1 diff --unified --recursive --new-file linux-2.6.21.4/include/linux/ring.h linux-2.6.21.4-1-686-smp-ring3/include/linux/ring.h
2 --- linux-2.6.21.4/include/linux/ring.h 1970-01-01 00:00:00.000000000 +0000
3 +++ linux-2.6.21.4-1-686-smp-ring3/include/linux/ring.h 2007-06-10 16:43:04.346421348 +0000
6 + * Definitions for packet ring
8 + * 2004-07 Luca Deri <deri@ntop.org>
13 +#define INCLUDE_MAC_INFO
15 +#ifdef INCLUDE_MAC_INFO
16 +#define SKB_DISPLACEMENT 14 /* Include MAC address information */
18 +#define SKB_DISPLACEMENT 0 /* Do NOT include MAC address information */
22 +#define RING_MAGIC_VALUE 0x88
23 +#define RING_FLOWSLOT_VERSION 6
24 +#define RING_VERSION "3.4.1"
26 +#define SO_ADD_TO_CLUSTER 99
27 +#define SO_REMOVE_FROM_CLUSTER 100
28 +#define SO_SET_REFLECTOR 101
29 +#define SO_SET_BLOOM 102
30 +#define SO_SET_STRING 103
31 +#define SO_TOGGLE_BLOOM_STATE 104
32 +#define SO_RESET_BLOOM_FILTERS 105
34 +#define BITMASK_SET(n, p) (((char*)p->bits_memory)[n/8] |= (1<<(n % 8)))
35 +#define BITMASK_CLR(n, p) (((char*)p->bits_memory)[n/8] &= ~(1<<(n % 8)))
36 +#define BITMASK_ISSET(n, p) (((char*)p->bits_memory)[n/8] & (1<<(n % 8)))
38 +/* *********************************** */
41 + Aho-Corasick code taken from Snort
45 + * DEFINES and Typedef's
47 +#define MAX_ALPHABET_SIZE 256
50 + FAIL STATE for 1,2,or 4 bytes for state transitions
52 + Uncomment this define to use 32 bit state values
56 +typedef unsigned short acstate_t;
57 +#define ACSM_FAIL_STATE2 0xffff
63 +struct _acsm_pattern2
65 + struct _acsm_pattern2 *next;
67 + unsigned char *patrn;
68 + unsigned char *casepatrn;
79 + * transition nodes - either 8 or 12 bytes
82 +struct trans_node_s {
84 + acstate_t key; /* The character that got us here - sized to keep structure aligned on 4 bytes */
85 + /* to better the caching opportunities. A value that crosses the cache line */
86 + /* forces an expensive reconstruction, typing this as acstate_t stops that. */
87 + acstate_t next_state; /* */
88 + struct trans_node_s * next; /* next transition for this state */
94 + * User specified final storage type for the state transitions
104 + * User specified machine types
106 + * TRIE : Keyword trie
117 + * Aho-Corasick State Machine Struct - one per group of pattterns
123 + ACSM_PATTERN2 * acsmPatterns;
124 + acstate_t * acsmFailState;
125 + ACSM_PATTERN2 ** acsmMatchList;
127 + /* list of transitions in each state, this is used to build the nfa & dfa */
128 + /* after construction we convert to sparse or full format matrix and free */
129 + /* the transition lists */
130 + trans_node_t ** acsmTransTable;
132 + acstate_t ** acsmNextState;
134 + int acsmSparseMaxRowNodes;
135 + int acsmSparseMaxZcnt;
138 + int acsmAlphabetSize;
143 +/* *********************************** */
146 +struct pcap_pkthdr {
147 + struct timeval ts; /* time stamp */
148 + u_int32_t caplen; /* length of portion present */
149 + u_int32_t len; /* length this packet (off wire) */
150 + /* packet parsing info */
151 + u_int16_t eth_type; /* Ethernet type */
152 + u_int16_t vlan_id; /* VLAN Id or -1 for no vlan */
153 + u_int8_t l3_proto; /* Layer 3 protocol */
154 + u_int16_t l3_offset, l4_offset, payload_offset; /* Offsets of L3/L4/payload elements */
155 + u_int32_t ipv4_src, ipv4_dst; /* IPv4 src/dst IP addresses */
156 + u_int16_t l4_src_port, l4_dst_port; /* Layer 4 src/dst ports */
160 +/* *********************************** */
162 +typedef struct _counter_list {
164 + u_int32_t bit_counter;
165 + struct _counter_list *next;
166 +} bitmask_counter_list;
169 + u_int32_t num_bits, order, num_pages;
170 + unsigned long bits_memory;
171 + bitmask_counter_list *clashes;
174 +/* *********************************** */
177 + cluster_per_flow = 0,
178 + cluster_round_robin
181 +/* *********************************** */
183 +#define RING_MIN_SLOT_SIZE (60+sizeof(struct pcap_pkthdr))
184 +#define RING_MAX_SLOT_SIZE (1514+sizeof(struct pcap_pkthdr))
186 +/* *********************************** */
188 +typedef struct flowSlotInfo {
189 + u_int16_t version, sample_rate;
190 + u_int32_t tot_slots, slot_len, data_len, tot_mem;
192 + u_int64_t tot_pkts, tot_lost;
193 + u_int64_t tot_insert, tot_read;
194 + u_int32_t insert_idx, remove_idx;
197 +/* *********************************** */
199 +typedef struct flowSlot {
201 + u_char magic; /* It must alwasy be zero */
203 + u_char slot_state; /* 0=empty, 1=full */
204 + u_char bucket; /* bucket[bucketLen] */
207 +/* *********************************** */
211 +FlowSlotInfo* getRingPtr(void);
212 +int allocateRing(char *deviceName, u_int numSlots,
213 + u_int bucketLen, u_int sampleRate);
214 +unsigned int pollRing(struct file *fp, struct poll_table_struct * wait);
215 +void deallocateRing(void);
217 +/* ************************* */
219 +typedef int (*handle_ring_skb)(struct sk_buff *skb,
220 + u_char recv_packet, u_char real_skb);
221 +extern handle_ring_skb get_skb_ring_handler(void);
222 +extern void set_skb_ring_handler(handle_ring_skb the_handler);
223 +extern void do_skb_ring_handler(struct sk_buff *skb,
224 + u_char recv_packet, u_char real_skb);
226 +typedef int (*handle_ring_buffer)(struct net_device *dev,
227 + char *data, int len);
228 +extern handle_ring_buffer get_buffer_ring_handler(void);
229 +extern void set_buffer_ring_handler(handle_ring_buffer the_handler);
230 +extern int do_buffer_ring_handler(struct net_device *dev,
231 + char *data, int len);
232 +#endif /* __KERNEL__ */
234 +/* *********************************** */
236 +#define PF_RING 27 /* Packet Ring */
237 +#define SOCK_RING PF_RING
240 +#define SIORINGPOLL 0x8888
242 +/* *********************************** */
244 +#endif /* __RING_H */
245 diff --unified --recursive --new-file linux-2.6.21.4/net/Kconfig linux-2.6.21.4-1-686-smp-ring3/net/Kconfig
246 --- linux-2.6.21.4/net/Kconfig 2007-06-07 21:27:31.000000000 +0000
247 +++ linux-2.6.21.4-1-686-smp-ring3/net/Kconfig 2007-06-10 16:43:04.402423771 +0000
249 source "net/xfrm/Kconfig"
250 source "net/iucv/Kconfig"
252 +source "net/ring/Kconfig"
254 bool "TCP/IP networking"
256 diff --unified --recursive --new-file linux-2.6.21.4/net/Makefile linux-2.6.21.4-1-686-smp-ring3/net/Makefile
257 --- linux-2.6.21.4/net/Makefile 2007-06-07 21:27:31.000000000 +0000
258 +++ linux-2.6.21.4-1-686-smp-ring3/net/Makefile 2007-06-10 16:43:04.394423425 +0000
260 obj-$(CONFIG_DECNET) += decnet/
261 obj-$(CONFIG_ECONET) += econet/
262 obj-$(CONFIG_VLAN_8021Q) += 8021q/
263 +obj-$(CONFIG_RING) += ring/
264 obj-$(CONFIG_IP_DCCP) += dccp/
265 obj-$(CONFIG_IP_SCTP) += sctp/
266 obj-$(CONFIG_IEEE80211) += ieee80211/
267 diff --unified --recursive --new-file linux-2.6.21.4/net/Makefile.ORG linux-2.6.21.4-1-686-smp-ring3/net/Makefile.ORG
268 --- linux-2.6.21.4/net/Makefile.ORG 1970-01-01 00:00:00.000000000 +0000
269 +++ linux-2.6.21.4-1-686-smp-ring3/net/Makefile.ORG 2007-06-10 16:43:04.386423079 +0000
272 +# Makefile for the linux networking.
274 +# 2 Sep 2000, Christoph Hellwig <hch@infradead.org>
275 +# Rewritten to use lists instead of if-statements.
280 +obj-$(CONFIG_NET) := socket.o core/
282 +tmp-$(CONFIG_COMPAT) := compat.o
283 +obj-$(CONFIG_NET) += $(tmp-y)
285 +# LLC has to be linked before the files in net/802/
286 +obj-$(CONFIG_LLC) += llc/
287 +obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/
288 +obj-$(CONFIG_NETFILTER) += netfilter/
289 +obj-$(CONFIG_INET) += ipv4/
290 +obj-$(CONFIG_XFRM) += xfrm/
291 +obj-$(CONFIG_UNIX) += unix/
292 +ifneq ($(CONFIG_IPV6),)
295 +obj-$(CONFIG_PACKET) += packet/
296 +obj-$(CONFIG_NET_KEY) += key/
297 +obj-$(CONFIG_NET_SCHED) += sched/
298 +obj-$(CONFIG_BRIDGE) += bridge/
299 +obj-$(CONFIG_IPX) += ipx/
300 +obj-$(CONFIG_ATALK) += appletalk/
301 +obj-$(CONFIG_WAN_ROUTER) += wanrouter/
302 +obj-$(CONFIG_X25) += x25/
303 +obj-$(CONFIG_LAPB) += lapb/
304 +obj-$(CONFIG_NETROM) += netrom/
305 +obj-$(CONFIG_ROSE) += rose/
306 +obj-$(CONFIG_AX25) += ax25/
307 +obj-$(CONFIG_IRDA) += irda/
308 +obj-$(CONFIG_BT) += bluetooth/
309 +obj-$(CONFIG_SUNRPC) += sunrpc/
310 +obj-$(CONFIG_RXRPC) += rxrpc/
311 +obj-$(CONFIG_ATM) += atm/
312 +obj-$(CONFIG_DECNET) += decnet/
313 +obj-$(CONFIG_ECONET) += econet/
314 +obj-$(CONFIG_VLAN_8021Q) += 8021q/
315 +obj-$(CONFIG_IP_DCCP) += dccp/
316 +obj-$(CONFIG_IP_SCTP) += sctp/
317 +obj-$(CONFIG_IEEE80211) += ieee80211/
318 +obj-$(CONFIG_TIPC) += tipc/
319 +obj-$(CONFIG_NETLABEL) += netlabel/
320 +obj-$(CONFIG_IUCV) += iucv/
322 +ifeq ($(CONFIG_NET),y)
323 +obj-$(CONFIG_SYSCTL) += sysctl_net.o
325 diff --unified --recursive --new-file linux-2.6.21.4/net/core/dev.c linux-2.6.21.4-1-686-smp-ring3/net/core/dev.c
326 --- linux-2.6.21.4/net/core/dev.c 2007-06-07 21:27:31.000000000 +0000
327 +++ linux-2.6.21.4-1-686-smp-ring3/net/core/dev.c 2007-06-10 16:43:04.382422906 +0000
329 #include <linux/err.h>
330 #include <linux/ctype.h>
332 +#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE)
334 +/* #define RING_DEBUG */
336 +#include <linux/ring.h>
337 +#include <linux/version.h>
339 +static handle_ring_skb ring_handler = NULL;
341 +handle_ring_skb get_skb_ring_handler() { return(ring_handler); }
343 +void set_skb_ring_handler(handle_ring_skb the_handler) {
344 + ring_handler = the_handler;
347 +void do_skb_ring_handler(struct sk_buff *skb,
348 + u_char recv_packet, u_char real_skb) {
350 + ring_handler(skb, recv_packet, real_skb);
353 +/* ******************* */
355 +static handle_ring_buffer buffer_ring_handler = NULL;
357 +handle_ring_buffer get_buffer_ring_handler() { return(buffer_ring_handler); }
359 +void set_buffer_ring_handler(handle_ring_buffer the_handler) {
360 + buffer_ring_handler = the_handler;
363 +int do_buffer_ring_handler(struct net_device *dev, char *data, int len) {
364 + if(buffer_ring_handler) {
365 + buffer_ring_handler(dev, data, len);
371 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
372 +EXPORT_SYMBOL(get_skb_ring_handler);
373 +EXPORT_SYMBOL(set_skb_ring_handler);
374 +EXPORT_SYMBOL(do_skb_ring_handler);
376 +EXPORT_SYMBOL(get_buffer_ring_handler);
377 +EXPORT_SYMBOL(set_buffer_ring_handler);
378 +EXPORT_SYMBOL(do_buffer_ring_handler);
383 * The list of packet types we will receive (as opposed to discard)
384 * and the routines to invoke.
385 @@ -1474,6 +1524,10 @@
386 skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
389 +#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE)
390 + if(ring_handler) ring_handler(skb, 0, 1);
391 +#endif /* CONFIG_RING */
393 /* Grab device queue */
394 spin_lock(&dev->queue_lock);
396 @@ -1574,6 +1628,13 @@
399 /* if netpoll wants it, pretend we never saw it */
400 +#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE)
401 + if(ring_handler && ring_handler(skb, 1, 1)) {
402 + /* The packet has been copied into a ring */
403 + return(NET_RX_SUCCESS);
405 +#endif /* CONFIG_RING */
410 @@ -1764,6 +1825,13 @@
411 struct net_device *orig_dev;
412 int ret = NET_RX_DROP;
414 +#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE)
415 + if(ring_handler && ring_handler(skb, 1, 1)) {
416 + /* The packet has been copied into a ring */
417 + return(NET_RX_SUCCESS);
419 +#endif /* CONFIG_RING */
422 /* if we've gotten here through NAPI, check netpoll */
423 if (skb->dev->poll && netpoll_rx(skb))
424 diff --unified --recursive --new-file linux-2.6.21.4/net/core/dev.c.ORG linux-2.6.21.4-1-686-smp-ring3/net/core/dev.c.ORG
425 --- linux-2.6.21.4/net/core/dev.c.ORG 1970-01-01 00:00:00.000000000 +0000
426 +++ linux-2.6.21.4-1-686-smp-ring3/net/core/dev.c.ORG 2007-06-10 16:43:04.354421694 +0000
429 + * NET3 Protocol independent device support routines.
431 + * This program is free software; you can redistribute it and/or
432 + * modify it under the terms of the GNU General Public License
433 + * as published by the Free Software Foundation; either version
434 + * 2 of the License, or (at your option) any later version.
436 + * Derived from the non IP parts of dev.c 1.0.19
437 + * Authors: Ross Biro
438 + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
439 + * Mark Evans, <evansmp@uhura.aston.ac.uk>
441 + * Additional Authors:
442 + * Florian la Roche <rzsfl@rz.uni-sb.de>
443 + * Alan Cox <gw4pts@gw4pts.ampr.org>
444 + * David Hinds <dahinds@users.sourceforge.net>
445 + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
446 + * Adam Sulmicki <adam@cfar.umd.edu>
447 + * Pekka Riikonen <priikone@poesidon.pspt.fi>
450 + * D.J. Barrow : Fixed bug where dev->refcnt gets set
451 + * to 2 if register_netdev gets called
452 + * before net_dev_init & also removed a
453 + * few lines of code in the process.
454 + * Alan Cox : device private ioctl copies fields back.
455 + * Alan Cox : Transmit queue code does relevant
456 + * stunts to keep the queue safe.
457 + * Alan Cox : Fixed double lock.
458 + * Alan Cox : Fixed promisc NULL pointer trap
459 + * ???????? : Support the full private ioctl range
460 + * Alan Cox : Moved ioctl permission check into
462 + * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
463 + * Alan Cox : 100 backlog just doesn't cut it when
464 + * you start doing multicast video 8)
465 + * Alan Cox : Rewrote net_bh and list manager.
466 + * Alan Cox : Fix ETH_P_ALL echoback lengths.
467 + * Alan Cox : Took out transmit every packet pass
468 + * Saved a few bytes in the ioctl handler
469 + * Alan Cox : Network driver sets packet type before
470 + * calling netif_rx. Saves a function
472 + * Alan Cox : Hashed net_bh()
473 + * Richard Kooijman: Timestamp fixes.
474 + * Alan Cox : Wrong field in SIOCGIFDSTADDR
475 + * Alan Cox : Device lock protection.
476 + * Alan Cox : Fixed nasty side effect of device close
478 + * Rudi Cilibrasi : Pass the right thing to
479 + * set_mac_address()
480 + * Dave Miller : 32bit quantity for the device lock to
481 + * make it work out on a Sparc.
482 + * Bjorn Ekwall : Added KERNELD hack.
483 + * Alan Cox : Cleaned up the backlog initialise.
484 + * Craig Metz : SIOCGIFCONF fix if space for under
486 + * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
487 + * is no device open function.
488 + * Andi Kleen : Fix error reporting for SIOCGIFCONF
489 + * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
490 + * Cyrus Durgin : Cleaned for KMOD
491 + * Adam Sulmicki : Bug Fix : Network Device Unload
492 + * A network device unload needs to purge
493 + * the backlog queue.
494 + * Paul Rusty Russell : SIOCSIFNAME
495 + * Pekka Riikonen : Netdev boot-time settings code
496 + * Andrew Morton : Make unregister_netdevice wait
497 + * indefinitely on dev->refcnt
498 + * J Hadi Salim : - Backlog queue sampling
499 + * - netif_rx() feedback
502 +#include <asm/uaccess.h>
503 +#include <asm/system.h>
504 +#include <linux/bitops.h>
505 +#include <linux/capability.h>
506 +#include <linux/cpu.h>
507 +#include <linux/types.h>
508 +#include <linux/kernel.h>
509 +#include <linux/sched.h>
510 +#include <linux/mutex.h>
511 +#include <linux/string.h>
512 +#include <linux/mm.h>
513 +#include <linux/socket.h>
514 +#include <linux/sockios.h>
515 +#include <linux/errno.h>
516 +#include <linux/interrupt.h>
517 +#include <linux/if_ether.h>
518 +#include <linux/netdevice.h>
519 +#include <linux/etherdevice.h>
520 +#include <linux/notifier.h>
521 +#include <linux/skbuff.h>
522 +#include <net/sock.h>
523 +#include <linux/rtnetlink.h>
524 +#include <linux/proc_fs.h>
525 +#include <linux/seq_file.h>
526 +#include <linux/stat.h>
527 +#include <linux/if_bridge.h>
528 +#include <net/dst.h>
529 +#include <net/pkt_sched.h>
530 +#include <net/checksum.h>
531 +#include <linux/highmem.h>
532 +#include <linux/init.h>
533 +#include <linux/kmod.h>
534 +#include <linux/module.h>
535 +#include <linux/kallsyms.h>
536 +#include <linux/netpoll.h>
537 +#include <linux/rcupdate.h>
538 +#include <linux/delay.h>
539 +#include <linux/wireless.h>
540 +#include <net/iw_handler.h>
541 +#include <asm/current.h>
542 +#include <linux/audit.h>
543 +#include <linux/dmaengine.h>
544 +#include <linux/err.h>
545 +#include <linux/ctype.h>
548 + * The list of packet types we will receive (as opposed to discard)
549 + * and the routines to invoke.
551 + * Why 16. Because with 16 the only overlap we get on a hash of the
552 + * low nibble of the protocol value is RARP/SNAP/X.25.
554 + * NOTE: That is no longer true with the addition of VLAN tags. Not
555 + * sure which should go first, but I bet it won't make much
556 + * difference if we are running VLANs. The good news is that
557 + * this protocol won't be in the list unless compiled in, so
558 + * the average user (w/out VLANs) will not be adversely affected.
575 +static DEFINE_SPINLOCK(ptype_lock);
576 +static struct list_head ptype_base[16]; /* 16 way hashed list */
577 +static struct list_head ptype_all; /* Taps */
579 +#ifdef CONFIG_NET_DMA
580 +static struct dma_client *net_dma_client;
581 +static unsigned int net_dma_count;
582 +static spinlock_t net_dma_event_lock;
586 + * The @dev_base list is protected by @dev_base_lock and the rtnl
589 + * Pure readers hold dev_base_lock for reading.
591 + * Writers must hold the rtnl semaphore while they loop through the
592 + * dev_base list, and hold dev_base_lock for writing when they do the
593 + * actual updates. This allows pure readers to access the list even
594 + * while a writer is preparing to update it.
596 + * To put it another way, dev_base_lock is held for writing only to
597 + * protect against pure readers; the rtnl semaphore provides the
598 + * protection against other writers.
600 + * See, for example usages, register_netdevice() and
601 + * unregister_netdevice(), which must be called with the rtnl
604 +struct net_device *dev_base;
605 +static struct net_device **dev_tail = &dev_base;
606 +DEFINE_RWLOCK(dev_base_lock);
608 +EXPORT_SYMBOL(dev_base);
609 +EXPORT_SYMBOL(dev_base_lock);
611 +#define NETDEV_HASHBITS 8
612 +static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS];
613 +static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS];
615 +static inline struct hlist_head *dev_name_hash(const char *name)
617 + unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
618 + return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)];
621 +static inline struct hlist_head *dev_index_hash(int ifindex)
623 + return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)];
627 + * Our notifier list
630 +static RAW_NOTIFIER_HEAD(netdev_chain);
633 + * Device drivers call our routines to queue packets here. We empty the
634 + * queue in the local softnet handler.
636 +DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL };
639 +extern int netdev_sysfs_init(void);
640 +extern int netdev_register_sysfs(struct net_device *);
641 +extern void netdev_unregister_sysfs(struct net_device *);
643 +#define netdev_sysfs_init() (0)
644 +#define netdev_register_sysfs(dev) (0)
645 +#define netdev_unregister_sysfs(dev) do { } while(0)
649 +/*******************************************************************************
651 + Protocol management and registration routines
653 +*******************************************************************************/
659 +static int netdev_nit;
662 + * Add a protocol ID to the list. Now that the input handler is
663 + * smarter we can dispense with all the messy stuff that used to be
666 + * BEWARE!!! Protocol handlers, mangling input packets,
667 + * MUST BE last in hash buckets and checking protocol handlers
668 + * MUST start from promiscuous ptype_all chain in net_bh.
669 + * It is true now, do not change it.
670 + * Explanation follows: if protocol handler, mangling packet, will
671 + * be the first on list, it is not able to sense, that packet
672 + * is cloned and should be copied-on-write, so that it will
673 + * change it and subsequent readers will get broken packet.
678 + * dev_add_pack - add packet handler
679 + * @pt: packet type declaration
681 + * Add a protocol handler to the networking stack. The passed &packet_type
682 + * is linked into kernel lists and may not be freed until it has been
683 + * removed from the kernel lists.
685 + * This call does not sleep therefore it can not
686 + * guarantee all CPU's that are in middle of receiving packets
687 + * will see the new packet type (until the next received packet).
690 +void dev_add_pack(struct packet_type *pt)
694 + spin_lock_bh(&ptype_lock);
695 + if (pt->type == htons(ETH_P_ALL)) {
697 + list_add_rcu(&pt->list, &ptype_all);
699 + hash = ntohs(pt->type) & 15;
700 + list_add_rcu(&pt->list, &ptype_base[hash]);
702 + spin_unlock_bh(&ptype_lock);
706 + * __dev_remove_pack - remove packet handler
707 + * @pt: packet type declaration
709 + * Remove a protocol handler that was previously added to the kernel
710 + * protocol handlers by dev_add_pack(). The passed &packet_type is removed
711 + * from the kernel lists and can be freed or reused once this function
714 + * The packet type might still be in use by receivers
715 + * and must not be freed until after all the CPU's have gone
716 + * through a quiescent state.
718 +void __dev_remove_pack(struct packet_type *pt)
720 + struct list_head *head;
721 + struct packet_type *pt1;
723 + spin_lock_bh(&ptype_lock);
725 + if (pt->type == htons(ETH_P_ALL)) {
729 + head = &ptype_base[ntohs(pt->type) & 15];
731 + list_for_each_entry(pt1, head, list) {
733 + list_del_rcu(&pt->list);
738 + printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
740 + spin_unlock_bh(&ptype_lock);
743 + * dev_remove_pack - remove packet handler
744 + * @pt: packet type declaration
746 + * Remove a protocol handler that was previously added to the kernel
747 + * protocol handlers by dev_add_pack(). The passed &packet_type is removed
748 + * from the kernel lists and can be freed or reused once this function
751 + * This call sleeps to guarantee that no CPU is looking at the packet
752 + * type after return.
754 +void dev_remove_pack(struct packet_type *pt)
756 + __dev_remove_pack(pt);
761 +/******************************************************************************
763 + Device Boot-time Settings Routines
765 +*******************************************************************************/
767 +/* Boot time configuration table */
768 +static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
771 + * netdev_boot_setup_add - add new setup entry
772 + * @name: name of the device
773 + * @map: configured settings for the device
775 + * Adds new setup entry to the dev_boot_setup list. The function
776 + * returns 0 on error and 1 on success. This is a generic routine to
779 +static int netdev_boot_setup_add(char *name, struct ifmap *map)
781 + struct netdev_boot_setup *s;
784 + s = dev_boot_setup;
785 + for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
786 + if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
787 + memset(s[i].name, 0, sizeof(s[i].name));
788 + strcpy(s[i].name, name);
789 + memcpy(&s[i].map, map, sizeof(s[i].map));
794 + return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
798 + * netdev_boot_setup_check - check boot time settings
799 + * @dev: the netdevice
801 + * Check boot time settings for the device.
802 + * The found settings are set for the device to be used
803 + * later in the device probing.
804 + * Returns 0 if no settings found, 1 if they are.
806 +int netdev_boot_setup_check(struct net_device *dev)
808 + struct netdev_boot_setup *s = dev_boot_setup;
811 + for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
812 + if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
813 + !strncmp(dev->name, s[i].name, strlen(s[i].name))) {
814 + dev->irq = s[i].map.irq;
815 + dev->base_addr = s[i].map.base_addr;
816 + dev->mem_start = s[i].map.mem_start;
817 + dev->mem_end = s[i].map.mem_end;
826 + * netdev_boot_base - get address from boot time settings
827 + * @prefix: prefix for network device
828 + * @unit: id for network device
830 + * Check boot time settings for the base address of device.
831 + * The found settings are set for the device to be used
832 + * later in the device probing.
833 + * Returns 0 if no settings found.
835 +unsigned long netdev_boot_base(const char *prefix, int unit)
837 + const struct netdev_boot_setup *s = dev_boot_setup;
838 + char name[IFNAMSIZ];
841 + sprintf(name, "%s%d", prefix, unit);
844 + * If device already registered then return base of 1
845 + * to indicate not to probe for this interface
847 + if (__dev_get_by_name(name))
850 + for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
851 + if (!strcmp(name, s[i].name))
852 + return s[i].map.base_addr;
857 + * Saves at boot time configured settings for any netdevice.
859 +int __init netdev_boot_setup(char *str)
864 + str = get_options(str, ARRAY_SIZE(ints), ints);
868 + /* Save settings */
869 + memset(&map, 0, sizeof(map));
873 + map.base_addr = ints[2];
875 + map.mem_start = ints[3];
877 + map.mem_end = ints[4];
879 + /* Add new entry to the list */
880 + return netdev_boot_setup_add(str, &map);
883 +__setup("netdev=", netdev_boot_setup);
885 +/*******************************************************************************
887 + Device Interface Subroutines
889 +*******************************************************************************/
892 + * __dev_get_by_name - find a device by its name
893 + * @name: name to find
895 + * Find an interface by name. Must be called under RTNL semaphore
896 + * or @dev_base_lock. If the name is found a pointer to the device
897 + * is returned. If the name is not found then %NULL is returned. The
898 + * reference counters are not incremented so the caller must be
899 + * careful with locks.
902 +struct net_device *__dev_get_by_name(const char *name)
904 + struct hlist_node *p;
906 + hlist_for_each(p, dev_name_hash(name)) {
907 + struct net_device *dev
908 + = hlist_entry(p, struct net_device, name_hlist);
909 + if (!strncmp(dev->name, name, IFNAMSIZ))
916 + * dev_get_by_name - find a device by its name
917 + * @name: name to find
919 + * Find an interface by name. This can be called from any
920 + * context and does its own locking. The returned handle has
921 + * the usage count incremented and the caller must use dev_put() to
922 + * release it when it is no longer needed. %NULL is returned if no
923 + * matching device is found.
926 +struct net_device *dev_get_by_name(const char *name)
928 + struct net_device *dev;
930 + read_lock(&dev_base_lock);
931 + dev = __dev_get_by_name(name);
934 + read_unlock(&dev_base_lock);
939 + * __dev_get_by_index - find a device by its ifindex
940 + * @ifindex: index of device
942 + * Search for an interface by index. Returns %NULL if the device
943 + * is not found or a pointer to the device. The device has not
944 + * had its reference counter increased so the caller must be careful
945 + * about locking. The caller must hold either the RTNL semaphore
946 + * or @dev_base_lock.
949 +struct net_device *__dev_get_by_index(int ifindex)
951 + struct hlist_node *p;
953 + hlist_for_each(p, dev_index_hash(ifindex)) {
954 + struct net_device *dev
955 + = hlist_entry(p, struct net_device, index_hlist);
956 + if (dev->ifindex == ifindex)
964 + * dev_get_by_index - find a device by its ifindex
965 + * @ifindex: index of device
967 + * Search for an interface by index. Returns NULL if the device
968 + * is not found or a pointer to the device. The device returned has
969 + * had a reference added and the pointer is safe until the user calls
970 + * dev_put to indicate they have finished with it.
973 +struct net_device *dev_get_by_index(int ifindex)
975 + struct net_device *dev;
977 + read_lock(&dev_base_lock);
978 + dev = __dev_get_by_index(ifindex);
981 + read_unlock(&dev_base_lock);
986 + * dev_getbyhwaddr - find a device by its hardware address
987 + * @type: media type of device
988 + * @ha: hardware address
990 + * Search for an interface by MAC address. Returns NULL if the device
991 + * is not found or a pointer to the device. The caller must hold the
992 + * rtnl semaphore. The returned device has not had its ref count increased
993 + * and the caller must therefore be careful about locking
996 + * If the API was consistent this would be __dev_get_by_hwaddr
999 +struct net_device *dev_getbyhwaddr(unsigned short type, char *ha)
1001 + struct net_device *dev;
1005 + for (dev = dev_base; dev; dev = dev->next)
1006 + if (dev->type == type &&
1007 + !memcmp(dev->dev_addr, ha, dev->addr_len))
1012 +EXPORT_SYMBOL(dev_getbyhwaddr);
1014 +struct net_device *dev_getfirstbyhwtype(unsigned short type)
1016 + struct net_device *dev;
1019 + for (dev = dev_base; dev; dev = dev->next) {
1020 + if (dev->type == type) {
1029 +EXPORT_SYMBOL(dev_getfirstbyhwtype);
1032 + * dev_get_by_flags - find any device with given flags
1033 + * @if_flags: IFF_* values
1034 + * @mask: bitmask of bits in if_flags to check
1036 + * Search for any interface with the given flags. Returns NULL if a device
1037 + * is not found or a pointer to the device. The device returned has
1038 + * had a reference added and the pointer is safe until the user calls
1039 + * dev_put to indicate they have finished with it.
1042 +struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask)
1044 + struct net_device *dev;
1046 + read_lock(&dev_base_lock);
1047 + for (dev = dev_base; dev != NULL; dev = dev->next) {
1048 + if (((dev->flags ^ if_flags) & mask) == 0) {
1053 + read_unlock(&dev_base_lock);
1058 + * dev_valid_name - check if name is okay for network device
1059 + * @name: name string
1061 + * Network device names need to be valid file names to
1062 + * to allow sysfs to work. We also disallow any kind of
1065 +int dev_valid_name(const char *name)
1067 + if (*name == '\0')
1069 + if (strlen(name) >= IFNAMSIZ)
1071 + if (!strcmp(name, ".") || !strcmp(name, ".."))
1075 + if (*name == '/' || isspace(*name))
1083 + * dev_alloc_name - allocate a name for a device
1085 + * @name: name format string
1087 + * Passed a format string - eg "lt%d" it will try and find a suitable
1088 + * id. It scans list of devices to build up a free map, then chooses
1089 + * the first empty slot. The caller must hold the dev_base or rtnl lock
1090 + * while allocating the name and adding the device in order to avoid
1092 + * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1093 + * Returns the number of the unit assigned or a negative errno code.
1096 +int dev_alloc_name(struct net_device *dev, const char *name)
1099 + char buf[IFNAMSIZ];
1101 + const int max_netdevices = 8*PAGE_SIZE;
1103 + struct net_device *d;
1105 + p = strnchr(name, IFNAMSIZ-1, '%');
1108 + * Verify the string as this thing may have come from
1109 + * the user. There must be either one "%d" and no other "%"
1112 + if (p[1] != 'd' || strchr(p + 2, '%'))
1115 + /* Use one page as a bit array of possible slots */
1116 + inuse = (long *) get_zeroed_page(GFP_ATOMIC);
1120 + for (d = dev_base; d; d = d->next) {
1121 + if (!sscanf(d->name, name, &i))
1123 + if (i < 0 || i >= max_netdevices)
1126 + /* avoid cases where sscanf is not exact inverse of printf */
1127 + snprintf(buf, sizeof(buf), name, i);
1128 + if (!strncmp(buf, d->name, IFNAMSIZ))
1129 + set_bit(i, inuse);
1132 + i = find_first_zero_bit(inuse, max_netdevices);
1133 + free_page((unsigned long) inuse);
1136 + snprintf(buf, sizeof(buf), name, i);
1137 + if (!__dev_get_by_name(buf)) {
1138 + strlcpy(dev->name, buf, IFNAMSIZ);
1142 + /* It is possible to run out of possible slots
1143 + * when the name is long and there isn't enough space left
1144 + * for the digits, or if all bits are used.
1151 + * dev_change_name - change name of a device
1153 + * @newname: name (or format string) must be at least IFNAMSIZ
1155 + * Change name of a device, can pass format strings "eth%d".
1156 + * for wildcarding.
1158 +int dev_change_name(struct net_device *dev, char *newname)
1164 + if (dev->flags & IFF_UP)
1167 + if (!dev_valid_name(newname))
1170 + if (strchr(newname, '%')) {
1171 + err = dev_alloc_name(dev, newname);
1174 + strcpy(newname, dev->name);
1176 + else if (__dev_get_by_name(newname))
1179 + strlcpy(dev->name, newname, IFNAMSIZ);
1181 + device_rename(&dev->dev, dev->name);
1182 + hlist_del(&dev->name_hlist);
1183 + hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name));
1184 + raw_notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
1190 + * netdev_features_change - device changes features
1191 + * @dev: device to cause notification
1193 + * Called to indicate a device has changed features.
1195 +void netdev_features_change(struct net_device *dev)
1197 + raw_notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev);
1199 +EXPORT_SYMBOL(netdev_features_change);
1202 + * netdev_state_change - device changes state
1203 + * @dev: device to cause notification
1205 + * Called to indicate a device has changed state. This function calls
1206 + * the notifier chains for netdev_chain and sends a NEWLINK message
1207 + * to the routing socket.
1209 +void netdev_state_change(struct net_device *dev)
1211 + if (dev->flags & IFF_UP) {
1212 + raw_notifier_call_chain(&netdev_chain,
1213 + NETDEV_CHANGE, dev);
1214 + rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1219 + * dev_load - load a network module
1220 + * @name: name of interface
1222 + * If a network interface is not present and the process has suitable
1223 + * privileges this function loads the module. If module loading is not
1224 + * available in this kernel then it becomes a nop.
1227 +void dev_load(const char *name)
1229 + struct net_device *dev;
1231 + read_lock(&dev_base_lock);
1232 + dev = __dev_get_by_name(name);
1233 + read_unlock(&dev_base_lock);
1235 + if (!dev && capable(CAP_SYS_MODULE))
1236 + request_module("%s", name);
1239 +static int default_rebuild_header(struct sk_buff *skb)
1241 + printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n",
1242 + skb->dev ? skb->dev->name : "NULL!!!");
1249 + * dev_open - prepare an interface for use.
1250 + * @dev: device to open
1252 + * Takes a device from down to up state. The device's private open
1253 + * function is invoked and then the multicast lists are loaded. Finally
1254 + * the device is moved into the up state and a %NETDEV_UP message is
1255 + * sent to the netdev notifier chain.
1257 + * Calling this function on an active interface is a nop. On a failure
1258 + * a negative errno code is returned.
1260 +int dev_open(struct net_device *dev)
1265 + * Is it already up?
1268 + if (dev->flags & IFF_UP)
1272 + * Is it even present?
1274 + if (!netif_device_present(dev))
1278 + * Call device private open method
1280 + set_bit(__LINK_STATE_START, &dev->state);
1282 + ret = dev->open(dev);
1284 + clear_bit(__LINK_STATE_START, &dev->state);
1288 + * If it went open OK then:
1295 + dev->flags |= IFF_UP;
1298 + * Initialize multicasting status
1300 + dev_mc_upload(dev);
1303 + * Wakeup transmit queue engine
1305 + dev_activate(dev);
1308 + * ... and announce new interface.
1310 + raw_notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
1316 + * dev_close - shutdown an interface.
1317 + * @dev: device to shutdown
1319 + * This function moves an active device into down state. A
1320 + * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1321 + * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1324 +int dev_close(struct net_device *dev)
1326 + if (!(dev->flags & IFF_UP))
1330 + * Tell people we are going down, so that they can
1331 + * prepare to death, when device is still operating.
1333 + raw_notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
1335 + dev_deactivate(dev);
1337 + clear_bit(__LINK_STATE_START, &dev->state);
1339 + /* Synchronize to scheduled poll. We cannot touch poll list,
1340 + * it can be even on different cpu. So just clear netif_running(),
1341 + * and wait when poll really will happen. Actually, the best place
1342 + * for this is inside dev->stop() after device stopped its irq
1343 + * engine, but this requires more changes in devices. */
1345 + smp_mb__after_clear_bit(); /* Commit netif_running(). */
1346 + while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) {
1352 + * Call the device specific close. This cannot fail.
1353 + * Only if device is UP
1355 + * We allow it to be called even after a DETACH hot-plug
1362 + * Device is now down.
1365 + dev->flags &= ~IFF_UP;
1368 + * Tell people we are down
1370 + raw_notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
1377 + * Device change register/unregister. These are not inline or static
1378 + * as we export them to the world.
1382 + * register_netdevice_notifier - register a network notifier block
1385 + * Register a notifier to be called when network device events occur.
1386 + * The notifier passed is linked into the kernel structures and must
1387 + * not be reused until it has been unregistered. A negative errno code
1388 + * is returned on a failure.
1390 + * When registered all registration and up events are replayed
1391 + * to the new notifier to allow device to have a race free
1392 + * view of the network device list.
1395 +int register_netdevice_notifier(struct notifier_block *nb)
1397 + struct net_device *dev;
1401 + err = raw_notifier_chain_register(&netdev_chain, nb);
1403 + for (dev = dev_base; dev; dev = dev->next) {
1404 + nb->notifier_call(nb, NETDEV_REGISTER, dev);
1406 + if (dev->flags & IFF_UP)
1407 + nb->notifier_call(nb, NETDEV_UP, dev);
1415 + * unregister_netdevice_notifier - unregister a network notifier block
1418 + * Unregister a notifier previously registered by
1419 + * register_netdevice_notifier(). The notifier is unlinked into the
1420 + * kernel structures and may then be reused. A negative errno code
1421 + * is returned on a failure.
1424 +int unregister_netdevice_notifier(struct notifier_block *nb)
1429 + err = raw_notifier_chain_unregister(&netdev_chain, nb);
1435 + * call_netdevice_notifiers - call all network notifier blocks
1436 + * @val: value passed unmodified to notifier function
1437 + * @v: pointer passed unmodified to notifier function
1439 + * Call all network notifier blocks. Parameters and return value
1440 + * are as for raw_notifier_call_chain().
1443 +int call_netdevice_notifiers(unsigned long val, void *v)
1445 + return raw_notifier_call_chain(&netdev_chain, val, v);
1448 +/* When > 0 there are consumers of rx skb time stamps */
1449 +static atomic_t netstamp_needed = ATOMIC_INIT(0);
1451 +void net_enable_timestamp(void)
1453 + atomic_inc(&netstamp_needed);
1456 +void net_disable_timestamp(void)
1458 + atomic_dec(&netstamp_needed);
1461 +void __net_timestamp(struct sk_buff *skb)
1463 + struct timeval tv;
1465 + do_gettimeofday(&tv);
1466 + skb_set_timestamp(skb, &tv);
1468 +EXPORT_SYMBOL(__net_timestamp);
1470 +static inline void net_timestamp(struct sk_buff *skb)
1472 + if (atomic_read(&netstamp_needed))
1473 + __net_timestamp(skb);
1475 + skb->tstamp.off_sec = 0;
1476 + skb->tstamp.off_usec = 0;
1481 + * Support routine. Sends outgoing frames to any network
1482 + * taps currently in use.
1485 +static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1487 + struct packet_type *ptype;
1489 + net_timestamp(skb);
1492 + list_for_each_entry_rcu(ptype, &ptype_all, list) {
1493 + /* Never send packets back to the socket
1494 + * they originated from - MvS (miquels@drinkel.ow.org)
1496 + if ((ptype->dev == dev || !ptype->dev) &&
1497 + (ptype->af_packet_priv == NULL ||
1498 + (struct sock *)ptype->af_packet_priv != skb->sk)) {
1499 + struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1503 + /* skb->nh should be correctly
1504 + set by sender, so that the second statement is
1505 + just protection against buggy protocols.
1507 + skb2->mac.raw = skb2->data;
1509 + if (skb2->nh.raw < skb2->data ||
1510 + skb2->nh.raw > skb2->tail) {
1511 + if (net_ratelimit())
1512 + printk(KERN_CRIT "protocol %04x is "
1513 + "buggy, dev %s\n",
1514 + skb2->protocol, dev->name);
1515 + skb2->nh.raw = skb2->data;
1518 + skb2->h.raw = skb2->nh.raw;
1519 + skb2->pkt_type = PACKET_OUTGOING;
1520 + ptype->func(skb2, skb->dev, ptype, skb->dev);
1523 + rcu_read_unlock();
1527 +void __netif_schedule(struct net_device *dev)
1529 + if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) {
1530 + unsigned long flags;
1531 + struct softnet_data *sd;
1533 + local_irq_save(flags);
1534 + sd = &__get_cpu_var(softnet_data);
1535 + dev->next_sched = sd->output_queue;
1536 + sd->output_queue = dev;
1537 + raise_softirq_irqoff(NET_TX_SOFTIRQ);
1538 + local_irq_restore(flags);
1541 +EXPORT_SYMBOL(__netif_schedule);
1543 +void __netif_rx_schedule(struct net_device *dev)
1545 + unsigned long flags;
1547 + local_irq_save(flags);
1549 + list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list);
1550 + if (dev->quota < 0)
1551 + dev->quota += dev->weight;
1553 + dev->quota = dev->weight;
1554 + __raise_softirq_irqoff(NET_RX_SOFTIRQ);
1555 + local_irq_restore(flags);
1557 +EXPORT_SYMBOL(__netif_rx_schedule);
1559 +void dev_kfree_skb_any(struct sk_buff *skb)
1561 + if (in_irq() || irqs_disabled())
1562 + dev_kfree_skb_irq(skb);
1564 + dev_kfree_skb(skb);
1566 +EXPORT_SYMBOL(dev_kfree_skb_any);
1569 +/* Hot-plugging. */
1570 +void netif_device_detach(struct net_device *dev)
1572 + if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1573 + netif_running(dev)) {
1574 + netif_stop_queue(dev);
1577 +EXPORT_SYMBOL(netif_device_detach);
1579 +void netif_device_attach(struct net_device *dev)
1581 + if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1582 + netif_running(dev)) {
1583 + netif_wake_queue(dev);
1584 + __netdev_watchdog_up(dev);
1587 +EXPORT_SYMBOL(netif_device_attach);
1591 + * Invalidate hardware checksum when packet is to be mangled, and
1592 + * complete checksum manually on outgoing path.
1594 +int skb_checksum_help(struct sk_buff *skb)
1597 + int ret = 0, offset = skb->h.raw - skb->data;
1599 + if (skb->ip_summed == CHECKSUM_COMPLETE)
1600 + goto out_set_summed;
1602 + if (unlikely(skb_shinfo(skb)->gso_size)) {
1603 + /* Let GSO fix up the checksum. */
1604 + goto out_set_summed;
1607 + if (skb_cloned(skb)) {
1608 + ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1613 + BUG_ON(offset > (int)skb->len);
1614 + csum = skb_checksum(skb, offset, skb->len-offset, 0);
1616 + offset = skb->tail - skb->h.raw;
1617 + BUG_ON(offset <= 0);
1618 + BUG_ON(skb->csum_offset + 2 > offset);
1620 + *(__sum16*)(skb->h.raw + skb->csum_offset) = csum_fold(csum);
1623 + skb->ip_summed = CHECKSUM_NONE;
1629 + * skb_gso_segment - Perform segmentation on skb.
1630 + * @skb: buffer to segment
1631 + * @features: features for the output path (see dev->features)
1633 + * This function segments the given skb and returns a list of segments.
1635 + * It may return NULL if the skb requires no segmentation. This is
1636 + * only possible when GSO is used for verifying header integrity.
1638 +struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1640 + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1641 + struct packet_type *ptype;
1642 + __be16 type = skb->protocol;
1645 + BUG_ON(skb_shinfo(skb)->frag_list);
1647 + skb->mac.raw = skb->data;
1648 + skb->mac_len = skb->nh.raw - skb->data;
1649 + __skb_pull(skb, skb->mac_len);
1651 + if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1652 + if (skb_header_cloned(skb) &&
1653 + (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1654 + return ERR_PTR(err);
1658 + list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & 15], list) {
1659 + if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1660 + if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1661 + err = ptype->gso_send_check(skb);
1662 + segs = ERR_PTR(err);
1663 + if (err || skb_gso_ok(skb, features))
1665 + __skb_push(skb, skb->data - skb->nh.raw);
1667 + segs = ptype->gso_segment(skb, features);
1671 + rcu_read_unlock();
1673 + __skb_push(skb, skb->data - skb->mac.raw);
1678 +EXPORT_SYMBOL(skb_gso_segment);
1680 +/* Take action when hardware reception checksum errors are detected. */
1682 +void netdev_rx_csum_fault(struct net_device *dev)
1684 + if (net_ratelimit()) {
1685 + printk(KERN_ERR "%s: hw csum failure.\n",
1686 + dev ? dev->name : "<unknown>");
1690 +EXPORT_SYMBOL(netdev_rx_csum_fault);
1693 +/* Actually, we should eliminate this check as soon as we know, that:
1694 + * 1. IOMMU is present and allows to map all the memory.
1695 + * 2. No high memory really exists on this machine.
1698 +static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1700 +#ifdef CONFIG_HIGHMEM
1703 + if (dev->features & NETIF_F_HIGHDMA)
1706 + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1707 + if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1714 +struct dev_gso_cb {
1715 + void (*destructor)(struct sk_buff *skb);
1718 +#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1720 +static void dev_gso_skb_destructor(struct sk_buff *skb)
1722 + struct dev_gso_cb *cb;
1725 + struct sk_buff *nskb = skb->next;
1727 + skb->next = nskb->next;
1728 + nskb->next = NULL;
1730 + } while (skb->next);
1732 + cb = DEV_GSO_CB(skb);
1733 + if (cb->destructor)
1734 + cb->destructor(skb);
1738 + * dev_gso_segment - Perform emulated hardware segmentation on skb.
1739 + * @skb: buffer to segment
1741 + * This function segments the given skb and stores the list of segments
1744 +static int dev_gso_segment(struct sk_buff *skb)
1746 + struct net_device *dev = skb->dev;
1747 + struct sk_buff *segs;
1748 + int features = dev->features & ~(illegal_highdma(dev, skb) ?
1751 + segs = skb_gso_segment(skb, features);
1753 + /* Verifying header integrity only. */
1757 + if (unlikely(IS_ERR(segs)))
1758 + return PTR_ERR(segs);
1761 + DEV_GSO_CB(skb)->destructor = skb->destructor;
1762 + skb->destructor = dev_gso_skb_destructor;
1767 +int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
1769 + if (likely(!skb->next)) {
1771 + dev_queue_xmit_nit(skb, dev);
1773 + if (netif_needs_gso(dev, skb)) {
1774 + if (unlikely(dev_gso_segment(skb)))
1775 + goto out_kfree_skb;
1780 + return dev->hard_start_xmit(skb, dev);
1785 + struct sk_buff *nskb = skb->next;
1788 + skb->next = nskb->next;
1789 + nskb->next = NULL;
1790 + rc = dev->hard_start_xmit(nskb, dev);
1791 + if (unlikely(rc)) {
1792 + nskb->next = skb->next;
1796 + if (unlikely(netif_queue_stopped(dev) && skb->next))
1797 + return NETDEV_TX_BUSY;
1798 + } while (skb->next);
1800 + skb->destructor = DEV_GSO_CB(skb)->destructor;
1807 +#define HARD_TX_LOCK(dev, cpu) { \
1808 + if ((dev->features & NETIF_F_LLTX) == 0) { \
1809 + netif_tx_lock(dev); \
1813 +#define HARD_TX_UNLOCK(dev) { \
1814 + if ((dev->features & NETIF_F_LLTX) == 0) { \
1815 + netif_tx_unlock(dev); \
1820 + * dev_queue_xmit - transmit a buffer
1821 + * @skb: buffer to transmit
1823 + * Queue a buffer for transmission to a network device. The caller must
1824 + * have set the device and priority and built the buffer before calling
1825 + * this function. The function can be called from an interrupt.
1827 + * A negative errno code is returned on a failure. A success does not
1828 + * guarantee the frame will be transmitted as it may be dropped due
1829 + * to congestion or traffic shaping.
1831 + * -----------------------------------------------------------------------------------
1832 + * I notice this method can also return errors from the queue disciplines,
1833 + * including NET_XMIT_DROP, which is a positive value. So, errors can also
1836 + * Regardless of the return value, the skb is consumed, so it is currently
1837 + * difficult to retry a send to this method. (You can bump the ref count
1838 + * before sending to hold a reference for retry if you are careful.)
1840 + * When calling this method, interrupts MUST be enabled. This is because
1841 + * the BH enable code must have IRQs enabled so that it will not deadlock.
1845 +int dev_queue_xmit(struct sk_buff *skb)
1847 + struct net_device *dev = skb->dev;
1851 + /* GSO will handle the following emulations directly. */
1852 + if (netif_needs_gso(dev, skb))
1855 + if (skb_shinfo(skb)->frag_list &&
1856 + !(dev->features & NETIF_F_FRAGLIST) &&
1857 + __skb_linearize(skb))
1858 + goto out_kfree_skb;
1860 + /* Fragmented skb is linearized if device does not support SG,
1861 + * or if at least one of fragments is in highmem and device
1862 + * does not support DMA from it.
1864 + if (skb_shinfo(skb)->nr_frags &&
1865 + (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1866 + __skb_linearize(skb))
1867 + goto out_kfree_skb;
1869 + /* If packet is not checksummed and device does not support
1870 + * checksumming for this protocol, complete checksumming here.
1872 + if (skb->ip_summed == CHECKSUM_PARTIAL &&
1873 + (!(dev->features & NETIF_F_GEN_CSUM) &&
1874 + (!(dev->features & NETIF_F_IP_CSUM) ||
1875 + skb->protocol != htons(ETH_P_IP))))
1876 + if (skb_checksum_help(skb))
1877 + goto out_kfree_skb;
1880 + spin_lock_prefetch(&dev->queue_lock);
1882 + /* Disable soft irqs for various locks below. Also
1883 + * stops preemption for RCU.
1885 + rcu_read_lock_bh();
1887 + /* Updates of qdisc are serialized by queue_lock.
1888 + * The struct Qdisc which is pointed to by qdisc is now a
1889 + * rcu structure - it may be accessed without acquiring
1890 + * a lock (but the structure may be stale.) The freeing of the
1891 + * qdisc will be deferred until it's known that there are no
1892 + * more references to it.
1894 + * If the qdisc has an enqueue function, we still need to
1895 + * hold the queue_lock before calling it, since queue_lock
1896 + * also serializes access to the device queue.
1899 + q = rcu_dereference(dev->qdisc);
1900 +#ifdef CONFIG_NET_CLS_ACT
1901 + skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1904 + /* Grab device queue */
1905 + spin_lock(&dev->queue_lock);
1908 + rc = q->enqueue(skb, q);
1910 + spin_unlock(&dev->queue_lock);
1912 + rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
1915 + spin_unlock(&dev->queue_lock);
1918 + /* The device has no queue. Common case for software devices:
1919 + loopback, all the sorts of tunnels...
1921 + Really, it is unlikely that netif_tx_lock protection is necessary
1922 + here. (f.e. loopback and IP tunnels are clean ignoring statistics
1924 + However, it is possible, that they rely on protection
1927 + Check this and shot the lock. It is not prone from deadlocks.
1928 + Either shot noqueue qdisc, it is even simpler 8)
1930 + if (dev->flags & IFF_UP) {
1931 + int cpu = smp_processor_id(); /* ok because BHs are off */
1933 + if (dev->xmit_lock_owner != cpu) {
1935 + HARD_TX_LOCK(dev, cpu);
1937 + if (!netif_queue_stopped(dev)) {
1939 + if (!dev_hard_start_xmit(skb, dev)) {
1940 + HARD_TX_UNLOCK(dev);
1944 + HARD_TX_UNLOCK(dev);
1945 + if (net_ratelimit())
1946 + printk(KERN_CRIT "Virtual device %s asks to "
1947 + "queue packet!\n", dev->name);
1949 + /* Recursion is detected! It is possible,
1950 + * unfortunately */
1951 + if (net_ratelimit())
1952 + printk(KERN_CRIT "Dead loop on virtual device "
1953 + "%s, fix it urgently!\n", dev->name);
1958 + rcu_read_unlock_bh();
1964 + rcu_read_unlock_bh();
1969 +/*=======================================================================
1971 + =======================================================================*/
1973 +int netdev_max_backlog = 1000;
1974 +int netdev_budget = 300;
1975 +int weight_p = 64; /* old backlog weight */
1977 +DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1981 + * netif_rx - post buffer to the network code
1982 + * @skb: buffer to post
1984 + * This function receives a packet from a device driver and queues it for
1985 + * the upper (protocol) levels to process. It always succeeds. The buffer
1986 + * may be dropped during processing for congestion control or by the
1987 + * protocol layers.
1990 + * NET_RX_SUCCESS (no congestion)
1991 + * NET_RX_CN_LOW (low congestion)
1992 + * NET_RX_CN_MOD (moderate congestion)
1993 + * NET_RX_CN_HIGH (high congestion)
1994 + * NET_RX_DROP (packet was dropped)
1998 +int netif_rx(struct sk_buff *skb)
2000 + struct softnet_data *queue;
2001 + unsigned long flags;
2003 + /* if netpoll wants it, pretend we never saw it */
2004 + if (netpoll_rx(skb))
2005 + return NET_RX_DROP;
2007 + if (!skb->tstamp.off_sec)
2008 + net_timestamp(skb);
2011 + * The code is rearranged so that the path is the most
2012 + * short when CPU is congested, but is still operating.
2014 + local_irq_save(flags);
2015 + queue = &__get_cpu_var(softnet_data);
2017 + __get_cpu_var(netdev_rx_stat).total++;
2018 + if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
2019 + if (queue->input_pkt_queue.qlen) {
2021 + dev_hold(skb->dev);
2022 + __skb_queue_tail(&queue->input_pkt_queue, skb);
2023 + local_irq_restore(flags);
2024 + return NET_RX_SUCCESS;
2027 + netif_rx_schedule(&queue->backlog_dev);
2031 + __get_cpu_var(netdev_rx_stat).dropped++;
2032 + local_irq_restore(flags);
2035 + return NET_RX_DROP;
2038 +int netif_rx_ni(struct sk_buff *skb)
2042 + preempt_disable();
2043 + err = netif_rx(skb);
2044 + if (local_softirq_pending())
2051 +EXPORT_SYMBOL(netif_rx_ni);
2053 +static inline struct net_device *skb_bond(struct sk_buff *skb)
2055 + struct net_device *dev = skb->dev;
2057 + if (dev->master) {
2058 + if (skb_bond_should_drop(skb)) {
2062 + skb->dev = dev->master;
2068 +static void net_tx_action(struct softirq_action *h)
2070 + struct softnet_data *sd = &__get_cpu_var(softnet_data);
2072 + if (sd->completion_queue) {
2073 + struct sk_buff *clist;
2075 + local_irq_disable();
2076 + clist = sd->completion_queue;
2077 + sd->completion_queue = NULL;
2078 + local_irq_enable();
2081 + struct sk_buff *skb = clist;
2082 + clist = clist->next;
2084 + BUG_TRAP(!atomic_read(&skb->users));
2089 + if (sd->output_queue) {
2090 + struct net_device *head;
2092 + local_irq_disable();
2093 + head = sd->output_queue;
2094 + sd->output_queue = NULL;
2095 + local_irq_enable();
2098 + struct net_device *dev = head;
2099 + head = head->next_sched;
2101 + smp_mb__before_clear_bit();
2102 + clear_bit(__LINK_STATE_SCHED, &dev->state);
2104 + if (spin_trylock(&dev->queue_lock)) {
2106 + spin_unlock(&dev->queue_lock);
2108 + netif_schedule(dev);
2114 +static __inline__ int deliver_skb(struct sk_buff *skb,
2115 + struct packet_type *pt_prev,
2116 + struct net_device *orig_dev)
2118 + atomic_inc(&skb->users);
2119 + return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2122 +#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2123 +int (*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff **pskb);
2125 +struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
2126 + unsigned char *addr);
2127 +void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent);
2129 +static __inline__ int handle_bridge(struct sk_buff **pskb,
2130 + struct packet_type **pt_prev, int *ret,
2131 + struct net_device *orig_dev)
2133 + struct net_bridge_port *port;
2135 + if ((*pskb)->pkt_type == PACKET_LOOPBACK ||
2136 + (port = rcu_dereference((*pskb)->dev->br_port)) == NULL)
2140 + *ret = deliver_skb(*pskb, *pt_prev, orig_dev);
2144 + return br_handle_frame_hook(port, pskb);
2147 +#define handle_bridge(skb, pt_prev, ret, orig_dev) (0)
2150 +#ifdef CONFIG_NET_CLS_ACT
2151 +/* TODO: Maybe we should just force sch_ingress to be compiled in
2152 + * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2153 + * a compare and 2 stores extra right now if we dont have it on
2154 + * but have CONFIG_NET_CLS_ACT
2155 + * NOTE: This doesnt stop any functionality; if you dont have
2156 + * the ingress scheduler, you just cant add policies on ingress.
2159 +static int ing_filter(struct sk_buff *skb)
2162 + struct net_device *dev = skb->dev;
2163 + int result = TC_ACT_OK;
2165 + if (dev->qdisc_ingress) {
2166 + __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
2167 + if (MAX_RED_LOOP < ttl++) {
2168 + printk(KERN_WARNING "Redir loop detected Dropping packet (%d->%d)\n",
2169 + skb->iif, skb->dev->ifindex);
2170 + return TC_ACT_SHOT;
2173 + skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl);
2175 + skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS);
2177 + spin_lock(&dev->queue_lock);
2178 + if ((q = dev->qdisc_ingress) != NULL)
2179 + result = q->enqueue(skb, q);
2180 + spin_unlock(&dev->queue_lock);
2188 +int netif_receive_skb(struct sk_buff *skb)
2190 + struct packet_type *ptype, *pt_prev;
2191 + struct net_device *orig_dev;
2192 + int ret = NET_RX_DROP;
2195 + /* if we've gotten here through NAPI, check netpoll */
2196 + if (skb->dev->poll && netpoll_rx(skb))
2197 + return NET_RX_DROP;
2199 + if (!skb->tstamp.off_sec)
2200 + net_timestamp(skb);
2203 + skb->iif = skb->dev->ifindex;
2205 + orig_dev = skb_bond(skb);
2208 + return NET_RX_DROP;
2210 + __get_cpu_var(netdev_rx_stat).total++;
2212 + skb->h.raw = skb->nh.raw = skb->data;
2213 + skb->mac_len = skb->nh.raw - skb->mac.raw;
2219 +#ifdef CONFIG_NET_CLS_ACT
2220 + if (skb->tc_verd & TC_NCLS) {
2221 + skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2226 + list_for_each_entry_rcu(ptype, &ptype_all, list) {
2227 + if (!ptype->dev || ptype->dev == skb->dev) {
2229 + ret = deliver_skb(skb, pt_prev, orig_dev);
2234 +#ifdef CONFIG_NET_CLS_ACT
2236 + ret = deliver_skb(skb, pt_prev, orig_dev);
2237 + pt_prev = NULL; /* noone else should process this after*/
2239 + skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2242 + ret = ing_filter(skb);
2244 + if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
2253 + if (handle_bridge(&skb, &pt_prev, &ret, orig_dev))
2256 + type = skb->protocol;
2257 + list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
2258 + if (ptype->type == type &&
2259 + (!ptype->dev || ptype->dev == skb->dev)) {
2261 + ret = deliver_skb(skb, pt_prev, orig_dev);
2267 + ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2270 + /* Jamal, now you will not able to escape explaining
2271 + * me how you were going to use this. :-)
2273 + ret = NET_RX_DROP;
2277 + rcu_read_unlock();
2281 +static int process_backlog(struct net_device *backlog_dev, int *budget)
2284 + int quota = min(backlog_dev->quota, *budget);
2285 + struct softnet_data *queue = &__get_cpu_var(softnet_data);
2286 + unsigned long start_time = jiffies;
2288 + backlog_dev->weight = weight_p;
2290 + struct sk_buff *skb;
2291 + struct net_device *dev;
2293 + local_irq_disable();
2294 + skb = __skb_dequeue(&queue->input_pkt_queue);
2297 + local_irq_enable();
2301 + netif_receive_skb(skb);
2307 + if (work >= quota || jiffies - start_time > 1)
2312 + backlog_dev->quota -= work;
2317 + backlog_dev->quota -= work;
2320 + list_del(&backlog_dev->poll_list);
2321 + smp_mb__before_clear_bit();
2322 + netif_poll_enable(backlog_dev);
2324 + local_irq_enable();
2328 +static void net_rx_action(struct softirq_action *h)
2330 + struct softnet_data *queue = &__get_cpu_var(softnet_data);
2331 + unsigned long start_time = jiffies;
2332 + int budget = netdev_budget;
2335 + local_irq_disable();
2337 + while (!list_empty(&queue->poll_list)) {
2338 + struct net_device *dev;
2340 + if (budget <= 0 || jiffies - start_time > 1)
2341 + goto softnet_break;
2343 + local_irq_enable();
2345 + dev = list_entry(queue->poll_list.next,
2346 + struct net_device, poll_list);
2347 + have = netpoll_poll_lock(dev);
2349 + if (dev->quota <= 0 || dev->poll(dev, &budget)) {
2350 + netpoll_poll_unlock(have);
2351 + local_irq_disable();
2352 + list_move_tail(&dev->poll_list, &queue->poll_list);
2353 + if (dev->quota < 0)
2354 + dev->quota += dev->weight;
2356 + dev->quota = dev->weight;
2358 + netpoll_poll_unlock(have);
2360 + local_irq_disable();
2364 +#ifdef CONFIG_NET_DMA
2366 + * There may not be any more sk_buffs coming right now, so push
2367 + * any pending DMA copies to hardware
2369 + if (net_dma_client) {
2370 + struct dma_chan *chan;
2372 + list_for_each_entry_rcu(chan, &net_dma_client->channels, client_node)
2373 + dma_async_memcpy_issue_pending(chan);
2374 + rcu_read_unlock();
2377 + local_irq_enable();
2381 + __get_cpu_var(netdev_rx_stat).time_squeeze++;
2382 + __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2386 +static gifconf_func_t * gifconf_list [NPROTO];
2389 + * register_gifconf - register a SIOCGIF handler
2390 + * @family: Address family
2391 + * @gifconf: Function handler
2393 + * Register protocol dependent address dumping routines. The handler
2394 + * that is passed must not be freed or reused until it has been replaced
2395 + * by another handler.
2397 +int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2399 + if (family >= NPROTO)
2401 + gifconf_list[family] = gifconf;
2407 + * Map an interface index to its name (SIOCGIFNAME)
2411 + * We need this ioctl for efficient implementation of the
2412 + * if_indextoname() function required by the IPv6 API. Without
2413 + * it, we would have to search all the interfaces to find a
2417 +static int dev_ifname(struct ifreq __user *arg)
2419 + struct net_device *dev;
2423 + * Fetch the caller's info block.
2426 + if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2429 + read_lock(&dev_base_lock);
2430 + dev = __dev_get_by_index(ifr.ifr_ifindex);
2432 + read_unlock(&dev_base_lock);
2436 + strcpy(ifr.ifr_name, dev->name);
2437 + read_unlock(&dev_base_lock);
2439 + if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2445 + * Perform a SIOCGIFCONF call. This structure will change
2446 + * size eventually, and there is nothing I can do about it.
2447 + * Thus we will need a 'compatibility mode'.
2450 +static int dev_ifconf(char __user *arg)
2452 + struct ifconf ifc;
2453 + struct net_device *dev;
2460 + * Fetch the caller's info block.
2463 + if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2466 + pos = ifc.ifc_buf;
2467 + len = ifc.ifc_len;
2470 + * Loop over the interfaces, and write an info block for each.
2474 + for (dev = dev_base; dev; dev = dev->next) {
2475 + for (i = 0; i < NPROTO; i++) {
2476 + if (gifconf_list[i]) {
2479 + done = gifconf_list[i](dev, NULL, 0);
2481 + done = gifconf_list[i](dev, pos + total,
2491 + * All done. Write the updated control block back to the caller.
2493 + ifc.ifc_len = total;
2496 + * Both BSD and Solaris return 0 here, so we do too.
2498 + return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2501 +#ifdef CONFIG_PROC_FS
2503 + * This is invoked by the /proc filesystem handler to display a device
2506 +static __inline__ struct net_device *dev_get_idx(loff_t pos)
2508 + struct net_device *dev;
2511 + for (i = 0, dev = dev_base; dev && i < pos; ++i, dev = dev->next);
2513 + return i == pos ? dev : NULL;
2516 +void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2518 + read_lock(&dev_base_lock);
2519 + return *pos ? dev_get_idx(*pos - 1) : SEQ_START_TOKEN;
2522 +void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2525 + return v == SEQ_START_TOKEN ? dev_base : ((struct net_device *)v)->next;
2528 +void dev_seq_stop(struct seq_file *seq, void *v)
2530 + read_unlock(&dev_base_lock);
2533 +static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2535 + if (dev->get_stats) {
2536 + struct net_device_stats *stats = dev->get_stats(dev);
2538 + seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2539 + "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2540 + dev->name, stats->rx_bytes, stats->rx_packets,
2542 + stats->rx_dropped + stats->rx_missed_errors,
2543 + stats->rx_fifo_errors,
2544 + stats->rx_length_errors + stats->rx_over_errors +
2545 + stats->rx_crc_errors + stats->rx_frame_errors,
2546 + stats->rx_compressed, stats->multicast,
2547 + stats->tx_bytes, stats->tx_packets,
2548 + stats->tx_errors, stats->tx_dropped,
2549 + stats->tx_fifo_errors, stats->collisions,
2550 + stats->tx_carrier_errors +
2551 + stats->tx_aborted_errors +
2552 + stats->tx_window_errors +
2553 + stats->tx_heartbeat_errors,
2554 + stats->tx_compressed);
2556 + seq_printf(seq, "%6s: No statistics available.\n", dev->name);
2560 + * Called from the PROCfs module. This now uses the new arbitrary sized
2561 + * /proc/net interface to create /proc/net/dev
2563 +static int dev_seq_show(struct seq_file *seq, void *v)
2565 + if (v == SEQ_START_TOKEN)
2566 + seq_puts(seq, "Inter-| Receive "
2568 + " face |bytes packets errs drop fifo frame "
2569 + "compressed multicast|bytes packets errs "
2570 + "drop fifo colls carrier compressed\n");
2572 + dev_seq_printf_stats(seq, v);
2576 +static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2578 + struct netif_rx_stats *rc = NULL;
2580 + while (*pos < NR_CPUS)
2581 + if (cpu_online(*pos)) {
2582 + rc = &per_cpu(netdev_rx_stat, *pos);
2589 +static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2591 + return softnet_get_online(pos);
2594 +static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2597 + return softnet_get_online(pos);
2600 +static void softnet_seq_stop(struct seq_file *seq, void *v)
2604 +static int softnet_seq_show(struct seq_file *seq, void *v)
2606 + struct netif_rx_stats *s = v;
2608 + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2609 + s->total, s->dropped, s->time_squeeze, 0,
2610 + 0, 0, 0, 0, /* was fastroute */
2611 + s->cpu_collision );
2615 +static struct seq_operations dev_seq_ops = {
2616 + .start = dev_seq_start,
2617 + .next = dev_seq_next,
2618 + .stop = dev_seq_stop,
2619 + .show = dev_seq_show,
2622 +static int dev_seq_open(struct inode *inode, struct file *file)
2624 + return seq_open(file, &dev_seq_ops);
2627 +static const struct file_operations dev_seq_fops = {
2628 + .owner = THIS_MODULE,
2629 + .open = dev_seq_open,
2631 + .llseek = seq_lseek,
2632 + .release = seq_release,
2635 +static struct seq_operations softnet_seq_ops = {
2636 + .start = softnet_seq_start,
2637 + .next = softnet_seq_next,
2638 + .stop = softnet_seq_stop,
2639 + .show = softnet_seq_show,
2642 +static int softnet_seq_open(struct inode *inode, struct file *file)
2644 + return seq_open(file, &softnet_seq_ops);
2647 +static const struct file_operations softnet_seq_fops = {
2648 + .owner = THIS_MODULE,
2649 + .open = softnet_seq_open,
2651 + .llseek = seq_lseek,
2652 + .release = seq_release,
2655 +#ifdef CONFIG_WIRELESS_EXT
2656 +extern int wireless_proc_init(void);
2658 +#define wireless_proc_init() 0
2661 +static int __init dev_proc_init(void)
2665 + if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops))
2667 + if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops))
2669 + if (wireless_proc_init())
2675 + proc_net_remove("softnet_stat");
2677 + proc_net_remove("dev");
2681 +#define dev_proc_init() 0
2682 +#endif /* CONFIG_PROC_FS */
2686 + * netdev_set_master - set up master/slave pair
2687 + * @slave: slave device
2688 + * @master: new master device
2690 + * Changes the master device of the slave. Pass %NULL to break the
2691 + * bonding. The caller must hold the RTNL semaphore. On a failure
2692 + * a negative errno code is returned. On success the reference counts
2693 + * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
2694 + * function returns zero.
2696 +int netdev_set_master(struct net_device *slave, struct net_device *master)
2698 + struct net_device *old = slave->master;
2708 + slave->master = master;
2710 + synchronize_net();
2716 + slave->flags |= IFF_SLAVE;
2718 + slave->flags &= ~IFF_SLAVE;
2720 + rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
2725 + * dev_set_promiscuity - update promiscuity count on a device
2729 + * Add or remove promiscuity from a device. While the count in the device
2730 + * remains above zero the interface remains promiscuous. Once it hits zero
2731 + * the device reverts back to normal filtering operation. A negative inc
2732 + * value is used to drop promiscuity on the device.
2734 +void dev_set_promiscuity(struct net_device *dev, int inc)
2736 + unsigned short old_flags = dev->flags;
2738 + if ((dev->promiscuity += inc) == 0)
2739 + dev->flags &= ~IFF_PROMISC;
2741 + dev->flags |= IFF_PROMISC;
2742 + if (dev->flags != old_flags) {
2743 + dev_mc_upload(dev);
2744 + printk(KERN_INFO "device %s %s promiscuous mode\n",
2745 + dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
2747 + audit_log(current->audit_context, GFP_ATOMIC,
2748 + AUDIT_ANOM_PROMISCUOUS,
2749 + "dev=%s prom=%d old_prom=%d auid=%u",
2750 + dev->name, (dev->flags & IFF_PROMISC),
2751 + (old_flags & IFF_PROMISC),
2752 + audit_get_loginuid(current->audit_context));
2757 + * dev_set_allmulti - update allmulti count on a device
2761 + * Add or remove reception of all multicast frames to a device. While the
2762 + * count in the device remains above zero the interface remains listening
2763 + * to all interfaces. Once it hits zero the device reverts back to normal
2764 + * filtering operation. A negative @inc value is used to drop the counter
2765 + * when releasing a resource needing all multicasts.
2768 +void dev_set_allmulti(struct net_device *dev, int inc)
2770 + unsigned short old_flags = dev->flags;
2772 + dev->flags |= IFF_ALLMULTI;
2773 + if ((dev->allmulti += inc) == 0)
2774 + dev->flags &= ~IFF_ALLMULTI;
2775 + if (dev->flags ^ old_flags)
2776 + dev_mc_upload(dev);
2779 +unsigned dev_get_flags(const struct net_device *dev)
2783 + flags = (dev->flags & ~(IFF_PROMISC |
2788 + (dev->gflags & (IFF_PROMISC |
2791 + if (netif_running(dev)) {
2792 + if (netif_oper_up(dev))
2793 + flags |= IFF_RUNNING;
2794 + if (netif_carrier_ok(dev))
2795 + flags |= IFF_LOWER_UP;
2796 + if (netif_dormant(dev))
2797 + flags |= IFF_DORMANT;
2803 +int dev_change_flags(struct net_device *dev, unsigned flags)
2806 + int old_flags = dev->flags;
2809 + * Set the flags on our device.
2812 + dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
2813 + IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
2815 + (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
2819 + * Load in the correct multicast list now the flags have changed.
2822 + dev_mc_upload(dev);
2825 + * Have we downed the interface. We handle IFF_UP ourselves
2826 + * according to user attempts to set it, rather than blindly
2831 + if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
2832 + ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
2835 + dev_mc_upload(dev);
2838 + if (dev->flags & IFF_UP &&
2839 + ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
2841 + raw_notifier_call_chain(&netdev_chain,
2842 + NETDEV_CHANGE, dev);
2844 + if ((flags ^ dev->gflags) & IFF_PROMISC) {
2845 + int inc = (flags & IFF_PROMISC) ? +1 : -1;
2846 + dev->gflags ^= IFF_PROMISC;
2847 + dev_set_promiscuity(dev, inc);
2850 + /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
2851 + is important. Some (broken) drivers set IFF_PROMISC, when
2852 + IFF_ALLMULTI is requested not asking us and not reporting.
2854 + if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
2855 + int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
2856 + dev->gflags ^= IFF_ALLMULTI;
2857 + dev_set_allmulti(dev, inc);
2860 + if (old_flags ^ dev->flags)
2861 + rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags ^ dev->flags);
2866 +int dev_set_mtu(struct net_device *dev, int new_mtu)
2870 + if (new_mtu == dev->mtu)
2873 + /* MTU must be positive. */
2877 + if (!netif_device_present(dev))
2881 + if (dev->change_mtu)
2882 + err = dev->change_mtu(dev, new_mtu);
2884 + dev->mtu = new_mtu;
2885 + if (!err && dev->flags & IFF_UP)
2886 + raw_notifier_call_chain(&netdev_chain,
2887 + NETDEV_CHANGEMTU, dev);
2891 +int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
2895 + if (!dev->set_mac_address)
2896 + return -EOPNOTSUPP;
2897 + if (sa->sa_family != dev->type)
2899 + if (!netif_device_present(dev))
2901 + err = dev->set_mac_address(dev, sa);
2903 + raw_notifier_call_chain(&netdev_chain,
2904 + NETDEV_CHANGEADDR, dev);
2909 + * Perform the SIOCxIFxxx calls.
2911 +static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
2914 + struct net_device *dev = __dev_get_by_name(ifr->ifr_name);
2920 + case SIOCGIFFLAGS: /* Get interface flags */
2921 + ifr->ifr_flags = dev_get_flags(dev);
2924 + case SIOCSIFFLAGS: /* Set interface flags */
2925 + return dev_change_flags(dev, ifr->ifr_flags);
2927 + case SIOCGIFMETRIC: /* Get the metric on the interface
2928 + (currently unused) */
2929 + ifr->ifr_metric = 0;
2932 + case SIOCSIFMETRIC: /* Set the metric on the interface
2933 + (currently unused) */
2934 + return -EOPNOTSUPP;
2936 + case SIOCGIFMTU: /* Get the MTU of a device */
2937 + ifr->ifr_mtu = dev->mtu;
2940 + case SIOCSIFMTU: /* Set the MTU of a device */
2941 + return dev_set_mtu(dev, ifr->ifr_mtu);
2943 + case SIOCGIFHWADDR:
2944 + if (!dev->addr_len)
2945 + memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
2947 + memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
2948 + min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
2949 + ifr->ifr_hwaddr.sa_family = dev->type;
2952 + case SIOCSIFHWADDR:
2953 + return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
2955 + case SIOCSIFHWBROADCAST:
2956 + if (ifr->ifr_hwaddr.sa_family != dev->type)
2958 + memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
2959 + min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
2960 + raw_notifier_call_chain(&netdev_chain,
2961 + NETDEV_CHANGEADDR, dev);
2965 + ifr->ifr_map.mem_start = dev->mem_start;
2966 + ifr->ifr_map.mem_end = dev->mem_end;
2967 + ifr->ifr_map.base_addr = dev->base_addr;
2968 + ifr->ifr_map.irq = dev->irq;
2969 + ifr->ifr_map.dma = dev->dma;
2970 + ifr->ifr_map.port = dev->if_port;
2974 + if (dev->set_config) {
2975 + if (!netif_device_present(dev))
2977 + return dev->set_config(dev, &ifr->ifr_map);
2979 + return -EOPNOTSUPP;
2981 + case SIOCADDMULTI:
2982 + if (!dev->set_multicast_list ||
2983 + ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
2985 + if (!netif_device_present(dev))
2987 + return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
2988 + dev->addr_len, 1);
2990 + case SIOCDELMULTI:
2991 + if (!dev->set_multicast_list ||
2992 + ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
2994 + if (!netif_device_present(dev))
2996 + return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
2997 + dev->addr_len, 1);
2999 + case SIOCGIFINDEX:
3000 + ifr->ifr_ifindex = dev->ifindex;
3003 + case SIOCGIFTXQLEN:
3004 + ifr->ifr_qlen = dev->tx_queue_len;
3007 + case SIOCSIFTXQLEN:
3008 + if (ifr->ifr_qlen < 0)
3010 + dev->tx_queue_len = ifr->ifr_qlen;
3014 + ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3015 + return dev_change_name(dev, ifr->ifr_newname);
3018 + * Unknown or private ioctl
3022 + if ((cmd >= SIOCDEVPRIVATE &&
3023 + cmd <= SIOCDEVPRIVATE + 15) ||
3024 + cmd == SIOCBONDENSLAVE ||
3025 + cmd == SIOCBONDRELEASE ||
3026 + cmd == SIOCBONDSETHWADDR ||
3027 + cmd == SIOCBONDSLAVEINFOQUERY ||
3028 + cmd == SIOCBONDINFOQUERY ||
3029 + cmd == SIOCBONDCHANGEACTIVE ||
3030 + cmd == SIOCGMIIPHY ||
3031 + cmd == SIOCGMIIREG ||
3032 + cmd == SIOCSMIIREG ||
3033 + cmd == SIOCBRADDIF ||
3034 + cmd == SIOCBRDELIF ||
3035 + cmd == SIOCWANDEV) {
3036 + err = -EOPNOTSUPP;
3037 + if (dev->do_ioctl) {
3038 + if (netif_device_present(dev))
3039 + err = dev->do_ioctl(dev, ifr,
3052 + * This function handles all "interface"-type I/O control requests. The actual
3053 + * 'doing' part of this is dev_ifsioc above.
3057 + * dev_ioctl - network device ioctl
3058 + * @cmd: command to issue
3059 + * @arg: pointer to a struct ifreq in user space
3061 + * Issue ioctl functions to devices. This is normally called by the
3062 + * user space syscall interfaces but can sometimes be useful for
3063 + * other purposes. The return value is the return from the syscall if
3064 + * positive or a negative errno code on error.
3067 +int dev_ioctl(unsigned int cmd, void __user *arg)
3073 + /* One special case: SIOCGIFCONF takes ifconf argument
3074 + and requires shared lock, because it sleeps writing
3078 + if (cmd == SIOCGIFCONF) {
3080 + ret = dev_ifconf((char __user *) arg);
3084 + if (cmd == SIOCGIFNAME)
3085 + return dev_ifname((struct ifreq __user *)arg);
3087 + if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3090 + ifr.ifr_name[IFNAMSIZ-1] = 0;
3092 + colon = strchr(ifr.ifr_name, ':');
3097 + * See which interface the caller is talking about.
3102 + * These ioctl calls:
3103 + * - can be done by all.
3104 + * - atomic and do not require locking.
3105 + * - return a value
3107 + case SIOCGIFFLAGS:
3108 + case SIOCGIFMETRIC:
3110 + case SIOCGIFHWADDR:
3111 + case SIOCGIFSLAVE:
3113 + case SIOCGIFINDEX:
3114 + case SIOCGIFTXQLEN:
3115 + dev_load(ifr.ifr_name);
3116 + read_lock(&dev_base_lock);
3117 + ret = dev_ifsioc(&ifr, cmd);
3118 + read_unlock(&dev_base_lock);
3122 + if (copy_to_user(arg, &ifr,
3123 + sizeof(struct ifreq)))
3129 + dev_load(ifr.ifr_name);
3131 + ret = dev_ethtool(&ifr);
3136 + if (copy_to_user(arg, &ifr,
3137 + sizeof(struct ifreq)))
3143 + * These ioctl calls:
3144 + * - require superuser power.
3145 + * - require strict serialization.
3146 + * - return a value
3151 + if (!capable(CAP_NET_ADMIN))
3153 + dev_load(ifr.ifr_name);
3155 + ret = dev_ifsioc(&ifr, cmd);
3160 + if (copy_to_user(arg, &ifr,
3161 + sizeof(struct ifreq)))
3167 + * These ioctl calls:
3168 + * - require superuser power.
3169 + * - require strict serialization.
3170 + * - do not return a value
3172 + case SIOCSIFFLAGS:
3173 + case SIOCSIFMETRIC:
3176 + case SIOCSIFHWADDR:
3177 + case SIOCSIFSLAVE:
3178 + case SIOCADDMULTI:
3179 + case SIOCDELMULTI:
3180 + case SIOCSIFHWBROADCAST:
3181 + case SIOCSIFTXQLEN:
3183 + case SIOCBONDENSLAVE:
3184 + case SIOCBONDRELEASE:
3185 + case SIOCBONDSETHWADDR:
3186 + case SIOCBONDCHANGEACTIVE:
3189 + if (!capable(CAP_NET_ADMIN))
3191 + /* fall through */
3192 + case SIOCBONDSLAVEINFOQUERY:
3193 + case SIOCBONDINFOQUERY:
3194 + dev_load(ifr.ifr_name);
3196 + ret = dev_ifsioc(&ifr, cmd);
3201 + /* Get the per device memory space. We can add this but
3202 + * currently do not support it */
3204 + /* Set the per device memory buffer space.
3205 + * Not applicable in our case */
3210 + * Unknown or private ioctl.
3213 + if (cmd == SIOCWANDEV ||
3214 + (cmd >= SIOCDEVPRIVATE &&
3215 + cmd <= SIOCDEVPRIVATE + 15)) {
3216 + dev_load(ifr.ifr_name);
3218 + ret = dev_ifsioc(&ifr, cmd);
3220 + if (!ret && copy_to_user(arg, &ifr,
3221 + sizeof(struct ifreq)))
3225 +#ifdef CONFIG_WIRELESS_EXT
3226 + /* Take care of Wireless Extensions */
3227 + if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
3228 + /* If command is `set a parameter', or
3229 + * `get the encoding parameters', check if
3230 + * the user has the right to do it */
3231 + if (IW_IS_SET(cmd) || cmd == SIOCGIWENCODE
3232 + || cmd == SIOCGIWENCODEEXT) {
3233 + if (!capable(CAP_NET_ADMIN))
3236 + dev_load(ifr.ifr_name);
3238 + /* Follow me in net/core/wireless.c */
3239 + ret = wireless_process_ioctl(&ifr, cmd);
3241 + if (IW_IS_GET(cmd) &&
3242 + copy_to_user(arg, &ifr,
3243 + sizeof(struct ifreq)))
3247 +#endif /* CONFIG_WIRELESS_EXT */
3254 + * dev_new_index - allocate an ifindex
3256 + * Returns a suitable unique value for a new device interface
3257 + * number. The caller must hold the rtnl semaphore or the
3258 + * dev_base_lock to be sure it remains unique.
3260 +static int dev_new_index(void)
3262 + static int ifindex;
3264 + if (++ifindex <= 0)
3266 + if (!__dev_get_by_index(ifindex))
3271 +static int dev_boot_phase = 1;
3273 +/* Delayed registration/unregisteration */
3274 +static DEFINE_SPINLOCK(net_todo_list_lock);
3275 +static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list);
3277 +static inline void net_set_todo(struct net_device *dev)
3279 + spin_lock(&net_todo_list_lock);
3280 + list_add_tail(&dev->todo_list, &net_todo_list);
3281 + spin_unlock(&net_todo_list_lock);
3285 + * register_netdevice - register a network device
3286 + * @dev: device to register
3288 + * Take a completed network device structure and add it to the kernel
3289 + * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3290 + * chain. 0 is returned on success. A negative errno code is returned
3291 + * on a failure to set up the device, or if the name is a duplicate.
3293 + * Callers must hold the rtnl semaphore. You may want
3294 + * register_netdev() instead of this.
3297 + * The locking appears insufficient to guarantee two parallel registers
3298 + * will not get the same name.
3301 +int register_netdevice(struct net_device *dev)
3303 + struct hlist_head *head;
3304 + struct hlist_node *p;
3307 + BUG_ON(dev_boot_phase);
3312 + /* When net_device's are persistent, this will be fatal. */
3313 + BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
3315 + spin_lock_init(&dev->queue_lock);
3316 + spin_lock_init(&dev->_xmit_lock);
3317 + dev->xmit_lock_owner = -1;
3318 +#ifdef CONFIG_NET_CLS_ACT
3319 + spin_lock_init(&dev->ingress_lock);
3324 + /* Init, if this function is available */
3326 + ret = dev->init(dev);
3334 + if (!dev_valid_name(dev->name)) {
3339 + dev->ifindex = dev_new_index();
3340 + if (dev->iflink == -1)
3341 + dev->iflink = dev->ifindex;
3343 + /* Check for existence of name */
3344 + head = dev_name_hash(dev->name);
3345 + hlist_for_each(p, head) {
3346 + struct net_device *d
3347 + = hlist_entry(p, struct net_device, name_hlist);
3348 + if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
3354 + /* Fix illegal SG+CSUM combinations. */
3355 + if ((dev->features & NETIF_F_SG) &&
3356 + !(dev->features & NETIF_F_ALL_CSUM)) {
3357 + printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no checksum feature.\n",
3359 + dev->features &= ~NETIF_F_SG;
3362 + /* TSO requires that SG is present as well. */
3363 + if ((dev->features & NETIF_F_TSO) &&
3364 + !(dev->features & NETIF_F_SG)) {
3365 + printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no SG feature.\n",
3367 + dev->features &= ~NETIF_F_TSO;
3369 + if (dev->features & NETIF_F_UFO) {
3370 + if (!(dev->features & NETIF_F_HW_CSUM)) {
3371 + printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3372 + "NETIF_F_HW_CSUM feature.\n",
3374 + dev->features &= ~NETIF_F_UFO;
3376 + if (!(dev->features & NETIF_F_SG)) {
3377 + printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3378 + "NETIF_F_SG feature.\n",
3380 + dev->features &= ~NETIF_F_UFO;
3385 + * nil rebuild_header routine,
3386 + * that should be never called and used as just bug trap.
3389 + if (!dev->rebuild_header)
3390 + dev->rebuild_header = default_rebuild_header;
3392 + ret = netdev_register_sysfs(dev);
3395 + dev->reg_state = NETREG_REGISTERED;
3398 + * Default initial state at registry is that the
3399 + * device is present.
3402 + set_bit(__LINK_STATE_PRESENT, &dev->state);
3405 + dev_init_scheduler(dev);
3406 + write_lock_bh(&dev_base_lock);
3408 + dev_tail = &dev->next;
3409 + hlist_add_head(&dev->name_hlist, head);
3410 + hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex));
3412 + write_unlock_bh(&dev_base_lock);
3414 + /* Notify protocols, that a new device appeared. */
3415 + raw_notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
3424 + * register_netdev - register a network device
3425 + * @dev: device to register
3427 + * Take a completed network device structure and add it to the kernel
3428 + * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3429 + * chain. 0 is returned on success. A negative errno code is returned
3430 + * on a failure to set up the device, or if the name is a duplicate.
3432 + * This is a wrapper around register_netdev that takes the rtnl semaphore
3433 + * and expands the device name if you passed a format string to
3436 +int register_netdev(struct net_device *dev)
3443 + * If the name is a format string the caller wants us to do a
3444 + * name allocation.
3446 + if (strchr(dev->name, '%')) {
3447 + err = dev_alloc_name(dev, dev->name);
3452 + err = register_netdevice(dev);
3457 +EXPORT_SYMBOL(register_netdev);
3460 + * netdev_wait_allrefs - wait until all references are gone.
3462 + * This is called when unregistering network devices.
3464 + * Any protocol or device that holds a reference should register
3465 + * for netdevice notification, and cleanup and put back the
3466 + * reference if they receive an UNREGISTER event.
3467 + * We can get stuck here if buggy protocols don't correctly
3470 +static void netdev_wait_allrefs(struct net_device *dev)
3472 + unsigned long rebroadcast_time, warning_time;
3474 + rebroadcast_time = warning_time = jiffies;
3475 + while (atomic_read(&dev->refcnt) != 0) {
3476 + if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
3479 + /* Rebroadcast unregister notification */
3480 + raw_notifier_call_chain(&netdev_chain,
3481 + NETDEV_UNREGISTER, dev);
3483 + if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
3485 + /* We must not have linkwatch events
3486 + * pending on unregister. If this
3487 + * happens, we simply run the queue
3488 + * unscheduled, resulting in a noop
3489 + * for this device.
3491 + linkwatch_run_queue();
3496 + rebroadcast_time = jiffies;
3501 + if (time_after(jiffies, warning_time + 10 * HZ)) {
3502 + printk(KERN_EMERG "unregister_netdevice: "
3503 + "waiting for %s to become free. Usage "
3505 + dev->name, atomic_read(&dev->refcnt));
3506 + warning_time = jiffies;
3511 +/* The sequence is:
3515 + * register_netdevice(x1);
3516 + * register_netdevice(x2);
3518 + * unregister_netdevice(y1);
3519 + * unregister_netdevice(y2);
3522 + * free_netdev(y1);
3523 + * free_netdev(y2);
3525 + * We are invoked by rtnl_unlock() after it drops the semaphore.
3526 + * This allows us to deal with problems:
3527 + * 1) We can delete sysfs objects which invoke hotplug
3528 + * without deadlocking with linkwatch via keventd.
3529 + * 2) Since we run with the RTNL semaphore not held, we can sleep
3530 + * safely in order to wait for the netdev refcnt to drop to zero.
3532 +static DEFINE_MUTEX(net_todo_run_mutex);
3533 +void netdev_run_todo(void)
3535 + struct list_head list;
3537 + /* Need to guard against multiple cpu's getting out of order. */
3538 + mutex_lock(&net_todo_run_mutex);
3540 + /* Not safe to do outside the semaphore. We must not return
3541 + * until all unregister events invoked by the local processor
3542 + * have been completed (either by this todo run, or one on
3545 + if (list_empty(&net_todo_list))
3548 + /* Snapshot list, allow later requests */
3549 + spin_lock(&net_todo_list_lock);
3550 + list_replace_init(&net_todo_list, &list);
3551 + spin_unlock(&net_todo_list_lock);
3553 + while (!list_empty(&list)) {
3554 + struct net_device *dev
3555 + = list_entry(list.next, struct net_device, todo_list);
3556 + list_del(&dev->todo_list);
3558 + if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
3559 + printk(KERN_ERR "network todo '%s' but state %d\n",
3560 + dev->name, dev->reg_state);
3565 + netdev_unregister_sysfs(dev);
3566 + dev->reg_state = NETREG_UNREGISTERED;
3568 + netdev_wait_allrefs(dev);
3571 + BUG_ON(atomic_read(&dev->refcnt));
3572 + BUG_TRAP(!dev->ip_ptr);
3573 + BUG_TRAP(!dev->ip6_ptr);
3574 + BUG_TRAP(!dev->dn_ptr);
3576 + /* It must be the very last action,
3577 + * after this 'dev' may point to freed up memory.
3579 + if (dev->destructor)
3580 + dev->destructor(dev);
3584 + mutex_unlock(&net_todo_run_mutex);
3588 + * alloc_netdev - allocate network device
3589 + * @sizeof_priv: size of private data to allocate space for
3590 + * @name: device name format string
3591 + * @setup: callback to initialize device
3593 + * Allocates a struct net_device with private data area for driver use
3594 + * and performs basic initialization.
3596 +struct net_device *alloc_netdev(int sizeof_priv, const char *name,
3597 + void (*setup)(struct net_device *))
3600 + struct net_device *dev;
3603 + BUG_ON(strlen(name) >= sizeof(dev->name));
3605 + /* ensure 32-byte alignment of both the device and private area */
3606 + alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
3607 + alloc_size += sizeof_priv + NETDEV_ALIGN_CONST;
3609 + p = kzalloc(alloc_size, GFP_KERNEL);
3611 + printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
3615 + dev = (struct net_device *)
3616 + (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
3617 + dev->padded = (char *)dev - (char *)p;
3620 + dev->priv = netdev_priv(dev);
3623 + strcpy(dev->name, name);
3626 +EXPORT_SYMBOL(alloc_netdev);
3629 + * free_netdev - free network device
3632 + * This function does the last stage of destroying an allocated device
3633 + * interface. The reference to the device object is released.
3634 + * If this is the last reference then it will be freed.
3636 +void free_netdev(struct net_device *dev)
3638 +#ifdef CONFIG_SYSFS
3639 + /* Compatibility with error handling in drivers */
3640 + if (dev->reg_state == NETREG_UNINITIALIZED) {
3641 + kfree((char *)dev - dev->padded);
3645 + BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
3646 + dev->reg_state = NETREG_RELEASED;
3648 + /* will free via device release */
3649 + put_device(&dev->dev);
3651 + kfree((char *)dev - dev->padded);
3655 +/* Synchronize with packet receive processing. */
3656 +void synchronize_net(void)
3659 + synchronize_rcu();
3663 + * unregister_netdevice - remove device from the kernel
3666 + * This function shuts down a device interface and removes it
3667 + * from the kernel tables. On success 0 is returned, on a failure
3668 + * a negative errno code is returned.
3670 + * Callers must hold the rtnl semaphore. You may want
3671 + * unregister_netdev() instead of this.
3674 +void unregister_netdevice(struct net_device *dev)
3676 + struct net_device *d, **dp;
3678 + BUG_ON(dev_boot_phase);
3681 + /* Some devices call without registering for initialization unwind. */
3682 + if (dev->reg_state == NETREG_UNINITIALIZED) {
3683 + printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
3684 + "was registered\n", dev->name, dev);
3690 + BUG_ON(dev->reg_state != NETREG_REGISTERED);
3692 + /* If device is running, close it first. */
3693 + if (dev->flags & IFF_UP)
3696 + /* And unlink it from device chain. */
3697 + for (dp = &dev_base; (d = *dp) != NULL; dp = &d->next) {
3699 + write_lock_bh(&dev_base_lock);
3700 + hlist_del(&dev->name_hlist);
3701 + hlist_del(&dev->index_hlist);
3702 + if (dev_tail == &dev->next)
3705 + write_unlock_bh(&dev_base_lock);
3711 + dev->reg_state = NETREG_UNREGISTERING;
3713 + synchronize_net();
3715 + /* Shutdown queueing discipline. */
3716 + dev_shutdown(dev);
3719 + /* Notify protocols, that we are about to destroy
3720 + this device. They should clean all the things.
3722 + raw_notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
3725 + * Flush the multicast chain
3727 + dev_mc_discard(dev);
3732 + /* Notifier chain MUST detach us from master device. */
3733 + BUG_TRAP(!dev->master);
3735 + /* Finish processing unregister after unlock */
3736 + net_set_todo(dev);
3738 + synchronize_net();
3744 + * unregister_netdev - remove device from the kernel
3747 + * This function shuts down a device interface and removes it
3748 + * from the kernel tables. On success 0 is returned, on a failure
3749 + * a negative errno code is returned.
3751 + * This is just a wrapper for unregister_netdevice that takes
3752 + * the rtnl semaphore. In general you want to use this and not
3753 + * unregister_netdevice.
3755 +void unregister_netdev(struct net_device *dev)
3758 + unregister_netdevice(dev);
3762 +EXPORT_SYMBOL(unregister_netdev);
3764 +static int dev_cpu_callback(struct notifier_block *nfb,
3765 + unsigned long action,
3768 + struct sk_buff **list_skb;
3769 + struct net_device **list_net;
3770 + struct sk_buff *skb;
3771 + unsigned int cpu, oldcpu = (unsigned long)ocpu;
3772 + struct softnet_data *sd, *oldsd;
3774 + if (action != CPU_DEAD)
3777 + local_irq_disable();
3778 + cpu = smp_processor_id();
3779 + sd = &per_cpu(softnet_data, cpu);
3780 + oldsd = &per_cpu(softnet_data, oldcpu);
3782 + /* Find end of our completion_queue. */
3783 + list_skb = &sd->completion_queue;
3785 + list_skb = &(*list_skb)->next;
3786 + /* Append completion queue from offline CPU. */
3787 + *list_skb = oldsd->completion_queue;
3788 + oldsd->completion_queue = NULL;
3790 + /* Find end of our output_queue. */
3791 + list_net = &sd->output_queue;
3793 + list_net = &(*list_net)->next_sched;
3794 + /* Append output queue from offline CPU. */
3795 + *list_net = oldsd->output_queue;
3796 + oldsd->output_queue = NULL;
3798 + raise_softirq_irqoff(NET_TX_SOFTIRQ);
3799 + local_irq_enable();
3801 + /* Process offline CPU's input_pkt_queue */
3802 + while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
3808 +#ifdef CONFIG_NET_DMA
3810 + * net_dma_rebalance -
3811 + * This is called when the number of channels allocated to the net_dma_client
3812 + * changes. The net_dma_client tries to have one DMA channel per CPU.
3814 +static void net_dma_rebalance(void)
3816 + unsigned int cpu, i, n;
3817 + struct dma_chan *chan;
3819 + if (net_dma_count == 0) {
3820 + for_each_online_cpu(cpu)
3821 + rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL);
3826 + cpu = first_cpu(cpu_online_map);
3829 + list_for_each_entry(chan, &net_dma_client->channels, client_node) {
3830 + n = ((num_online_cpus() / net_dma_count)
3831 + + (i < (num_online_cpus() % net_dma_count) ? 1 : 0));
3834 + per_cpu(softnet_data, cpu).net_dma = chan;
3835 + cpu = next_cpu(cpu, cpu_online_map);
3840 + rcu_read_unlock();
3844 + * netdev_dma_event - event callback for the net_dma_client
3845 + * @client: should always be net_dma_client
3846 + * @chan: DMA channel for the event
3847 + * @event: event type
3849 +static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
3850 + enum dma_event event)
3852 + spin_lock(&net_dma_event_lock);
3854 + case DMA_RESOURCE_ADDED:
3856 + net_dma_rebalance();
3858 + case DMA_RESOURCE_REMOVED:
3860 + net_dma_rebalance();
3865 + spin_unlock(&net_dma_event_lock);
3869 + * netdev_dma_regiser - register the networking subsystem as a DMA client
3871 +static int __init netdev_dma_register(void)
3873 + spin_lock_init(&net_dma_event_lock);
3874 + net_dma_client = dma_async_client_register(netdev_dma_event);
3875 + if (net_dma_client == NULL)
3878 + dma_async_client_chan_request(net_dma_client, num_online_cpus());
3883 +static int __init netdev_dma_register(void) { return -ENODEV; }
3884 +#endif /* CONFIG_NET_DMA */
3887 + * Initialize the DEV module. At boot time this walks the device list and
3888 + * unhooks any devices that fail to initialise (normally hardware not
3889 + * present) and leaves us with a valid list of present and active devices.
3894 + * This is called single threaded during boot, so no need
3895 + * to take the rtnl semaphore.
3897 +static int __init net_dev_init(void)
3899 + int i, rc = -ENOMEM;
3901 + BUG_ON(!dev_boot_phase);
3903 + if (dev_proc_init())
3906 + if (netdev_sysfs_init())
3909 + INIT_LIST_HEAD(&ptype_all);
3910 + for (i = 0; i < 16; i++)
3911 + INIT_LIST_HEAD(&ptype_base[i]);
3913 + for (i = 0; i < ARRAY_SIZE(dev_name_head); i++)
3914 + INIT_HLIST_HEAD(&dev_name_head[i]);
3916 + for (i = 0; i < ARRAY_SIZE(dev_index_head); i++)
3917 + INIT_HLIST_HEAD(&dev_index_head[i]);
3920 + * Initialise the packet receive queues.
3923 + for_each_possible_cpu(i) {
3924 + struct softnet_data *queue;
3926 + queue = &per_cpu(softnet_data, i);
3927 + skb_queue_head_init(&queue->input_pkt_queue);
3928 + queue->completion_queue = NULL;
3929 + INIT_LIST_HEAD(&queue->poll_list);
3930 + set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
3931 + queue->backlog_dev.weight = weight_p;
3932 + queue->backlog_dev.poll = process_backlog;
3933 + atomic_set(&queue->backlog_dev.refcnt, 1);
3936 + netdev_dma_register();
3938 + dev_boot_phase = 0;
3940 + open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
3941 + open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
3943 + hotcpu_notifier(dev_cpu_callback, 0);
3951 +subsys_initcall(net_dev_init);
3953 +EXPORT_SYMBOL(__dev_get_by_index);
3954 +EXPORT_SYMBOL(__dev_get_by_name);
3955 +EXPORT_SYMBOL(__dev_remove_pack);
3956 +EXPORT_SYMBOL(dev_valid_name);
3957 +EXPORT_SYMBOL(dev_add_pack);
3958 +EXPORT_SYMBOL(dev_alloc_name);
3959 +EXPORT_SYMBOL(dev_close);
3960 +EXPORT_SYMBOL(dev_get_by_flags);
3961 +EXPORT_SYMBOL(dev_get_by_index);
3962 +EXPORT_SYMBOL(dev_get_by_name);
3963 +EXPORT_SYMBOL(dev_open);
3964 +EXPORT_SYMBOL(dev_queue_xmit);
3965 +EXPORT_SYMBOL(dev_remove_pack);
3966 +EXPORT_SYMBOL(dev_set_allmulti);
3967 +EXPORT_SYMBOL(dev_set_promiscuity);
3968 +EXPORT_SYMBOL(dev_change_flags);
3969 +EXPORT_SYMBOL(dev_set_mtu);
3970 +EXPORT_SYMBOL(dev_set_mac_address);
3971 +EXPORT_SYMBOL(free_netdev);
3972 +EXPORT_SYMBOL(netdev_boot_setup_check);
3973 +EXPORT_SYMBOL(netdev_set_master);
3974 +EXPORT_SYMBOL(netdev_state_change);
3975 +EXPORT_SYMBOL(netif_receive_skb);
3976 +EXPORT_SYMBOL(netif_rx);
3977 +EXPORT_SYMBOL(register_gifconf);
3978 +EXPORT_SYMBOL(register_netdevice);
3979 +EXPORT_SYMBOL(register_netdevice_notifier);
3980 +EXPORT_SYMBOL(skb_checksum_help);
3981 +EXPORT_SYMBOL(synchronize_net);
3982 +EXPORT_SYMBOL(unregister_netdevice);
3983 +EXPORT_SYMBOL(unregister_netdevice_notifier);
3984 +EXPORT_SYMBOL(net_enable_timestamp);
3985 +EXPORT_SYMBOL(net_disable_timestamp);
3986 +EXPORT_SYMBOL(dev_get_flags);
3988 +#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
3989 +EXPORT_SYMBOL(br_handle_frame_hook);
3990 +EXPORT_SYMBOL(br_fdb_get_hook);
3991 +EXPORT_SYMBOL(br_fdb_put_hook);
3995 +EXPORT_SYMBOL(dev_load);
3998 +EXPORT_PER_CPU_SYMBOL(softnet_data);
3999 diff --unified --recursive --new-file linux-2.6.21.4/net/ring/Kconfig linux-2.6.21.4-1-686-smp-ring3/net/ring/Kconfig
4000 --- linux-2.6.21.4/net/ring/Kconfig 1970-01-01 00:00:00.000000000 +0000
4001 +++ linux-2.6.21.4-1-686-smp-ring3/net/ring/Kconfig 2007-06-10 16:43:04.406423944 +0000
4004 + tristate "PF_RING sockets (EXPERIMENTAL)"
4005 + depends on EXPERIMENTAL
4007 + PF_RING socket family, optimized for packet capture.
4008 + If a PF_RING socket is bound to an adapter (via the bind() system
4009 + call), such adapter will be used in read-only mode until the socket
4010 + is destroyed. Whenever an incoming packet is received from the adapter
4011 + it will not passed to upper layers, but instead it is copied to a ring
4012 + buffer, which in turn is exported to user space applications via mmap.
4013 + Please refer to http://luca.ntop.org/Ring.pdf for more.
4015 + Say N unless you know what you are doing.
4017 diff --unified --recursive --new-file linux-2.6.21.4/net/ring/Makefile linux-2.6.21.4-1-686-smp-ring3/net/ring/Makefile
4018 --- linux-2.6.21.4/net/ring/Makefile 1970-01-01 00:00:00.000000000 +0000
4019 +++ linux-2.6.21.4-1-686-smp-ring3/net/ring/Makefile 2007-06-10 16:43:04.350421521 +0000
4022 +# Makefile for the ring driver.
4027 +ring-objs := ring_packet.o
4028 diff --unified --recursive --new-file linux-2.6.21.4/net/ring/ring_packet.c linux-2.6.21.4-1-686-smp-ring3/net/ring/ring_packet.c
4029 --- linux-2.6.21.4/net/ring/ring_packet.c 1970-01-01 00:00:00.000000000 +0000
4030 +++ linux-2.6.21.4-1-686-smp-ring3/net/ring/ring_packet.c 2007-06-10 16:43:04.354421694 +0000
4032 +/* ***************************************************************
4034 + * (C) 2004-07 - Luca Deri <deri@ntop.org>
4036 + * This code includes contributions courtesy of
4037 + * - Jeff Randall <jrandall@nexvu.com>
4038 + * - Helmut Manck <helmut.manck@secunet.com>
4039 + * - Brad Doctor <brad@stillsecure.com>
4040 + * - Amit D. Chaudhary <amit_ml@rajgad.com>
4041 + * - Francesco Fusco <fusco@ntop.org>
4042 + * - Michael Stiller <ms@2scale.net>
4045 + * This program is free software; you can redistribute it and/or modify
4046 + * it under the terms of the GNU General Public License as published by
4047 + * the Free Software Foundation; either version 2 of the License, or
4048 + * (at your option) any later version.
4050 + * This program is distributed in the hope that it will be useful,
4051 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
4052 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
4053 + * GNU General Public License for more details.
4055 + * You should have received a copy of the GNU General Public License
4056 + * along with this program; if not, write to the Free Software Foundation,
4057 + * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
4061 +#include <linux/version.h>
4062 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,19))
4063 +#include <linux/autoconf.h>
4065 +#include <linux/config.h>
4067 +#include <linux/module.h>
4068 +#include <linux/kernel.h>
4069 +#include <linux/socket.h>
4070 +#include <linux/skbuff.h>
4071 +#include <linux/rtnetlink.h>
4072 +#include <linux/in.h>
4073 +#include <linux/inet.h>
4074 +#include <linux/in6.h>
4075 +#include <linux/init.h>
4076 +#include <linux/filter.h>
4077 +#include <linux/ring.h>
4078 +#include <linux/ip.h>
4079 +#include <linux/tcp.h>
4080 +#include <linux/udp.h>
4081 +#include <linux/list.h>
4082 +#include <linux/proc_fs.h>
4083 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
4084 +#include <net/xfrm.h>
4086 +#include <linux/poll.h>
4088 +#include <net/sock.h>
4089 +#include <asm/io.h> /* needed for virt_to_phys() */
4091 +#include <net/inet_common.h>
4094 +/* #define RING_DEBUG */
4096 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11))
4097 +static inline int remap_page_range(struct vm_area_struct *vma,
4098 + unsigned long uvaddr,
4099 + unsigned long paddr,
4100 + unsigned long size,
4102 + return(remap_pfn_range(vma, uvaddr, paddr >> PAGE_SHIFT,
4107 +/* ************************************************* */
4109 +#define CLUSTER_LEN 8
4111 +struct ring_cluster {
4112 + u_short cluster_id; /* 0 = no cluster */
4113 + u_short num_cluster_elements;
4114 + enum cluster_type hashing_mode;
4115 + u_short hashing_id;
4116 + struct sock *sk[CLUSTER_LEN];
4117 + struct ring_cluster *next; /* NULL = last element of the cluster */
4120 +/* ************************************************* */
4122 +struct ring_element {
4123 + struct list_head list;
4127 +/* ************************************************* */
4130 + struct net_device *ring_netdev;
4135 + u_short cluster_id; /* 0 = no cluster */
4138 + struct net_device *reflector_dev;
4140 + /* Packet buffers */
4141 + unsigned long order;
4144 + unsigned long ring_memory;
4145 + FlowSlotInfo *slots_info; /* Basically it points to ring_memory */
4146 + char *ring_slots; /* Basically it points to ring_memory
4147 + +sizeof(FlowSlotInfo) */
4149 + /* Packet Sampling */
4150 + u_int pktToSample, sample_rate;
4153 + struct sk_filter *bpfFilter;
4155 + /* Aho-Corasick */
4156 + ACSM_STRUCT2 * acsm;
4159 + atomic_t num_ring_slots_waiters;
4160 + wait_queue_head_t ring_slots_waitqueue;
4161 + rwlock_t ring_index_lock;
4163 + /* Bloom Filters */
4164 + u_char bitmask_enabled;
4165 + bitmask_selector mac_bitmask, vlan_bitmask, ip_bitmask, twin_ip_bitmask,
4166 + port_bitmask, twin_port_bitmask, proto_bitmask;
4167 + u_int32_t num_mac_bitmask_add, num_mac_bitmask_remove;
4168 + u_int32_t num_vlan_bitmask_add, num_vlan_bitmask_remove;
4169 + u_int32_t num_ip_bitmask_add, num_ip_bitmask_remove;
4170 + u_int32_t num_port_bitmask_add, num_port_bitmask_remove;
4171 + u_int32_t num_proto_bitmask_add, num_proto_bitmask_remove;
4173 + /* Indexes (Internal) */
4174 + u_int insert_page_id, insert_slot_id;
4177 +/* ************************************************* */
4179 +/* List of all ring sockets. */
4180 +static struct list_head ring_table;
4181 +static u_int ring_table_size;
4183 +/* List of all clusters */
4184 +static struct ring_cluster *ring_cluster_list;
4186 +static rwlock_t ring_mgmt_lock = RW_LOCK_UNLOCKED;
4188 +/* ********************************** */
4190 +/* /proc entry for ring module */
4191 +struct proc_dir_entry *ring_proc_dir = NULL;
4192 +struct proc_dir_entry *ring_proc = NULL;
4194 +static int ring_proc_get_info(char *, char **, off_t, int, int *, void *);
4195 +static void ring_proc_add(struct ring_opt *pfr);
4196 +static void ring_proc_remove(struct ring_opt *pfr);
4197 +static void ring_proc_init(void);
4198 +static void ring_proc_term(void);
4200 +/* ********************************** */
4203 +static struct proto_ops ring_ops;
4205 +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,11))
4206 +static struct proto ring_proto;
4209 +static int skb_ring_handler(struct sk_buff *skb, u_char recv_packet,
4211 +static int buffer_ring_handler(struct net_device *dev, char *data, int len);
4212 +static int remove_from_cluster(struct sock *sock, struct ring_opt *pfr);
4216 +/* ********************************** */
4219 +static unsigned int bucket_len = 128, num_slots = 4096, sample_rate = 1,
4220 + transparent_mode = 1, enable_tx_capture = 1;
4222 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16))
4223 +module_param(bucket_len, uint, 0644);
4224 +module_param(num_slots, uint, 0644);
4225 +module_param(sample_rate, uint, 0644);
4226 +module_param(transparent_mode, uint, 0644);
4227 +module_param(enable_tx_capture, uint, 0644);
4229 +MODULE_PARM(bucket_len, "i");
4230 +MODULE_PARM(num_slots, "i");
4231 +MODULE_PARM(sample_rate, "i");
4232 +MODULE_PARM(transparent_mode, "i");
4233 +MODULE_PARM(enable_tx_capture, "i");
4236 +MODULE_PARM_DESC(bucket_len, "Number of ring buckets");
4237 +MODULE_PARM_DESC(num_slots, "Number of ring slots");
4238 +MODULE_PARM_DESC(sample_rate, "Ring packet sample rate");
4239 +MODULE_PARM_DESC(transparent_mode,
4240 + "Set to 1 to set transparent mode "
4241 + "(slower but backwards compatible)");
4243 +MODULE_PARM_DESC(enable_tx_capture, "Set to 1 to capture outgoing packets");
4245 +/* ********************************** */
4247 +#define MIN_QUEUED_PKTS 64
4248 +#define MAX_QUEUE_LOOPS 64
4251 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
4252 +#define ring_sk_datatype(__sk) ((struct ring_opt *)__sk)
4253 +#define ring_sk(__sk) ((__sk)->sk_protinfo)
4255 +#define ring_sk_datatype(a) (a)
4256 +#define ring_sk(__sk) ((__sk)->protinfo.pf_ring)
4259 +#define _rdtsc() ({ uint64_t x; asm volatile("rdtsc" : "=A" (x)); x; })
4262 + int dev_queue_xmit(struct sk_buff *skb)
4264 + struct net_device *dev_get_by_name(const char *name)
4267 +/* ********************************** */
4274 +** Multi-Pattern Search Engine
4276 +** Aho-Corasick State Machine - version 2.0
4278 +** Supports both Non-Deterministic and Deterministic Finite Automata
4281 +** Reference - Efficient String matching: An Aid to Bibliographic Search
4282 +** Alfred V Aho and Margaret J Corasick
4283 +** Bell Labratories
4284 +** Copyright(C) 1975 Association for Computing Machinery,Inc
4287 +** +++ Version 1.0 notes - Marc Norton:
4290 +** Original implementation based on the 4 algorithms in the paper by Aho & Corasick,
4291 +** some implementation ideas from 'Practical Algorithms in C', and some
4294 +** 1) Finds all occurrences of all patterns within a text.
4297 +** +++ Version 2.0 Notes - Marc Norton/Dan Roelker:
4300 +** New implementation modifies the state table storage and access model to use
4301 +** compacted sparse vector storage. Dan Roelker and I hammered this strategy out
4302 +** amongst many others in order to reduce memory usage and improve caching performance.
4303 +** The memory usage is greatly reduced, we only use 1/4 of what we use to. The caching
4304 +** performance is better in pure benchmarking tests, but does not show overall improvement
4305 +** in Snort. Unfortunately, once a pattern match test has been performed Snort moves on to doing
4306 +** many other things before we get back to a patteren match test, so the cache is voided.
4308 +** This versions has better caching performance characteristics, reduced memory,
4309 +** more state table storage options, and requires no a priori case conversions.
4310 +** It does maintain the same public interface. (Snort only used banded storage).
4312 +** 1) Supports NFA and DFA state machines, and basic keyword state machines
4313 +** 2) Initial transition table uses Linked Lists
4314 +** 3) Improved state table memory options. NFA and DFA state
4315 +** transition tables are converted to one of 4 formats during compilation.
4317 +** b) Sparse matrix
4318 +** c) Banded matrix (Default-this is the only one used in snort)
4319 +** d) Sparse-Banded matrix
4320 +** 4) Added support for acstate_t in .h file so we can compile states as
4321 +** 16, or 32 bit state values for another reduction in memory consumption,
4322 +** smaller states allows more of the state table to be cached, and improves
4323 +** performance on x86-P4. Your mileage may vary, especially on risc systems.
4324 +** 5) Added a bool to each state transition list to indicate if there is a matching
4325 +** pattern in the state. This prevents us from accessing another data array
4326 +** and can improve caching/performance.
4327 +** 6) The search functions are very sensitive, don't change them without extensive testing,
4328 +** or you'll just spoil the caching and prefetching opportunities.
4330 +** Extras for fellow pattern matchers:
4331 +** The table below explains the storage format used at each step.
4332 +** You can use an NFA or DFA to match with, the NFA is slower but tiny - set the structure directly.
4333 +** You can use any of the 4 storage modes above -full,sparse,banded,sparse-bands, set the structure directly.
4334 +** For applications where you have lots of data and a pattern set to search, this version was up to 3x faster
4335 +** than the previous verion, due to caching performance. This cannot be fully realized in Snort yet,
4336 +** but other applications may have better caching opportunities.
4337 +** Snort only needs to use the banded or full storage.
4339 +** Transition table format at each processing stage.
4340 +** -------------------------------------------------
4341 +** Patterns -> Keyword State Table (List)
4342 +** Keyword State Table -> NFA (List)
4343 +** NFA -> DFA (List)
4344 +** DFA (List)-> Sparse Rows O(m-avg # transitions per state)
4345 +** -> Banded Rows O(1)
4346 +** -> Sparse-Banded Rows O(nb-# bands)
4347 +** -> Full Matrix O(1)
4349 +** Copyright(C) 2002,2003,2004 Marc Norton
4350 +** Copyright(C) 2003,2004 Daniel Roelker
4351 +** Copyright(C) 2002,2003,2004 Sourcefire,Inc.
4353 +** This program is free software; you can redistribute it and/or modify
4354 +** it under the terms of the GNU General Public License as published by
4355 +** the Free Software Foundation; either version 2 of the License, or
4356 +** (at your option) any later version.
4358 +** This program is distributed in the hope that it will be useful,
4359 +** but WITHOUT ANY WARRANTY; without even the implied warranty of
4360 +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
4361 +** GNU General Public License for more details.
4363 +** You should have received a copy of the GNU General Public License
4364 +** along with this program; if not, write to the Free Software
4365 +** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
4372 +#define MEMASSERT(p,s) if(!p){printk("ACSM-No Memory: %s!\n",s);}
4377 +static int max_memory = 0;
4382 +typedef struct acsm_summary_s
4384 + unsigned num_states;
4385 + unsigned num_transitions;
4386 + ACSM_STRUCT2 acsm;
4393 +static acsm_summary_t summary={0,0};
4396 +** Case Translation Table
4398 +static unsigned char xlatcase[256];
4403 +inline int toupper(int ch) {
4404 + if ( (unsigned int)(ch - 'a') < 26u )
4409 +static void init_xlatcase(void)
4412 + for (i = 0; i < 256; i++)
4414 + xlatcase[i] = toupper(i);
4424 +ConvertCaseEx (unsigned char *d, unsigned char *s, int m)
4432 + for (i = 0; i < m; i++ )
4434 + d[0] = xlatcase[ s[0] ];
4435 + d[2] = xlatcase[ s[2] ];
4436 + d[1] = xlatcase[ s[1] ];
4437 + d[3] = xlatcase[ s[3] ];
4442 + for (i=0; i < n; i++)
4444 + d[i] = xlatcase[ s[i] ];
4447 + for (i=0; i < m; i++)
4449 + d[i] = xlatcase[ s[i] ];
4463 + p = kmalloc (n, GFP_KERNEL);
4482 + * Simple QUEUE NODE
4484 +typedef struct _qnode
4487 + struct _qnode *next;
4492 + * Simple QUEUE Structure
4494 +typedef struct _queue
4496 + QNODE * head, *tail;
4502 + * Initialize the queue
4505 +queue_init (QUEUE * s)
4507 + s->head = s->tail = 0;
4512 + * Find a State in the queue
4515 +queue_find (QUEUE * s, int state)
4521 + if( q->state == state ) return 1;
4528 + * Add Tail Item to queue (FiFo/LiLo)
4531 +queue_add (QUEUE * s, int state)
4535 + if( queue_find( s, state ) ) return;
4539 + q = s->tail = s->head = (QNODE *) AC_MALLOC (sizeof (QNODE));
4540 + MEMASSERT (q, "queue_add");
4546 + q = (QNODE *) AC_MALLOC (sizeof (QNODE));
4549 + s->tail->next = q;
4557 + * Remove Head Item from queue
4560 +queue_remove (QUEUE * s)
4568 + s->head = s->head->next;
4583 + * Return items in the queue
4586 +queue_count (QUEUE * s)
4596 +queue_free (QUEUE * s)
4598 + while (queue_count (s))
4605 + * Get Next State-NFA
4608 +int List_GetNextState( ACSM_STRUCT2 * acsm, int state, int input )
4610 + trans_node_t * t = acsm->acsmTransTable[state];
4614 + if( t->key == input )
4616 + return t->next_state;
4621 + if( state == 0 ) return 0;
4623 + return ACSM_FAIL_STATE2; /* Fail state ??? */
4627 + * Get Next State-DFA
4630 +int List_GetNextState2( ACSM_STRUCT2 * acsm, int state, int input )
4632 + trans_node_t * t = acsm->acsmTransTable[state];
4636 + if( t->key == input )
4638 + return t->next_state;
4643 + return 0; /* default state */
4646 + * Put Next State - Head insertion, and transition updates
4649 +int List_PutNextState( ACSM_STRUCT2 * acsm, int state, int input, int next_state )
4652 + trans_node_t * tnew;
4654 + // printk(" List_PutNextState: state=%d, input='%c', next_state=%d\n",state,input,next_state);
4657 + /* Check if the transition already exists, if so just update the next_state */
4658 + p = acsm->acsmTransTable[state];
4661 + if( p->key == input ) /* transition already exists- reset the next state */
4663 + p->next_state = next_state;
4669 + /* Definitely not an existing transition - add it */
4670 + tnew = (trans_node_t*)AC_MALLOC(sizeof(trans_node_t));
4671 + if( !tnew ) return -1;
4673 + tnew->key = input;
4674 + tnew->next_state = next_state;
4677 + tnew->next = acsm->acsmTransTable[state];
4678 + acsm->acsmTransTable[state] = tnew;
4680 + acsm->acsmNumTrans++;
4685 + * Free the entire transition table
4688 +int List_FreeTransTable( ACSM_STRUCT2 * acsm )
4691 + trans_node_t * t, *p;
4693 + if( !acsm->acsmTransTable ) return 0;
4695 + for(i=0;i< acsm->acsmMaxStates;i++)
4697 + t = acsm->acsmTransTable[i];
4704 + max_memory -= sizeof(trans_node_t);
4708 + kfree(acsm->acsmTransTable);
4710 + max_memory -= sizeof(void*) * acsm->acsmMaxStates;
4712 + acsm->acsmTransTable = 0;
4722 + int List_FreeList( trans_node_t * t )
4733 + max_memory -= sizeof(trans_node_t);
4742 + * Converts row of states from list to a full vector format
4745 +int List_ConvToFull(ACSM_STRUCT2 * acsm, acstate_t state, acstate_t * full )
4748 + trans_node_t * t = acsm->acsmTransTable[ state ];
4750 + memset(full,0,sizeof(acstate_t)*acsm->acsmAlphabetSize);
4752 + if( !t ) return 0;
4756 + full[ t->key ] = t->next_state;
4764 + * Copy a Match List Entry - don't dup the pattern data
4766 +static ACSM_PATTERN2*
4767 +CopyMatchListEntry (ACSM_PATTERN2 * px)
4769 + ACSM_PATTERN2 * p;
4771 + p = (ACSM_PATTERN2 *) AC_MALLOC (sizeof (ACSM_PATTERN2));
4772 + MEMASSERT (p, "CopyMatchListEntry");
4774 + memcpy (p, px, sizeof (ACSM_PATTERN2));
4782 + * Check if a pattern is in the list already,
4783 + * validate it using the 'id' field. This must be unique
4784 + * for every pattern.
4788 + int FindMatchListEntry (ACSM_STRUCT2 * acsm, int state, ACSM_PATTERN2 * px)
4790 + ACSM_PATTERN2 * p;
4792 + p = acsm->acsmMatchList[state];
4795 + if( p->id == px->id ) return 1;
4805 + * Add a pattern to the list of patterns terminated at this state.
4806 + * Insert at front of list.
4809 +AddMatchListEntry (ACSM_STRUCT2 * acsm, int state, ACSM_PATTERN2 * px)
4811 + ACSM_PATTERN2 * p;
4813 + p = (ACSM_PATTERN2 *) AC_MALLOC (sizeof (ACSM_PATTERN2));
4815 + MEMASSERT (p, "AddMatchListEntry");
4817 + memcpy (p, px, sizeof (ACSM_PATTERN2));
4819 + p->next = acsm->acsmMatchList[state];
4821 + acsm->acsmMatchList[state] = p;
4826 +AddPatternStates (ACSM_STRUCT2 * acsm, ACSM_PATTERN2 * p)
4828 + int state, next, n;
4829 + unsigned char *pattern;
4832 + pattern = p->patrn;
4836 + * Match up pattern with existing states
4838 + for (; n > 0; pattern++, n--)
4840 + next = List_GetNextState(acsm,state,*pattern);
4841 + if (next == ACSM_FAIL_STATE2 || next == 0)
4849 + * Add new states for the rest of the pattern bytes, 1 state per byte
4851 + for (; n > 0; pattern++, n--)
4853 + acsm->acsmNumStates++;
4854 + List_PutNextState(acsm,state,*pattern,acsm->acsmNumStates);
4855 + state = acsm->acsmNumStates;
4858 + AddMatchListEntry (acsm, state, p );
4862 + * Build A Non-Deterministic Finite Automata
4863 + * The keyword state table must already be built, via AddPatternStates().
4866 +Build_NFA (ACSM_STRUCT2 * acsm)
4869 + QUEUE q, *queue = &q;
4870 + acstate_t * FailState = acsm->acsmFailState;
4871 + ACSM_PATTERN2 ** MatchList = acsm->acsmMatchList;
4872 + ACSM_PATTERN2 * mlist,* px;
4874 + /* Init a Queue */
4875 + queue_init (queue);
4878 + /* Add the state 0 transitions 1st, the states at depth 1, fail to state 0 */
4879 + for (i = 0; i < acsm->acsmAlphabetSize; i++)
4881 + s = List_GetNextState2(acsm,0,i);
4884 + queue_add (queue, s);
4889 + /* Build the fail state successive layer of transitions */
4890 + while (queue_count (queue) > 0)
4892 + r = queue_remove (queue);
4894 + /* Find Final States for any Failure */
4895 + for (i = 0; i < acsm->acsmAlphabetSize; i++)
4899 + s = List_GetNextState(acsm,r,i);
4901 + if( s != ACSM_FAIL_STATE2 )
4903 + queue_add (queue, s);
4905 + fs = FailState[r];
4908 + * Locate the next valid state for 'i' starting at fs
4910 + while( (next=List_GetNextState(acsm,fs,i)) == ACSM_FAIL_STATE2 )
4912 + fs = FailState[fs];
4916 + * Update 's' state failure state to point to the next valid state
4918 + FailState[s] = next;
4921 + * Copy 'next'states MatchList to 's' states MatchList,
4922 + * we copy them so each list can be AC_FREE'd later,
4923 + * else we could just manipulate pointers to fake the copy.
4925 + for( mlist = MatchList[next];
4927 + mlist = mlist->next)
4929 + px = CopyMatchListEntry (mlist);
4931 + /* Insert at front of MatchList */
4932 + px->next = MatchList[s];
4933 + MatchList[s] = px;
4939 + /* Clean up the queue */
4940 + queue_free (queue);
4944 + * Build Deterministic Finite Automata from the NFA
4947 +Convert_NFA_To_DFA (ACSM_STRUCT2 * acsm)
4949 + int i, r, s, cFailState;
4950 + QUEUE q, *queue = &q;
4951 + acstate_t * FailState = acsm->acsmFailState;
4953 + /* Init a Queue */
4954 + queue_init (queue);
4956 + /* Add the state 0 transitions 1st */
4957 + for(i=0; i<acsm->acsmAlphabetSize; i++)
4959 + s = List_GetNextState(acsm,0,i);
4962 + queue_add (queue, s);
4966 + /* Start building the next layer of transitions */
4967 + while( queue_count(queue) > 0 )
4969 + r = queue_remove(queue);
4971 + /* Process this states layer */
4972 + for (i = 0; i < acsm->acsmAlphabetSize; i++)
4974 + s = List_GetNextState(acsm,r,i);
4976 + if( s != ACSM_FAIL_STATE2 && s!= 0)
4978 + queue_add (queue, s);
4982 + cFailState = List_GetNextState(acsm,FailState[r],i);
4984 + if( cFailState != 0 && cFailState != ACSM_FAIL_STATE2 )
4986 + List_PutNextState(acsm,r,i,cFailState);
4992 + /* Clean up the queue */
4993 + queue_free (queue);
4998 + * Convert a row lists for the state table to a full vector format
5002 +Conv_List_To_Full(ACSM_STRUCT2 * acsm)
5006 + acstate_t ** NextState = acsm->acsmNextState;
5008 + for(k=0;k<acsm->acsmMaxStates;k++)
5010 + p = AC_MALLOC( sizeof(acstate_t) * (acsm->acsmAlphabetSize+2) );
5013 + tcnt = List_ConvToFull( acsm, (acstate_t)k, p+2 );
5016 + p[1] = 0; /* no matches yet */
5018 + NextState[k] = p; /* now we have a full format row vector */
5025 + * Convert DFA memory usage from list based storage to a sparse-row storage.
5027 + * The Sparse format allows each row to be either full or sparse formatted. If the sparse row has
5028 + * too many transitions, performance or space may dictate that we use the standard full formatting
5029 + * for the row. More than 5 or 10 transitions per state ought to really whack performance. So the
5030 + * user can specify the max state transitions per state allowed in the sparse format.
5032 + * Standard Full Matrix Format
5033 + * ---------------------------
5034 + * acstate_t ** NextState ( 1st index is row/state, 2nd index is column=event/input)
5038 + * events -> a b c d e f g h i j k l m n o p
5040 + * N 1 7 0 0 0 3 0 0 0 0 0 0 0 0 0 0
5042 + * Sparse Format, each row : Words Value
5043 + * 1-1 fmt(0-full,1-sparse,2-banded,3-sparsebands)
5044 + * 2-2 bool match flag (indicates this state has pattern matches)
5045 + * 3-3 sparse state count ( # of input/next-state pairs )
5046 + * 4-3+2*cnt 'input,next-state' pairs... each sizof(acstate_t)
5048 + * above example case yields:
5049 + * Full Format: 0, 1 7 0 0 0 3 0 0 0 0 0 0 0 0 0 0 ...
5050 + * Sparse format: 1, 3, 'a',1,'b',7,'f',3 - uses 2+2*ntransitions (non-default transitions)
5053 +Conv_Full_DFA_To_Sparse(ACSM_STRUCT2 * acsm)
5056 + acstate_t * p, state, maxstates=0;
5057 + acstate_t ** NextState = acsm->acsmNextState;
5058 + acstate_t full[MAX_ALPHABET_SIZE];
5060 + for(k=0;k<acsm->acsmMaxStates;k++)
5064 + List_ConvToFull(acsm, (acstate_t)k, full );
5066 + for (i = 0; i < acsm->acsmAlphabetSize; i++)
5069 + if( state != 0 && state != ACSM_FAIL_STATE2 ) cnt++;
5072 + if( cnt > 0 ) maxstates++;
5074 + if( k== 0 || cnt > acsm->acsmSparseMaxRowNodes )
5076 + p = AC_MALLOC(sizeof(acstate_t)*(acsm->acsmAlphabetSize+2) );
5081 + memcpy(&p[2],full,acsm->acsmAlphabetSize*sizeof(acstate_t));
5085 + p = AC_MALLOC(sizeof(acstate_t)*(3+2*cnt));
5089 + p[m++] = ACF_SPARSE;
5090 + p[m++] = 0; /* no matches */
5093 + for(i = 0; i < acsm->acsmAlphabetSize ; i++)
5096 + if( state != 0 && state != ACSM_FAIL_STATE2 )
5104 + NextState[k] = p; /* now we are a sparse formatted state transition array */
5110 + Convert Full matrix to Banded row format.
5114 + 2 n number of values
5115 + 3 i index of 1st value (0-256)
5116 + 4 - 3+n next-state values at each index
5120 +Conv_Full_DFA_To_Banded(ACSM_STRUCT2 * acsm)
5122 + int first = -1, last;
5123 + acstate_t * p, state, full[MAX_ALPHABET_SIZE];
5124 + acstate_t ** NextState = acsm->acsmNextState;
5127 + for(k=0;k<acsm->acsmMaxStates;k++)
5131 + List_ConvToFull(acsm, (acstate_t)k, full );
5136 + for (i = 0; i < acsm->acsmAlphabetSize; i++)
5140 + if( state !=0 && state != ACSM_FAIL_STATE2 )
5142 + if( first < 0 ) first = i;
5147 + /* calc band width */
5148 + cnt= last - first + 1;
5150 + p = AC_MALLOC(sizeof(acstate_t)*(4+cnt));
5155 + p[m++] = ACF_BANDED;
5156 + p[m++] = 0; /* no matches */
5160 + for(i = first; i <= last; i++)
5165 + NextState[k] = p; /* now we are a banded formatted state transition array */
5172 + * Convert full matrix to Sparse Band row format.
5174 + * next - Full formatted row of next states
5175 + * asize - size of alphabet
5176 + * zcnt - max number of zeros in a run of zeros in any given band.
5179 + * 1 ACF_SPARSEBANDS
5180 + * 2 number of bands
5181 + * repeat 3 - 5+ ....once for each band in this row.
5182 + * 3 number of items in this band* 4 start index of this band
5183 + * 5- next-state values in this band...
5186 +int calcSparseBands( acstate_t * next, int * begin, int * end, int asize, int zmax )
5188 + int i, nbands,zcnt,last=0;
5192 + for( i=0; i<asize; i++ )
5196 + if( state !=0 && state != ACSM_FAIL_STATE2 )
5198 + begin[nbands] = i;
5201 + for( ; i< asize; i++ )
5204 + if( state ==0 || state == ACSM_FAIL_STATE2 )
5207 + if( zcnt > zmax ) break;
5216 + end[nbands++] = last;
5230 + * 1 SPARSEBANDS format indicator
5231 + * 2 bool indicates a pattern match in this state
5232 + * 3 number of sparse bands
5233 + * 4 number of elements in this band
5234 + * 5 start index of this band
5235 + * 6- list of next states
5237 + * m number of elements in this band
5238 + * m+1 start index of this band
5239 + * m+2- list of next states
5242 +Conv_Full_DFA_To_SparseBands(ACSM_STRUCT2 * acsm)
5245 + acstate_t ** NextState = acsm->acsmNextState;
5246 + int cnt,m,k,i,zcnt=acsm->acsmSparseMaxZcnt;
5248 + int band_begin[MAX_ALPHABET_SIZE];
5249 + int band_end[MAX_ALPHABET_SIZE];
5251 + acstate_t full[MAX_ALPHABET_SIZE];
5253 + for(k=0;k<acsm->acsmMaxStates;k++)
5257 + List_ConvToFull(acsm, (acstate_t)k, full );
5259 + nbands = calcSparseBands( full, band_begin, band_end, acsm->acsmAlphabetSize, zcnt );
5261 + /* calc band width space*/
5263 + for(i=0;i<nbands;i++)
5266 + cnt += band_end[i] - band_begin[i] + 1;
5268 + /*printk("state %d: sparseband %d, first=%d, last=%d, cnt=%d\n",k,i,band_begin[i],band_end[i],band_end[i]-band_begin[i]+1); */
5271 + p = AC_MALLOC(sizeof(acstate_t)*(cnt));
5276 + p[m++] = ACF_SPARSEBANDS;
5277 + p[m++] = 0; /* no matches */
5280 + for( i=0;i<nbands;i++ )
5282 + p[m++] = band_end[i] - band_begin[i] + 1; /* # states in this band */
5283 + p[m++] = band_begin[i]; /* start index */
5285 + for( j=band_begin[i]; j<=band_end[i]; j++ )
5287 + p[m++] = full[j]; /* some states may be state zero */
5291 + NextState[k] = p; /* now we are a sparse-banded formatted state transition array */
5299 + * Convert an NFA or DFA row from sparse to full format
5300 + * and store into the 'full' buffer.
5303 + * 0 - failed, no state transitions
5304 + * *p - pointer to 'full' buffer
5309 + acstate_t * acsmConvToFull(ACSM_STRUCT2 * acsm, acstate_t k, acstate_t * full )
5312 + acstate_t * p, n, fmt, index, nb, bmatch;
5313 + acstate_t ** NextState = acsm->acsmNextState;
5317 + if( !p ) return 0;
5323 + if( fmt ==ACF_SPARSE )
5326 + for( ; n>0; n--, p+=2 )
5328 + full[ p[0] ] = p[1];
5331 + else if( fmt ==ACF_BANDED )
5337 + for( ; n>0; n--, p++ )
5339 + full[ index++ ] = p[0];
5342 + else if( fmt ==ACF_SPARSEBANDS )
5349 + for( ; n>0; n--, p++ )
5351 + full[ index++ ] = p[0];
5355 + else if( fmt == ACF_FULL )
5357 + memcpy(full,p,acsm->acsmAlphabetSize*sizeof(acstate_t));
5365 + * Select the desired storage mode
5367 +int acsmSelectFormat2( ACSM_STRUCT2 * acsm, int m )
5374 + case ACF_SPARSEBANDS:
5375 + acsm->acsmFormat = m;
5386 +void acsmSetMaxSparseBandZeros2( ACSM_STRUCT2 * acsm, int n )
5388 + acsm->acsmSparseMaxZcnt = n;
5393 +void acsmSetMaxSparseElements2( ACSM_STRUCT2 * acsm, int n )
5395 + acsm->acsmSparseMaxRowNodes = n;
5400 +int acsmSelectFSA2( ACSM_STRUCT2 * acsm, int m )
5407 + acsm->acsmFSA = m;
5415 +int acsmSetAlphabetSize2( ACSM_STRUCT2 * acsm, int n )
5417 + if( n <= MAX_ALPHABET_SIZE )
5419 + acsm->acsmAlphabetSize = n;
5428 + * Create a new AC state machine
5430 +static ACSM_STRUCT2 * acsmNew2 (void)
5436 + p = (ACSM_STRUCT2 *) AC_MALLOC(sizeof (ACSM_STRUCT2));
5437 + MEMASSERT (p, "acsmNew");
5441 + memset (p, 0, sizeof (ACSM_STRUCT2));
5443 + /* Some defaults */
5444 + p->acsmFSA = FSA_DFA;
5445 + p->acsmFormat = ACF_BANDED;
5446 + p->acsmAlphabetSize = 256;
5447 + p->acsmSparseMaxRowNodes = 256;
5448 + p->acsmSparseMaxZcnt = 10;
5454 + * Add a pattern to the list of patterns for this state machine
5458 +acsmAddPattern2 (ACSM_STRUCT2 * p, unsigned char *pat, int n, int nocase,
5459 + int offset, int depth, void * id, int iid)
5461 + ACSM_PATTERN2 * plist;
5463 + plist = (ACSM_PATTERN2 *) AC_MALLOC (sizeof (ACSM_PATTERN2));
5464 + MEMASSERT (plist, "acsmAddPattern");
5466 + plist->patrn = (unsigned char *) AC_MALLOC ( n );
5467 + MEMASSERT (plist->patrn, "acsmAddPattern");
5469 + ConvertCaseEx(plist->patrn, pat, n);
5471 + plist->casepatrn = (unsigned char *) AC_MALLOC ( n );
5472 + MEMASSERT (plist->casepatrn, "acsmAddPattern");
5474 + memcpy (plist->casepatrn, pat, n);
5477 + plist->nocase = nocase;
5478 + plist->offset = offset;
5479 + plist->depth = depth;
5483 + plist->next = p->acsmPatterns;
5484 + p->acsmPatterns = plist;
5489 + * Add a Key to the list of key+data pairs
5491 +int acsmAddKey2(ACSM_STRUCT2 * p, unsigned char *key, int klen, int nocase, void * data)
5493 + ACSM_PATTERN2 * plist;
5495 + plist = (ACSM_PATTERN2 *) AC_MALLOC (sizeof (ACSM_PATTERN2));
5496 + MEMASSERT (plist, "acsmAddPattern");
5498 + plist->patrn = (unsigned char *) AC_MALLOC (klen);
5499 + memcpy (plist->patrn, key, klen);
5501 + plist->casepatrn = (unsigned char *) AC_MALLOC (klen);
5502 + memcpy (plist->casepatrn, key, klen);
5505 + plist->nocase = nocase;
5506 + plist->offset = 0;
5511 + plist->next = p->acsmPatterns;
5512 + p->acsmPatterns = plist;
5518 + * Copy a boolean match flag int NextState table, for caching purposes.
5521 +void acsmUpdateMatchStates( ACSM_STRUCT2 * acsm )
5524 + acstate_t ** NextState = acsm->acsmNextState;
5525 + ACSM_PATTERN2 ** MatchList = acsm->acsmMatchList;
5527 + for( state=0; state<acsm->acsmNumStates; state++ )
5529 + if( MatchList[state] )
5531 + NextState[state][1] = 1;
5535 + NextState[state][1] = 0;
5541 + * Compile State Machine - NFA or DFA and Full or Banded or Sparse or SparseBands
5544 +acsmCompile2 (ACSM_STRUCT2 * acsm)
5547 + ACSM_PATTERN2 * plist;
5549 + /* Count number of states */
5550 + for (plist = acsm->acsmPatterns; plist != NULL; plist = plist->next)
5552 + acsm->acsmMaxStates += plist->n;
5553 + /* acsm->acsmMaxStates += plist->n*2; if we handle case in the table */
5555 + acsm->acsmMaxStates++; /* one extra */
5557 + /* Alloc a List based State Transition table */
5558 + acsm->acsmTransTable =(trans_node_t**) AC_MALLOC(sizeof(trans_node_t*) * acsm->acsmMaxStates );
5559 + MEMASSERT (acsm->acsmTransTable, "acsmCompile");
5561 + memset (acsm->acsmTransTable, 0, sizeof(trans_node_t*) * acsm->acsmMaxStates);
5563 + /* Alloc a failure table - this has a failure state, and a match list for each state */
5564 + acsm->acsmFailState =(acstate_t*) AC_MALLOC(sizeof(acstate_t) * acsm->acsmMaxStates );
5565 + MEMASSERT (acsm->acsmFailState, "acsmCompile");
5567 + memset (acsm->acsmFailState, 0, sizeof(acstate_t) * acsm->acsmMaxStates );
5569 + /* Alloc a MatchList table - this has a lis tof pattern matches for each state, if any */
5570 + acsm->acsmMatchList=(ACSM_PATTERN2**) AC_MALLOC(sizeof(ACSM_PATTERN2*) * acsm->acsmMaxStates );
5571 + MEMASSERT (acsm->acsmMatchList, "acsmCompile");
5573 + memset (acsm->acsmMatchList, 0, sizeof(ACSM_PATTERN2*) * acsm->acsmMaxStates );
5575 + /* Alloc a separate state transition table == in state 's' due to event 'k', transition to 'next' state */
5576 + acsm->acsmNextState=(acstate_t**)AC_MALLOC( acsm->acsmMaxStates * sizeof(acstate_t*) );
5577 + MEMASSERT(acsm->acsmNextState, "acsmCompile-NextState");
5579 + for (k = 0; k < acsm->acsmMaxStates; k++)
5581 + acsm->acsmNextState[k]=(acstate_t*)0;
5584 + /* Initialize state zero as a branch */
5585 + acsm->acsmNumStates = 0;
5587 + /* Add the 0'th state, */
5588 + //acsm->acsmNumStates++;
5590 + /* Add each Pattern to the State Table - This forms a keywords state table */
5591 + for (plist = acsm->acsmPatterns; plist != NULL; plist = plist->next)
5593 + AddPatternStates (acsm, plist);
5596 + acsm->acsmNumStates++;
5598 + if( acsm->acsmFSA == FSA_DFA || acsm->acsmFSA == FSA_NFA )
5600 + /* Build the NFA */
5604 + if( acsm->acsmFSA == FSA_DFA )
5606 + /* Convert the NFA to a DFA */
5607 + Convert_NFA_To_DFA (acsm);
5612 + * Select Final Transition Table Storage Mode
5615 + if( acsm->acsmFormat == ACF_SPARSE )
5617 + /* Convert DFA Full matrix to a Sparse matrix */
5618 + if( Conv_Full_DFA_To_Sparse(acsm) )
5622 + else if( acsm->acsmFormat == ACF_BANDED )
5624 + /* Convert DFA Full matrix to a Sparse matrix */
5625 + if( Conv_Full_DFA_To_Banded(acsm) )
5629 + else if( acsm->acsmFormat == ACF_SPARSEBANDS )
5631 + /* Convert DFA Full matrix to a Sparse matrix */
5632 + if( Conv_Full_DFA_To_SparseBands(acsm) )
5635 + else if( acsm->acsmFormat == ACF_FULL )
5637 + if( Conv_List_To_Full( acsm ) )
5641 + acsmUpdateMatchStates( acsm ); /* load boolean match flags into state table */
5643 + /* Free up the Table Of Transition Lists */
5644 + List_FreeTransTable( acsm );
5646 + /* For now -- show this info */
5648 + * acsmPrintInfo( acsm );
5652 + /* Accrue Summary State Stats */
5653 + summary.num_states += acsm->acsmNumStates;
5654 + summary.num_transitions += acsm->acsmNumTrans;
5656 + memcpy( &summary.acsm, acsm, sizeof(ACSM_STRUCT2));
5662 + * Get the NextState from the NFA, all NFA storage formats use this
5665 +acstate_t SparseGetNextStateNFA(acstate_t * ps, acstate_t state, unsigned input)
5674 + ps++; /* skip bMatchState */
5683 + if( input < index )
5691 + return (acstate_t)ACSM_FAIL_STATE2;
5694 + if( input >= index + n )
5702 + return (acstate_t)ACSM_FAIL_STATE2;
5705 + if( ps[input-index] == 0 )
5709 + return ACSM_FAIL_STATE2;
5713 + return (acstate_t) ps[input-index];
5718 + n = *ps++; /* number of sparse index-value entries */
5720 + for( ; n>0 ; n-- )
5722 + if( ps[0] > input ) /* cannot match the input, already a higher value than the input */
5724 + return (acstate_t)ACSM_FAIL_STATE2; /* default state */
5726 + else if( ps[0] == input )
5728 + return ps[1]; /* next state */
5736 + return ACSM_FAIL_STATE2;
5739 + case ACF_SPARSEBANDS:
5741 + nb = *ps++; /* number of bands */
5743 + while( nb > 0 ) /* for each band */
5745 + n = *ps++; /* number of elements */
5746 + index = *ps++; /* 1st element value */
5748 + if( input < index )
5752 + return (acstate_t)ACSM_FAIL_STATE2;
5754 + return (acstate_t)0;
5756 + if( (input >= index) && (input < (index + n)) )
5758 + if( ps[input-index] == 0 )
5762 + return ACSM_FAIL_STATE2;
5765 + return (acstate_t) ps[input-index];
5772 + return (acstate_t)ACSM_FAIL_STATE2;
5774 + return (acstate_t)0;
5779 + if( ps[input] == 0 )
5783 + return ACSM_FAIL_STATE2;
5796 + * Get the NextState from the DFA Next State Transition table
5797 + * Full and banded are supported separately, this is for
5798 + * sparse and sparse-bands
5801 +acstate_t SparseGetNextStateDFA(acstate_t * ps, acstate_t state, unsigned input)
5811 + /* n=ps[2] : number of entries in the band */
5812 + /* index=ps[3] : index of the 1st entry, sequential thereafter */
5814 + if( input < ps[3] ) return 0;
5815 + if( input >= (ps[3]+ps[2]) ) return 0;
5817 + return ps[4+input-ps[3]];
5823 + return ps[2+input];
5829 + n = ps[2]; /* number of entries/ key+next pairs */
5833 + for( ; n>0 ; n-- )
5835 + if( input < ps[0] ) /* cannot match the input, already a higher value than the input */
5837 + return (acstate_t)0; /* default state */
5839 + else if( ps[0] == input )
5841 + return ps[1]; /* next state */
5845 + return (acstate_t)0;
5850 + case ACF_SPARSEBANDS:
5852 + nb = ps[2]; /* number of bands */
5856 + while( nb > 0 ) /* for each band */
5858 + n = ps[0]; /* number of elements in this band */
5859 + index = ps[1]; /* start index/char of this band */
5860 + if( input < index )
5862 + return (acstate_t)0;
5864 + if( (input < (index + n)) )
5866 + return (acstate_t) ps[2+input-index];
5871 + return (acstate_t)0;
5878 + * Search Text or Binary Data for Pattern matches
5880 + * Sparse & Sparse-Banded Matrix search
5885 +acsmSearchSparseDFA(ACSM_STRUCT2 * acsm, unsigned char *Tx, int n,
5886 + int (*Match) (void * id, int index, void *data),
5890 + ACSM_PATTERN2 * mlist;
5891 + unsigned char * Tend;
5893 + unsigned char * T, * Tc;
5895 + acstate_t ** NextState = acsm->acsmNextState;
5896 + ACSM_PATTERN2 ** MatchList = acsm->acsmMatchList;
5902 + for( state = 0; T < Tend; T++ )
5904 + state = SparseGetNextStateDFA ( NextState[state], state, xlatcase[*T] );
5906 + /* test if this state has any matching patterns */
5907 + if( NextState[state][1] )
5909 + for( mlist = MatchList[state];
5911 + mlist = mlist->next )
5913 + index = T - mlist->n - Tc;
5914 + if( mlist->nocase )
5917 + if (Match (mlist->id, index, data))
5922 + if( memcmp (mlist->casepatrn, Tx + index, mlist->n) == 0 )
5925 + if (Match (mlist->id, index, data))
5935 + * Full format DFA search
5936 + * Do not change anything here without testing, caching and prefetching
5937 + * performance is very sensitive to any changes.
5940 + * 1) replaced ConvertCaseEx with inline xlatcase - this improves performance 5-10%
5941 + * 2) using 'nocase' improves performance again by 10-15%, since memcmp is not needed
5947 +acsmSearchSparseDFA_Full(ACSM_STRUCT2 * acsm, unsigned char *Tx, int n,
5948 + int (*Match) (void * id, int index, void *data),
5951 + ACSM_PATTERN2 * mlist;
5952 + unsigned char * Tend;
5953 + unsigned char * T;
5958 + acstate_t ** NextState = acsm->acsmNextState;
5959 + ACSM_PATTERN2 ** MatchList = acsm->acsmMatchList;
5965 + for( state = 0; T < Tend; T++ )
5967 + ps = NextState[ state ];
5969 + sindex = xlatcase[ T[0] ];
5971 + /* check the current state for a pattern match */
5974 + for( mlist = MatchList[state];
5976 + mlist = mlist->next )
5978 + index = T - mlist->n - Tx;
5981 + if( mlist->nocase )
5984 + if (Match (mlist->id, index, data))
5989 + if( memcmp (mlist->casepatrn, Tx + index, mlist->n ) == 0 )
5992 + if (Match (mlist->id, index, data))
6000 + state = ps[ 2u + sindex ];
6003 + /* Check the last state for a pattern match */
6004 + for( mlist = MatchList[state];
6006 + mlist = mlist->next )
6008 + index = T - mlist->n - Tx;
6010 + if( mlist->nocase )
6013 + if (Match (mlist->id, index, data))
6018 + if( memcmp (mlist->casepatrn, Tx + index, mlist->n) == 0 )
6021 + if (Match (mlist->id, index, data))
6030 + * Banded-Row format DFA search
6031 + * Do not change anything here, caching and prefetching
6032 + * performance is very sensitive to any changes.
6034 + * ps[0] = storage fmt
6035 + * ps[1] = bool match flag
6036 + * ps[2] = # elements in band
6037 + * ps[3] = index of 1st element
6042 +acsmSearchSparseDFA_Banded(ACSM_STRUCT2 * acsm, unsigned char *Tx, int n,
6043 + int (*Match) (void * id, int index, void *data),
6047 + unsigned char * Tend;
6048 + unsigned char * T;
6051 + acstate_t ** NextState = acsm->acsmNextState;
6052 + ACSM_PATTERN2 ** MatchList = acsm->acsmMatchList;
6053 + ACSM_PATTERN2 * mlist;
6060 + for( state = 0; T < Tend; T++ )
6062 + ps = NextState[state];
6064 + sindex = xlatcase[ T[0] ];
6066 + /* test if this state has any matching patterns */
6069 + for( mlist = MatchList[state];
6071 + mlist = mlist->next )
6073 + index = T - mlist->n - Tx;
6075 + if( mlist->nocase )
6078 + if (Match (mlist->id, index, data))
6083 + if( memcmp (mlist->casepatrn, Tx + index, mlist->n) == 0 )
6086 + if (Match (mlist->id, index, data))
6093 + if( sindex < ps[3] ) state = 0;
6094 + else if( sindex >= (ps[3] + ps[2]) ) state = 0;
6095 + else state = ps[ 4u + sindex - ps[3] ];
6098 + /* Check the last state for a pattern match */
6099 + for( mlist = MatchList[state];
6101 + mlist = mlist->next )
6103 + index = T - mlist->n - Tx;
6105 + if( mlist->nocase )
6108 + if (Match (mlist->id, index, data))
6113 + if( memcmp (mlist->casepatrn, Tx + index, mlist->n) == 0 )
6116 + if (Match (mlist->id, index, data))
6128 + * Search Text or Binary Data for Pattern matches
6130 + * Sparse Storage Version
6135 +acsmSearchSparseNFA(ACSM_STRUCT2 * acsm, unsigned char *Tx, int n,
6136 + int (*Match) (void * id, int index, void *data),
6140 + ACSM_PATTERN2 * mlist;
6141 + unsigned char * Tend;
6143 + unsigned char * T, *Tc;
6145 + acstate_t ** NextState= acsm->acsmNextState;
6146 + acstate_t * FailState= acsm->acsmFailState;
6147 + ACSM_PATTERN2 ** MatchList = acsm->acsmMatchList;
6148 + unsigned char Tchar;
6154 + for( state = 0; T < Tend; T++ )
6158 + Tchar = xlatcase[ *T ];
6160 + while( (nstate=SparseGetNextStateNFA(NextState[state],state,Tchar))==ACSM_FAIL_STATE2 )
6161 + state = FailState[state];
6165 + for( mlist = MatchList[state];
6167 + mlist = mlist->next )
6169 + index = T - mlist->n - Tx;
6170 + if( mlist->nocase )
6173 + if (Match (mlist->id, index, data))
6178 + if( memcmp (mlist->casepatrn, Tx + index, mlist->n) == 0 )
6181 + if (Match (mlist->id, index, data))
6195 +acsmSearch2(ACSM_STRUCT2 * acsm, unsigned char *Tx, int n,
6196 + int (*Match) (void * id, int index, void *data),
6200 + switch( acsm->acsmFSA )
6204 + if( acsm->acsmFormat == ACF_FULL )
6206 + return acsmSearchSparseDFA_Full( acsm, Tx, n, Match,data );
6208 + else if( acsm->acsmFormat == ACF_BANDED )
6210 + return acsmSearchSparseDFA_Banded( acsm, Tx, n, Match,data );
6214 + return acsmSearchSparseDFA( acsm, Tx, n, Match,data );
6219 + return acsmSearchSparseNFA( acsm, Tx, n, Match,data );
6233 +acsmFree2 (ACSM_STRUCT2 * acsm)
6236 + ACSM_PATTERN2 * mlist, *ilist;
6237 + for (i = 0; i < acsm->acsmMaxStates; i++)
6239 + mlist = acsm->acsmMatchList[i];
6244 + mlist = mlist->next;
6247 + AC_FREE(acsm->acsmNextState[i]);
6249 + AC_FREE(acsm->acsmFailState);
6250 + AC_FREE(acsm->acsmMatchList);
6253 +/* ********************************** */
6255 +static void ring_sock_destruct(struct sock *sk) {
6257 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
6258 + skb_queue_purge(&sk->sk_receive_queue);
6260 + if (!sock_flag(sk, SOCK_DEAD)) {
6261 +#if defined(RING_DEBUG)
6262 + printk("Attempt to release alive ring socket: %p\n", sk);
6267 + BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
6268 + BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
6271 + BUG_TRAP(atomic_read(&sk->rmem_alloc)==0);
6272 + BUG_TRAP(atomic_read(&sk->wmem_alloc)==0);
6275 +#if defined(RING_DEBUG)
6276 + printk("Attempt to release alive ring socket: %p\n", sk);
6282 + kfree(ring_sk(sk));
6284 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
6285 + MOD_DEC_USE_COUNT;
6289 +/* ********************************** */
6291 +static void ring_proc_add(struct ring_opt *pfr) {
6292 + if(ring_proc_dir != NULL) {
6295 + pfr->ring_pid = current->pid;
6297 + snprintf(name, sizeof(name), "%d", pfr->ring_pid);
6298 + create_proc_read_entry(name, 0, ring_proc_dir,
6299 + ring_proc_get_info, pfr);
6300 + /* printk("PF_RING: added /proc/net/pf_ring/%s\n", name); */
6304 +/* ********************************** */
6306 +static void ring_proc_remove(struct ring_opt *pfr) {
6307 + if(ring_proc_dir != NULL) {
6310 + snprintf(name, sizeof(name), "%d", pfr->ring_pid);
6311 + remove_proc_entry(name, ring_proc_dir);
6312 + /* printk("PF_RING: removed /proc/net/pf_ring/%s\n", name); */
6316 +/* ********************************** */
6318 +static int ring_proc_get_info(char *buf, char **start, off_t offset,
6319 + int len, int *unused, void *data)
6322 + struct ring_opt *pfr;
6323 + FlowSlotInfo *fsi;
6325 + if(data == NULL) {
6326 + /* /proc/net/pf_ring/info */
6327 + rlen = sprintf(buf,"Version : %s\n", RING_VERSION);
6328 + rlen += sprintf(buf + rlen,"Bucket length : %d bytes\n", bucket_len);
6329 + rlen += sprintf(buf + rlen,"Ring slots : %d\n", num_slots);
6330 + rlen += sprintf(buf + rlen,"Sample rate : %d [1=no sampling]\n", sample_rate);
6332 + rlen += sprintf(buf + rlen,"Capture TX : %s\n",
6333 + enable_tx_capture ? "Yes [RX+TX]" : "No [RX only]");
6334 + rlen += sprintf(buf + rlen,"Transparent mode : %s\n",
6335 + transparent_mode ? "Yes" : "No");
6336 + rlen += sprintf(buf + rlen,"Total rings : %d\n", ring_table_size);
6338 + /* detailed statistics about a PF_RING */
6339 + pfr = (struct ring_opt*)data;
6342 + fsi = pfr->slots_info;
6345 + rlen = sprintf(buf, "Bound Device : %s\n",
6346 + pfr->ring_netdev->name == NULL ? "<NULL>" : pfr->ring_netdev->name);
6347 + rlen += sprintf(buf + rlen,"Version : %d\n", fsi->version);
6348 + rlen += sprintf(buf + rlen,"Sampling Rate : %d\n", pfr->sample_rate);
6349 + rlen += sprintf(buf + rlen,"BPF Filtering : %s\n", pfr->bpfFilter ? "Enabled" : "Disabled");
6350 + rlen += sprintf(buf + rlen,"Bloom Filters : %s\n", pfr->bitmask_enabled ? "Enabled" : "Disabled");
6351 + rlen += sprintf(buf + rlen,"Pattern Search: %s\n", pfr->acsm ? "Enabled" : "Disabled");
6352 + rlen += sprintf(buf + rlen,"Cluster Id : %d\n", pfr->cluster_id);
6353 + rlen += sprintf(buf + rlen,"Tot Slots : %d\n", fsi->tot_slots);
6354 + rlen += sprintf(buf + rlen,"Slot Len : %d\n", fsi->slot_len);
6355 + rlen += sprintf(buf + rlen,"Data Len : %d\n", fsi->data_len);
6356 + rlen += sprintf(buf + rlen,"Tot Memory : %d\n", fsi->tot_mem);
6357 + rlen += sprintf(buf + rlen,"Tot Packets : %lu\n", (unsigned long)fsi->tot_pkts);
6358 + rlen += sprintf(buf + rlen,"Tot Pkt Lost : %lu\n", (unsigned long)fsi->tot_lost);
6359 + rlen += sprintf(buf + rlen,"Tot Insert : %lu\n", (unsigned long)fsi->tot_insert);
6360 + rlen += sprintf(buf + rlen,"Tot Read : %lu\n", (unsigned long)fsi->tot_read);
6363 + rlen = sprintf(buf, "WARNING fsi == NULL\n");
6365 + rlen = sprintf(buf, "WARNING data == NULL\n");
6371 +/* ********************************** */
6373 +static void ring_proc_init(void) {
6374 + ring_proc_dir = proc_mkdir("pf_ring", proc_net);
6376 + if(ring_proc_dir) {
6377 + ring_proc_dir->owner = THIS_MODULE;
6378 + ring_proc = create_proc_read_entry("info", 0, ring_proc_dir,
6379 + ring_proc_get_info, NULL);
6381 + printk("PF_RING: unable to register proc file\n");
6383 + ring_proc->owner = THIS_MODULE;
6384 + printk("PF_RING: registered /proc/net/pf_ring/\n");
6387 + printk("PF_RING: unable to create /proc/net/pf_ring\n");
6390 +/* ********************************** */
6392 +static void ring_proc_term(void) {
6393 + if(ring_proc != NULL) {
6394 + remove_proc_entry("info", ring_proc_dir);
6395 + if(ring_proc_dir != NULL) remove_proc_entry("pf_ring", proc_net);
6397 + printk("PF_RING: deregistered /proc/net/pf_ring\n");
6401 +/* ********************************** */
6406 + * store the sk in a new element and add it
6407 + * to the head of the list.
6409 +static inline void ring_insert(struct sock *sk) {
6410 + struct ring_element *next;
6412 +#if defined(RING_DEBUG)
6413 + printk("RING: ring_insert()\n");
6416 + next = kmalloc(sizeof(struct ring_element), GFP_ATOMIC);
6417 + if(next != NULL) {
6419 + write_lock_irq(&ring_mgmt_lock);
6420 + list_add(&next->list, &ring_table);
6421 + write_unlock_irq(&ring_mgmt_lock);
6423 + if(net_ratelimit())
6424 + printk("RING: could not kmalloc slot!!\n");
6427 + ring_table_size++;
6428 + ring_proc_add(ring_sk(sk));
6431 +/* ********************************** */
6436 + * For each of the elements in the list:
6437 + * - check if this is the element we want to delete
6438 + * - if it is, remove it from the list, and free it.
6440 + * stop when we find the one we're looking for (break),
6441 + * or when we reach the end of the list.
6443 +static inline void ring_remove(struct sock *sk) {
6444 + struct list_head *ptr;
6445 + struct ring_element *entry;
6447 + for(ptr = ring_table.next; ptr != &ring_table; ptr = ptr->next) {
6448 + entry = list_entry(ptr, struct ring_element, list);
6450 + if(entry->sk == sk) {
6453 + ring_table_size--;
6459 +/* ********************************** */
6461 +static u_int32_t num_queued_pkts(struct ring_opt *pfr) {
6463 + if(pfr->ring_slots != NULL) {
6465 + u_int32_t tot_insert = pfr->slots_info->insert_idx,
6466 +#if defined(RING_DEBUG)
6467 + tot_read = pfr->slots_info->tot_read, tot_pkts;
6469 + tot_read = pfr->slots_info->tot_read;
6472 + if(tot_insert >= tot_read) {
6473 +#if defined(RING_DEBUG)
6474 + tot_pkts = tot_insert-tot_read;
6476 + return(tot_insert-tot_read);
6478 +#if defined(RING_DEBUG)
6479 + tot_pkts = ((u_int32_t)-1)+tot_insert-tot_read;
6481 + return(((u_int32_t)-1)+tot_insert-tot_read);
6484 +#if defined(RING_DEBUG)
6485 + printk("-> num_queued_pkts=%d [tot_insert=%d][tot_read=%d]\n",
6486 + tot_pkts, tot_insert, tot_read);
6493 +/* ********************************** */
6495 +static inline FlowSlot* get_insert_slot(struct ring_opt *pfr) {
6496 +#if defined(RING_DEBUG)
6497 + printk("get_insert_slot(%d)\n", pfr->slots_info->insert_idx);
6500 + if(pfr->ring_slots != NULL) {
6501 + FlowSlot *slot = (FlowSlot*)&(pfr->ring_slots[pfr->slots_info->insert_idx
6502 + *pfr->slots_info->slot_len]);
6508 +/* ********************************** */
6510 +static inline FlowSlot* get_remove_slot(struct ring_opt *pfr) {
6511 +#if defined(RING_DEBUG)
6512 + printk("get_remove_slot(%d)\n", pfr->slots_info->remove_idx);
6515 + if(pfr->ring_slots != NULL)
6516 + return((FlowSlot*)&(pfr->ring_slots[pfr->slots_info->remove_idx*
6517 + pfr->slots_info->slot_len]));
6522 +/* ******************************************************* */
6524 +static int parse_pkt(struct sk_buff *skb, u_int16_t skb_displ,
6525 + u_int8_t *l3_proto, u_int16_t *eth_type,
6526 + u_int16_t *l3_offset, u_int16_t *l4_offset,
6527 + u_int16_t *vlan_id, u_int32_t *ipv4_src,
6528 + u_int32_t *ipv4_dst,
6529 + u_int16_t *l4_src_port, u_int16_t *l4_dst_port,
6530 + u_int16_t *payload_offset) {
6532 + struct ethhdr *eh = (struct ethhdr*)(skb->data-skb_displ);
6535 + *l3_offset = *l4_offset = *l3_proto = *payload_offset = 0;
6536 + *eth_type = ntohs(eh->h_proto);
6538 + if(*eth_type == 0x8100 /* 802.1q (VLAN) */) {
6539 + (*vlan_id) = (skb->data[14] & 15)*256 + skb->data[15];
6540 + *eth_type = (skb->data[16])*256 + skb->data[17];
6544 + (*vlan_id) = (u_int16_t)-1;
6547 + if(*eth_type == 0x0800 /* IP */) {
6548 + *l3_offset = displ+sizeof(struct ethhdr);
6549 + ip = (struct iphdr*)(skb->data-skb_displ+(*l3_offset));
6551 + *ipv4_src = ntohl(ip->saddr), *ipv4_dst = ntohl(ip->daddr), *l3_proto = ip->protocol;
6553 + if((ip->protocol == IPPROTO_TCP) || (ip->protocol == IPPROTO_UDP)) {
6554 + *l4_offset = (*l3_offset)+(ip->ihl*4);
6556 + if(ip->protocol == IPPROTO_TCP) {
6557 + struct tcphdr *tcp = (struct tcphdr*)(skb->data-skb_displ+(*l4_offset));
6558 + *l4_src_port = ntohs(tcp->source), *l4_dst_port = ntohs(tcp->dest);
6559 + *payload_offset = (*l4_offset)+(tcp->doff * 4);
6560 + } else if(ip->protocol == IPPROTO_UDP) {
6561 + struct udphdr *udp = (struct udphdr*)(skb->data-skb_displ+(*l4_offset));
6562 + *l4_src_port = ntohs(udp->source), *l4_dst_port = ntohs(udp->dest);
6563 + *payload_offset = (*l4_offset)+sizeof(struct udphdr);
6565 + *payload_offset = (*l4_offset);
6567 + *l4_src_port = *l4_dst_port = 0;
6569 + return(1); /* IP */
6570 + } /* TODO: handle IPv6 */
6572 + return(0); /* No IP */
6575 +/* **************************************************************** */
6577 +static void reset_bitmask(bitmask_selector *selector)
6579 + memset((char*)selector->bits_memory, 0, selector->num_bits/8);
6581 + while(selector->clashes != NULL) {
6582 + bitmask_counter_list *next = selector->clashes->next;
6583 + kfree(selector->clashes);
6584 + selector->clashes = next;
6588 +/* **************************************************************** */
6590 +static void alloc_bitmask(u_int32_t tot_bits, bitmask_selector *selector)
6592 + u_int tot_mem = tot_bits/8;
6594 + if(tot_mem <= PAGE_SIZE)
6595 + selector->order = 1;
6597 + for(selector->order = 0; (PAGE_SIZE << selector->order) < tot_mem; selector->order++)
6601 + printk("BITMASK: [order=%d][tot_mem=%d]\n", selector->order, tot_mem);
6603 + while((selector->bits_memory = __get_free_pages(GFP_ATOMIC, selector->order)) == 0)
6604 + if(selector->order-- == 0)
6607 + if(selector->order == 0) {
6608 + printk("BITMASK: ERROR not enough memory for bitmask\n");
6609 + selector->num_bits = 0;
6613 + tot_mem = PAGE_SIZE << selector->order;
6614 + printk("BITMASK: succesfully allocated [tot_mem=%d][order=%d]\n",
6615 + tot_mem, selector->order);
6617 + selector->num_bits = tot_mem*8;
6618 + selector->clashes = NULL;
6619 + reset_bitmask(selector);
6622 +/* ********************************** */
6624 +static void free_bitmask(bitmask_selector *selector)
6626 + if(selector->bits_memory > 0)
6627 + free_pages(selector->bits_memory, selector->order);
6630 +/* ********************************** */
6632 +static void set_bit_bitmask(bitmask_selector *selector, u_int32_t the_bit) {
6633 + u_int32_t idx = the_bit % selector->num_bits;
6635 + if(BITMASK_ISSET(idx, selector)) {
6636 + bitmask_counter_list *head = selector->clashes;
6638 + printk("BITMASK: bit %u was already set\n", the_bit);
6640 + while(head != NULL) {
6641 + if(head->bit_id == the_bit) {
6642 + head->bit_counter++;
6643 + printk("BITMASK: bit %u is now set to %d\n", the_bit, head->bit_counter);
6647 + head = head->next;
6650 + head = kmalloc(sizeof(bitmask_counter_list), GFP_KERNEL);
6652 + head->bit_id = the_bit;
6653 + head->bit_counter = 1 /* previous value */ + 1 /* the requested set */;
6654 + head->next = selector->clashes;
6655 + selector->clashes = head;
6657 + printk("BITMASK: not enough memory\n");
6661 + BITMASK_SET(idx, selector);
6662 + printk("BITMASK: bit %u is now set\n", the_bit);
6666 +/* ********************************** */
6668 +static u_char is_set_bit_bitmask(bitmask_selector *selector, u_int32_t the_bit) {
6669 + u_int32_t idx = the_bit % selector->num_bits;
6670 + return(BITMASK_ISSET(idx, selector));
6673 +/* ********************************** */
6675 +static void clear_bit_bitmask(bitmask_selector *selector, u_int32_t the_bit) {
6676 + u_int32_t idx = the_bit % selector->num_bits;
6678 + if(!BITMASK_ISSET(idx, selector))
6679 + printk("BITMASK: bit %u was not set\n", the_bit);
6681 + bitmask_counter_list *head = selector->clashes, *prev = NULL;
6683 + while(head != NULL) {
6684 + if(head->bit_id == the_bit) {
6685 + head->bit_counter--;
6687 + printk("BITMASK: bit %u is now set to %d\n",
6688 + the_bit, head->bit_counter);
6690 + if(head->bit_counter == 1) {
6691 + /* We can now delete this entry as '1' can be
6692 + accommodated into the bitmask */
6695 + selector->clashes = head->next;
6697 + prev->next = head->next;
6704 + prev = head; head = head->next;
6707 + BITMASK_CLR(idx, selector);
6708 + printk("BITMASK: bit %u is now reset\n", the_bit);
6712 +/* ********************************** */
6714 +/* Hash function */
6715 +static u_int32_t sdb_hash(u_int32_t value) {
6716 + u_int32_t hash = 0, i;
6717 + u_int8_t str[sizeof(value)];
6719 + memcpy(str, &value, sizeof(value));
6721 + for(i = 0; i < sizeof(value); i++) {
6722 + hash = str[i] + (hash << 6) + (hash << 16) - hash;
6728 +/* ********************************** */
6730 +static void handle_bloom_filter_rule(struct ring_opt *pfr, char *buf) {
6736 + count = strlen(buf);
6738 + printk("PF_RING: -> handle_bloom_filter_rule(%s)\n", buf);
6740 + if((buf[count-1] == '\n') || (buf[count-1] == '\r')) buf[count-1] = '\0';
6743 + u_int32_t the_bit;
6745 + if(!strncmp(&buf[1], "vlan=", 5)) {
6746 + sscanf(&buf[6], "%d", &the_bit);
6749 + set_bit_bitmask(&pfr->vlan_bitmask, the_bit), pfr->num_vlan_bitmask_add++;
6751 + clear_bit_bitmask(&pfr->vlan_bitmask, the_bit), pfr->num_vlan_bitmask_remove++;
6752 + } else if(!strncmp(&buf[1], "mac=", 4)) {
6753 + int a, b, c, d, e, f;
6755 + if(sscanf(&buf[5], "%02x:%02x:%02x:%02x:%02x:%02x:",
6756 + &a, &b, &c, &d, &e, &f) == 6) {
6757 + u_int32_t mac_addr = (a & 0xff) + (b & 0xff) + ((c & 0xff) << 24) + ((d & 0xff) << 16) + ((e & 0xff) << 8) + (f & 0xff);
6759 + /* printk("PF_RING: -> [%u][%u][%u][%u][%u][%u] -> [%u]\n", a, b, c, d, e, f, mac_addr); */
6762 + set_bit_bitmask(&pfr->mac_bitmask, mac_addr), pfr->num_mac_bitmask_add++;
6764 + clear_bit_bitmask(&pfr->mac_bitmask, mac_addr), pfr->num_mac_bitmask_remove++;
6766 + printk("PF_RING: -> Invalid MAC address '%s'\n", &buf[5]);
6767 + } else if(!strncmp(&buf[1], "ip=", 3)) {
6770 + if(sscanf(&buf[4], "%d.%d.%d.%d", &a, &b, &c, &d) == 4) {
6771 + u_int32_t ip_addr = ((a & 0xff) << 24) + ((b & 0xff) << 16) + ((c & 0xff) << 8) + (d & 0xff);
6774 + set_bit_bitmask(&pfr->ip_bitmask, ip_addr), set_bit_bitmask(&pfr->ip_bitmask, sdb_hash(ip_addr)), pfr->num_ip_bitmask_add++;
6776 + clear_bit_bitmask(&pfr->ip_bitmask, ip_addr), clear_bit_bitmask(&pfr->twin_ip_bitmask, sdb_hash(ip_addr)), pfr->num_ip_bitmask_remove++;
6778 + printk("PF_RING: -> Invalid IP address '%s'\n", &buf[4]);
6779 + } else if(!strncmp(&buf[1], "port=", 5)) {
6780 + sscanf(&buf[6], "%d", &the_bit);
6783 + set_bit_bitmask(&pfr->port_bitmask, the_bit), set_bit_bitmask(&pfr->port_bitmask, sdb_hash(the_bit)), pfr->num_port_bitmask_add++;
6785 + clear_bit_bitmask(&pfr->port_bitmask, the_bit), clear_bit_bitmask(&pfr->twin_port_bitmask, sdb_hash(the_bit)), pfr->num_port_bitmask_remove++;
6786 + } else if(!strncmp(&buf[1], "proto=", 6)) {
6787 + if(!strncmp(&buf[7], "tcp", 3)) the_bit = 6;
6788 + else if(!strncmp(&buf[7], "udp", 3)) the_bit = 17;
6789 + else if(!strncmp(&buf[7], "icmp", 4)) the_bit = 1;
6790 + else sscanf(&buf[7], "%d", &the_bit);
6793 + set_bit_bitmask(&pfr->proto_bitmask, the_bit);
6795 + clear_bit_bitmask(&pfr->proto_bitmask, the_bit);
6797 + printk("PF_RING: -> Unknown rule type '%s'\n", buf);
6801 +/* ********************************** */
6803 +static void reset_bloom_filters(struct ring_opt *pfr) {
6804 + reset_bitmask(&pfr->mac_bitmask);
6805 + reset_bitmask(&pfr->vlan_bitmask);
6806 + reset_bitmask(&pfr->ip_bitmask); reset_bitmask(&pfr->twin_ip_bitmask);
6807 + reset_bitmask(&pfr->port_bitmask); reset_bitmask(&pfr->twin_port_bitmask);
6808 + reset_bitmask(&pfr->proto_bitmask);
6810 + pfr->num_mac_bitmask_add = pfr->num_mac_bitmask_remove = 0;
6811 + pfr->num_vlan_bitmask_add = pfr->num_vlan_bitmask_remove = 0;
6812 + pfr->num_ip_bitmask_add = pfr->num_ip_bitmask_remove = 0;
6813 + pfr->num_port_bitmask_add = pfr->num_port_bitmask_remove = 0;
6814 + pfr->num_proto_bitmask_add = pfr->num_proto_bitmask_remove = 0;
6816 + printk("PF_RING: rules have been reset\n");
6819 +/* ********************************** */
6821 +static void init_blooms(struct ring_opt *pfr) {
6822 + alloc_bitmask(4096, &pfr->mac_bitmask);
6823 + alloc_bitmask(4096, &pfr->vlan_bitmask);
6824 + alloc_bitmask(32768, &pfr->ip_bitmask); alloc_bitmask(32768, &pfr->twin_ip_bitmask);
6825 + alloc_bitmask(4096, &pfr->port_bitmask); alloc_bitmask(4096, &pfr->twin_port_bitmask);
6826 + alloc_bitmask(4096, &pfr->proto_bitmask);
6828 + pfr->num_mac_bitmask_add = pfr->num_mac_bitmask_remove = 0;
6829 + pfr->num_vlan_bitmask_add = pfr->num_vlan_bitmask_remove = 0;
6830 + pfr->num_ip_bitmask_add = pfr->num_ip_bitmask_remove = 0;
6831 + pfr->num_port_bitmask_add = pfr->num_port_bitmask_remove = 0;
6832 + pfr->num_proto_bitmask_add = pfr->num_proto_bitmask_remove = 0;
6834 + reset_bloom_filters(pfr);
6837 +/* ********************************** */
6839 +inline int MatchFound (void* id, int index, void *data) { return(0); }
6841 +/* ********************************** */
6843 +static void add_skb_to_ring(struct sk_buff *skb,
6844 + struct ring_opt *pfr,
6845 + u_char recv_packet,
6846 + u_char real_skb /* 1=skb 0=faked skb */) {
6847 + FlowSlot *theSlot;
6848 + int idx, displ, fwd_pkt = 0;
6851 + /* Hack for identifying a packet received by the e1000 */
6853 + displ = SKB_DISPLACEMENT;
6855 + displ = 0; /* Received by the e1000 wrapper */
6859 + write_lock(&pfr->ring_index_lock);
6860 + pfr->slots_info->tot_pkts++;
6861 + write_unlock(&pfr->ring_index_lock);
6863 + /* BPF Filtering (from af_packet.c) */
6864 + if(pfr->bpfFilter != NULL) {
6865 + unsigned res = 1, len;
6867 + len = skb->len-skb->data_len;
6869 + write_lock(&pfr->ring_index_lock);
6870 + skb->data -= displ;
6871 + res = sk_run_filter(skb, pfr->bpfFilter->insns, pfr->bpfFilter->len);
6872 + skb->data += displ;
6873 + write_unlock(&pfr->ring_index_lock);
6876 + /* Filter failed */
6878 +#if defined(RING_DEBUG)
6879 + printk("add_skb_to_ring(skb): Filter failed [len=%d][tot=%llu]"
6880 + "[insertIdx=%d][pkt_type=%d][cloned=%d]\n",
6881 + (int)skb->len, pfr->slots_info->tot_pkts,
6882 + pfr->slots_info->insert_idx,
6883 + skb->pkt_type, skb->cloned);
6890 + /* ************************** */
6892 + if(pfr->sample_rate > 1) {
6893 + if(pfr->pktToSample == 0) {
6894 + write_lock(&pfr->ring_index_lock);
6895 + pfr->pktToSample = pfr->sample_rate;
6896 + write_unlock(&pfr->ring_index_lock);
6898 + write_lock(&pfr->ring_index_lock);
6899 + pfr->pktToSample--;
6900 + write_unlock(&pfr->ring_index_lock);
6902 +#if defined(RING_DEBUG)
6903 + printk("add_skb_to_ring(skb): sampled packet [len=%d]"
6904 + "[tot=%llu][insertIdx=%d][pkt_type=%d][cloned=%d]\n",
6905 + (int)skb->len, pfr->slots_info->tot_pkts,
6906 + pfr->slots_info->insert_idx,
6907 + skb->pkt_type, skb->cloned);
6913 + /* ************************************* */
6915 + if((pfr->reflector_dev != NULL)
6916 + && (!netif_queue_stopped(pfr->reflector_dev))) {
6917 + int cpu = smp_processor_id();
6919 + /* increase reference counter so that this skb is not freed */
6920 + atomic_inc(&skb->users);
6922 + skb->data -= displ;
6925 + if (pfr->reflector_dev->xmit_lock_owner != cpu) {
6926 + /* Patch below courtesy of Matthew J. Roth <mroth@imminc.com> */
6927 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18))
6928 + spin_lock_bh(&pfr->reflector_dev->xmit_lock);
6929 + pfr->reflector_dev->xmit_lock_owner = cpu;
6930 + spin_unlock_bh(&pfr->reflector_dev->xmit_lock);
6932 + netif_tx_lock_bh(pfr->reflector_dev);
6934 + if (pfr->reflector_dev->hard_start_xmit(skb, pfr->reflector_dev) == 0) {
6935 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18))
6936 + spin_lock_bh(&pfr->reflector_dev->xmit_lock);
6937 + pfr->reflector_dev->xmit_lock_owner = -1;
6938 + spin_unlock_bh(&pfr->reflector_dev->xmit_lock);
6940 + netif_tx_unlock_bh(pfr->reflector_dev);
6942 + skb->data += displ;
6943 +#if defined(RING_DEBUG)
6944 + printk("++ hard_start_xmit succeeded\n");
6949 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18))
6950 + spin_lock_bh(&pfr->reflector_dev->xmit_lock);
6951 + pfr->reflector_dev->xmit_lock_owner = -1;
6952 + spin_unlock_bh(&pfr->reflector_dev->xmit_lock);
6954 + netif_tx_unlock_bh(pfr->reflector_dev);
6958 +#if defined(RING_DEBUG)
6959 + printk("++ hard_start_xmit failed\n");
6961 + skb->data += displ;
6962 + return; /* -ENETDOWN */
6965 + /* ************************************* */
6967 +#if defined(RING_DEBUG)
6968 + printk("add_skb_to_ring(skb) [len=%d][tot=%llu][insertIdx=%d]"
6969 + "[pkt_type=%d][cloned=%d]\n",
6970 + (int)skb->len, pfr->slots_info->tot_pkts,
6971 + pfr->slots_info->insert_idx,
6972 + skb->pkt_type, skb->cloned);
6975 + idx = pfr->slots_info->insert_idx;
6976 + theSlot = get_insert_slot(pfr);
6978 + if((theSlot != NULL) && (theSlot->slot_state == 0)) {
6979 + struct pcap_pkthdr *hdr;
6981 + int is_ip_pkt, debug = 0;
6983 + /* Update Index */
6986 + bucket = &theSlot->bucket;
6987 + hdr = (struct pcap_pkthdr*)bucket;
6989 + /* BD - API changed for time keeping */
6990 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14))
6991 + if(skb->stamp.tv_sec == 0) do_gettimeofday(&skb->stamp);
6993 + hdr->ts.tv_sec = skb->stamp.tv_sec, hdr->ts.tv_usec = skb->stamp.tv_usec;
6995 + if(skb->tstamp.off_sec == 0) __net_timestamp(skb);
6997 + hdr->ts.tv_sec = skb->tstamp.off_sec, hdr->ts.tv_usec = skb->tstamp.off_usec;
6999 + hdr->caplen = skb->len+displ;
7001 + if(hdr->caplen > pfr->slots_info->data_len)
7002 + hdr->caplen = pfr->slots_info->data_len;
7004 + hdr->len = skb->len+displ;
7007 + is_ip_pkt = parse_pkt(skb, displ,
7015 + &hdr->l4_src_port,
7016 + &hdr->l4_dst_port,
7017 + &hdr->payload_offset);
7019 + if(is_ip_pkt && pfr->bitmask_enabled) {
7020 + int vlan_match = 0;
7026 + printk(KERN_INFO "PF_RING: [proto=%d][vlan=%d][sport=%d][dport=%d][src=%u][dst=%u]\n",
7027 + hdr->l3_proto, hdr->vlan_id, hdr->l4_src_port, hdr->l4_dst_port, hdr->ipv4_src, hdr->ipv4_dst);
7029 + printk(KERN_INFO "PF_RING: [proto=%d][vlan=%d]\n", hdr->l3_proto, hdr->vlan_id);
7032 + if(hdr->vlan_id != (u_int16_t)-1) {
7033 + vlan_match = is_set_bit_bitmask(&pfr->vlan_bitmask, hdr->vlan_id);
7038 + struct ethhdr *eh = (struct ethhdr*)(skb->data);
7039 + u_int32_t src_mac = (eh->h_source[0] & 0xff) + (eh->h_source[1] & 0xff) + ((eh->h_source[2] & 0xff) << 24)
7040 + + ((eh->h_source[3] & 0xff) << 16) + ((eh->h_source[4] & 0xff) << 8) + (eh->h_source[5] & 0xff);
7042 + if(debug) printk(KERN_INFO "PF_RING: [src_mac=%u]\n", src_mac);
7044 + fwd_pkt |= is_set_bit_bitmask(&pfr->mac_bitmask, src_mac);
7047 + u_int32_t dst_mac = (eh->h_dest[0] & 0xff) + (eh->h_dest[1] & 0xff) + ((eh->h_dest[2] & 0xff) << 24)
7048 + + ((eh->h_dest[3] & 0xff) << 16) + ((eh->h_dest[4] & 0xff) << 8) + (eh->h_dest[5] & 0xff);
7050 + if(debug) printk(KERN_INFO "PF_RING: [dst_mac=%u]\n", dst_mac);
7052 + fwd_pkt |= is_set_bit_bitmask(&pfr->mac_bitmask, dst_mac);
7054 + if(is_ip_pkt && (!fwd_pkt)) {
7055 + fwd_pkt |= is_set_bit_bitmask(&pfr->ip_bitmask, hdr->ipv4_src);
7058 + fwd_pkt |= is_set_bit_bitmask(&pfr->ip_bitmask, hdr->ipv4_dst);
7060 + if((!fwd_pkt) && ((hdr->l3_proto == IPPROTO_TCP)
7061 + || (hdr->l3_proto == IPPROTO_UDP))) {
7062 + fwd_pkt |= is_set_bit_bitmask(&pfr->port_bitmask, hdr->l4_src_port);
7063 + if(!fwd_pkt) fwd_pkt |= is_set_bit_bitmask(&pfr->port_bitmask, hdr->l4_dst_port);
7066 + if(!fwd_pkt) fwd_pkt |= is_set_bit_bitmask(&pfr->proto_bitmask, hdr->l3_proto);
7074 + if(fwd_pkt && (pfr->acsm != NULL)) {
7075 + if((hdr->payload_offset > 0) && ((skb->len+skb->mac_len) > hdr->payload_offset)) {
7076 + char *payload = (skb->data-displ+hdr->payload_offset);
7077 + int payload_len = skb->len /* + skb->mac_len */ - hdr->payload_offset;
7079 + if((payload_len > 0)
7080 + && ((hdr->l4_src_port == 80) || (hdr->l4_dst_port == 80))) {
7086 + memcpy(buf, payload, payload_len);
7087 + buf[payload_len] = '\0';
7088 + printk("[%s]\n", payload);
7091 + /* printk("Tring to match pattern [len=%d][%s]\n", payload_len, payload); */
7092 + rc = acsmSearch2(pfr->acsm, payload, payload_len, MatchFound, (void *)0) ? 1 : 0;
7094 + // printk("Match result: %d\n", fwd_pkt);
7096 + printk("Pattern matched!\n");
7107 + memcpy(&bucket[sizeof(struct pcap_pkthdr)], skb->data-displ, hdr->caplen);
7109 +#if defined(RING_DEBUG)
7111 + static unsigned int lastLoss = 0;
7113 + if(pfr->slots_info->tot_lost
7114 + && (lastLoss != pfr->slots_info->tot_lost)) {
7115 + printk("add_skb_to_ring(%d): [data_len=%d]"
7116 + "[hdr.caplen=%d][skb->len=%d]"
7117 + "[pcap_pkthdr=%d][removeIdx=%d]"
7118 + "[loss=%lu][page=%u][slot=%u]\n",
7119 + idx-1, pfr->slots_info->data_len, hdr->caplen, skb->len,
7120 + sizeof(struct pcap_pkthdr),
7121 + pfr->slots_info->remove_idx,
7122 + (long unsigned int)pfr->slots_info->tot_lost,
7123 + pfr->insert_page_id, pfr->insert_slot_id);
7125 + lastLoss = pfr->slots_info->tot_lost;
7130 + write_lock(&pfr->ring_index_lock);
7131 + if(idx == pfr->slots_info->tot_slots)
7132 + pfr->slots_info->insert_idx = 0;
7134 + pfr->slots_info->insert_idx = idx;
7136 + pfr->slots_info->tot_insert++;
7137 + theSlot->slot_state = 1;
7138 + write_unlock(&pfr->ring_index_lock);
7141 + write_lock(&pfr->ring_index_lock);
7142 + pfr->slots_info->tot_lost++;
7143 + write_unlock(&pfr->ring_index_lock);
7145 +#if defined(RING_DEBUG)
7146 + printk("add_skb_to_ring(skb): packet lost [loss=%lu]"
7147 + "[removeIdx=%u][insertIdx=%u]\n",
7148 + (long unsigned int)pfr->slots_info->tot_lost,
7149 + pfr->slots_info->remove_idx, pfr->slots_info->insert_idx);
7155 + /* wakeup in case of poll() */
7156 + if(waitqueue_active(&pfr->ring_slots_waitqueue))
7157 + wake_up_interruptible(&pfr->ring_slots_waitqueue);
7161 +/* ********************************** */
7163 +static u_int hash_skb(struct ring_cluster *cluster_ptr,
7164 + struct sk_buff *skb, u_char recv_packet) {
7169 + if(cluster_ptr->hashing_mode == cluster_round_robin) {
7170 + idx = cluster_ptr->hashing_id++;
7172 + /* Per-flow clustering */
7173 + if(skb->len > sizeof(struct iphdr)+sizeof(struct tcphdr)) {
7177 + displ = SKB_DISPLACEMENT;
7182 + Always points to to the IP part of the packet
7185 + ip = (struct iphdr*)(skb->data+displ);
7187 + idx = ip->saddr+ip->daddr+ip->protocol;
7189 + if(ip->protocol == IPPROTO_TCP) {
7190 + struct tcphdr *tcp = (struct tcphdr*)(skb->data+displ
7191 + +sizeof(struct iphdr));
7192 + idx += tcp->source+tcp->dest;
7193 + } else if(ip->protocol == IPPROTO_UDP) {
7194 + struct udphdr *udp = (struct udphdr*)(skb->data+displ
7195 + +sizeof(struct iphdr));
7196 + idx += udp->source+udp->dest;
7202 + return(idx % cluster_ptr->num_cluster_elements);
7205 +/* ********************************** */
7207 +static int skb_ring_handler(struct sk_buff *skb,
7208 + u_char recv_packet,
7209 + u_char real_skb /* 1=skb 0=faked skb */) {
7210 + struct sock *skElement;
7212 + struct list_head *ptr;
7213 + struct ring_cluster *cluster_ptr;
7216 + uint64_t rdt = _rdtsc(), rdt1, rdt2;
7219 + if((!skb) /* Invalid skb */
7220 + || ((!enable_tx_capture) && (!recv_packet))) {
7222 + An outgoing packet is about to be sent out
7223 + but we decided not to handle transmitted
7229 +#if defined(RING_DEBUG)
7231 + printk("skb_ring_handler() [len=%d][dev=%s]\n", skb->len,
7232 + skb->dev->name == NULL ? "<NULL>" : skb->dev->name);
7240 + /* [1] Check unclustered sockets */
7241 + for (ptr = ring_table.next; ptr != &ring_table; ptr = ptr->next) {
7242 + struct ring_opt *pfr;
7243 + struct ring_element *entry;
7245 + entry = list_entry(ptr, struct ring_element, list);
7247 + read_lock(&ring_mgmt_lock);
7248 + skElement = entry->sk;
7249 + pfr = ring_sk(skElement);
7250 + read_unlock(&ring_mgmt_lock);
7253 + && (pfr->cluster_id == 0 /* No cluster */)
7254 + && (pfr->ring_slots != NULL)
7255 + && ((pfr->ring_netdev == skb->dev) || ((skb->dev->flags & IFF_SLAVE) && pfr->ring_netdev == skb->dev->master))) {
7256 + /* We've found the ring where the packet can be stored */
7257 + read_lock(&ring_mgmt_lock);
7258 + add_skb_to_ring(skb, pfr, recv_packet, real_skb);
7259 + read_unlock(&ring_mgmt_lock);
7261 + rc = 1; /* Ring found: we've done our job */
7265 + /* [2] Check socket clusters */
7266 + cluster_ptr = ring_cluster_list;
7268 + while(cluster_ptr != NULL) {
7269 + struct ring_opt *pfr;
7271 + if(cluster_ptr->num_cluster_elements > 0) {
7272 + u_int skb_hash = hash_skb(cluster_ptr, skb, recv_packet);
7274 + read_lock(&ring_mgmt_lock);
7275 + skElement = cluster_ptr->sk[skb_hash];
7276 + read_unlock(&ring_mgmt_lock);
7278 + if(skElement != NULL) {
7279 + pfr = ring_sk(skElement);
7282 + && (pfr->ring_slots != NULL)
7283 + && ((pfr->ring_netdev == skb->dev) || ((skb->dev->flags & IFF_SLAVE) && pfr->ring_netdev == skb->dev->master))) {
7284 + /* We've found the ring where the packet can be stored */
7285 + read_lock(&ring_mgmt_lock);
7286 + add_skb_to_ring(skb, pfr, recv_packet, real_skb);
7287 + read_unlock(&ring_mgmt_lock);
7289 + rc = 1; /* Ring found: we've done our job */
7294 + cluster_ptr = cluster_ptr->next;
7298 + rdt1 = _rdtsc()-rdt1;
7305 + if(transparent_mode) rc = 0;
7307 + if((rc != 0) && real_skb)
7308 + dev_kfree_skb(skb); /* Free the skb */
7311 + rdt2 = _rdtsc()-rdt2;
7312 + rdt = _rdtsc()-rdt;
7314 +#if defined(RING_DEBUG)
7315 + printk("# cycles: %d [lock costed %d %d%%][free costed %d %d%%]\n",
7316 + (int)rdt, rdt-rdt1,
7317 + (int)((float)((rdt-rdt1)*100)/(float)rdt),
7319 + (int)((float)(rdt2*100)/(float)rdt));
7323 + return(rc); /* 0 = packet not handled */
7326 +/* ********************************** */
7328 +struct sk_buff skb;
7330 +static int buffer_ring_handler(struct net_device *dev,
7331 + char *data, int len) {
7333 +#if defined(RING_DEBUG)
7334 + printk("buffer_ring_handler: [dev=%s][len=%d]\n",
7335 + dev->name == NULL ? "<NULL>" : dev->name, len);
7338 + /* BD - API changed for time keeping */
7339 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14))
7340 + skb.dev = dev, skb.len = len, skb.data = data,
7341 + skb.data_len = len, skb.stamp.tv_sec = 0; /* Calculate the time */
7343 + skb.dev = dev, skb.len = len, skb.data = data,
7344 + skb.data_len = len, skb.tstamp.off_sec = 0; /* Calculate the time */
7347 + skb_ring_handler(&skb, 1, 0 /* fake skb */);
7352 +/* ********************************** */
7354 +static int ring_create(struct socket *sock, int protocol) {
7356 + struct ring_opt *pfr;
7359 +#if defined(RING_DEBUG)
7360 + printk("RING: ring_create()\n");
7363 + /* Are you root, superuser or so ? */
7364 + if(!capable(CAP_NET_ADMIN))
7367 + if(sock->type != SOCK_RAW)
7368 + return -ESOCKTNOSUPPORT;
7370 + if(protocol != htons(ETH_P_ALL))
7371 + return -EPROTONOSUPPORT;
7373 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
7374 + MOD_INC_USE_COUNT;
7379 + // BD: -- broke this out to keep it more simple and clear as to what the
7381 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
7382 +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,11))
7383 + sk = sk_alloc(PF_RING, GFP_KERNEL, 1, NULL);
7385 + // BD: API changed in 2.6.12, ref:
7386 + // http://svn.clkao.org/svnweb/linux/revision/?rev=28201
7387 + sk = sk_alloc(PF_RING, GFP_ATOMIC, &ring_proto, 1);
7391 + sk = sk_alloc(PF_RING, GFP_KERNEL, 1);
7397 + sock->ops = &ring_ops;
7398 + sock_init_data(sock, sk);
7399 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
7400 +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,11))
7401 + sk_set_owner(sk, THIS_MODULE);
7406 + ring_sk(sk) = ring_sk_datatype(kmalloc(sizeof(*pfr), GFP_KERNEL));
7408 + if (!(pfr = ring_sk(sk))) {
7412 + memset(pfr, 0, sizeof(*pfr));
7413 + init_waitqueue_head(&pfr->ring_slots_waitqueue);
7414 + pfr->ring_index_lock = RW_LOCK_UNLOCKED;
7415 + atomic_set(&pfr->num_ring_slots_waiters, 0);
7419 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
7420 + sk->sk_family = PF_RING;
7421 + sk->sk_destruct = ring_sock_destruct;
7423 + sk->family = PF_RING;
7424 + sk->destruct = ring_sock_destruct;
7425 + sk->num = protocol;
7430 +#if defined(RING_DEBUG)
7431 + printk("RING: ring_create() - created\n");
7436 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
7437 + MOD_DEC_USE_COUNT;
7442 +/* *********************************************** */
7444 +static int ring_release(struct socket *sock)
7446 + struct sock *sk = sock->sk;
7447 + struct ring_opt *pfr = ring_sk(sk);
7451 +#if defined(RING_DEBUG)
7452 + printk("RING: called ring_release\n");
7455 +#if defined(RING_DEBUG)
7456 + printk("RING: ring_release entered\n");
7460 + The calls below must be placed outside the
7461 + write_lock_irq...write_unlock_irq block.
7464 + ring_proc_remove(ring_sk(sk));
7466 + write_lock_irq(&ring_mgmt_lock);
7470 + /* Free the ring buffer */
7471 + if(pfr->ring_memory) {
7472 + struct page *page, *page_end;
7474 + page_end = virt_to_page(pfr->ring_memory + (PAGE_SIZE << pfr->order) - 1);
7475 + for(page = virt_to_page(pfr->ring_memory); page <= page_end; page++)
7476 + ClearPageReserved(page);
7478 + free_pages(pfr->ring_memory, pfr->order);
7481 + free_bitmask(&pfr->mac_bitmask);
7482 + free_bitmask(&pfr->vlan_bitmask);
7483 + free_bitmask(&pfr->ip_bitmask); free_bitmask(&pfr->twin_ip_bitmask);
7484 + free_bitmask(&pfr->port_bitmask); free_bitmask(&pfr->twin_port_bitmask);
7485 + free_bitmask(&pfr->proto_bitmask);
7487 + if(pfr->acsm != NULL) acsmFree2(pfr->acsm);
7490 + ring_sk(sk) = NULL;
7492 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
7493 + skb_queue_purge(&sk->sk_write_queue);
7497 + write_unlock_irq(&ring_mgmt_lock);
7499 +#if defined(RING_DEBUG)
7500 + printk("RING: ring_release leaving\n");
7506 +/* ********************************** */
7508 + * We create a ring for this socket and bind it to the specified device
7510 +static int packet_ring_bind(struct sock *sk, struct net_device *dev)
7512 + u_int the_slot_len;
7513 + u_int32_t tot_mem;
7514 + struct ring_opt *pfr = ring_sk(sk);
7515 + struct page *page, *page_end;
7517 + if(!dev) return(-1);
7519 +#if defined(RING_DEBUG)
7520 + printk("RING: packet_ring_bind(%s) called\n", dev->name);
7523 + /* **********************************************
7525 + *************************************
7529 + ************************************* <-+
7531 + ************************************* |
7533 + ************************************* +- num_slots
7535 + ************************************* |
7537 + ************************************* <-+
7539 + ********************************************** */
7541 + the_slot_len = sizeof(u_char) /* flowSlot.slot_state */
7545 + + sizeof(struct pcap_pkthdr)
7546 + + bucket_len /* flowSlot.bucket */;
7548 + tot_mem = sizeof(FlowSlotInfo) + num_slots*the_slot_len;
7551 + Calculate the value of the order parameter used later.
7552 + See http://www.linuxjournal.com/article.php?sid=1133
7554 + for(pfr->order = 0;(PAGE_SIZE << pfr->order) < tot_mem; pfr->order++) ;
7557 + We now try to allocate the memory as required. If we fail
7558 + we try to allocate a smaller amount or memory (hence a
7561 + while((pfr->ring_memory = __get_free_pages(GFP_ATOMIC, pfr->order)) == 0)
7562 + if(pfr->order-- == 0)
7565 + if(pfr->order == 0) {
7566 + printk("RING: ERROR not enough memory for ring\n");
7569 + printk("RING: succesfully allocated %lu KB [tot_mem=%d][order=%ld]\n",
7570 + PAGE_SIZE >> (10 - pfr->order), tot_mem, pfr->order);
7573 + tot_mem = PAGE_SIZE << pfr->order;
7574 + memset((char*)pfr->ring_memory, 0, tot_mem);
7576 + /* Now we need to reserve the pages */
7577 + page_end = virt_to_page(pfr->ring_memory + (PAGE_SIZE << pfr->order) - 1);
7578 + for(page = virt_to_page(pfr->ring_memory); page <= page_end; page++)
7579 + SetPageReserved(page);
7581 + pfr->slots_info = (FlowSlotInfo*)pfr->ring_memory;
7582 + pfr->ring_slots = (char*)(pfr->ring_memory+sizeof(FlowSlotInfo));
7584 + pfr->slots_info->version = RING_FLOWSLOT_VERSION;
7585 + pfr->slots_info->slot_len = the_slot_len;
7586 + pfr->slots_info->data_len = bucket_len;
7587 + pfr->slots_info->tot_slots = (tot_mem-sizeof(FlowSlotInfo))/the_slot_len;
7588 + pfr->slots_info->tot_mem = tot_mem;
7589 + pfr->slots_info->sample_rate = sample_rate;
7591 + printk("RING: allocated %d slots [slot_len=%d][tot_mem=%u]\n",
7592 + pfr->slots_info->tot_slots, pfr->slots_info->slot_len,
7593 + pfr->slots_info->tot_mem);
7599 + for(i=0; i<pfr->slots_info->tot_slots; i++) {
7600 + unsigned long idx = i*pfr->slots_info->slot_len;
7601 + FlowSlot *slot = (FlowSlot*)&pfr->ring_slots[idx];
7602 + slot->magic = RING_MAGIC_VALUE; slot->slot_state = 0;
7607 + pfr->insert_page_id = 1, pfr->insert_slot_id = 0;
7611 + Leave this statement here as last one. In fact when
7612 + the ring_netdev != NULL the socket is ready to be used.
7614 + pfr->ring_netdev = dev;
7619 +/* ************************************* */
7621 +/* Bind to a device */
7622 +static int ring_bind(struct socket *sock,
7623 + struct sockaddr *sa, int addr_len)
7625 + struct sock *sk=sock->sk;
7626 + struct net_device *dev = NULL;
7628 +#if defined(RING_DEBUG)
7629 + printk("RING: ring_bind() called\n");
7635 + if (addr_len != sizeof(struct sockaddr))
7637 + if (sa->sa_family != PF_RING)
7640 + /* Safety check: add trailing zero if missing */
7641 + sa->sa_data[sizeof(sa->sa_data)-1] = '\0';
7643 +#if defined(RING_DEBUG)
7644 + printk("RING: searching device %s\n", sa->sa_data);
7647 + if((dev = __dev_get_by_name(sa->sa_data)) == NULL) {
7648 +#if defined(RING_DEBUG)
7649 + printk("RING: search failed\n");
7653 + return(packet_ring_bind(sk, dev));
7656 +/* ************************************* */
7658 +static int ring_mmap(struct file *file,
7659 + struct socket *sock,
7660 + struct vm_area_struct *vma)
7662 + struct sock *sk = sock->sk;
7663 + struct ring_opt *pfr = ring_sk(sk);
7664 + unsigned long size, start;
7668 +#if defined(RING_DEBUG)
7669 + printk("RING: ring_mmap() called\n");
7672 + if(pfr->ring_memory == 0) {
7673 +#if defined(RING_DEBUG)
7674 + printk("RING: ring_mmap() failed: mapping area to an unbound socket\n");
7679 + size = (unsigned long)(vma->vm_end-vma->vm_start);
7681 + if(size % PAGE_SIZE) {
7682 +#if defined(RING_DEBUG)
7683 + printk("RING: ring_mmap() failed: len is not multiple of PAGE_SIZE\n");
7688 + /* if userspace tries to mmap beyond end of our buffer, fail */
7689 + if(size > pfr->slots_info->tot_mem) {
7690 +#if defined(RING_DEBUG)
7691 + printk("proc_mmap() failed: area too large [%ld > %d]\n", size, pfr->slots_info->tot_mem);
7696 + pagesToMap = size/PAGE_SIZE;
7698 +#if defined(RING_DEBUG)
7699 + printk("RING: ring_mmap() called. %d pages to map\n", pagesToMap);
7702 +#if defined(RING_DEBUG)
7703 + printk("RING: mmap [slot_len=%d][tot_slots=%d] for ring on device %s\n",
7704 + pfr->slots_info->slot_len, pfr->slots_info->tot_slots,
7705 + pfr->ring_netdev->name);
7708 + /* we do not want to have this area swapped out, lock it */
7709 + vma->vm_flags |= VM_LOCKED;
7710 + start = vma->vm_start;
7712 + /* Ring slots start from page 1 (page 0 is reserved for FlowSlotInfo) */
7713 + ptr = (char*)(start+PAGE_SIZE);
7715 + if(remap_page_range(
7716 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
7720 + __pa(pfr->ring_memory),
7721 + PAGE_SIZE*pagesToMap, vma->vm_page_prot)) {
7722 +#if defined(RING_DEBUG)
7723 + printk("remap_page_range() failed\n");
7728 +#if defined(RING_DEBUG)
7729 + printk("proc_mmap(pagesToMap=%d): success.\n", pagesToMap);
7735 +/* ************************************* */
7737 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
7738 +static int ring_recvmsg(struct kiocb *iocb, struct socket *sock,
7739 + struct msghdr *msg, size_t len, int flags)
7741 + static int ring_recvmsg(struct socket *sock, struct msghdr *msg, int len,
7742 + int flags, struct scm_cookie *scm)
7746 + struct ring_opt *pfr = ring_sk(sock->sk);
7747 + u_int32_t queued_pkts, num_loops = 0;
7749 +#if defined(RING_DEBUG)
7750 + printk("ring_recvmsg called\n");
7753 + slot = get_remove_slot(pfr);
7755 + while((queued_pkts = num_queued_pkts(pfr)) < MIN_QUEUED_PKTS) {
7756 + wait_event_interruptible(pfr->ring_slots_waitqueue, 1);
7758 +#if defined(RING_DEBUG)
7759 + printk("-> ring_recvmsg returning %d [queued_pkts=%d][num_loops=%d]\n",
7760 + slot->slot_state, queued_pkts, num_loops);
7763 + if(queued_pkts > 0) {
7764 + if(num_loops++ > MAX_QUEUE_LOOPS)
7769 +#if defined(RING_DEBUG)
7771 + printk("ring_recvmsg is returning [queued_pkts=%d][num_loops=%d]\n",
7772 + queued_pkts, num_loops);
7775 + return(queued_pkts);
7778 +/* ************************************* */
7780 +unsigned int ring_poll(struct file * file,
7781 + struct socket *sock, poll_table *wait)
7784 + struct ring_opt *pfr = ring_sk(sock->sk);
7786 +#if defined(RING_DEBUG)
7787 + printk("poll called\n");
7790 + slot = get_remove_slot(pfr);
7792 + if((slot != NULL) && (slot->slot_state == 0))
7793 + poll_wait(file, &pfr->ring_slots_waitqueue, wait);
7795 +#if defined(RING_DEBUG)
7796 + printk("poll returning %d\n", slot->slot_state);
7799 + if((slot != NULL) && (slot->slot_state == 1))
7800 + return(POLLIN | POLLRDNORM);
7805 +/* ************************************* */
7807 +int add_to_cluster_list(struct ring_cluster *el,
7808 + struct sock *sock) {
7810 + if(el->num_cluster_elements == CLUSTER_LEN)
7811 + return(-1); /* Cluster full */
7813 + ring_sk_datatype(ring_sk(sock))->cluster_id = el->cluster_id;
7814 + el->sk[el->num_cluster_elements] = sock;
7815 + el->num_cluster_elements++;
7819 +/* ************************************* */
7821 +int remove_from_cluster_list(struct ring_cluster *el,
7822 + struct sock *sock) {
7825 + for(i=0; i<CLUSTER_LEN; i++)
7826 + if(el->sk[i] == sock) {
7827 + el->num_cluster_elements--;
7829 + if(el->num_cluster_elements > 0) {
7830 + /* The cluster contains other elements */
7831 + for(j=i; j<CLUSTER_LEN-1; j++)
7832 + el->sk[j] = el->sk[j+1];
7834 + el->sk[CLUSTER_LEN-1] = NULL;
7836 + /* Empty cluster */
7837 + memset(el->sk, 0, sizeof(el->sk));
7843 + return(-1); /* Not found */
7846 +/* ************************************* */
7848 +static int remove_from_cluster(struct sock *sock,
7849 + struct ring_opt *pfr)
7851 + struct ring_cluster *el;
7853 +#if defined(RING_DEBUG)
7854 + printk("--> remove_from_cluster(%d)\n", pfr->cluster_id);
7857 + if(pfr->cluster_id == 0 /* 0 = No Cluster */)
7858 + return(0); /* Noting to do */
7860 + el = ring_cluster_list;
7862 + while(el != NULL) {
7863 + if(el->cluster_id == pfr->cluster_id) {
7864 + return(remove_from_cluster_list(el, sock));
7869 + return(-EINVAL); /* Not found */
7872 +/* ************************************* */
7874 +static int add_to_cluster(struct sock *sock,
7875 + struct ring_opt *pfr,
7876 + u_short cluster_id)
7878 + struct ring_cluster *el;
7881 + printk("--> add_to_cluster(%d)\n", cluster_id);
7884 + if(cluster_id == 0 /* 0 = No Cluster */) return(-EINVAL);
7886 + if(pfr->cluster_id != 0)
7887 + remove_from_cluster(sock, pfr);
7889 + el = ring_cluster_list;
7891 + while(el != NULL) {
7892 + if(el->cluster_id == cluster_id) {
7893 + return(add_to_cluster_list(el, sock));
7898 + /* There's no existing cluster. We need to create one */
7899 + if((el = kmalloc(sizeof(struct ring_cluster), GFP_KERNEL)) == NULL)
7902 + el->cluster_id = cluster_id;
7903 + el->num_cluster_elements = 1;
7904 + el->hashing_mode = cluster_per_flow; /* Default */
7905 + el->hashing_id = 0;
7907 + memset(el->sk, 0, sizeof(el->sk));
7909 + el->next = ring_cluster_list;
7910 + ring_cluster_list = el;
7911 + pfr->cluster_id = cluster_id;
7913 + return(0); /* 0 = OK */
7916 +/* ************************************* */
7918 +/* Code taken/inspired from core/sock.c */
7919 +static int ring_setsockopt(struct socket *sock,
7920 + int level, int optname,
7921 + char *optval, int optlen)
7923 + struct ring_opt *pfr = ring_sk(sock->sk);
7924 + int val, found, ret = 0;
7925 + u_int cluster_id, do_enable;
7926 + char devName[8], bloom_filter[256], aho_pattern[256];
7928 + if(pfr == NULL) return(-EINVAL);
7930 + if (get_user(val, (int *)optval))
7937 + case SO_ATTACH_FILTER:
7939 + if (optlen == sizeof(struct sock_fprog)) {
7940 + unsigned int fsize;
7941 + struct sock_fprog fprog;
7942 + struct sk_filter *filter;
7949 + Do not call copy_from_user within a held
7950 + splinlock (e.g. ring_mgmt_lock) as this caused
7951 + problems when certain debugging was enabled under
7952 + 2.6.5 -- including hard lockups of the machine.
7954 + if(copy_from_user(&fprog, optval, sizeof(fprog)))
7957 + fsize = sizeof(struct sock_filter) * fprog.len;
7958 + filter = kmalloc(fsize, GFP_KERNEL);
7960 + if(filter == NULL) {
7965 + if(copy_from_user(filter->insns, fprog.filter, fsize))
7968 + filter->len = fprog.len;
7970 + if(sk_chk_filter(filter->insns, filter->len) != 0) {
7971 + /* Bad filter specified */
7973 + pfr->bpfFilter = NULL;
7977 + /* get the lock, set the filter, release the lock */
7978 + write_lock(&ring_mgmt_lock);
7979 + pfr->bpfFilter = filter;
7980 + write_unlock(&ring_mgmt_lock);
7985 + case SO_DETACH_FILTER:
7986 + write_lock(&ring_mgmt_lock);
7988 + if(pfr->bpfFilter != NULL) {
7989 + kfree(pfr->bpfFilter);
7990 + pfr->bpfFilter = NULL;
7991 + write_unlock(&ring_mgmt_lock);
7997 + case SO_ADD_TO_CLUSTER:
7998 + if (optlen!=sizeof(val))
8001 + if (copy_from_user(&cluster_id, optval, sizeof(cluster_id)))
8004 + write_lock(&ring_mgmt_lock);
8005 + ret = add_to_cluster(sock->sk, pfr, cluster_id);
8006 + write_unlock(&ring_mgmt_lock);
8009 + case SO_REMOVE_FROM_CLUSTER:
8010 + write_lock(&ring_mgmt_lock);
8011 + ret = remove_from_cluster(sock->sk, pfr);
8012 + write_unlock(&ring_mgmt_lock);
8015 + case SO_SET_REFLECTOR:
8016 + if(optlen >= (sizeof(devName)-1))
8020 + if(copy_from_user(devName, optval, optlen))
8024 + devName[optlen] = '\0';
8026 +#if defined(RING_DEBUG)
8027 + printk("+++ SO_SET_REFLECTOR(%s)\n", devName);
8030 + write_lock(&ring_mgmt_lock);
8031 + pfr->reflector_dev = dev_get_by_name(devName);
8032 + write_unlock(&ring_mgmt_lock);
8034 +#if defined(RING_DEBUG)
8035 + if(pfr->reflector_dev != NULL)
8036 + printk("SO_SET_REFLECTOR(%s): succeded\n", devName);
8038 + printk("SO_SET_REFLECTOR(%s): device unknown\n", devName);
8042 + case SO_SET_BLOOM:
8043 + if(optlen >= (sizeof(bloom_filter)-1))
8047 + if(copy_from_user(bloom_filter, optval, optlen))
8051 + bloom_filter[optlen] = '\0';
8053 + write_lock(&ring_mgmt_lock);
8054 + handle_bloom_filter_rule(pfr, bloom_filter);
8055 + write_unlock(&ring_mgmt_lock);
8058 + case SO_SET_STRING:
8059 + if(optlen >= (sizeof(aho_pattern)-1))
8063 + if(copy_from_user(aho_pattern, optval, optlen))
8067 + aho_pattern[optlen] = '\0';
8069 + write_lock(&ring_mgmt_lock);
8070 + if(pfr->acsm != NULL) acsmFree2(pfr->acsm);
8073 + if((pfr->acsm = acsmNew2()) != NULL) {
8074 + int nc=1 /* case sensitive */, i = 0;
8076 + pfr->acsm->acsmFormat = ACF_BANDED;
8077 + acsmAddPattern2(pfr->acsm, (unsigned char*)aho_pattern,
8078 + (int)strlen(aho_pattern), nc, 0, 0,(void*)aho_pattern, i);
8079 + acsmCompile2(pfr->acsm);
8082 + pfr->acsm = kmalloc (10, GFP_KERNEL); /* TEST */
8085 + write_unlock(&ring_mgmt_lock);
8088 + case SO_TOGGLE_BLOOM_STATE:
8089 + if(optlen >= (sizeof(bloom_filter)-1))
8093 + if(copy_from_user(&do_enable, optval, optlen))
8097 + write_lock(&ring_mgmt_lock);
8099 + pfr->bitmask_enabled = 1;
8101 + pfr->bitmask_enabled = 0;
8102 + write_unlock(&ring_mgmt_lock);
8103 + printk("SO_TOGGLE_BLOOM_STATE: bloom bitmask %s\n",
8104 + pfr->bitmask_enabled ? "enabled" : "disabled");
8107 + case SO_RESET_BLOOM_FILTERS:
8108 + if(optlen >= (sizeof(bloom_filter)-1))
8112 + if(copy_from_user(&do_enable, optval, optlen))
8116 + write_lock(&ring_mgmt_lock);
8117 + reset_bloom_filters(pfr);
8118 + write_unlock(&ring_mgmt_lock);
8129 + return(sock_setsockopt(sock, level, optname, optval, optlen));
8132 +/* ************************************* */
8134 +static int ring_ioctl(struct socket *sock,
8135 + unsigned int cmd, unsigned long arg)
8140 + case SIOCGIFFLAGS:
8141 + case SIOCSIFFLAGS:
8143 + case SIOCGIFMETRIC:
8144 + case SIOCSIFMETRIC:
8150 + case SIOCGIFHWADDR:
8151 + case SIOCSIFHWADDR:
8154 + case SIOCSIFSLAVE:
8155 + case SIOCGIFSLAVE:
8156 + case SIOCGIFINDEX:
8158 + case SIOCGIFCOUNT:
8159 + case SIOCSIFHWBROADCAST:
8160 + return(inet_dgram_ops.ioctl(sock, cmd, arg));
8164 + return -ENOIOCTLCMD;
8170 +/* ************************************* */
8172 +static struct proto_ops ring_ops = {
8173 + .family = PF_RING,
8174 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
8175 + .owner = THIS_MODULE,
8178 + /* Operations that make no sense on ring sockets. */
8179 + .connect = sock_no_connect,
8180 + .socketpair = sock_no_socketpair,
8181 + .accept = sock_no_accept,
8182 + .getname = sock_no_getname,
8183 + .listen = sock_no_listen,
8184 + .shutdown = sock_no_shutdown,
8185 + .sendpage = sock_no_sendpage,
8186 + .sendmsg = sock_no_sendmsg,
8187 + .getsockopt = sock_no_getsockopt,
8189 + /* Now the operations that really occur. */
8190 + .release = ring_release,
8191 + .bind = ring_bind,
8192 + .mmap = ring_mmap,
8193 + .poll = ring_poll,
8194 + .setsockopt = ring_setsockopt,
8195 + .ioctl = ring_ioctl,
8196 + .recvmsg = ring_recvmsg,
8199 +/* ************************************ */
8201 +static struct net_proto_family ring_family_ops = {
8202 + .family = PF_RING,
8203 + .create = ring_create,
8204 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
8205 + .owner = THIS_MODULE,
8209 +// BD: API changed in 2.6.12, ref:
8210 +// http://svn.clkao.org/svnweb/linux/revision/?rev=28201
8211 +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,11))
8212 +static struct proto ring_proto = {
8213 + .name = "PF_RING",
8214 + .owner = THIS_MODULE,
8215 + .obj_size = sizeof(struct sock),
8219 +/* ************************************ */
8221 +static void __exit ring_exit(void)
8223 + struct list_head *ptr;
8224 + struct ring_element *entry;
8226 + for(ptr = ring_table.next; ptr != &ring_table; ptr = ptr->next) {
8227 + entry = list_entry(ptr, struct ring_element, list);
8231 + while(ring_cluster_list != NULL) {
8232 + struct ring_cluster *next = ring_cluster_list->next;
8233 + kfree(ring_cluster_list);
8234 + ring_cluster_list = next;
8237 + set_skb_ring_handler(NULL);
8238 + set_buffer_ring_handler(NULL);
8239 + sock_unregister(PF_RING);
8241 + printk("PF_RING shut down.\n");
8244 +/* ************************************ */
8246 +static int __init ring_init(void)
8248 + printk("Welcome to PF_RING %s\n(C) 2004-07 L.Deri <deri@ntop.org>\n",
8251 + INIT_LIST_HEAD(&ring_table);
8252 + ring_cluster_list = NULL;
8254 + sock_register(&ring_family_ops);
8256 + set_skb_ring_handler(skb_ring_handler);
8257 + set_buffer_ring_handler(buffer_ring_handler);
8259 + if(get_buffer_ring_handler() != buffer_ring_handler) {
8260 + printk("PF_RING: set_buffer_ring_handler FAILED\n");
8262 + set_skb_ring_handler(NULL);
8263 + set_buffer_ring_handler(NULL);
8264 + sock_unregister(PF_RING);
8267 + printk("PF_RING: bucket length %d bytes\n", bucket_len);
8268 + printk("PF_RING: ring slots %d\n", num_slots);
8269 + printk("PF_RING: sample rate %d [1=no sampling]\n", sample_rate);
8270 + printk("PF_RING: capture TX %s\n",
8271 + enable_tx_capture ? "Yes [RX+TX]" : "No [RX only]");
8272 + printk("PF_RING: transparent mode %s\n",
8273 + transparent_mode ? "Yes" : "No");
8275 + printk("PF_RING initialized correctly.\n");
8282 +module_init(ring_init);
8283 +module_exit(ring_exit);
8284 +MODULE_LICENSE("GPL");
8286 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
8287 +MODULE_ALIAS_NETPROTO(PF_RING);