]> git.pld-linux.org Git - packages/kernel.git/blame - linux-PF_RING.patch
This commit was manufactured by cvs2git to create branch 'LINUX_2_6_22'.
[packages/kernel.git] / linux-PF_RING.patch
CommitLineData
7f651772 1diff --unified --recursive --new-file linux-2.6.21.4/include/linux/ring.h linux-2.6.21.4-1-686-smp-ring3/include/linux/ring.h
2--- linux-2.6.21.4/include/linux/ring.h 1970-01-01 00:00:00.000000000 +0000
3+++ linux-2.6.21.4-1-686-smp-ring3/include/linux/ring.h 2007-06-10 16:43:04.346421348 +0000
4@@ -0,0 +1,240 @@
5+/*
6+ * Definitions for packet ring
7+ *
8+ * 2004-07 Luca Deri <deri@ntop.org>
9+ */
10+#ifndef __RING_H
11+#define __RING_H
12+
13+#define INCLUDE_MAC_INFO
14+
15+#ifdef INCLUDE_MAC_INFO
16+#define SKB_DISPLACEMENT 14 /* Include MAC address information */
17+#else
18+#define SKB_DISPLACEMENT 0 /* Do NOT include MAC address information */
19+#endif
20+
21+#define RING_MAGIC
22+#define RING_MAGIC_VALUE 0x88
23+#define RING_FLOWSLOT_VERSION 6
24+#define RING_VERSION "3.4.1"
25+
26+#define SO_ADD_TO_CLUSTER 99
27+#define SO_REMOVE_FROM_CLUSTER 100
28+#define SO_SET_REFLECTOR 101
29+#define SO_SET_BLOOM 102
30+#define SO_SET_STRING 103
31+#define SO_TOGGLE_BLOOM_STATE 104
32+#define SO_RESET_BLOOM_FILTERS 105
33+
34+#define BITMASK_SET(n, p) (((char*)p->bits_memory)[n/8] |= (1<<(n % 8)))
35+#define BITMASK_CLR(n, p) (((char*)p->bits_memory)[n/8] &= ~(1<<(n % 8)))
36+#define BITMASK_ISSET(n, p) (((char*)p->bits_memory)[n/8] & (1<<(n % 8)))
37+
38+/* *********************************** */
39+
40+/*
41+ Aho-Corasick code taken from Snort
42+ under GPL license
43+*/
44+/*
45+ * DEFINES and Typedef's
46+ */
47+#define MAX_ALPHABET_SIZE 256
48+
49+/*
50+ FAIL STATE for 1,2,or 4 bytes for state transitions
51+
52+ Uncomment this define to use 32 bit state values
53+ #define AC32
54+*/
55+
56+typedef unsigned short acstate_t;
57+#define ACSM_FAIL_STATE2 0xffff
58+
59+/*
60+ *
61+ */
62+typedef
63+struct _acsm_pattern2
64+{
65+ struct _acsm_pattern2 *next;
66+
67+ unsigned char *patrn;
68+ unsigned char *casepatrn;
69+ int n;
70+ int nocase;
71+ int offset;
72+ int depth;
73+ void * id;
74+ int iid;
75+
76+} ACSM_PATTERN2;
77+
78+/*
79+ * transition nodes - either 8 or 12 bytes
80+ */
81+typedef
82+struct trans_node_s {
83+
84+ acstate_t key; /* The character that got us here - sized to keep structure aligned on 4 bytes */
85+ /* to better the caching opportunities. A value that crosses the cache line */
86+ /* forces an expensive reconstruction, typing this as acstate_t stops that. */
87+ acstate_t next_state; /* */
88+ struct trans_node_s * next; /* next transition for this state */
89+
90+} trans_node_t;
91+
92+
93+/*
94+ * User specified final storage type for the state transitions
95+ */
96+enum {
97+ ACF_FULL,
98+ ACF_SPARSE,
99+ ACF_BANDED,
100+ ACF_SPARSEBANDS,
101+};
102+
103+/*
104+ * User specified machine types
105+ *
106+ * TRIE : Keyword trie
107+ * NFA :
108+ * DFA :
109+ */
110+enum {
111+ FSA_TRIE,
112+ FSA_NFA,
113+ FSA_DFA,
114+};
115+
116+/*
117+ * Aho-Corasick State Machine Struct - one per group of pattterns
118+ */
119+typedef struct {
120+ int acsmMaxStates;
121+ int acsmNumStates;
122+
123+ ACSM_PATTERN2 * acsmPatterns;
124+ acstate_t * acsmFailState;
125+ ACSM_PATTERN2 ** acsmMatchList;
126+
127+ /* list of transitions in each state, this is used to build the nfa & dfa */
128+ /* after construction we convert to sparse or full format matrix and free */
129+ /* the transition lists */
130+ trans_node_t ** acsmTransTable;
131+
132+ acstate_t ** acsmNextState;
133+ int acsmFormat;
134+ int acsmSparseMaxRowNodes;
135+ int acsmSparseMaxZcnt;
136+
137+ int acsmNumTrans;
138+ int acsmAlphabetSize;
139+ int acsmFSA;
140+
141+} ACSM_STRUCT2;
142+
143+/* *********************************** */
144+
145+#ifndef HAVE_PCAP
146+struct pcap_pkthdr {
147+ struct timeval ts; /* time stamp */
148+ u_int32_t caplen; /* length of portion present */
149+ u_int32_t len; /* length this packet (off wire) */
150+ /* packet parsing info */
151+ u_int16_t eth_type; /* Ethernet type */
152+ u_int16_t vlan_id; /* VLAN Id or -1 for no vlan */
153+ u_int8_t l3_proto; /* Layer 3 protocol */
154+ u_int16_t l3_offset, l4_offset, payload_offset; /* Offsets of L3/L4/payload elements */
155+ u_int32_t ipv4_src, ipv4_dst; /* IPv4 src/dst IP addresses */
156+ u_int16_t l4_src_port, l4_dst_port; /* Layer 4 src/dst ports */
157+};
158+#endif
159+
160+/* *********************************** */
161+
162+typedef struct _counter_list {
163+ u_int32_t bit_id;
164+ u_int32_t bit_counter;
165+ struct _counter_list *next;
166+} bitmask_counter_list;
167+
168+typedef struct {
169+ u_int32_t num_bits, order, num_pages;
170+ unsigned long bits_memory;
171+ bitmask_counter_list *clashes;
172+} bitmask_selector;
173+
174+/* *********************************** */
175+
176+enum cluster_type {
177+ cluster_per_flow = 0,
178+ cluster_round_robin
179+};
180+
181+/* *********************************** */
182+
183+#define RING_MIN_SLOT_SIZE (60+sizeof(struct pcap_pkthdr))
184+#define RING_MAX_SLOT_SIZE (1514+sizeof(struct pcap_pkthdr))
185+
186+/* *********************************** */
187+
188+typedef struct flowSlotInfo {
189+ u_int16_t version, sample_rate;
190+ u_int32_t tot_slots, slot_len, data_len, tot_mem;
191+
192+ u_int64_t tot_pkts, tot_lost;
193+ u_int64_t tot_insert, tot_read;
194+ u_int32_t insert_idx, remove_idx;
195+} FlowSlotInfo;
196+
197+/* *********************************** */
198+
199+typedef struct flowSlot {
200+#ifdef RING_MAGIC
201+ u_char magic; /* It must alwasy be zero */
202+#endif
203+ u_char slot_state; /* 0=empty, 1=full */
204+ u_char bucket; /* bucket[bucketLen] */
205+} FlowSlot;
206+
207+/* *********************************** */
208+
209+#ifdef __KERNEL__
210+
211+FlowSlotInfo* getRingPtr(void);
212+int allocateRing(char *deviceName, u_int numSlots,
213+ u_int bucketLen, u_int sampleRate);
214+unsigned int pollRing(struct file *fp, struct poll_table_struct * wait);
215+void deallocateRing(void);
216+
217+/* ************************* */
218+
219+typedef int (*handle_ring_skb)(struct sk_buff *skb,
220+ u_char recv_packet, u_char real_skb);
221+extern handle_ring_skb get_skb_ring_handler(void);
222+extern void set_skb_ring_handler(handle_ring_skb the_handler);
223+extern void do_skb_ring_handler(struct sk_buff *skb,
224+ u_char recv_packet, u_char real_skb);
225+
226+typedef int (*handle_ring_buffer)(struct net_device *dev,
227+ char *data, int len);
228+extern handle_ring_buffer get_buffer_ring_handler(void);
229+extern void set_buffer_ring_handler(handle_ring_buffer the_handler);
230+extern int do_buffer_ring_handler(struct net_device *dev,
231+ char *data, int len);
232+#endif /* __KERNEL__ */
233+
234+/* *********************************** */
235+
236+#define PF_RING 27 /* Packet Ring */
237+#define SOCK_RING PF_RING
238+
239+/* ioctl() */
240+#define SIORINGPOLL 0x8888
241+
242+/* *********************************** */
243+
244+#endif /* __RING_H */
245diff --unified --recursive --new-file linux-2.6.21.4/net/Kconfig linux-2.6.21.4-1-686-smp-ring3/net/Kconfig
246--- linux-2.6.21.4/net/Kconfig 2007-06-07 21:27:31.000000000 +0000
247+++ linux-2.6.21.4-1-686-smp-ring3/net/Kconfig 2007-06-10 16:43:04.402423771 +0000
248@@ -39,6 +39,7 @@
249 source "net/xfrm/Kconfig"
250 source "net/iucv/Kconfig"
251
252+source "net/ring/Kconfig"
253 config INET
254 bool "TCP/IP networking"
255 ---help---
256diff --unified --recursive --new-file linux-2.6.21.4/net/Makefile linux-2.6.21.4-1-686-smp-ring3/net/Makefile
257--- linux-2.6.21.4/net/Makefile 2007-06-07 21:27:31.000000000 +0000
258+++ linux-2.6.21.4-1-686-smp-ring3/net/Makefile 2007-06-10 16:43:04.394423425 +0000
259@@ -42,6 +42,7 @@
260 obj-$(CONFIG_DECNET) += decnet/
261 obj-$(CONFIG_ECONET) += econet/
262 obj-$(CONFIG_VLAN_8021Q) += 8021q/
263+obj-$(CONFIG_RING) += ring/
264 obj-$(CONFIG_IP_DCCP) += dccp/
265 obj-$(CONFIG_IP_SCTP) += sctp/
266 obj-$(CONFIG_IEEE80211) += ieee80211/
267diff --unified --recursive --new-file linux-2.6.21.4/net/Makefile.ORG linux-2.6.21.4-1-686-smp-ring3/net/Makefile.ORG
268--- linux-2.6.21.4/net/Makefile.ORG 1970-01-01 00:00:00.000000000 +0000
269+++ linux-2.6.21.4-1-686-smp-ring3/net/Makefile.ORG 2007-06-10 16:43:04.386423079 +0000
270@@ -0,0 +1,54 @@
271+#
272+# Makefile for the linux networking.
273+#
274+# 2 Sep 2000, Christoph Hellwig <hch@infradead.org>
275+# Rewritten to use lists instead of if-statements.
276+#
277+
278+obj-y := nonet.o
279+
280+obj-$(CONFIG_NET) := socket.o core/
281+
282+tmp-$(CONFIG_COMPAT) := compat.o
283+obj-$(CONFIG_NET) += $(tmp-y)
284+
285+# LLC has to be linked before the files in net/802/
286+obj-$(CONFIG_LLC) += llc/
287+obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/
288+obj-$(CONFIG_NETFILTER) += netfilter/
289+obj-$(CONFIG_INET) += ipv4/
290+obj-$(CONFIG_XFRM) += xfrm/
291+obj-$(CONFIG_UNIX) += unix/
292+ifneq ($(CONFIG_IPV6),)
293+obj-y += ipv6/
294+endif
295+obj-$(CONFIG_PACKET) += packet/
296+obj-$(CONFIG_NET_KEY) += key/
297+obj-$(CONFIG_NET_SCHED) += sched/
298+obj-$(CONFIG_BRIDGE) += bridge/
299+obj-$(CONFIG_IPX) += ipx/
300+obj-$(CONFIG_ATALK) += appletalk/
301+obj-$(CONFIG_WAN_ROUTER) += wanrouter/
302+obj-$(CONFIG_X25) += x25/
303+obj-$(CONFIG_LAPB) += lapb/
304+obj-$(CONFIG_NETROM) += netrom/
305+obj-$(CONFIG_ROSE) += rose/
306+obj-$(CONFIG_AX25) += ax25/
307+obj-$(CONFIG_IRDA) += irda/
308+obj-$(CONFIG_BT) += bluetooth/
309+obj-$(CONFIG_SUNRPC) += sunrpc/
310+obj-$(CONFIG_RXRPC) += rxrpc/
311+obj-$(CONFIG_ATM) += atm/
312+obj-$(CONFIG_DECNET) += decnet/
313+obj-$(CONFIG_ECONET) += econet/
314+obj-$(CONFIG_VLAN_8021Q) += 8021q/
315+obj-$(CONFIG_IP_DCCP) += dccp/
316+obj-$(CONFIG_IP_SCTP) += sctp/
317+obj-$(CONFIG_IEEE80211) += ieee80211/
318+obj-$(CONFIG_TIPC) += tipc/
319+obj-$(CONFIG_NETLABEL) += netlabel/
320+obj-$(CONFIG_IUCV) += iucv/
321+
322+ifeq ($(CONFIG_NET),y)
323+obj-$(CONFIG_SYSCTL) += sysctl_net.o
324+endif
325diff --unified --recursive --new-file linux-2.6.21.4/net/core/dev.c linux-2.6.21.4-1-686-smp-ring3/net/core/dev.c
326--- linux-2.6.21.4/net/core/dev.c 2007-06-07 21:27:31.000000000 +0000
327+++ linux-2.6.21.4-1-686-smp-ring3/net/core/dev.c 2007-06-10 16:43:04.382422906 +0000
328@@ -117,6 +117,56 @@
329 #include <linux/err.h>
330 #include <linux/ctype.h>
331
332+#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE)
333+
334+/* #define RING_DEBUG */
335+
336+#include <linux/ring.h>
337+#include <linux/version.h>
338+
339+static handle_ring_skb ring_handler = NULL;
340+
341+handle_ring_skb get_skb_ring_handler() { return(ring_handler); }
342+
343+void set_skb_ring_handler(handle_ring_skb the_handler) {
344+ ring_handler = the_handler;
345+}
346+
347+void do_skb_ring_handler(struct sk_buff *skb,
348+ u_char recv_packet, u_char real_skb) {
349+ if(ring_handler)
350+ ring_handler(skb, recv_packet, real_skb);
351+}
352+
353+/* ******************* */
354+
355+static handle_ring_buffer buffer_ring_handler = NULL;
356+
357+handle_ring_buffer get_buffer_ring_handler() { return(buffer_ring_handler); }
358+
359+void set_buffer_ring_handler(handle_ring_buffer the_handler) {
360+ buffer_ring_handler = the_handler;
361+}
362+
363+int do_buffer_ring_handler(struct net_device *dev, char *data, int len) {
364+ if(buffer_ring_handler) {
365+ buffer_ring_handler(dev, data, len);
366+ return(1);
367+ } else
368+ return(0);
369+}
370+
371+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
372+EXPORT_SYMBOL(get_skb_ring_handler);
373+EXPORT_SYMBOL(set_skb_ring_handler);
374+EXPORT_SYMBOL(do_skb_ring_handler);
375+
376+EXPORT_SYMBOL(get_buffer_ring_handler);
377+EXPORT_SYMBOL(set_buffer_ring_handler);
378+EXPORT_SYMBOL(do_buffer_ring_handler);
379+#endif
380+
381+#endif
382 /*
383 * The list of packet types we will receive (as opposed to discard)
384 * and the routines to invoke.
385@@ -1474,6 +1524,10 @@
386 skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
387 #endif
388 if (q->enqueue) {
389+#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE)
390+ if(ring_handler) ring_handler(skb, 0, 1);
391+#endif /* CONFIG_RING */
392+
393 /* Grab device queue */
394 spin_lock(&dev->queue_lock);
395 q = dev->qdisc;
396@@ -1574,6 +1628,13 @@
397 unsigned long flags;
398
399 /* if netpoll wants it, pretend we never saw it */
400+#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE)
401+ if(ring_handler && ring_handler(skb, 1, 1)) {
402+ /* The packet has been copied into a ring */
403+ return(NET_RX_SUCCESS);
404+ }
405+#endif /* CONFIG_RING */
406+
407 if (netpoll_rx(skb))
408 return NET_RX_DROP;
409
410@@ -1764,6 +1825,13 @@
411 struct net_device *orig_dev;
412 int ret = NET_RX_DROP;
413 __be16 type;
414+#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE)
415+ if(ring_handler && ring_handler(skb, 1, 1)) {
416+ /* The packet has been copied into a ring */
417+ return(NET_RX_SUCCESS);
418+ }
419+#endif /* CONFIG_RING */
420+
421
422 /* if we've gotten here through NAPI, check netpoll */
423 if (skb->dev->poll && netpoll_rx(skb))
424diff --unified --recursive --new-file linux-2.6.21.4/net/core/dev.c.ORG linux-2.6.21.4-1-686-smp-ring3/net/core/dev.c.ORG
425--- linux-2.6.21.4/net/core/dev.c.ORG 1970-01-01 00:00:00.000000000 +0000
426+++ linux-2.6.21.4-1-686-smp-ring3/net/core/dev.c.ORG 2007-06-10 16:43:04.354421694 +0000
427@@ -0,0 +1,3571 @@
428+/*
429+ * NET3 Protocol independent device support routines.
430+ *
431+ * This program is free software; you can redistribute it and/or
432+ * modify it under the terms of the GNU General Public License
433+ * as published by the Free Software Foundation; either version
434+ * 2 of the License, or (at your option) any later version.
435+ *
436+ * Derived from the non IP parts of dev.c 1.0.19
437+ * Authors: Ross Biro
438+ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
439+ * Mark Evans, <evansmp@uhura.aston.ac.uk>
440+ *
441+ * Additional Authors:
442+ * Florian la Roche <rzsfl@rz.uni-sb.de>
443+ * Alan Cox <gw4pts@gw4pts.ampr.org>
444+ * David Hinds <dahinds@users.sourceforge.net>
445+ * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
446+ * Adam Sulmicki <adam@cfar.umd.edu>
447+ * Pekka Riikonen <priikone@poesidon.pspt.fi>
448+ *
449+ * Changes:
450+ * D.J. Barrow : Fixed bug where dev->refcnt gets set
451+ * to 2 if register_netdev gets called
452+ * before net_dev_init & also removed a
453+ * few lines of code in the process.
454+ * Alan Cox : device private ioctl copies fields back.
455+ * Alan Cox : Transmit queue code does relevant
456+ * stunts to keep the queue safe.
457+ * Alan Cox : Fixed double lock.
458+ * Alan Cox : Fixed promisc NULL pointer trap
459+ * ???????? : Support the full private ioctl range
460+ * Alan Cox : Moved ioctl permission check into
461+ * drivers
462+ * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
463+ * Alan Cox : 100 backlog just doesn't cut it when
464+ * you start doing multicast video 8)
465+ * Alan Cox : Rewrote net_bh and list manager.
466+ * Alan Cox : Fix ETH_P_ALL echoback lengths.
467+ * Alan Cox : Took out transmit every packet pass
468+ * Saved a few bytes in the ioctl handler
469+ * Alan Cox : Network driver sets packet type before
470+ * calling netif_rx. Saves a function
471+ * call a packet.
472+ * Alan Cox : Hashed net_bh()
473+ * Richard Kooijman: Timestamp fixes.
474+ * Alan Cox : Wrong field in SIOCGIFDSTADDR
475+ * Alan Cox : Device lock protection.
476+ * Alan Cox : Fixed nasty side effect of device close
477+ * changes.
478+ * Rudi Cilibrasi : Pass the right thing to
479+ * set_mac_address()
480+ * Dave Miller : 32bit quantity for the device lock to
481+ * make it work out on a Sparc.
482+ * Bjorn Ekwall : Added KERNELD hack.
483+ * Alan Cox : Cleaned up the backlog initialise.
484+ * Craig Metz : SIOCGIFCONF fix if space for under
485+ * 1 device.
486+ * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
487+ * is no device open function.
488+ * Andi Kleen : Fix error reporting for SIOCGIFCONF
489+ * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
490+ * Cyrus Durgin : Cleaned for KMOD
491+ * Adam Sulmicki : Bug Fix : Network Device Unload
492+ * A network device unload needs to purge
493+ * the backlog queue.
494+ * Paul Rusty Russell : SIOCSIFNAME
495+ * Pekka Riikonen : Netdev boot-time settings code
496+ * Andrew Morton : Make unregister_netdevice wait
497+ * indefinitely on dev->refcnt
498+ * J Hadi Salim : - Backlog queue sampling
499+ * - netif_rx() feedback
500+ */
501+
502+#include <asm/uaccess.h>
503+#include <asm/system.h>
504+#include <linux/bitops.h>
505+#include <linux/capability.h>
506+#include <linux/cpu.h>
507+#include <linux/types.h>
508+#include <linux/kernel.h>
509+#include <linux/sched.h>
510+#include <linux/mutex.h>
511+#include <linux/string.h>
512+#include <linux/mm.h>
513+#include <linux/socket.h>
514+#include <linux/sockios.h>
515+#include <linux/errno.h>
516+#include <linux/interrupt.h>
517+#include <linux/if_ether.h>
518+#include <linux/netdevice.h>
519+#include <linux/etherdevice.h>
520+#include <linux/notifier.h>
521+#include <linux/skbuff.h>
522+#include <net/sock.h>
523+#include <linux/rtnetlink.h>
524+#include <linux/proc_fs.h>
525+#include <linux/seq_file.h>
526+#include <linux/stat.h>
527+#include <linux/if_bridge.h>
528+#include <net/dst.h>
529+#include <net/pkt_sched.h>
530+#include <net/checksum.h>
531+#include <linux/highmem.h>
532+#include <linux/init.h>
533+#include <linux/kmod.h>
534+#include <linux/module.h>
535+#include <linux/kallsyms.h>
536+#include <linux/netpoll.h>
537+#include <linux/rcupdate.h>
538+#include <linux/delay.h>
539+#include <linux/wireless.h>
540+#include <net/iw_handler.h>
541+#include <asm/current.h>
542+#include <linux/audit.h>
543+#include <linux/dmaengine.h>
544+#include <linux/err.h>
545+#include <linux/ctype.h>
546+
547+/*
548+ * The list of packet types we will receive (as opposed to discard)
549+ * and the routines to invoke.
550+ *
551+ * Why 16. Because with 16 the only overlap we get on a hash of the
552+ * low nibble of the protocol value is RARP/SNAP/X.25.
553+ *
554+ * NOTE: That is no longer true with the addition of VLAN tags. Not
555+ * sure which should go first, but I bet it won't make much
556+ * difference if we are running VLANs. The good news is that
557+ * this protocol won't be in the list unless compiled in, so
558+ * the average user (w/out VLANs) will not be adversely affected.
559+ * --BLG
560+ *
561+ * 0800 IP
562+ * 8100 802.1Q VLAN
563+ * 0001 802.3
564+ * 0002 AX.25
565+ * 0004 802.2
566+ * 8035 RARP
567+ * 0005 SNAP
568+ * 0805 X.25
569+ * 0806 ARP
570+ * 8137 IPX
571+ * 0009 Localtalk
572+ * 86DD IPv6
573+ */
574+
575+static DEFINE_SPINLOCK(ptype_lock);
576+static struct list_head ptype_base[16]; /* 16 way hashed list */
577+static struct list_head ptype_all; /* Taps */
578+
579+#ifdef CONFIG_NET_DMA
580+static struct dma_client *net_dma_client;
581+static unsigned int net_dma_count;
582+static spinlock_t net_dma_event_lock;
583+#endif
584+
585+/*
586+ * The @dev_base list is protected by @dev_base_lock and the rtnl
587+ * semaphore.
588+ *
589+ * Pure readers hold dev_base_lock for reading.
590+ *
591+ * Writers must hold the rtnl semaphore while they loop through the
592+ * dev_base list, and hold dev_base_lock for writing when they do the
593+ * actual updates. This allows pure readers to access the list even
594+ * while a writer is preparing to update it.
595+ *
596+ * To put it another way, dev_base_lock is held for writing only to
597+ * protect against pure readers; the rtnl semaphore provides the
598+ * protection against other writers.
599+ *
600+ * See, for example usages, register_netdevice() and
601+ * unregister_netdevice(), which must be called with the rtnl
602+ * semaphore held.
603+ */
604+struct net_device *dev_base;
605+static struct net_device **dev_tail = &dev_base;
606+DEFINE_RWLOCK(dev_base_lock);
607+
608+EXPORT_SYMBOL(dev_base);
609+EXPORT_SYMBOL(dev_base_lock);
610+
611+#define NETDEV_HASHBITS 8
612+static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS];
613+static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS];
614+
615+static inline struct hlist_head *dev_name_hash(const char *name)
616+{
617+ unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
618+ return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)];
619+}
620+
621+static inline struct hlist_head *dev_index_hash(int ifindex)
622+{
623+ return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)];
624+}
625+
626+/*
627+ * Our notifier list
628+ */
629+
630+static RAW_NOTIFIER_HEAD(netdev_chain);
631+
632+/*
633+ * Device drivers call our routines to queue packets here. We empty the
634+ * queue in the local softnet handler.
635+ */
636+DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL };
637+
638+#ifdef CONFIG_SYSFS
639+extern int netdev_sysfs_init(void);
640+extern int netdev_register_sysfs(struct net_device *);
641+extern void netdev_unregister_sysfs(struct net_device *);
642+#else
643+#define netdev_sysfs_init() (0)
644+#define netdev_register_sysfs(dev) (0)
645+#define netdev_unregister_sysfs(dev) do { } while(0)
646+#endif
647+
648+
649+/*******************************************************************************
650+
651+ Protocol management and registration routines
652+
653+*******************************************************************************/
654+
655+/*
656+ * For efficiency
657+ */
658+
659+static int netdev_nit;
660+
661+/*
662+ * Add a protocol ID to the list. Now that the input handler is
663+ * smarter we can dispense with all the messy stuff that used to be
664+ * here.
665+ *
666+ * BEWARE!!! Protocol handlers, mangling input packets,
667+ * MUST BE last in hash buckets and checking protocol handlers
668+ * MUST start from promiscuous ptype_all chain in net_bh.
669+ * It is true now, do not change it.
670+ * Explanation follows: if protocol handler, mangling packet, will
671+ * be the first on list, it is not able to sense, that packet
672+ * is cloned and should be copied-on-write, so that it will
673+ * change it and subsequent readers will get broken packet.
674+ * --ANK (980803)
675+ */
676+
677+/**
678+ * dev_add_pack - add packet handler
679+ * @pt: packet type declaration
680+ *
681+ * Add a protocol handler to the networking stack. The passed &packet_type
682+ * is linked into kernel lists and may not be freed until it has been
683+ * removed from the kernel lists.
684+ *
685+ * This call does not sleep therefore it can not
686+ * guarantee all CPU's that are in middle of receiving packets
687+ * will see the new packet type (until the next received packet).
688+ */
689+
690+void dev_add_pack(struct packet_type *pt)
691+{
692+ int hash;
693+
694+ spin_lock_bh(&ptype_lock);
695+ if (pt->type == htons(ETH_P_ALL)) {
696+ netdev_nit++;
697+ list_add_rcu(&pt->list, &ptype_all);
698+ } else {
699+ hash = ntohs(pt->type) & 15;
700+ list_add_rcu(&pt->list, &ptype_base[hash]);
701+ }
702+ spin_unlock_bh(&ptype_lock);
703+}
704+
705+/**
706+ * __dev_remove_pack - remove packet handler
707+ * @pt: packet type declaration
708+ *
709+ * Remove a protocol handler that was previously added to the kernel
710+ * protocol handlers by dev_add_pack(). The passed &packet_type is removed
711+ * from the kernel lists and can be freed or reused once this function
712+ * returns.
713+ *
714+ * The packet type might still be in use by receivers
715+ * and must not be freed until after all the CPU's have gone
716+ * through a quiescent state.
717+ */
718+void __dev_remove_pack(struct packet_type *pt)
719+{
720+ struct list_head *head;
721+ struct packet_type *pt1;
722+
723+ spin_lock_bh(&ptype_lock);
724+
725+ if (pt->type == htons(ETH_P_ALL)) {
726+ netdev_nit--;
727+ head = &ptype_all;
728+ } else
729+ head = &ptype_base[ntohs(pt->type) & 15];
730+
731+ list_for_each_entry(pt1, head, list) {
732+ if (pt == pt1) {
733+ list_del_rcu(&pt->list);
734+ goto out;
735+ }
736+ }
737+
738+ printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
739+out:
740+ spin_unlock_bh(&ptype_lock);
741+}
742+/**
743+ * dev_remove_pack - remove packet handler
744+ * @pt: packet type declaration
745+ *
746+ * Remove a protocol handler that was previously added to the kernel
747+ * protocol handlers by dev_add_pack(). The passed &packet_type is removed
748+ * from the kernel lists and can be freed or reused once this function
749+ * returns.
750+ *
751+ * This call sleeps to guarantee that no CPU is looking at the packet
752+ * type after return.
753+ */
754+void dev_remove_pack(struct packet_type *pt)
755+{
756+ __dev_remove_pack(pt);
757+
758+ synchronize_net();
759+}
760+
761+/******************************************************************************
762+
763+ Device Boot-time Settings Routines
764+
765+*******************************************************************************/
766+
767+/* Boot time configuration table */
768+static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
769+
770+/**
771+ * netdev_boot_setup_add - add new setup entry
772+ * @name: name of the device
773+ * @map: configured settings for the device
774+ *
775+ * Adds new setup entry to the dev_boot_setup list. The function
776+ * returns 0 on error and 1 on success. This is a generic routine to
777+ * all netdevices.
778+ */
779+static int netdev_boot_setup_add(char *name, struct ifmap *map)
780+{
781+ struct netdev_boot_setup *s;
782+ int i;
783+
784+ s = dev_boot_setup;
785+ for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
786+ if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
787+ memset(s[i].name, 0, sizeof(s[i].name));
788+ strcpy(s[i].name, name);
789+ memcpy(&s[i].map, map, sizeof(s[i].map));
790+ break;
791+ }
792+ }
793+
794+ return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
795+}
796+
797+/**
798+ * netdev_boot_setup_check - check boot time settings
799+ * @dev: the netdevice
800+ *
801+ * Check boot time settings for the device.
802+ * The found settings are set for the device to be used
803+ * later in the device probing.
804+ * Returns 0 if no settings found, 1 if they are.
805+ */
806+int netdev_boot_setup_check(struct net_device *dev)
807+{
808+ struct netdev_boot_setup *s = dev_boot_setup;
809+ int i;
810+
811+ for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
812+ if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
813+ !strncmp(dev->name, s[i].name, strlen(s[i].name))) {
814+ dev->irq = s[i].map.irq;
815+ dev->base_addr = s[i].map.base_addr;
816+ dev->mem_start = s[i].map.mem_start;
817+ dev->mem_end = s[i].map.mem_end;
818+ return 1;
819+ }
820+ }
821+ return 0;
822+}
823+
824+
825+/**
826+ * netdev_boot_base - get address from boot time settings
827+ * @prefix: prefix for network device
828+ * @unit: id for network device
829+ *
830+ * Check boot time settings for the base address of device.
831+ * The found settings are set for the device to be used
832+ * later in the device probing.
833+ * Returns 0 if no settings found.
834+ */
835+unsigned long netdev_boot_base(const char *prefix, int unit)
836+{
837+ const struct netdev_boot_setup *s = dev_boot_setup;
838+ char name[IFNAMSIZ];
839+ int i;
840+
841+ sprintf(name, "%s%d", prefix, unit);
842+
843+ /*
844+ * If device already registered then return base of 1
845+ * to indicate not to probe for this interface
846+ */
847+ if (__dev_get_by_name(name))
848+ return 1;
849+
850+ for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
851+ if (!strcmp(name, s[i].name))
852+ return s[i].map.base_addr;
853+ return 0;
854+}
855+
856+/*
857+ * Saves at boot time configured settings for any netdevice.
858+ */
859+int __init netdev_boot_setup(char *str)
860+{
861+ int ints[5];
862+ struct ifmap map;
863+
864+ str = get_options(str, ARRAY_SIZE(ints), ints);
865+ if (!str || !*str)
866+ return 0;
867+
868+ /* Save settings */
869+ memset(&map, 0, sizeof(map));
870+ if (ints[0] > 0)
871+ map.irq = ints[1];
872+ if (ints[0] > 1)
873+ map.base_addr = ints[2];
874+ if (ints[0] > 2)
875+ map.mem_start = ints[3];
876+ if (ints[0] > 3)
877+ map.mem_end = ints[4];
878+
879+ /* Add new entry to the list */
880+ return netdev_boot_setup_add(str, &map);
881+}
882+
883+__setup("netdev=", netdev_boot_setup);
884+
885+/*******************************************************************************
886+
887+ Device Interface Subroutines
888+
889+*******************************************************************************/
890+
891+/**
892+ * __dev_get_by_name - find a device by its name
893+ * @name: name to find
894+ *
895+ * Find an interface by name. Must be called under RTNL semaphore
896+ * or @dev_base_lock. If the name is found a pointer to the device
897+ * is returned. If the name is not found then %NULL is returned. The
898+ * reference counters are not incremented so the caller must be
899+ * careful with locks.
900+ */
901+
902+struct net_device *__dev_get_by_name(const char *name)
903+{
904+ struct hlist_node *p;
905+
906+ hlist_for_each(p, dev_name_hash(name)) {
907+ struct net_device *dev
908+ = hlist_entry(p, struct net_device, name_hlist);
909+ if (!strncmp(dev->name, name, IFNAMSIZ))
910+ return dev;
911+ }
912+ return NULL;
913+}
914+
915+/**
916+ * dev_get_by_name - find a device by its name
917+ * @name: name to find
918+ *
919+ * Find an interface by name. This can be called from any
920+ * context and does its own locking. The returned handle has
921+ * the usage count incremented and the caller must use dev_put() to
922+ * release it when it is no longer needed. %NULL is returned if no
923+ * matching device is found.
924+ */
925+
926+struct net_device *dev_get_by_name(const char *name)
927+{
928+ struct net_device *dev;
929+
930+ read_lock(&dev_base_lock);
931+ dev = __dev_get_by_name(name);
932+ if (dev)
933+ dev_hold(dev);
934+ read_unlock(&dev_base_lock);
935+ return dev;
936+}
937+
938+/**
939+ * __dev_get_by_index - find a device by its ifindex
940+ * @ifindex: index of device
941+ *
942+ * Search for an interface by index. Returns %NULL if the device
943+ * is not found or a pointer to the device. The device has not
944+ * had its reference counter increased so the caller must be careful
945+ * about locking. The caller must hold either the RTNL semaphore
946+ * or @dev_base_lock.
947+ */
948+
949+struct net_device *__dev_get_by_index(int ifindex)
950+{
951+ struct hlist_node *p;
952+
953+ hlist_for_each(p, dev_index_hash(ifindex)) {
954+ struct net_device *dev
955+ = hlist_entry(p, struct net_device, index_hlist);
956+ if (dev->ifindex == ifindex)
957+ return dev;
958+ }
959+ return NULL;
960+}
961+
962+
963+/**
964+ * dev_get_by_index - find a device by its ifindex
965+ * @ifindex: index of device
966+ *
967+ * Search for an interface by index. Returns NULL if the device
968+ * is not found or a pointer to the device. The device returned has
969+ * had a reference added and the pointer is safe until the user calls
970+ * dev_put to indicate they have finished with it.
971+ */
972+
973+struct net_device *dev_get_by_index(int ifindex)
974+{
975+ struct net_device *dev;
976+
977+ read_lock(&dev_base_lock);
978+ dev = __dev_get_by_index(ifindex);
979+ if (dev)
980+ dev_hold(dev);
981+ read_unlock(&dev_base_lock);
982+ return dev;
983+}
984+
985+/**
986+ * dev_getbyhwaddr - find a device by its hardware address
987+ * @type: media type of device
988+ * @ha: hardware address
989+ *
990+ * Search for an interface by MAC address. Returns NULL if the device
991+ * is not found or a pointer to the device. The caller must hold the
992+ * rtnl semaphore. The returned device has not had its ref count increased
993+ * and the caller must therefore be careful about locking
994+ *
995+ * BUGS:
996+ * If the API was consistent this would be __dev_get_by_hwaddr
997+ */
998+
999+struct net_device *dev_getbyhwaddr(unsigned short type, char *ha)
1000+{
1001+ struct net_device *dev;
1002+
1003+ ASSERT_RTNL();
1004+
1005+ for (dev = dev_base; dev; dev = dev->next)
1006+ if (dev->type == type &&
1007+ !memcmp(dev->dev_addr, ha, dev->addr_len))
1008+ break;
1009+ return dev;
1010+}
1011+
1012+EXPORT_SYMBOL(dev_getbyhwaddr);
1013+
1014+struct net_device *dev_getfirstbyhwtype(unsigned short type)
1015+{
1016+ struct net_device *dev;
1017+
1018+ rtnl_lock();
1019+ for (dev = dev_base; dev; dev = dev->next) {
1020+ if (dev->type == type) {
1021+ dev_hold(dev);
1022+ break;
1023+ }
1024+ }
1025+ rtnl_unlock();
1026+ return dev;
1027+}
1028+
1029+EXPORT_SYMBOL(dev_getfirstbyhwtype);
1030+
1031+/**
1032+ * dev_get_by_flags - find any device with given flags
1033+ * @if_flags: IFF_* values
1034+ * @mask: bitmask of bits in if_flags to check
1035+ *
1036+ * Search for any interface with the given flags. Returns NULL if a device
1037+ * is not found or a pointer to the device. The device returned has
1038+ * had a reference added and the pointer is safe until the user calls
1039+ * dev_put to indicate they have finished with it.
1040+ */
1041+
1042+struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask)
1043+{
1044+ struct net_device *dev;
1045+
1046+ read_lock(&dev_base_lock);
1047+ for (dev = dev_base; dev != NULL; dev = dev->next) {
1048+ if (((dev->flags ^ if_flags) & mask) == 0) {
1049+ dev_hold(dev);
1050+ break;
1051+ }
1052+ }
1053+ read_unlock(&dev_base_lock);
1054+ return dev;
1055+}
1056+
1057+/**
1058+ * dev_valid_name - check if name is okay for network device
1059+ * @name: name string
1060+ *
1061+ * Network device names need to be valid file names to
1062+ * to allow sysfs to work. We also disallow any kind of
1063+ * whitespace.
1064+ */
1065+int dev_valid_name(const char *name)
1066+{
1067+ if (*name == '\0')
1068+ return 0;
1069+ if (strlen(name) >= IFNAMSIZ)
1070+ return 0;
1071+ if (!strcmp(name, ".") || !strcmp(name, ".."))
1072+ return 0;
1073+
1074+ while (*name) {
1075+ if (*name == '/' || isspace(*name))
1076+ return 0;
1077+ name++;
1078+ }
1079+ return 1;
1080+}
1081+
1082+/**
1083+ * dev_alloc_name - allocate a name for a device
1084+ * @dev: device
1085+ * @name: name format string
1086+ *
1087+ * Passed a format string - eg "lt%d" it will try and find a suitable
1088+ * id. It scans list of devices to build up a free map, then chooses
1089+ * the first empty slot. The caller must hold the dev_base or rtnl lock
1090+ * while allocating the name and adding the device in order to avoid
1091+ * duplicates.
1092+ * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1093+ * Returns the number of the unit assigned or a negative errno code.
1094+ */
1095+
1096+int dev_alloc_name(struct net_device *dev, const char *name)
1097+{
1098+ int i = 0;
1099+ char buf[IFNAMSIZ];
1100+ const char *p;
1101+ const int max_netdevices = 8*PAGE_SIZE;
1102+ long *inuse;
1103+ struct net_device *d;
1104+
1105+ p = strnchr(name, IFNAMSIZ-1, '%');
1106+ if (p) {
1107+ /*
1108+ * Verify the string as this thing may have come from
1109+ * the user. There must be either one "%d" and no other "%"
1110+ * characters.
1111+ */
1112+ if (p[1] != 'd' || strchr(p + 2, '%'))
1113+ return -EINVAL;
1114+
1115+ /* Use one page as a bit array of possible slots */
1116+ inuse = (long *) get_zeroed_page(GFP_ATOMIC);
1117+ if (!inuse)
1118+ return -ENOMEM;
1119+
1120+ for (d = dev_base; d; d = d->next) {
1121+ if (!sscanf(d->name, name, &i))
1122+ continue;
1123+ if (i < 0 || i >= max_netdevices)
1124+ continue;
1125+
1126+ /* avoid cases where sscanf is not exact inverse of printf */
1127+ snprintf(buf, sizeof(buf), name, i);
1128+ if (!strncmp(buf, d->name, IFNAMSIZ))
1129+ set_bit(i, inuse);
1130+ }
1131+
1132+ i = find_first_zero_bit(inuse, max_netdevices);
1133+ free_page((unsigned long) inuse);
1134+ }
1135+
1136+ snprintf(buf, sizeof(buf), name, i);
1137+ if (!__dev_get_by_name(buf)) {
1138+ strlcpy(dev->name, buf, IFNAMSIZ);
1139+ return i;
1140+ }
1141+
1142+ /* It is possible to run out of possible slots
1143+ * when the name is long and there isn't enough space left
1144+ * for the digits, or if all bits are used.
1145+ */
1146+ return -ENFILE;
1147+}
1148+
1149+
1150+/**
1151+ * dev_change_name - change name of a device
1152+ * @dev: device
1153+ * @newname: name (or format string) must be at least IFNAMSIZ
1154+ *
1155+ * Change name of a device, can pass format strings "eth%d".
1156+ * for wildcarding.
1157+ */
1158+int dev_change_name(struct net_device *dev, char *newname)
1159+{
1160+ int err = 0;
1161+
1162+ ASSERT_RTNL();
1163+
1164+ if (dev->flags & IFF_UP)
1165+ return -EBUSY;
1166+
1167+ if (!dev_valid_name(newname))
1168+ return -EINVAL;
1169+
1170+ if (strchr(newname, '%')) {
1171+ err = dev_alloc_name(dev, newname);
1172+ if (err < 0)
1173+ return err;
1174+ strcpy(newname, dev->name);
1175+ }
1176+ else if (__dev_get_by_name(newname))
1177+ return -EEXIST;
1178+ else
1179+ strlcpy(dev->name, newname, IFNAMSIZ);
1180+
1181+ device_rename(&dev->dev, dev->name);
1182+ hlist_del(&dev->name_hlist);
1183+ hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name));
1184+ raw_notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
1185+
1186+ return err;
1187+}
1188+
1189+/**
1190+ * netdev_features_change - device changes features
1191+ * @dev: device to cause notification
1192+ *
1193+ * Called to indicate a device has changed features.
1194+ */
1195+void netdev_features_change(struct net_device *dev)
1196+{
1197+ raw_notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev);
1198+}
1199+EXPORT_SYMBOL(netdev_features_change);
1200+
1201+/**
1202+ * netdev_state_change - device changes state
1203+ * @dev: device to cause notification
1204+ *
1205+ * Called to indicate a device has changed state. This function calls
1206+ * the notifier chains for netdev_chain and sends a NEWLINK message
1207+ * to the routing socket.
1208+ */
1209+void netdev_state_change(struct net_device *dev)
1210+{
1211+ if (dev->flags & IFF_UP) {
1212+ raw_notifier_call_chain(&netdev_chain,
1213+ NETDEV_CHANGE, dev);
1214+ rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1215+ }
1216+}
1217+
1218+/**
1219+ * dev_load - load a network module
1220+ * @name: name of interface
1221+ *
1222+ * If a network interface is not present and the process has suitable
1223+ * privileges this function loads the module. If module loading is not
1224+ * available in this kernel then it becomes a nop.
1225+ */
1226+
1227+void dev_load(const char *name)
1228+{
1229+ struct net_device *dev;
1230+
1231+ read_lock(&dev_base_lock);
1232+ dev = __dev_get_by_name(name);
1233+ read_unlock(&dev_base_lock);
1234+
1235+ if (!dev && capable(CAP_SYS_MODULE))
1236+ request_module("%s", name);
1237+}
1238+
1239+static int default_rebuild_header(struct sk_buff *skb)
1240+{
1241+ printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n",
1242+ skb->dev ? skb->dev->name : "NULL!!!");
1243+ kfree_skb(skb);
1244+ return 1;
1245+}
1246+
1247+
1248+/**
1249+ * dev_open - prepare an interface for use.
1250+ * @dev: device to open
1251+ *
1252+ * Takes a device from down to up state. The device's private open
1253+ * function is invoked and then the multicast lists are loaded. Finally
1254+ * the device is moved into the up state and a %NETDEV_UP message is
1255+ * sent to the netdev notifier chain.
1256+ *
1257+ * Calling this function on an active interface is a nop. On a failure
1258+ * a negative errno code is returned.
1259+ */
1260+int dev_open(struct net_device *dev)
1261+{
1262+ int ret = 0;
1263+
1264+ /*
1265+ * Is it already up?
1266+ */
1267+
1268+ if (dev->flags & IFF_UP)
1269+ return 0;
1270+
1271+ /*
1272+ * Is it even present?
1273+ */
1274+ if (!netif_device_present(dev))
1275+ return -ENODEV;
1276+
1277+ /*
1278+ * Call device private open method
1279+ */
1280+ set_bit(__LINK_STATE_START, &dev->state);
1281+ if (dev->open) {
1282+ ret = dev->open(dev);
1283+ if (ret)
1284+ clear_bit(__LINK_STATE_START, &dev->state);
1285+ }
1286+
1287+ /*
1288+ * If it went open OK then:
1289+ */
1290+
1291+ if (!ret) {
1292+ /*
1293+ * Set the flags.
1294+ */
1295+ dev->flags |= IFF_UP;
1296+
1297+ /*
1298+ * Initialize multicasting status
1299+ */
1300+ dev_mc_upload(dev);
1301+
1302+ /*
1303+ * Wakeup transmit queue engine
1304+ */
1305+ dev_activate(dev);
1306+
1307+ /*
1308+ * ... and announce new interface.
1309+ */
1310+ raw_notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
1311+ }
1312+ return ret;
1313+}
1314+
1315+/**
1316+ * dev_close - shutdown an interface.
1317+ * @dev: device to shutdown
1318+ *
1319+ * This function moves an active device into down state. A
1320+ * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1321+ * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1322+ * chain.
1323+ */
1324+int dev_close(struct net_device *dev)
1325+{
1326+ if (!(dev->flags & IFF_UP))
1327+ return 0;
1328+
1329+ /*
1330+ * Tell people we are going down, so that they can
1331+ * prepare to death, when device is still operating.
1332+ */
1333+ raw_notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
1334+
1335+ dev_deactivate(dev);
1336+
1337+ clear_bit(__LINK_STATE_START, &dev->state);
1338+
1339+ /* Synchronize to scheduled poll. We cannot touch poll list,
1340+ * it can be even on different cpu. So just clear netif_running(),
1341+ * and wait when poll really will happen. Actually, the best place
1342+ * for this is inside dev->stop() after device stopped its irq
1343+ * engine, but this requires more changes in devices. */
1344+
1345+ smp_mb__after_clear_bit(); /* Commit netif_running(). */
1346+ while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) {
1347+ /* No hurry. */
1348+ msleep(1);
1349+ }
1350+
1351+ /*
1352+ * Call the device specific close. This cannot fail.
1353+ * Only if device is UP
1354+ *
1355+ * We allow it to be called even after a DETACH hot-plug
1356+ * event.
1357+ */
1358+ if (dev->stop)
1359+ dev->stop(dev);
1360+
1361+ /*
1362+ * Device is now down.
1363+ */
1364+
1365+ dev->flags &= ~IFF_UP;
1366+
1367+ /*
1368+ * Tell people we are down
1369+ */
1370+ raw_notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
1371+
1372+ return 0;
1373+}
1374+
1375+
1376+/*
1377+ * Device change register/unregister. These are not inline or static
1378+ * as we export them to the world.
1379+ */
1380+
1381+/**
1382+ * register_netdevice_notifier - register a network notifier block
1383+ * @nb: notifier
1384+ *
1385+ * Register a notifier to be called when network device events occur.
1386+ * The notifier passed is linked into the kernel structures and must
1387+ * not be reused until it has been unregistered. A negative errno code
1388+ * is returned on a failure.
1389+ *
1390+ * When registered all registration and up events are replayed
1391+ * to the new notifier to allow device to have a race free
1392+ * view of the network device list.
1393+ */
1394+
1395+int register_netdevice_notifier(struct notifier_block *nb)
1396+{
1397+ struct net_device *dev;
1398+ int err;
1399+
1400+ rtnl_lock();
1401+ err = raw_notifier_chain_register(&netdev_chain, nb);
1402+ if (!err) {
1403+ for (dev = dev_base; dev; dev = dev->next) {
1404+ nb->notifier_call(nb, NETDEV_REGISTER, dev);
1405+
1406+ if (dev->flags & IFF_UP)
1407+ nb->notifier_call(nb, NETDEV_UP, dev);
1408+ }
1409+ }
1410+ rtnl_unlock();
1411+ return err;
1412+}
1413+
1414+/**
1415+ * unregister_netdevice_notifier - unregister a network notifier block
1416+ * @nb: notifier
1417+ *
1418+ * Unregister a notifier previously registered by
1419+ * register_netdevice_notifier(). The notifier is unlinked into the
1420+ * kernel structures and may then be reused. A negative errno code
1421+ * is returned on a failure.
1422+ */
1423+
1424+int unregister_netdevice_notifier(struct notifier_block *nb)
1425+{
1426+ int err;
1427+
1428+ rtnl_lock();
1429+ err = raw_notifier_chain_unregister(&netdev_chain, nb);
1430+ rtnl_unlock();
1431+ return err;
1432+}
1433+
1434+/**
1435+ * call_netdevice_notifiers - call all network notifier blocks
1436+ * @val: value passed unmodified to notifier function
1437+ * @v: pointer passed unmodified to notifier function
1438+ *
1439+ * Call all network notifier blocks. Parameters and return value
1440+ * are as for raw_notifier_call_chain().
1441+ */
1442+
1443+int call_netdevice_notifiers(unsigned long val, void *v)
1444+{
1445+ return raw_notifier_call_chain(&netdev_chain, val, v);
1446+}
1447+
1448+/* When > 0 there are consumers of rx skb time stamps */
1449+static atomic_t netstamp_needed = ATOMIC_INIT(0);
1450+
1451+void net_enable_timestamp(void)
1452+{
1453+ atomic_inc(&netstamp_needed);
1454+}
1455+
1456+void net_disable_timestamp(void)
1457+{
1458+ atomic_dec(&netstamp_needed);
1459+}
1460+
1461+void __net_timestamp(struct sk_buff *skb)
1462+{
1463+ struct timeval tv;
1464+
1465+ do_gettimeofday(&tv);
1466+ skb_set_timestamp(skb, &tv);
1467+}
1468+EXPORT_SYMBOL(__net_timestamp);
1469+
1470+static inline void net_timestamp(struct sk_buff *skb)
1471+{
1472+ if (atomic_read(&netstamp_needed))
1473+ __net_timestamp(skb);
1474+ else {
1475+ skb->tstamp.off_sec = 0;
1476+ skb->tstamp.off_usec = 0;
1477+ }
1478+}
1479+
1480+/*
1481+ * Support routine. Sends outgoing frames to any network
1482+ * taps currently in use.
1483+ */
1484+
1485+static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1486+{
1487+ struct packet_type *ptype;
1488+
1489+ net_timestamp(skb);
1490+
1491+ rcu_read_lock();
1492+ list_for_each_entry_rcu(ptype, &ptype_all, list) {
1493+ /* Never send packets back to the socket
1494+ * they originated from - MvS (miquels@drinkel.ow.org)
1495+ */
1496+ if ((ptype->dev == dev || !ptype->dev) &&
1497+ (ptype->af_packet_priv == NULL ||
1498+ (struct sock *)ptype->af_packet_priv != skb->sk)) {
1499+ struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1500+ if (!skb2)
1501+ break;
1502+
1503+ /* skb->nh should be correctly
1504+ set by sender, so that the second statement is
1505+ just protection against buggy protocols.
1506+ */
1507+ skb2->mac.raw = skb2->data;
1508+
1509+ if (skb2->nh.raw < skb2->data ||
1510+ skb2->nh.raw > skb2->tail) {
1511+ if (net_ratelimit())
1512+ printk(KERN_CRIT "protocol %04x is "
1513+ "buggy, dev %s\n",
1514+ skb2->protocol, dev->name);
1515+ skb2->nh.raw = skb2->data;
1516+ }
1517+
1518+ skb2->h.raw = skb2->nh.raw;
1519+ skb2->pkt_type = PACKET_OUTGOING;
1520+ ptype->func(skb2, skb->dev, ptype, skb->dev);
1521+ }
1522+ }
1523+ rcu_read_unlock();
1524+}
1525+
1526+
1527+void __netif_schedule(struct net_device *dev)
1528+{
1529+ if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) {
1530+ unsigned long flags;
1531+ struct softnet_data *sd;
1532+
1533+ local_irq_save(flags);
1534+ sd = &__get_cpu_var(softnet_data);
1535+ dev->next_sched = sd->output_queue;
1536+ sd->output_queue = dev;
1537+ raise_softirq_irqoff(NET_TX_SOFTIRQ);
1538+ local_irq_restore(flags);
1539+ }
1540+}
1541+EXPORT_SYMBOL(__netif_schedule);
1542+
1543+void __netif_rx_schedule(struct net_device *dev)
1544+{
1545+ unsigned long flags;
1546+
1547+ local_irq_save(flags);
1548+ dev_hold(dev);
1549+ list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list);
1550+ if (dev->quota < 0)
1551+ dev->quota += dev->weight;
1552+ else
1553+ dev->quota = dev->weight;
1554+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
1555+ local_irq_restore(flags);
1556+}
1557+EXPORT_SYMBOL(__netif_rx_schedule);
1558+
1559+void dev_kfree_skb_any(struct sk_buff *skb)
1560+{
1561+ if (in_irq() || irqs_disabled())
1562+ dev_kfree_skb_irq(skb);
1563+ else
1564+ dev_kfree_skb(skb);
1565+}
1566+EXPORT_SYMBOL(dev_kfree_skb_any);
1567+
1568+
1569+/* Hot-plugging. */
1570+void netif_device_detach(struct net_device *dev)
1571+{
1572+ if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1573+ netif_running(dev)) {
1574+ netif_stop_queue(dev);
1575+ }
1576+}
1577+EXPORT_SYMBOL(netif_device_detach);
1578+
1579+void netif_device_attach(struct net_device *dev)
1580+{
1581+ if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1582+ netif_running(dev)) {
1583+ netif_wake_queue(dev);
1584+ __netdev_watchdog_up(dev);
1585+ }
1586+}
1587+EXPORT_SYMBOL(netif_device_attach);
1588+
1589+
1590+/*
1591+ * Invalidate hardware checksum when packet is to be mangled, and
1592+ * complete checksum manually on outgoing path.
1593+ */
1594+int skb_checksum_help(struct sk_buff *skb)
1595+{
1596+ __wsum csum;
1597+ int ret = 0, offset = skb->h.raw - skb->data;
1598+
1599+ if (skb->ip_summed == CHECKSUM_COMPLETE)
1600+ goto out_set_summed;
1601+
1602+ if (unlikely(skb_shinfo(skb)->gso_size)) {
1603+ /* Let GSO fix up the checksum. */
1604+ goto out_set_summed;
1605+ }
1606+
1607+ if (skb_cloned(skb)) {
1608+ ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1609+ if (ret)
1610+ goto out;
1611+ }
1612+
1613+ BUG_ON(offset > (int)skb->len);
1614+ csum = skb_checksum(skb, offset, skb->len-offset, 0);
1615+
1616+ offset = skb->tail - skb->h.raw;
1617+ BUG_ON(offset <= 0);
1618+ BUG_ON(skb->csum_offset + 2 > offset);
1619+
1620+ *(__sum16*)(skb->h.raw + skb->csum_offset) = csum_fold(csum);
1621+
1622+out_set_summed:
1623+ skb->ip_summed = CHECKSUM_NONE;
1624+out:
1625+ return ret;
1626+}
1627+
1628+/**
1629+ * skb_gso_segment - Perform segmentation on skb.
1630+ * @skb: buffer to segment
1631+ * @features: features for the output path (see dev->features)
1632+ *
1633+ * This function segments the given skb and returns a list of segments.
1634+ *
1635+ * It may return NULL if the skb requires no segmentation. This is
1636+ * only possible when GSO is used for verifying header integrity.
1637+ */
1638+struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1639+{
1640+ struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1641+ struct packet_type *ptype;
1642+ __be16 type = skb->protocol;
1643+ int err;
1644+
1645+ BUG_ON(skb_shinfo(skb)->frag_list);
1646+
1647+ skb->mac.raw = skb->data;
1648+ skb->mac_len = skb->nh.raw - skb->data;
1649+ __skb_pull(skb, skb->mac_len);
1650+
1651+ if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1652+ if (skb_header_cloned(skb) &&
1653+ (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1654+ return ERR_PTR(err);
1655+ }
1656+
1657+ rcu_read_lock();
1658+ list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & 15], list) {
1659+ if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1660+ if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1661+ err = ptype->gso_send_check(skb);
1662+ segs = ERR_PTR(err);
1663+ if (err || skb_gso_ok(skb, features))
1664+ break;
1665+ __skb_push(skb, skb->data - skb->nh.raw);
1666+ }
1667+ segs = ptype->gso_segment(skb, features);
1668+ break;
1669+ }
1670+ }
1671+ rcu_read_unlock();
1672+
1673+ __skb_push(skb, skb->data - skb->mac.raw);
1674+
1675+ return segs;
1676+}
1677+
1678+EXPORT_SYMBOL(skb_gso_segment);
1679+
1680+/* Take action when hardware reception checksum errors are detected. */
1681+#ifdef CONFIG_BUG
1682+void netdev_rx_csum_fault(struct net_device *dev)
1683+{
1684+ if (net_ratelimit()) {
1685+ printk(KERN_ERR "%s: hw csum failure.\n",
1686+ dev ? dev->name : "<unknown>");
1687+ dump_stack();
1688+ }
1689+}
1690+EXPORT_SYMBOL(netdev_rx_csum_fault);
1691+#endif
1692+
1693+/* Actually, we should eliminate this check as soon as we know, that:
1694+ * 1. IOMMU is present and allows to map all the memory.
1695+ * 2. No high memory really exists on this machine.
1696+ */
1697+
1698+static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1699+{
1700+#ifdef CONFIG_HIGHMEM
1701+ int i;
1702+
1703+ if (dev->features & NETIF_F_HIGHDMA)
1704+ return 0;
1705+
1706+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1707+ if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1708+ return 1;
1709+
1710+#endif
1711+ return 0;
1712+}
1713+
1714+struct dev_gso_cb {
1715+ void (*destructor)(struct sk_buff *skb);
1716+};
1717+
1718+#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1719+
1720+static void dev_gso_skb_destructor(struct sk_buff *skb)
1721+{
1722+ struct dev_gso_cb *cb;
1723+
1724+ do {
1725+ struct sk_buff *nskb = skb->next;
1726+
1727+ skb->next = nskb->next;
1728+ nskb->next = NULL;
1729+ kfree_skb(nskb);
1730+ } while (skb->next);
1731+
1732+ cb = DEV_GSO_CB(skb);
1733+ if (cb->destructor)
1734+ cb->destructor(skb);
1735+}
1736+
1737+/**
1738+ * dev_gso_segment - Perform emulated hardware segmentation on skb.
1739+ * @skb: buffer to segment
1740+ *
1741+ * This function segments the given skb and stores the list of segments
1742+ * in skb->next.
1743+ */
1744+static int dev_gso_segment(struct sk_buff *skb)
1745+{
1746+ struct net_device *dev = skb->dev;
1747+ struct sk_buff *segs;
1748+ int features = dev->features & ~(illegal_highdma(dev, skb) ?
1749+ NETIF_F_SG : 0);
1750+
1751+ segs = skb_gso_segment(skb, features);
1752+
1753+ /* Verifying header integrity only. */
1754+ if (!segs)
1755+ return 0;
1756+
1757+ if (unlikely(IS_ERR(segs)))
1758+ return PTR_ERR(segs);
1759+
1760+ skb->next = segs;
1761+ DEV_GSO_CB(skb)->destructor = skb->destructor;
1762+ skb->destructor = dev_gso_skb_destructor;
1763+
1764+ return 0;
1765+}
1766+
1767+int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
1768+{
1769+ if (likely(!skb->next)) {
1770+ if (netdev_nit)
1771+ dev_queue_xmit_nit(skb, dev);
1772+
1773+ if (netif_needs_gso(dev, skb)) {
1774+ if (unlikely(dev_gso_segment(skb)))
1775+ goto out_kfree_skb;
1776+ if (skb->next)
1777+ goto gso;
1778+ }
1779+
1780+ return dev->hard_start_xmit(skb, dev);
1781+ }
1782+
1783+gso:
1784+ do {
1785+ struct sk_buff *nskb = skb->next;
1786+ int rc;
1787+
1788+ skb->next = nskb->next;
1789+ nskb->next = NULL;
1790+ rc = dev->hard_start_xmit(nskb, dev);
1791+ if (unlikely(rc)) {
1792+ nskb->next = skb->next;
1793+ skb->next = nskb;
1794+ return rc;
1795+ }
1796+ if (unlikely(netif_queue_stopped(dev) && skb->next))
1797+ return NETDEV_TX_BUSY;
1798+ } while (skb->next);
1799+
1800+ skb->destructor = DEV_GSO_CB(skb)->destructor;
1801+
1802+out_kfree_skb:
1803+ kfree_skb(skb);
1804+ return 0;
1805+}
1806+
1807+#define HARD_TX_LOCK(dev, cpu) { \
1808+ if ((dev->features & NETIF_F_LLTX) == 0) { \
1809+ netif_tx_lock(dev); \
1810+ } \
1811+}
1812+
1813+#define HARD_TX_UNLOCK(dev) { \
1814+ if ((dev->features & NETIF_F_LLTX) == 0) { \
1815+ netif_tx_unlock(dev); \
1816+ } \
1817+}
1818+
1819+/**
1820+ * dev_queue_xmit - transmit a buffer
1821+ * @skb: buffer to transmit
1822+ *
1823+ * Queue a buffer for transmission to a network device. The caller must
1824+ * have set the device and priority and built the buffer before calling
1825+ * this function. The function can be called from an interrupt.
1826+ *
1827+ * A negative errno code is returned on a failure. A success does not
1828+ * guarantee the frame will be transmitted as it may be dropped due
1829+ * to congestion or traffic shaping.
1830+ *
1831+ * -----------------------------------------------------------------------------------
1832+ * I notice this method can also return errors from the queue disciplines,
1833+ * including NET_XMIT_DROP, which is a positive value. So, errors can also
1834+ * be positive.
1835+ *
1836+ * Regardless of the return value, the skb is consumed, so it is currently
1837+ * difficult to retry a send to this method. (You can bump the ref count
1838+ * before sending to hold a reference for retry if you are careful.)
1839+ *
1840+ * When calling this method, interrupts MUST be enabled. This is because
1841+ * the BH enable code must have IRQs enabled so that it will not deadlock.
1842+ * --BLG
1843+ */
1844+
1845+int dev_queue_xmit(struct sk_buff *skb)
1846+{
1847+ struct net_device *dev = skb->dev;
1848+ struct Qdisc *q;
1849+ int rc = -ENOMEM;
1850+
1851+ /* GSO will handle the following emulations directly. */
1852+ if (netif_needs_gso(dev, skb))
1853+ goto gso;
1854+
1855+ if (skb_shinfo(skb)->frag_list &&
1856+ !(dev->features & NETIF_F_FRAGLIST) &&
1857+ __skb_linearize(skb))
1858+ goto out_kfree_skb;
1859+
1860+ /* Fragmented skb is linearized if device does not support SG,
1861+ * or if at least one of fragments is in highmem and device
1862+ * does not support DMA from it.
1863+ */
1864+ if (skb_shinfo(skb)->nr_frags &&
1865+ (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1866+ __skb_linearize(skb))
1867+ goto out_kfree_skb;
1868+
1869+ /* If packet is not checksummed and device does not support
1870+ * checksumming for this protocol, complete checksumming here.
1871+ */
1872+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
1873+ (!(dev->features & NETIF_F_GEN_CSUM) &&
1874+ (!(dev->features & NETIF_F_IP_CSUM) ||
1875+ skb->protocol != htons(ETH_P_IP))))
1876+ if (skb_checksum_help(skb))
1877+ goto out_kfree_skb;
1878+
1879+gso:
1880+ spin_lock_prefetch(&dev->queue_lock);
1881+
1882+ /* Disable soft irqs for various locks below. Also
1883+ * stops preemption for RCU.
1884+ */
1885+ rcu_read_lock_bh();
1886+
1887+ /* Updates of qdisc are serialized by queue_lock.
1888+ * The struct Qdisc which is pointed to by qdisc is now a
1889+ * rcu structure - it may be accessed without acquiring
1890+ * a lock (but the structure may be stale.) The freeing of the
1891+ * qdisc will be deferred until it's known that there are no
1892+ * more references to it.
1893+ *
1894+ * If the qdisc has an enqueue function, we still need to
1895+ * hold the queue_lock before calling it, since queue_lock
1896+ * also serializes access to the device queue.
1897+ */
1898+
1899+ q = rcu_dereference(dev->qdisc);
1900+#ifdef CONFIG_NET_CLS_ACT
1901+ skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1902+#endif
1903+ if (q->enqueue) {
1904+ /* Grab device queue */
1905+ spin_lock(&dev->queue_lock);
1906+ q = dev->qdisc;
1907+ if (q->enqueue) {
1908+ rc = q->enqueue(skb, q);
1909+ qdisc_run(dev);
1910+ spin_unlock(&dev->queue_lock);
1911+
1912+ rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
1913+ goto out;
1914+ }
1915+ spin_unlock(&dev->queue_lock);
1916+ }
1917+
1918+ /* The device has no queue. Common case for software devices:
1919+ loopback, all the sorts of tunnels...
1920+
1921+ Really, it is unlikely that netif_tx_lock protection is necessary
1922+ here. (f.e. loopback and IP tunnels are clean ignoring statistics
1923+ counters.)
1924+ However, it is possible, that they rely on protection
1925+ made by us here.
1926+
1927+ Check this and shot the lock. It is not prone from deadlocks.
1928+ Either shot noqueue qdisc, it is even simpler 8)
1929+ */
1930+ if (dev->flags & IFF_UP) {
1931+ int cpu = smp_processor_id(); /* ok because BHs are off */
1932+
1933+ if (dev->xmit_lock_owner != cpu) {
1934+
1935+ HARD_TX_LOCK(dev, cpu);
1936+
1937+ if (!netif_queue_stopped(dev)) {
1938+ rc = 0;
1939+ if (!dev_hard_start_xmit(skb, dev)) {
1940+ HARD_TX_UNLOCK(dev);
1941+ goto out;
1942+ }
1943+ }
1944+ HARD_TX_UNLOCK(dev);
1945+ if (net_ratelimit())
1946+ printk(KERN_CRIT "Virtual device %s asks to "
1947+ "queue packet!\n", dev->name);
1948+ } else {
1949+ /* Recursion is detected! It is possible,
1950+ * unfortunately */
1951+ if (net_ratelimit())
1952+ printk(KERN_CRIT "Dead loop on virtual device "
1953+ "%s, fix it urgently!\n", dev->name);
1954+ }
1955+ }
1956+
1957+ rc = -ENETDOWN;
1958+ rcu_read_unlock_bh();
1959+
1960+out_kfree_skb:
1961+ kfree_skb(skb);
1962+ return rc;
1963+out:
1964+ rcu_read_unlock_bh();
1965+ return rc;
1966+}
1967+
1968+
1969+/*=======================================================================
1970+ Receiver routines
1971+ =======================================================================*/
1972+
1973+int netdev_max_backlog = 1000;
1974+int netdev_budget = 300;
1975+int weight_p = 64; /* old backlog weight */
1976+
1977+DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1978+
1979+
1980+/**
1981+ * netif_rx - post buffer to the network code
1982+ * @skb: buffer to post
1983+ *
1984+ * This function receives a packet from a device driver and queues it for
1985+ * the upper (protocol) levels to process. It always succeeds. The buffer
1986+ * may be dropped during processing for congestion control or by the
1987+ * protocol layers.
1988+ *
1989+ * return values:
1990+ * NET_RX_SUCCESS (no congestion)
1991+ * NET_RX_CN_LOW (low congestion)
1992+ * NET_RX_CN_MOD (moderate congestion)
1993+ * NET_RX_CN_HIGH (high congestion)
1994+ * NET_RX_DROP (packet was dropped)
1995+ *
1996+ */
1997+
1998+int netif_rx(struct sk_buff *skb)
1999+{
2000+ struct softnet_data *queue;
2001+ unsigned long flags;
2002+
2003+ /* if netpoll wants it, pretend we never saw it */
2004+ if (netpoll_rx(skb))
2005+ return NET_RX_DROP;
2006+
2007+ if (!skb->tstamp.off_sec)
2008+ net_timestamp(skb);
2009+
2010+ /*
2011+ * The code is rearranged so that the path is the most
2012+ * short when CPU is congested, but is still operating.
2013+ */
2014+ local_irq_save(flags);
2015+ queue = &__get_cpu_var(softnet_data);
2016+
2017+ __get_cpu_var(netdev_rx_stat).total++;
2018+ if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
2019+ if (queue->input_pkt_queue.qlen) {
2020+enqueue:
2021+ dev_hold(skb->dev);
2022+ __skb_queue_tail(&queue->input_pkt_queue, skb);
2023+ local_irq_restore(flags);
2024+ return NET_RX_SUCCESS;
2025+ }
2026+
2027+ netif_rx_schedule(&queue->backlog_dev);
2028+ goto enqueue;
2029+ }
2030+
2031+ __get_cpu_var(netdev_rx_stat).dropped++;
2032+ local_irq_restore(flags);
2033+
2034+ kfree_skb(skb);
2035+ return NET_RX_DROP;
2036+}
2037+
2038+int netif_rx_ni(struct sk_buff *skb)
2039+{
2040+ int err;
2041+
2042+ preempt_disable();
2043+ err = netif_rx(skb);
2044+ if (local_softirq_pending())
2045+ do_softirq();
2046+ preempt_enable();
2047+
2048+ return err;
2049+}
2050+
2051+EXPORT_SYMBOL(netif_rx_ni);
2052+
2053+static inline struct net_device *skb_bond(struct sk_buff *skb)
2054+{
2055+ struct net_device *dev = skb->dev;
2056+
2057+ if (dev->master) {
2058+ if (skb_bond_should_drop(skb)) {
2059+ kfree_skb(skb);
2060+ return NULL;
2061+ }
2062+ skb->dev = dev->master;
2063+ }
2064+
2065+ return dev;
2066+}
2067+
2068+static void net_tx_action(struct softirq_action *h)
2069+{
2070+ struct softnet_data *sd = &__get_cpu_var(softnet_data);
2071+
2072+ if (sd->completion_queue) {
2073+ struct sk_buff *clist;
2074+
2075+ local_irq_disable();
2076+ clist = sd->completion_queue;
2077+ sd->completion_queue = NULL;
2078+ local_irq_enable();
2079+
2080+ while (clist) {
2081+ struct sk_buff *skb = clist;
2082+ clist = clist->next;
2083+
2084+ BUG_TRAP(!atomic_read(&skb->users));
2085+ __kfree_skb(skb);
2086+ }
2087+ }
2088+
2089+ if (sd->output_queue) {
2090+ struct net_device *head;
2091+
2092+ local_irq_disable();
2093+ head = sd->output_queue;
2094+ sd->output_queue = NULL;
2095+ local_irq_enable();
2096+
2097+ while (head) {
2098+ struct net_device *dev = head;
2099+ head = head->next_sched;
2100+
2101+ smp_mb__before_clear_bit();
2102+ clear_bit(__LINK_STATE_SCHED, &dev->state);
2103+
2104+ if (spin_trylock(&dev->queue_lock)) {
2105+ qdisc_run(dev);
2106+ spin_unlock(&dev->queue_lock);
2107+ } else {
2108+ netif_schedule(dev);
2109+ }
2110+ }
2111+ }
2112+}
2113+
2114+static __inline__ int deliver_skb(struct sk_buff *skb,
2115+ struct packet_type *pt_prev,
2116+ struct net_device *orig_dev)
2117+{
2118+ atomic_inc(&skb->users);
2119+ return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2120+}
2121+
2122+#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2123+int (*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff **pskb);
2124+struct net_bridge;
2125+struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
2126+ unsigned char *addr);
2127+void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent);
2128+
2129+static __inline__ int handle_bridge(struct sk_buff **pskb,
2130+ struct packet_type **pt_prev, int *ret,
2131+ struct net_device *orig_dev)
2132+{
2133+ struct net_bridge_port *port;
2134+
2135+ if ((*pskb)->pkt_type == PACKET_LOOPBACK ||
2136+ (port = rcu_dereference((*pskb)->dev->br_port)) == NULL)
2137+ return 0;
2138+
2139+ if (*pt_prev) {
2140+ *ret = deliver_skb(*pskb, *pt_prev, orig_dev);
2141+ *pt_prev = NULL;
2142+ }
2143+
2144+ return br_handle_frame_hook(port, pskb);
2145+}
2146+#else
2147+#define handle_bridge(skb, pt_prev, ret, orig_dev) (0)
2148+#endif
2149+
2150+#ifdef CONFIG_NET_CLS_ACT
2151+/* TODO: Maybe we should just force sch_ingress to be compiled in
2152+ * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2153+ * a compare and 2 stores extra right now if we dont have it on
2154+ * but have CONFIG_NET_CLS_ACT
2155+ * NOTE: This doesnt stop any functionality; if you dont have
2156+ * the ingress scheduler, you just cant add policies on ingress.
2157+ *
2158+ */
2159+static int ing_filter(struct sk_buff *skb)
2160+{
2161+ struct Qdisc *q;
2162+ struct net_device *dev = skb->dev;
2163+ int result = TC_ACT_OK;
2164+
2165+ if (dev->qdisc_ingress) {
2166+ __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
2167+ if (MAX_RED_LOOP < ttl++) {
2168+ printk(KERN_WARNING "Redir loop detected Dropping packet (%d->%d)\n",
2169+ skb->iif, skb->dev->ifindex);
2170+ return TC_ACT_SHOT;
2171+ }
2172+
2173+ skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl);
2174+
2175+ skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS);
2176+
2177+ spin_lock(&dev->queue_lock);
2178+ if ((q = dev->qdisc_ingress) != NULL)
2179+ result = q->enqueue(skb, q);
2180+ spin_unlock(&dev->queue_lock);
2181+
2182+ }
2183+
2184+ return result;
2185+}
2186+#endif
2187+
2188+int netif_receive_skb(struct sk_buff *skb)
2189+{
2190+ struct packet_type *ptype, *pt_prev;
2191+ struct net_device *orig_dev;
2192+ int ret = NET_RX_DROP;
2193+ __be16 type;
2194+
2195+ /* if we've gotten here through NAPI, check netpoll */
2196+ if (skb->dev->poll && netpoll_rx(skb))
2197+ return NET_RX_DROP;
2198+
2199+ if (!skb->tstamp.off_sec)
2200+ net_timestamp(skb);
2201+
2202+ if (!skb->iif)
2203+ skb->iif = skb->dev->ifindex;
2204+
2205+ orig_dev = skb_bond(skb);
2206+
2207+ if (!orig_dev)
2208+ return NET_RX_DROP;
2209+
2210+ __get_cpu_var(netdev_rx_stat).total++;
2211+
2212+ skb->h.raw = skb->nh.raw = skb->data;
2213+ skb->mac_len = skb->nh.raw - skb->mac.raw;
2214+
2215+ pt_prev = NULL;
2216+
2217+ rcu_read_lock();
2218+
2219+#ifdef CONFIG_NET_CLS_ACT
2220+ if (skb->tc_verd & TC_NCLS) {
2221+ skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2222+ goto ncls;
2223+ }
2224+#endif
2225+
2226+ list_for_each_entry_rcu(ptype, &ptype_all, list) {
2227+ if (!ptype->dev || ptype->dev == skb->dev) {
2228+ if (pt_prev)
2229+ ret = deliver_skb(skb, pt_prev, orig_dev);
2230+ pt_prev = ptype;
2231+ }
2232+ }
2233+
2234+#ifdef CONFIG_NET_CLS_ACT
2235+ if (pt_prev) {
2236+ ret = deliver_skb(skb, pt_prev, orig_dev);
2237+ pt_prev = NULL; /* noone else should process this after*/
2238+ } else {
2239+ skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2240+ }
2241+
2242+ ret = ing_filter(skb);
2243+
2244+ if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
2245+ kfree_skb(skb);
2246+ goto out;
2247+ }
2248+
2249+ skb->tc_verd = 0;
2250+ncls:
2251+#endif
2252+
2253+ if (handle_bridge(&skb, &pt_prev, &ret, orig_dev))
2254+ goto out;
2255+
2256+ type = skb->protocol;
2257+ list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
2258+ if (ptype->type == type &&
2259+ (!ptype->dev || ptype->dev == skb->dev)) {
2260+ if (pt_prev)
2261+ ret = deliver_skb(skb, pt_prev, orig_dev);
2262+ pt_prev = ptype;
2263+ }
2264+ }
2265+
2266+ if (pt_prev) {
2267+ ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2268+ } else {
2269+ kfree_skb(skb);
2270+ /* Jamal, now you will not able to escape explaining
2271+ * me how you were going to use this. :-)
2272+ */
2273+ ret = NET_RX_DROP;
2274+ }
2275+
2276+out:
2277+ rcu_read_unlock();
2278+ return ret;
2279+}
2280+
2281+static int process_backlog(struct net_device *backlog_dev, int *budget)
2282+{
2283+ int work = 0;
2284+ int quota = min(backlog_dev->quota, *budget);
2285+ struct softnet_data *queue = &__get_cpu_var(softnet_data);
2286+ unsigned long start_time = jiffies;
2287+
2288+ backlog_dev->weight = weight_p;
2289+ for (;;) {
2290+ struct sk_buff *skb;
2291+ struct net_device *dev;
2292+
2293+ local_irq_disable();
2294+ skb = __skb_dequeue(&queue->input_pkt_queue);
2295+ if (!skb)
2296+ goto job_done;
2297+ local_irq_enable();
2298+
2299+ dev = skb->dev;
2300+
2301+ netif_receive_skb(skb);
2302+
2303+ dev_put(dev);
2304+
2305+ work++;
2306+
2307+ if (work >= quota || jiffies - start_time > 1)
2308+ break;
2309+
2310+ }
2311+
2312+ backlog_dev->quota -= work;
2313+ *budget -= work;
2314+ return -1;
2315+
2316+job_done:
2317+ backlog_dev->quota -= work;
2318+ *budget -= work;
2319+
2320+ list_del(&backlog_dev->poll_list);
2321+ smp_mb__before_clear_bit();
2322+ netif_poll_enable(backlog_dev);
2323+
2324+ local_irq_enable();
2325+ return 0;
2326+}
2327+
2328+static void net_rx_action(struct softirq_action *h)
2329+{
2330+ struct softnet_data *queue = &__get_cpu_var(softnet_data);
2331+ unsigned long start_time = jiffies;
2332+ int budget = netdev_budget;
2333+ void *have;
2334+
2335+ local_irq_disable();
2336+
2337+ while (!list_empty(&queue->poll_list)) {
2338+ struct net_device *dev;
2339+
2340+ if (budget <= 0 || jiffies - start_time > 1)
2341+ goto softnet_break;
2342+
2343+ local_irq_enable();
2344+
2345+ dev = list_entry(queue->poll_list.next,
2346+ struct net_device, poll_list);
2347+ have = netpoll_poll_lock(dev);
2348+
2349+ if (dev->quota <= 0 || dev->poll(dev, &budget)) {
2350+ netpoll_poll_unlock(have);
2351+ local_irq_disable();
2352+ list_move_tail(&dev->poll_list, &queue->poll_list);
2353+ if (dev->quota < 0)
2354+ dev->quota += dev->weight;
2355+ else
2356+ dev->quota = dev->weight;
2357+ } else {
2358+ netpoll_poll_unlock(have);
2359+ dev_put(dev);
2360+ local_irq_disable();
2361+ }
2362+ }
2363+out:
2364+#ifdef CONFIG_NET_DMA
2365+ /*
2366+ * There may not be any more sk_buffs coming right now, so push
2367+ * any pending DMA copies to hardware
2368+ */
2369+ if (net_dma_client) {
2370+ struct dma_chan *chan;
2371+ rcu_read_lock();
2372+ list_for_each_entry_rcu(chan, &net_dma_client->channels, client_node)
2373+ dma_async_memcpy_issue_pending(chan);
2374+ rcu_read_unlock();
2375+ }
2376+#endif
2377+ local_irq_enable();
2378+ return;
2379+
2380+softnet_break:
2381+ __get_cpu_var(netdev_rx_stat).time_squeeze++;
2382+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2383+ goto out;
2384+}
2385+
2386+static gifconf_func_t * gifconf_list [NPROTO];
2387+
2388+/**
2389+ * register_gifconf - register a SIOCGIF handler
2390+ * @family: Address family
2391+ * @gifconf: Function handler
2392+ *
2393+ * Register protocol dependent address dumping routines. The handler
2394+ * that is passed must not be freed or reused until it has been replaced
2395+ * by another handler.
2396+ */
2397+int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2398+{
2399+ if (family >= NPROTO)
2400+ return -EINVAL;
2401+ gifconf_list[family] = gifconf;
2402+ return 0;
2403+}
2404+
2405+
2406+/*
2407+ * Map an interface index to its name (SIOCGIFNAME)
2408+ */
2409+
2410+/*
2411+ * We need this ioctl for efficient implementation of the
2412+ * if_indextoname() function required by the IPv6 API. Without
2413+ * it, we would have to search all the interfaces to find a
2414+ * match. --pb
2415+ */
2416+
2417+static int dev_ifname(struct ifreq __user *arg)
2418+{
2419+ struct net_device *dev;
2420+ struct ifreq ifr;
2421+
2422+ /*
2423+ * Fetch the caller's info block.
2424+ */
2425+
2426+ if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2427+ return -EFAULT;
2428+
2429+ read_lock(&dev_base_lock);
2430+ dev = __dev_get_by_index(ifr.ifr_ifindex);
2431+ if (!dev) {
2432+ read_unlock(&dev_base_lock);
2433+ return -ENODEV;
2434+ }
2435+
2436+ strcpy(ifr.ifr_name, dev->name);
2437+ read_unlock(&dev_base_lock);
2438+
2439+ if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2440+ return -EFAULT;
2441+ return 0;
2442+}
2443+
2444+/*
2445+ * Perform a SIOCGIFCONF call. This structure will change
2446+ * size eventually, and there is nothing I can do about it.
2447+ * Thus we will need a 'compatibility mode'.
2448+ */
2449+
2450+static int dev_ifconf(char __user *arg)
2451+{
2452+ struct ifconf ifc;
2453+ struct net_device *dev;
2454+ char __user *pos;
2455+ int len;
2456+ int total;
2457+ int i;
2458+
2459+ /*
2460+ * Fetch the caller's info block.
2461+ */
2462+
2463+ if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2464+ return -EFAULT;
2465+
2466+ pos = ifc.ifc_buf;
2467+ len = ifc.ifc_len;
2468+
2469+ /*
2470+ * Loop over the interfaces, and write an info block for each.
2471+ */
2472+
2473+ total = 0;
2474+ for (dev = dev_base; dev; dev = dev->next) {
2475+ for (i = 0; i < NPROTO; i++) {
2476+ if (gifconf_list[i]) {
2477+ int done;
2478+ if (!pos)
2479+ done = gifconf_list[i](dev, NULL, 0);
2480+ else
2481+ done = gifconf_list[i](dev, pos + total,
2482+ len - total);
2483+ if (done < 0)
2484+ return -EFAULT;
2485+ total += done;
2486+ }
2487+ }
2488+ }
2489+
2490+ /*
2491+ * All done. Write the updated control block back to the caller.
2492+ */
2493+ ifc.ifc_len = total;
2494+
2495+ /*
2496+ * Both BSD and Solaris return 0 here, so we do too.
2497+ */
2498+ return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2499+}
2500+
2501+#ifdef CONFIG_PROC_FS
2502+/*
2503+ * This is invoked by the /proc filesystem handler to display a device
2504+ * in detail.
2505+ */
2506+static __inline__ struct net_device *dev_get_idx(loff_t pos)
2507+{
2508+ struct net_device *dev;
2509+ loff_t i;
2510+
2511+ for (i = 0, dev = dev_base; dev && i < pos; ++i, dev = dev->next);
2512+
2513+ return i == pos ? dev : NULL;
2514+}
2515+
2516+void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2517+{
2518+ read_lock(&dev_base_lock);
2519+ return *pos ? dev_get_idx(*pos - 1) : SEQ_START_TOKEN;
2520+}
2521+
2522+void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2523+{
2524+ ++*pos;
2525+ return v == SEQ_START_TOKEN ? dev_base : ((struct net_device *)v)->next;
2526+}
2527+
2528+void dev_seq_stop(struct seq_file *seq, void *v)
2529+{
2530+ read_unlock(&dev_base_lock);
2531+}
2532+
2533+static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2534+{
2535+ if (dev->get_stats) {
2536+ struct net_device_stats *stats = dev->get_stats(dev);
2537+
2538+ seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2539+ "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2540+ dev->name, stats->rx_bytes, stats->rx_packets,
2541+ stats->rx_errors,
2542+ stats->rx_dropped + stats->rx_missed_errors,
2543+ stats->rx_fifo_errors,
2544+ stats->rx_length_errors + stats->rx_over_errors +
2545+ stats->rx_crc_errors + stats->rx_frame_errors,
2546+ stats->rx_compressed, stats->multicast,
2547+ stats->tx_bytes, stats->tx_packets,
2548+ stats->tx_errors, stats->tx_dropped,
2549+ stats->tx_fifo_errors, stats->collisions,
2550+ stats->tx_carrier_errors +
2551+ stats->tx_aborted_errors +
2552+ stats->tx_window_errors +
2553+ stats->tx_heartbeat_errors,
2554+ stats->tx_compressed);
2555+ } else
2556+ seq_printf(seq, "%6s: No statistics available.\n", dev->name);
2557+}
2558+
2559+/*
2560+ * Called from the PROCfs module. This now uses the new arbitrary sized
2561+ * /proc/net interface to create /proc/net/dev
2562+ */
2563+static int dev_seq_show(struct seq_file *seq, void *v)
2564+{
2565+ if (v == SEQ_START_TOKEN)
2566+ seq_puts(seq, "Inter-| Receive "
2567+ " | Transmit\n"
2568+ " face |bytes packets errs drop fifo frame "
2569+ "compressed multicast|bytes packets errs "
2570+ "drop fifo colls carrier compressed\n");
2571+ else
2572+ dev_seq_printf_stats(seq, v);
2573+ return 0;
2574+}
2575+
2576+static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2577+{
2578+ struct netif_rx_stats *rc = NULL;
2579+
2580+ while (*pos < NR_CPUS)
2581+ if (cpu_online(*pos)) {
2582+ rc = &per_cpu(netdev_rx_stat, *pos);
2583+ break;
2584+ } else
2585+ ++*pos;
2586+ return rc;
2587+}
2588+
2589+static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2590+{
2591+ return softnet_get_online(pos);
2592+}
2593+
2594+static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2595+{
2596+ ++*pos;
2597+ return softnet_get_online(pos);
2598+}
2599+
2600+static void softnet_seq_stop(struct seq_file *seq, void *v)
2601+{
2602+}
2603+
2604+static int softnet_seq_show(struct seq_file *seq, void *v)
2605+{
2606+ struct netif_rx_stats *s = v;
2607+
2608+ seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2609+ s->total, s->dropped, s->time_squeeze, 0,
2610+ 0, 0, 0, 0, /* was fastroute */
2611+ s->cpu_collision );
2612+ return 0;
2613+}
2614+
2615+static struct seq_operations dev_seq_ops = {
2616+ .start = dev_seq_start,
2617+ .next = dev_seq_next,
2618+ .stop = dev_seq_stop,
2619+ .show = dev_seq_show,
2620+};
2621+
2622+static int dev_seq_open(struct inode *inode, struct file *file)
2623+{
2624+ return seq_open(file, &dev_seq_ops);
2625+}
2626+
2627+static const struct file_operations dev_seq_fops = {
2628+ .owner = THIS_MODULE,
2629+ .open = dev_seq_open,
2630+ .read = seq_read,
2631+ .llseek = seq_lseek,
2632+ .release = seq_release,
2633+};
2634+
2635+static struct seq_operations softnet_seq_ops = {
2636+ .start = softnet_seq_start,
2637+ .next = softnet_seq_next,
2638+ .stop = softnet_seq_stop,
2639+ .show = softnet_seq_show,
2640+};
2641+
2642+static int softnet_seq_open(struct inode *inode, struct file *file)
2643+{
2644+ return seq_open(file, &softnet_seq_ops);
2645+}
2646+
2647+static const struct file_operations softnet_seq_fops = {
2648+ .owner = THIS_MODULE,
2649+ .open = softnet_seq_open,
2650+ .read = seq_read,
2651+ .llseek = seq_lseek,
2652+ .release = seq_release,
2653+};
2654+
2655+#ifdef CONFIG_WIRELESS_EXT
2656+extern int wireless_proc_init(void);
2657+#else
2658+#define wireless_proc_init() 0
2659+#endif
2660+
2661+static int __init dev_proc_init(void)
2662+{
2663+ int rc = -ENOMEM;
2664+
2665+ if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops))
2666+ goto out;
2667+ if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops))
2668+ goto out_dev;
2669+ if (wireless_proc_init())
2670+ goto out_softnet;
2671+ rc = 0;
2672+out:
2673+ return rc;
2674+out_softnet:
2675+ proc_net_remove("softnet_stat");
2676+out_dev:
2677+ proc_net_remove("dev");
2678+ goto out;
2679+}
2680+#else
2681+#define dev_proc_init() 0
2682+#endif /* CONFIG_PROC_FS */
2683+
2684+
2685+/**
2686+ * netdev_set_master - set up master/slave pair
2687+ * @slave: slave device
2688+ * @master: new master device
2689+ *
2690+ * Changes the master device of the slave. Pass %NULL to break the
2691+ * bonding. The caller must hold the RTNL semaphore. On a failure
2692+ * a negative errno code is returned. On success the reference counts
2693+ * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
2694+ * function returns zero.
2695+ */
2696+int netdev_set_master(struct net_device *slave, struct net_device *master)
2697+{
2698+ struct net_device *old = slave->master;
2699+
2700+ ASSERT_RTNL();
2701+
2702+ if (master) {
2703+ if (old)
2704+ return -EBUSY;
2705+ dev_hold(master);
2706+ }
2707+
2708+ slave->master = master;
2709+
2710+ synchronize_net();
2711+
2712+ if (old)
2713+ dev_put(old);
2714+
2715+ if (master)
2716+ slave->flags |= IFF_SLAVE;
2717+ else
2718+ slave->flags &= ~IFF_SLAVE;
2719+
2720+ rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
2721+ return 0;
2722+}
2723+
2724+/**
2725+ * dev_set_promiscuity - update promiscuity count on a device
2726+ * @dev: device
2727+ * @inc: modifier
2728+ *
2729+ * Add or remove promiscuity from a device. While the count in the device
2730+ * remains above zero the interface remains promiscuous. Once it hits zero
2731+ * the device reverts back to normal filtering operation. A negative inc
2732+ * value is used to drop promiscuity on the device.
2733+ */
2734+void dev_set_promiscuity(struct net_device *dev, int inc)
2735+{
2736+ unsigned short old_flags = dev->flags;
2737+
2738+ if ((dev->promiscuity += inc) == 0)
2739+ dev->flags &= ~IFF_PROMISC;
2740+ else
2741+ dev->flags |= IFF_PROMISC;
2742+ if (dev->flags != old_flags) {
2743+ dev_mc_upload(dev);
2744+ printk(KERN_INFO "device %s %s promiscuous mode\n",
2745+ dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
2746+ "left");
2747+ audit_log(current->audit_context, GFP_ATOMIC,
2748+ AUDIT_ANOM_PROMISCUOUS,
2749+ "dev=%s prom=%d old_prom=%d auid=%u",
2750+ dev->name, (dev->flags & IFF_PROMISC),
2751+ (old_flags & IFF_PROMISC),
2752+ audit_get_loginuid(current->audit_context));
2753+ }
2754+}
2755+
2756+/**
2757+ * dev_set_allmulti - update allmulti count on a device
2758+ * @dev: device
2759+ * @inc: modifier
2760+ *
2761+ * Add or remove reception of all multicast frames to a device. While the
2762+ * count in the device remains above zero the interface remains listening
2763+ * to all interfaces. Once it hits zero the device reverts back to normal
2764+ * filtering operation. A negative @inc value is used to drop the counter
2765+ * when releasing a resource needing all multicasts.
2766+ */
2767+
2768+void dev_set_allmulti(struct net_device *dev, int inc)
2769+{
2770+ unsigned short old_flags = dev->flags;
2771+
2772+ dev->flags |= IFF_ALLMULTI;
2773+ if ((dev->allmulti += inc) == 0)
2774+ dev->flags &= ~IFF_ALLMULTI;
2775+ if (dev->flags ^ old_flags)
2776+ dev_mc_upload(dev);
2777+}
2778+
2779+unsigned dev_get_flags(const struct net_device *dev)
2780+{
2781+ unsigned flags;
2782+
2783+ flags = (dev->flags & ~(IFF_PROMISC |
2784+ IFF_ALLMULTI |
2785+ IFF_RUNNING |
2786+ IFF_LOWER_UP |
2787+ IFF_DORMANT)) |
2788+ (dev->gflags & (IFF_PROMISC |
2789+ IFF_ALLMULTI));
2790+
2791+ if (netif_running(dev)) {
2792+ if (netif_oper_up(dev))
2793+ flags |= IFF_RUNNING;
2794+ if (netif_carrier_ok(dev))
2795+ flags |= IFF_LOWER_UP;
2796+ if (netif_dormant(dev))
2797+ flags |= IFF_DORMANT;
2798+ }
2799+
2800+ return flags;
2801+}
2802+
2803+int dev_change_flags(struct net_device *dev, unsigned flags)
2804+{
2805+ int ret;
2806+ int old_flags = dev->flags;
2807+
2808+ /*
2809+ * Set the flags on our device.
2810+ */
2811+
2812+ dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
2813+ IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
2814+ IFF_AUTOMEDIA)) |
2815+ (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
2816+ IFF_ALLMULTI));
2817+
2818+ /*
2819+ * Load in the correct multicast list now the flags have changed.
2820+ */
2821+
2822+ dev_mc_upload(dev);
2823+
2824+ /*
2825+ * Have we downed the interface. We handle IFF_UP ourselves
2826+ * according to user attempts to set it, rather than blindly
2827+ * setting it.
2828+ */
2829+
2830+ ret = 0;
2831+ if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
2832+ ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
2833+
2834+ if (!ret)
2835+ dev_mc_upload(dev);
2836+ }
2837+
2838+ if (dev->flags & IFF_UP &&
2839+ ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
2840+ IFF_VOLATILE)))
2841+ raw_notifier_call_chain(&netdev_chain,
2842+ NETDEV_CHANGE, dev);
2843+
2844+ if ((flags ^ dev->gflags) & IFF_PROMISC) {
2845+ int inc = (flags & IFF_PROMISC) ? +1 : -1;
2846+ dev->gflags ^= IFF_PROMISC;
2847+ dev_set_promiscuity(dev, inc);
2848+ }
2849+
2850+ /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
2851+ is important. Some (broken) drivers set IFF_PROMISC, when
2852+ IFF_ALLMULTI is requested not asking us and not reporting.
2853+ */
2854+ if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
2855+ int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
2856+ dev->gflags ^= IFF_ALLMULTI;
2857+ dev_set_allmulti(dev, inc);
2858+ }
2859+
2860+ if (old_flags ^ dev->flags)
2861+ rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags ^ dev->flags);
2862+
2863+ return ret;
2864+}
2865+
2866+int dev_set_mtu(struct net_device *dev, int new_mtu)
2867+{
2868+ int err;
2869+
2870+ if (new_mtu == dev->mtu)
2871+ return 0;
2872+
2873+ /* MTU must be positive. */
2874+ if (new_mtu < 0)
2875+ return -EINVAL;
2876+
2877+ if (!netif_device_present(dev))
2878+ return -ENODEV;
2879+
2880+ err = 0;
2881+ if (dev->change_mtu)
2882+ err = dev->change_mtu(dev, new_mtu);
2883+ else
2884+ dev->mtu = new_mtu;
2885+ if (!err && dev->flags & IFF_UP)
2886+ raw_notifier_call_chain(&netdev_chain,
2887+ NETDEV_CHANGEMTU, dev);
2888+ return err;
2889+}
2890+
2891+int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
2892+{
2893+ int err;
2894+
2895+ if (!dev->set_mac_address)
2896+ return -EOPNOTSUPP;
2897+ if (sa->sa_family != dev->type)
2898+ return -EINVAL;
2899+ if (!netif_device_present(dev))
2900+ return -ENODEV;
2901+ err = dev->set_mac_address(dev, sa);
2902+ if (!err)
2903+ raw_notifier_call_chain(&netdev_chain,
2904+ NETDEV_CHANGEADDR, dev);
2905+ return err;
2906+}
2907+
2908+/*
2909+ * Perform the SIOCxIFxxx calls.
2910+ */
2911+static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
2912+{
2913+ int err;
2914+ struct net_device *dev = __dev_get_by_name(ifr->ifr_name);
2915+
2916+ if (!dev)
2917+ return -ENODEV;
2918+
2919+ switch (cmd) {
2920+ case SIOCGIFFLAGS: /* Get interface flags */
2921+ ifr->ifr_flags = dev_get_flags(dev);
2922+ return 0;
2923+
2924+ case SIOCSIFFLAGS: /* Set interface flags */
2925+ return dev_change_flags(dev, ifr->ifr_flags);
2926+
2927+ case SIOCGIFMETRIC: /* Get the metric on the interface
2928+ (currently unused) */
2929+ ifr->ifr_metric = 0;
2930+ return 0;
2931+
2932+ case SIOCSIFMETRIC: /* Set the metric on the interface
2933+ (currently unused) */
2934+ return -EOPNOTSUPP;
2935+
2936+ case SIOCGIFMTU: /* Get the MTU of a device */
2937+ ifr->ifr_mtu = dev->mtu;
2938+ return 0;
2939+
2940+ case SIOCSIFMTU: /* Set the MTU of a device */
2941+ return dev_set_mtu(dev, ifr->ifr_mtu);
2942+
2943+ case SIOCGIFHWADDR:
2944+ if (!dev->addr_len)
2945+ memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
2946+ else
2947+ memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
2948+ min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
2949+ ifr->ifr_hwaddr.sa_family = dev->type;
2950+ return 0;
2951+
2952+ case SIOCSIFHWADDR:
2953+ return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
2954+
2955+ case SIOCSIFHWBROADCAST:
2956+ if (ifr->ifr_hwaddr.sa_family != dev->type)
2957+ return -EINVAL;
2958+ memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
2959+ min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
2960+ raw_notifier_call_chain(&netdev_chain,
2961+ NETDEV_CHANGEADDR, dev);
2962+ return 0;
2963+
2964+ case SIOCGIFMAP:
2965+ ifr->ifr_map.mem_start = dev->mem_start;
2966+ ifr->ifr_map.mem_end = dev->mem_end;
2967+ ifr->ifr_map.base_addr = dev->base_addr;
2968+ ifr->ifr_map.irq = dev->irq;
2969+ ifr->ifr_map.dma = dev->dma;
2970+ ifr->ifr_map.port = dev->if_port;
2971+ return 0;
2972+
2973+ case SIOCSIFMAP:
2974+ if (dev->set_config) {
2975+ if (!netif_device_present(dev))
2976+ return -ENODEV;
2977+ return dev->set_config(dev, &ifr->ifr_map);
2978+ }
2979+ return -EOPNOTSUPP;
2980+
2981+ case SIOCADDMULTI:
2982+ if (!dev->set_multicast_list ||
2983+ ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
2984+ return -EINVAL;
2985+ if (!netif_device_present(dev))
2986+ return -ENODEV;
2987+ return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
2988+ dev->addr_len, 1);
2989+
2990+ case SIOCDELMULTI:
2991+ if (!dev->set_multicast_list ||
2992+ ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
2993+ return -EINVAL;
2994+ if (!netif_device_present(dev))
2995+ return -ENODEV;
2996+ return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
2997+ dev->addr_len, 1);
2998+
2999+ case SIOCGIFINDEX:
3000+ ifr->ifr_ifindex = dev->ifindex;
3001+ return 0;
3002+
3003+ case SIOCGIFTXQLEN:
3004+ ifr->ifr_qlen = dev->tx_queue_len;
3005+ return 0;
3006+
3007+ case SIOCSIFTXQLEN:
3008+ if (ifr->ifr_qlen < 0)
3009+ return -EINVAL;
3010+ dev->tx_queue_len = ifr->ifr_qlen;
3011+ return 0;
3012+
3013+ case SIOCSIFNAME:
3014+ ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3015+ return dev_change_name(dev, ifr->ifr_newname);
3016+
3017+ /*
3018+ * Unknown or private ioctl
3019+ */
3020+
3021+ default:
3022+ if ((cmd >= SIOCDEVPRIVATE &&
3023+ cmd <= SIOCDEVPRIVATE + 15) ||
3024+ cmd == SIOCBONDENSLAVE ||
3025+ cmd == SIOCBONDRELEASE ||
3026+ cmd == SIOCBONDSETHWADDR ||
3027+ cmd == SIOCBONDSLAVEINFOQUERY ||
3028+ cmd == SIOCBONDINFOQUERY ||
3029+ cmd == SIOCBONDCHANGEACTIVE ||
3030+ cmd == SIOCGMIIPHY ||
3031+ cmd == SIOCGMIIREG ||
3032+ cmd == SIOCSMIIREG ||
3033+ cmd == SIOCBRADDIF ||
3034+ cmd == SIOCBRDELIF ||
3035+ cmd == SIOCWANDEV) {
3036+ err = -EOPNOTSUPP;
3037+ if (dev->do_ioctl) {
3038+ if (netif_device_present(dev))
3039+ err = dev->do_ioctl(dev, ifr,
3040+ cmd);
3041+ else
3042+ err = -ENODEV;
3043+ }
3044+ } else
3045+ err = -EINVAL;
3046+
3047+ }
3048+ return err;
3049+}
3050+
3051+/*
3052+ * This function handles all "interface"-type I/O control requests. The actual
3053+ * 'doing' part of this is dev_ifsioc above.
3054+ */
3055+
3056+/**
3057+ * dev_ioctl - network device ioctl
3058+ * @cmd: command to issue
3059+ * @arg: pointer to a struct ifreq in user space
3060+ *
3061+ * Issue ioctl functions to devices. This is normally called by the
3062+ * user space syscall interfaces but can sometimes be useful for
3063+ * other purposes. The return value is the return from the syscall if
3064+ * positive or a negative errno code on error.
3065+ */
3066+
3067+int dev_ioctl(unsigned int cmd, void __user *arg)
3068+{
3069+ struct ifreq ifr;
3070+ int ret;
3071+ char *colon;
3072+
3073+ /* One special case: SIOCGIFCONF takes ifconf argument
3074+ and requires shared lock, because it sleeps writing
3075+ to user space.
3076+ */
3077+
3078+ if (cmd == SIOCGIFCONF) {
3079+ rtnl_lock();
3080+ ret = dev_ifconf((char __user *) arg);
3081+ rtnl_unlock();
3082+ return ret;
3083+ }
3084+ if (cmd == SIOCGIFNAME)
3085+ return dev_ifname((struct ifreq __user *)arg);
3086+
3087+ if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3088+ return -EFAULT;
3089+
3090+ ifr.ifr_name[IFNAMSIZ-1] = 0;
3091+
3092+ colon = strchr(ifr.ifr_name, ':');
3093+ if (colon)
3094+ *colon = 0;
3095+
3096+ /*
3097+ * See which interface the caller is talking about.
3098+ */
3099+
3100+ switch (cmd) {
3101+ /*
3102+ * These ioctl calls:
3103+ * - can be done by all.
3104+ * - atomic and do not require locking.
3105+ * - return a value
3106+ */
3107+ case SIOCGIFFLAGS:
3108+ case SIOCGIFMETRIC:
3109+ case SIOCGIFMTU:
3110+ case SIOCGIFHWADDR:
3111+ case SIOCGIFSLAVE:
3112+ case SIOCGIFMAP:
3113+ case SIOCGIFINDEX:
3114+ case SIOCGIFTXQLEN:
3115+ dev_load(ifr.ifr_name);
3116+ read_lock(&dev_base_lock);
3117+ ret = dev_ifsioc(&ifr, cmd);
3118+ read_unlock(&dev_base_lock);
3119+ if (!ret) {
3120+ if (colon)
3121+ *colon = ':';
3122+ if (copy_to_user(arg, &ifr,
3123+ sizeof(struct ifreq)))
3124+ ret = -EFAULT;
3125+ }
3126+ return ret;
3127+
3128+ case SIOCETHTOOL:
3129+ dev_load(ifr.ifr_name);
3130+ rtnl_lock();
3131+ ret = dev_ethtool(&ifr);
3132+ rtnl_unlock();
3133+ if (!ret) {
3134+ if (colon)
3135+ *colon = ':';
3136+ if (copy_to_user(arg, &ifr,
3137+ sizeof(struct ifreq)))
3138+ ret = -EFAULT;
3139+ }
3140+ return ret;
3141+
3142+ /*
3143+ * These ioctl calls:
3144+ * - require superuser power.
3145+ * - require strict serialization.
3146+ * - return a value
3147+ */
3148+ case SIOCGMIIPHY:
3149+ case SIOCGMIIREG:
3150+ case SIOCSIFNAME:
3151+ if (!capable(CAP_NET_ADMIN))
3152+ return -EPERM;
3153+ dev_load(ifr.ifr_name);
3154+ rtnl_lock();
3155+ ret = dev_ifsioc(&ifr, cmd);
3156+ rtnl_unlock();
3157+ if (!ret) {
3158+ if (colon)
3159+ *colon = ':';
3160+ if (copy_to_user(arg, &ifr,
3161+ sizeof(struct ifreq)))
3162+ ret = -EFAULT;
3163+ }
3164+ return ret;
3165+
3166+ /*
3167+ * These ioctl calls:
3168+ * - require superuser power.
3169+ * - require strict serialization.
3170+ * - do not return a value
3171+ */
3172+ case SIOCSIFFLAGS:
3173+ case SIOCSIFMETRIC:
3174+ case SIOCSIFMTU:
3175+ case SIOCSIFMAP:
3176+ case SIOCSIFHWADDR:
3177+ case SIOCSIFSLAVE:
3178+ case SIOCADDMULTI:
3179+ case SIOCDELMULTI:
3180+ case SIOCSIFHWBROADCAST:
3181+ case SIOCSIFTXQLEN:
3182+ case SIOCSMIIREG:
3183+ case SIOCBONDENSLAVE:
3184+ case SIOCBONDRELEASE:
3185+ case SIOCBONDSETHWADDR:
3186+ case SIOCBONDCHANGEACTIVE:
3187+ case SIOCBRADDIF:
3188+ case SIOCBRDELIF:
3189+ if (!capable(CAP_NET_ADMIN))
3190+ return -EPERM;
3191+ /* fall through */
3192+ case SIOCBONDSLAVEINFOQUERY:
3193+ case SIOCBONDINFOQUERY:
3194+ dev_load(ifr.ifr_name);
3195+ rtnl_lock();
3196+ ret = dev_ifsioc(&ifr, cmd);
3197+ rtnl_unlock();
3198+ return ret;
3199+
3200+ case SIOCGIFMEM:
3201+ /* Get the per device memory space. We can add this but
3202+ * currently do not support it */
3203+ case SIOCSIFMEM:
3204+ /* Set the per device memory buffer space.
3205+ * Not applicable in our case */
3206+ case SIOCSIFLINK:
3207+ return -EINVAL;
3208+
3209+ /*
3210+ * Unknown or private ioctl.
3211+ */
3212+ default:
3213+ if (cmd == SIOCWANDEV ||
3214+ (cmd >= SIOCDEVPRIVATE &&
3215+ cmd <= SIOCDEVPRIVATE + 15)) {
3216+ dev_load(ifr.ifr_name);
3217+ rtnl_lock();
3218+ ret = dev_ifsioc(&ifr, cmd);
3219+ rtnl_unlock();
3220+ if (!ret && copy_to_user(arg, &ifr,
3221+ sizeof(struct ifreq)))
3222+ ret = -EFAULT;
3223+ return ret;
3224+ }
3225+#ifdef CONFIG_WIRELESS_EXT
3226+ /* Take care of Wireless Extensions */
3227+ if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
3228+ /* If command is `set a parameter', or
3229+ * `get the encoding parameters', check if
3230+ * the user has the right to do it */
3231+ if (IW_IS_SET(cmd) || cmd == SIOCGIWENCODE
3232+ || cmd == SIOCGIWENCODEEXT) {
3233+ if (!capable(CAP_NET_ADMIN))
3234+ return -EPERM;
3235+ }
3236+ dev_load(ifr.ifr_name);
3237+ rtnl_lock();
3238+ /* Follow me in net/core/wireless.c */
3239+ ret = wireless_process_ioctl(&ifr, cmd);
3240+ rtnl_unlock();
3241+ if (IW_IS_GET(cmd) &&
3242+ copy_to_user(arg, &ifr,
3243+ sizeof(struct ifreq)))
3244+ ret = -EFAULT;
3245+ return ret;
3246+ }
3247+#endif /* CONFIG_WIRELESS_EXT */
3248+ return -EINVAL;
3249+ }
3250+}
3251+
3252+
3253+/**
3254+ * dev_new_index - allocate an ifindex
3255+ *
3256+ * Returns a suitable unique value for a new device interface
3257+ * number. The caller must hold the rtnl semaphore or the
3258+ * dev_base_lock to be sure it remains unique.
3259+ */
3260+static int dev_new_index(void)
3261+{
3262+ static int ifindex;
3263+ for (;;) {
3264+ if (++ifindex <= 0)
3265+ ifindex = 1;
3266+ if (!__dev_get_by_index(ifindex))
3267+ return ifindex;
3268+ }
3269+}
3270+
3271+static int dev_boot_phase = 1;
3272+
3273+/* Delayed registration/unregisteration */
3274+static DEFINE_SPINLOCK(net_todo_list_lock);
3275+static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list);
3276+
3277+static inline void net_set_todo(struct net_device *dev)
3278+{
3279+ spin_lock(&net_todo_list_lock);
3280+ list_add_tail(&dev->todo_list, &net_todo_list);
3281+ spin_unlock(&net_todo_list_lock);
3282+}
3283+
3284+/**
3285+ * register_netdevice - register a network device
3286+ * @dev: device to register
3287+ *
3288+ * Take a completed network device structure and add it to the kernel
3289+ * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3290+ * chain. 0 is returned on success. A negative errno code is returned
3291+ * on a failure to set up the device, or if the name is a duplicate.
3292+ *
3293+ * Callers must hold the rtnl semaphore. You may want
3294+ * register_netdev() instead of this.
3295+ *
3296+ * BUGS:
3297+ * The locking appears insufficient to guarantee two parallel registers
3298+ * will not get the same name.
3299+ */
3300+
3301+int register_netdevice(struct net_device *dev)
3302+{
3303+ struct hlist_head *head;
3304+ struct hlist_node *p;
3305+ int ret;
3306+
3307+ BUG_ON(dev_boot_phase);
3308+ ASSERT_RTNL();
3309+
3310+ might_sleep();
3311+
3312+ /* When net_device's are persistent, this will be fatal. */
3313+ BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
3314+
3315+ spin_lock_init(&dev->queue_lock);
3316+ spin_lock_init(&dev->_xmit_lock);
3317+ dev->xmit_lock_owner = -1;
3318+#ifdef CONFIG_NET_CLS_ACT
3319+ spin_lock_init(&dev->ingress_lock);
3320+#endif
3321+
3322+ dev->iflink = -1;
3323+
3324+ /* Init, if this function is available */
3325+ if (dev->init) {
3326+ ret = dev->init(dev);
3327+ if (ret) {
3328+ if (ret > 0)
3329+ ret = -EIO;
3330+ goto out;
3331+ }
3332+ }
3333+
3334+ if (!dev_valid_name(dev->name)) {
3335+ ret = -EINVAL;
3336+ goto out;
3337+ }
3338+
3339+ dev->ifindex = dev_new_index();
3340+ if (dev->iflink == -1)
3341+ dev->iflink = dev->ifindex;
3342+
3343+ /* Check for existence of name */
3344+ head = dev_name_hash(dev->name);
3345+ hlist_for_each(p, head) {
3346+ struct net_device *d
3347+ = hlist_entry(p, struct net_device, name_hlist);
3348+ if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
3349+ ret = -EEXIST;
3350+ goto out;
3351+ }
3352+ }
3353+
3354+ /* Fix illegal SG+CSUM combinations. */
3355+ if ((dev->features & NETIF_F_SG) &&
3356+ !(dev->features & NETIF_F_ALL_CSUM)) {
3357+ printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no checksum feature.\n",
3358+ dev->name);
3359+ dev->features &= ~NETIF_F_SG;
3360+ }
3361+
3362+ /* TSO requires that SG is present as well. */
3363+ if ((dev->features & NETIF_F_TSO) &&
3364+ !(dev->features & NETIF_F_SG)) {
3365+ printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no SG feature.\n",
3366+ dev->name);
3367+ dev->features &= ~NETIF_F_TSO;
3368+ }
3369+ if (dev->features & NETIF_F_UFO) {
3370+ if (!(dev->features & NETIF_F_HW_CSUM)) {
3371+ printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3372+ "NETIF_F_HW_CSUM feature.\n",
3373+ dev->name);
3374+ dev->features &= ~NETIF_F_UFO;
3375+ }
3376+ if (!(dev->features & NETIF_F_SG)) {
3377+ printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3378+ "NETIF_F_SG feature.\n",
3379+ dev->name);
3380+ dev->features &= ~NETIF_F_UFO;
3381+ }
3382+ }
3383+
3384+ /*
3385+ * nil rebuild_header routine,
3386+ * that should be never called and used as just bug trap.
3387+ */
3388+
3389+ if (!dev->rebuild_header)
3390+ dev->rebuild_header = default_rebuild_header;
3391+
3392+ ret = netdev_register_sysfs(dev);
3393+ if (ret)
3394+ goto out;
3395+ dev->reg_state = NETREG_REGISTERED;
3396+
3397+ /*
3398+ * Default initial state at registry is that the
3399+ * device is present.
3400+ */
3401+
3402+ set_bit(__LINK_STATE_PRESENT, &dev->state);
3403+
3404+ dev->next = NULL;
3405+ dev_init_scheduler(dev);
3406+ write_lock_bh(&dev_base_lock);
3407+ *dev_tail = dev;
3408+ dev_tail = &dev->next;
3409+ hlist_add_head(&dev->name_hlist, head);
3410+ hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex));
3411+ dev_hold(dev);
3412+ write_unlock_bh(&dev_base_lock);
3413+
3414+ /* Notify protocols, that a new device appeared. */
3415+ raw_notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
3416+
3417+ ret = 0;
3418+
3419+out:
3420+ return ret;
3421+}
3422+
3423+/**
3424+ * register_netdev - register a network device
3425+ * @dev: device to register
3426+ *
3427+ * Take a completed network device structure and add it to the kernel
3428+ * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3429+ * chain. 0 is returned on success. A negative errno code is returned
3430+ * on a failure to set up the device, or if the name is a duplicate.
3431+ *
3432+ * This is a wrapper around register_netdev that takes the rtnl semaphore
3433+ * and expands the device name if you passed a format string to
3434+ * alloc_netdev.
3435+ */
3436+int register_netdev(struct net_device *dev)
3437+{
3438+ int err;
3439+
3440+ rtnl_lock();
3441+
3442+ /*
3443+ * If the name is a format string the caller wants us to do a
3444+ * name allocation.
3445+ */
3446+ if (strchr(dev->name, '%')) {
3447+ err = dev_alloc_name(dev, dev->name);
3448+ if (err < 0)
3449+ goto out;
3450+ }
3451+
3452+ err = register_netdevice(dev);
3453+out:
3454+ rtnl_unlock();
3455+ return err;
3456+}
3457+EXPORT_SYMBOL(register_netdev);
3458+
3459+/*
3460+ * netdev_wait_allrefs - wait until all references are gone.
3461+ *
3462+ * This is called when unregistering network devices.
3463+ *
3464+ * Any protocol or device that holds a reference should register
3465+ * for netdevice notification, and cleanup and put back the
3466+ * reference if they receive an UNREGISTER event.
3467+ * We can get stuck here if buggy protocols don't correctly
3468+ * call dev_put.
3469+ */
3470+static void netdev_wait_allrefs(struct net_device *dev)
3471+{
3472+ unsigned long rebroadcast_time, warning_time;
3473+
3474+ rebroadcast_time = warning_time = jiffies;
3475+ while (atomic_read(&dev->refcnt) != 0) {
3476+ if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
3477+ rtnl_lock();
3478+
3479+ /* Rebroadcast unregister notification */
3480+ raw_notifier_call_chain(&netdev_chain,
3481+ NETDEV_UNREGISTER, dev);
3482+
3483+ if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
3484+ &dev->state)) {
3485+ /* We must not have linkwatch events
3486+ * pending on unregister. If this
3487+ * happens, we simply run the queue
3488+ * unscheduled, resulting in a noop
3489+ * for this device.
3490+ */
3491+ linkwatch_run_queue();
3492+ }
3493+
3494+ __rtnl_unlock();
3495+
3496+ rebroadcast_time = jiffies;
3497+ }
3498+
3499+ msleep(250);
3500+
3501+ if (time_after(jiffies, warning_time + 10 * HZ)) {
3502+ printk(KERN_EMERG "unregister_netdevice: "
3503+ "waiting for %s to become free. Usage "
3504+ "count = %d\n",
3505+ dev->name, atomic_read(&dev->refcnt));
3506+ warning_time = jiffies;
3507+ }
3508+ }
3509+}
3510+
3511+/* The sequence is:
3512+ *
3513+ * rtnl_lock();
3514+ * ...
3515+ * register_netdevice(x1);
3516+ * register_netdevice(x2);
3517+ * ...
3518+ * unregister_netdevice(y1);
3519+ * unregister_netdevice(y2);
3520+ * ...
3521+ * rtnl_unlock();
3522+ * free_netdev(y1);
3523+ * free_netdev(y2);
3524+ *
3525+ * We are invoked by rtnl_unlock() after it drops the semaphore.
3526+ * This allows us to deal with problems:
3527+ * 1) We can delete sysfs objects which invoke hotplug
3528+ * without deadlocking with linkwatch via keventd.
3529+ * 2) Since we run with the RTNL semaphore not held, we can sleep
3530+ * safely in order to wait for the netdev refcnt to drop to zero.
3531+ */
3532+static DEFINE_MUTEX(net_todo_run_mutex);
3533+void netdev_run_todo(void)
3534+{
3535+ struct list_head list;
3536+
3537+ /* Need to guard against multiple cpu's getting out of order. */
3538+ mutex_lock(&net_todo_run_mutex);
3539+
3540+ /* Not safe to do outside the semaphore. We must not return
3541+ * until all unregister events invoked by the local processor
3542+ * have been completed (either by this todo run, or one on
3543+ * another cpu).
3544+ */
3545+ if (list_empty(&net_todo_list))
3546+ goto out;
3547+
3548+ /* Snapshot list, allow later requests */
3549+ spin_lock(&net_todo_list_lock);
3550+ list_replace_init(&net_todo_list, &list);
3551+ spin_unlock(&net_todo_list_lock);
3552+
3553+ while (!list_empty(&list)) {
3554+ struct net_device *dev
3555+ = list_entry(list.next, struct net_device, todo_list);
3556+ list_del(&dev->todo_list);
3557+
3558+ if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
3559+ printk(KERN_ERR "network todo '%s' but state %d\n",
3560+ dev->name, dev->reg_state);
3561+ dump_stack();
3562+ continue;
3563+ }
3564+
3565+ netdev_unregister_sysfs(dev);
3566+ dev->reg_state = NETREG_UNREGISTERED;
3567+
3568+ netdev_wait_allrefs(dev);
3569+
3570+ /* paranoia */
3571+ BUG_ON(atomic_read(&dev->refcnt));
3572+ BUG_TRAP(!dev->ip_ptr);
3573+ BUG_TRAP(!dev->ip6_ptr);
3574+ BUG_TRAP(!dev->dn_ptr);
3575+
3576+ /* It must be the very last action,
3577+ * after this 'dev' may point to freed up memory.
3578+ */
3579+ if (dev->destructor)
3580+ dev->destructor(dev);
3581+ }
3582+
3583+out:
3584+ mutex_unlock(&net_todo_run_mutex);
3585+}
3586+
3587+/**
3588+ * alloc_netdev - allocate network device
3589+ * @sizeof_priv: size of private data to allocate space for
3590+ * @name: device name format string
3591+ * @setup: callback to initialize device
3592+ *
3593+ * Allocates a struct net_device with private data area for driver use
3594+ * and performs basic initialization.
3595+ */
3596+struct net_device *alloc_netdev(int sizeof_priv, const char *name,
3597+ void (*setup)(struct net_device *))
3598+{
3599+ void *p;
3600+ struct net_device *dev;
3601+ int alloc_size;
3602+
3603+ BUG_ON(strlen(name) >= sizeof(dev->name));
3604+
3605+ /* ensure 32-byte alignment of both the device and private area */
3606+ alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
3607+ alloc_size += sizeof_priv + NETDEV_ALIGN_CONST;
3608+
3609+ p = kzalloc(alloc_size, GFP_KERNEL);
3610+ if (!p) {
3611+ printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
3612+ return NULL;
3613+ }
3614+
3615+ dev = (struct net_device *)
3616+ (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
3617+ dev->padded = (char *)dev - (char *)p;
3618+
3619+ if (sizeof_priv)
3620+ dev->priv = netdev_priv(dev);
3621+
3622+ setup(dev);
3623+ strcpy(dev->name, name);
3624+ return dev;
3625+}
3626+EXPORT_SYMBOL(alloc_netdev);
3627+
3628+/**
3629+ * free_netdev - free network device
3630+ * @dev: device
3631+ *
3632+ * This function does the last stage of destroying an allocated device
3633+ * interface. The reference to the device object is released.
3634+ * If this is the last reference then it will be freed.
3635+ */
3636+void free_netdev(struct net_device *dev)
3637+{
3638+#ifdef CONFIG_SYSFS
3639+ /* Compatibility with error handling in drivers */
3640+ if (dev->reg_state == NETREG_UNINITIALIZED) {
3641+ kfree((char *)dev - dev->padded);
3642+ return;
3643+ }
3644+
3645+ BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
3646+ dev->reg_state = NETREG_RELEASED;
3647+
3648+ /* will free via device release */
3649+ put_device(&dev->dev);
3650+#else
3651+ kfree((char *)dev - dev->padded);
3652+#endif
3653+}
3654+
3655+/* Synchronize with packet receive processing. */
3656+void synchronize_net(void)
3657+{
3658+ might_sleep();
3659+ synchronize_rcu();
3660+}
3661+
3662+/**
3663+ * unregister_netdevice - remove device from the kernel
3664+ * @dev: device
3665+ *
3666+ * This function shuts down a device interface and removes it
3667+ * from the kernel tables. On success 0 is returned, on a failure
3668+ * a negative errno code is returned.
3669+ *
3670+ * Callers must hold the rtnl semaphore. You may want
3671+ * unregister_netdev() instead of this.
3672+ */
3673+
3674+void unregister_netdevice(struct net_device *dev)
3675+{
3676+ struct net_device *d, **dp;
3677+
3678+ BUG_ON(dev_boot_phase);
3679+ ASSERT_RTNL();
3680+
3681+ /* Some devices call without registering for initialization unwind. */
3682+ if (dev->reg_state == NETREG_UNINITIALIZED) {
3683+ printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
3684+ "was registered\n", dev->name, dev);
3685+
3686+ WARN_ON(1);
3687+ return;
3688+ }
3689+
3690+ BUG_ON(dev->reg_state != NETREG_REGISTERED);
3691+
3692+ /* If device is running, close it first. */
3693+ if (dev->flags & IFF_UP)
3694+ dev_close(dev);
3695+
3696+ /* And unlink it from device chain. */
3697+ for (dp = &dev_base; (d = *dp) != NULL; dp = &d->next) {
3698+ if (d == dev) {
3699+ write_lock_bh(&dev_base_lock);
3700+ hlist_del(&dev->name_hlist);
3701+ hlist_del(&dev->index_hlist);
3702+ if (dev_tail == &dev->next)
3703+ dev_tail = dp;
3704+ *dp = d->next;
3705+ write_unlock_bh(&dev_base_lock);
3706+ break;
3707+ }
3708+ }
3709+ BUG_ON(!d);
3710+
3711+ dev->reg_state = NETREG_UNREGISTERING;
3712+
3713+ synchronize_net();
3714+
3715+ /* Shutdown queueing discipline. */
3716+ dev_shutdown(dev);
3717+
3718+
3719+ /* Notify protocols, that we are about to destroy
3720+ this device. They should clean all the things.
3721+ */
3722+ raw_notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
3723+
3724+ /*
3725+ * Flush the multicast chain
3726+ */
3727+ dev_mc_discard(dev);
3728+
3729+ if (dev->uninit)
3730+ dev->uninit(dev);
3731+
3732+ /* Notifier chain MUST detach us from master device. */
3733+ BUG_TRAP(!dev->master);
3734+
3735+ /* Finish processing unregister after unlock */
3736+ net_set_todo(dev);
3737+
3738+ synchronize_net();
3739+
3740+ dev_put(dev);
3741+}
3742+
3743+/**
3744+ * unregister_netdev - remove device from the kernel
3745+ * @dev: device
3746+ *
3747+ * This function shuts down a device interface and removes it
3748+ * from the kernel tables. On success 0 is returned, on a failure
3749+ * a negative errno code is returned.
3750+ *
3751+ * This is just a wrapper for unregister_netdevice that takes
3752+ * the rtnl semaphore. In general you want to use this and not
3753+ * unregister_netdevice.
3754+ */
3755+void unregister_netdev(struct net_device *dev)
3756+{
3757+ rtnl_lock();
3758+ unregister_netdevice(dev);
3759+ rtnl_unlock();
3760+}
3761+
3762+EXPORT_SYMBOL(unregister_netdev);
3763+
3764+static int dev_cpu_callback(struct notifier_block *nfb,
3765+ unsigned long action,
3766+ void *ocpu)
3767+{
3768+ struct sk_buff **list_skb;
3769+ struct net_device **list_net;
3770+ struct sk_buff *skb;
3771+ unsigned int cpu, oldcpu = (unsigned long)ocpu;
3772+ struct softnet_data *sd, *oldsd;
3773+
3774+ if (action != CPU_DEAD)
3775+ return NOTIFY_OK;
3776+
3777+ local_irq_disable();
3778+ cpu = smp_processor_id();
3779+ sd = &per_cpu(softnet_data, cpu);
3780+ oldsd = &per_cpu(softnet_data, oldcpu);
3781+
3782+ /* Find end of our completion_queue. */
3783+ list_skb = &sd->completion_queue;
3784+ while (*list_skb)
3785+ list_skb = &(*list_skb)->next;
3786+ /* Append completion queue from offline CPU. */
3787+ *list_skb = oldsd->completion_queue;
3788+ oldsd->completion_queue = NULL;
3789+
3790+ /* Find end of our output_queue. */
3791+ list_net = &sd->output_queue;
3792+ while (*list_net)
3793+ list_net = &(*list_net)->next_sched;
3794+ /* Append output queue from offline CPU. */
3795+ *list_net = oldsd->output_queue;
3796+ oldsd->output_queue = NULL;
3797+
3798+ raise_softirq_irqoff(NET_TX_SOFTIRQ);
3799+ local_irq_enable();
3800+
3801+ /* Process offline CPU's input_pkt_queue */
3802+ while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
3803+ netif_rx(skb);
3804+
3805+ return NOTIFY_OK;
3806+}
3807+
3808+#ifdef CONFIG_NET_DMA
3809+/**
3810+ * net_dma_rebalance -
3811+ * This is called when the number of channels allocated to the net_dma_client
3812+ * changes. The net_dma_client tries to have one DMA channel per CPU.
3813+ */
3814+static void net_dma_rebalance(void)
3815+{
3816+ unsigned int cpu, i, n;
3817+ struct dma_chan *chan;
3818+
3819+ if (net_dma_count == 0) {
3820+ for_each_online_cpu(cpu)
3821+ rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL);
3822+ return;
3823+ }
3824+
3825+ i = 0;
3826+ cpu = first_cpu(cpu_online_map);
3827+
3828+ rcu_read_lock();
3829+ list_for_each_entry(chan, &net_dma_client->channels, client_node) {
3830+ n = ((num_online_cpus() / net_dma_count)
3831+ + (i < (num_online_cpus() % net_dma_count) ? 1 : 0));
3832+
3833+ while(n) {
3834+ per_cpu(softnet_data, cpu).net_dma = chan;
3835+ cpu = next_cpu(cpu, cpu_online_map);
3836+ n--;
3837+ }
3838+ i++;
3839+ }
3840+ rcu_read_unlock();
3841+}
3842+
3843+/**
3844+ * netdev_dma_event - event callback for the net_dma_client
3845+ * @client: should always be net_dma_client
3846+ * @chan: DMA channel for the event
3847+ * @event: event type
3848+ */
3849+static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
3850+ enum dma_event event)
3851+{
3852+ spin_lock(&net_dma_event_lock);
3853+ switch (event) {
3854+ case DMA_RESOURCE_ADDED:
3855+ net_dma_count++;
3856+ net_dma_rebalance();
3857+ break;
3858+ case DMA_RESOURCE_REMOVED:
3859+ net_dma_count--;
3860+ net_dma_rebalance();
3861+ break;
3862+ default:
3863+ break;
3864+ }
3865+ spin_unlock(&net_dma_event_lock);
3866+}
3867+
3868+/**
3869+ * netdev_dma_regiser - register the networking subsystem as a DMA client
3870+ */
3871+static int __init netdev_dma_register(void)
3872+{
3873+ spin_lock_init(&net_dma_event_lock);
3874+ net_dma_client = dma_async_client_register(netdev_dma_event);
3875+ if (net_dma_client == NULL)
3876+ return -ENOMEM;
3877+
3878+ dma_async_client_chan_request(net_dma_client, num_online_cpus());
3879+ return 0;
3880+}
3881+
3882+#else
3883+static int __init netdev_dma_register(void) { return -ENODEV; }
3884+#endif /* CONFIG_NET_DMA */
3885+
3886+/*
3887+ * Initialize the DEV module. At boot time this walks the device list and
3888+ * unhooks any devices that fail to initialise (normally hardware not
3889+ * present) and leaves us with a valid list of present and active devices.
3890+ *
3891+ */
3892+
3893+/*
3894+ * This is called single threaded during boot, so no need
3895+ * to take the rtnl semaphore.
3896+ */
3897+static int __init net_dev_init(void)
3898+{
3899+ int i, rc = -ENOMEM;
3900+
3901+ BUG_ON(!dev_boot_phase);
3902+
3903+ if (dev_proc_init())
3904+ goto out;
3905+
3906+ if (netdev_sysfs_init())
3907+ goto out;
3908+
3909+ INIT_LIST_HEAD(&ptype_all);
3910+ for (i = 0; i < 16; i++)
3911+ INIT_LIST_HEAD(&ptype_base[i]);
3912+
3913+ for (i = 0; i < ARRAY_SIZE(dev_name_head); i++)
3914+ INIT_HLIST_HEAD(&dev_name_head[i]);
3915+
3916+ for (i = 0; i < ARRAY_SIZE(dev_index_head); i++)
3917+ INIT_HLIST_HEAD(&dev_index_head[i]);
3918+
3919+ /*
3920+ * Initialise the packet receive queues.
3921+ */
3922+
3923+ for_each_possible_cpu(i) {
3924+ struct softnet_data *queue;
3925+
3926+ queue = &per_cpu(softnet_data, i);
3927+ skb_queue_head_init(&queue->input_pkt_queue);
3928+ queue->completion_queue = NULL;
3929+ INIT_LIST_HEAD(&queue->poll_list);
3930+ set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
3931+ queue->backlog_dev.weight = weight_p;
3932+ queue->backlog_dev.poll = process_backlog;
3933+ atomic_set(&queue->backlog_dev.refcnt, 1);
3934+ }
3935+
3936+ netdev_dma_register();
3937+
3938+ dev_boot_phase = 0;
3939+
3940+ open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
3941+ open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
3942+
3943+ hotcpu_notifier(dev_cpu_callback, 0);
3944+ dst_init();
3945+ dev_mcast_init();
3946+ rc = 0;
3947+out:
3948+ return rc;
3949+}
3950+
3951+subsys_initcall(net_dev_init);
3952+
3953+EXPORT_SYMBOL(__dev_get_by_index);
3954+EXPORT_SYMBOL(__dev_get_by_name);
3955+EXPORT_SYMBOL(__dev_remove_pack);
3956+EXPORT_SYMBOL(dev_valid_name);
3957+EXPORT_SYMBOL(dev_add_pack);
3958+EXPORT_SYMBOL(dev_alloc_name);
3959+EXPORT_SYMBOL(dev_close);
3960+EXPORT_SYMBOL(dev_get_by_flags);
3961+EXPORT_SYMBOL(dev_get_by_index);
3962+EXPORT_SYMBOL(dev_get_by_name);
3963+EXPORT_SYMBOL(dev_open);
3964+EXPORT_SYMBOL(dev_queue_xmit);
3965+EXPORT_SYMBOL(dev_remove_pack);
3966+EXPORT_SYMBOL(dev_set_allmulti);
3967+EXPORT_SYMBOL(dev_set_promiscuity);
3968+EXPORT_SYMBOL(dev_change_flags);
3969+EXPORT_SYMBOL(dev_set_mtu);
3970+EXPORT_SYMBOL(dev_set_mac_address);
3971+EXPORT_SYMBOL(free_netdev);
3972+EXPORT_SYMBOL(netdev_boot_setup_check);
3973+EXPORT_SYMBOL(netdev_set_master);
3974+EXPORT_SYMBOL(netdev_state_change);
3975+EXPORT_SYMBOL(netif_receive_skb);
3976+EXPORT_SYMBOL(netif_rx);
3977+EXPORT_SYMBOL(register_gifconf);
3978+EXPORT_SYMBOL(register_netdevice);
3979+EXPORT_SYMBOL(register_netdevice_notifier);
3980+EXPORT_SYMBOL(skb_checksum_help);
3981+EXPORT_SYMBOL(synchronize_net);
3982+EXPORT_SYMBOL(unregister_netdevice);
3983+EXPORT_SYMBOL(unregister_netdevice_notifier);
3984+EXPORT_SYMBOL(net_enable_timestamp);
3985+EXPORT_SYMBOL(net_disable_timestamp);
3986+EXPORT_SYMBOL(dev_get_flags);
3987+
3988+#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
3989+EXPORT_SYMBOL(br_handle_frame_hook);
3990+EXPORT_SYMBOL(br_fdb_get_hook);
3991+EXPORT_SYMBOL(br_fdb_put_hook);
3992+#endif
3993+
3994+#ifdef CONFIG_KMOD
3995+EXPORT_SYMBOL(dev_load);
3996+#endif
3997+
3998+EXPORT_PER_CPU_SYMBOL(softnet_data);
3999diff --unified --recursive --new-file linux-2.6.21.4/net/ring/Kconfig linux-2.6.21.4-1-686-smp-ring3/net/ring/Kconfig
4000--- linux-2.6.21.4/net/ring/Kconfig 1970-01-01 00:00:00.000000000 +0000
4001+++ linux-2.6.21.4-1-686-smp-ring3/net/ring/Kconfig 2007-06-10 16:43:04.406423944 +0000
4002@@ -0,0 +1,14 @@
4003+config RING
4004+ tristate "PF_RING sockets (EXPERIMENTAL)"
4005+ depends on EXPERIMENTAL
4006+ ---help---
4007+ PF_RING socket family, optimized for packet capture.
4008+ If a PF_RING socket is bound to an adapter (via the bind() system
4009+ call), such adapter will be used in read-only mode until the socket
4010+ is destroyed. Whenever an incoming packet is received from the adapter
4011+ it will not passed to upper layers, but instead it is copied to a ring
4012+ buffer, which in turn is exported to user space applications via mmap.
4013+ Please refer to http://luca.ntop.org/Ring.pdf for more.
4014+
4015+ Say N unless you know what you are doing.
4016+
4017diff --unified --recursive --new-file linux-2.6.21.4/net/ring/Makefile linux-2.6.21.4-1-686-smp-ring3/net/ring/Makefile
4018--- linux-2.6.21.4/net/ring/Makefile 1970-01-01 00:00:00.000000000 +0000
4019+++ linux-2.6.21.4-1-686-smp-ring3/net/ring/Makefile 2007-06-10 16:43:04.350421521 +0000
4020@@ -0,0 +1,7 @@
4021+#
4022+# Makefile for the ring driver.
4023+#
4024+
4025+obj-m += ring.o
4026+
4027+ring-objs := ring_packet.o
4028diff --unified --recursive --new-file linux-2.6.21.4/net/ring/ring_packet.c linux-2.6.21.4-1-686-smp-ring3/net/ring/ring_packet.c
4029--- linux-2.6.21.4/net/ring/ring_packet.c 1970-01-01 00:00:00.000000000 +0000
4030+++ linux-2.6.21.4-1-686-smp-ring3/net/ring/ring_packet.c 2007-06-10 16:43:04.354421694 +0000
4031@@ -0,0 +1,4258 @@
4032+/* ***************************************************************
4033+ *
4034+ * (C) 2004-07 - Luca Deri <deri@ntop.org>
4035+ *
4036+ * This code includes contributions courtesy of
4037+ * - Jeff Randall <jrandall@nexvu.com>
4038+ * - Helmut Manck <helmut.manck@secunet.com>
4039+ * - Brad Doctor <brad@stillsecure.com>
4040+ * - Amit D. Chaudhary <amit_ml@rajgad.com>
4041+ * - Francesco Fusco <fusco@ntop.org>
4042+ * - Michael Stiller <ms@2scale.net>
4043+ *
4044+ *
4045+ * This program is free software; you can redistribute it and/or modify
4046+ * it under the terms of the GNU General Public License as published by
4047+ * the Free Software Foundation; either version 2 of the License, or
4048+ * (at your option) any later version.
4049+ *
4050+ * This program is distributed in the hope that it will be useful,
4051+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
4052+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
4053+ * GNU General Public License for more details.
4054+ *
4055+ * You should have received a copy of the GNU General Public License
4056+ * along with this program; if not, write to the Free Software Foundation,
4057+ * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
4058+ *
4059+ */
4060+
4061+#include <linux/version.h>
4062+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,19))
4063+#include <linux/autoconf.h>
4064+#else
4065+#include <linux/config.h>
4066+#endif
4067+#include <linux/module.h>
4068+#include <linux/kernel.h>
4069+#include <linux/socket.h>
4070+#include <linux/skbuff.h>
4071+#include <linux/rtnetlink.h>
4072+#include <linux/in.h>
4073+#include <linux/inet.h>
4074+#include <linux/in6.h>
4075+#include <linux/init.h>
4076+#include <linux/filter.h>
4077+#include <linux/ring.h>
4078+#include <linux/ip.h>
4079+#include <linux/tcp.h>
4080+#include <linux/udp.h>
4081+#include <linux/list.h>
4082+#include <linux/proc_fs.h>
4083+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
4084+#include <net/xfrm.h>
4085+#else
4086+#include <linux/poll.h>
4087+#endif
4088+#include <net/sock.h>
4089+#include <asm/io.h> /* needed for virt_to_phys() */
4090+#ifdef CONFIG_INET
4091+#include <net/inet_common.h>
4092+#endif
4093+
4094+/* #define RING_DEBUG */
4095+
4096+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11))
4097+static inline int remap_page_range(struct vm_area_struct *vma,
4098+ unsigned long uvaddr,
4099+ unsigned long paddr,
4100+ unsigned long size,
4101+ pgprot_t prot) {
4102+ return(remap_pfn_range(vma, uvaddr, paddr >> PAGE_SHIFT,
4103+ size, prot));
4104+}
4105+#endif
4106+
4107+/* ************************************************* */
4108+
4109+#define CLUSTER_LEN 8
4110+
4111+struct ring_cluster {
4112+ u_short cluster_id; /* 0 = no cluster */
4113+ u_short num_cluster_elements;
4114+ enum cluster_type hashing_mode;
4115+ u_short hashing_id;
4116+ struct sock *sk[CLUSTER_LEN];
4117+ struct ring_cluster *next; /* NULL = last element of the cluster */
4118+};
4119+
4120+/* ************************************************* */
4121+
4122+struct ring_element {
4123+ struct list_head list;
4124+ struct sock *sk;
4125+};
4126+
4127+/* ************************************************* */
4128+
4129+struct ring_opt {
4130+ struct net_device *ring_netdev;
4131+
4132+ u_short ring_pid;
4133+
4134+ /* Cluster */
4135+ u_short cluster_id; /* 0 = no cluster */
4136+
4137+ /* Reflector */
4138+ struct net_device *reflector_dev;
4139+
4140+ /* Packet buffers */
4141+ unsigned long order;
4142+
4143+ /* Ring Slots */
4144+ unsigned long ring_memory;
4145+ FlowSlotInfo *slots_info; /* Basically it points to ring_memory */
4146+ char *ring_slots; /* Basically it points to ring_memory
4147+ +sizeof(FlowSlotInfo) */
4148+
4149+ /* Packet Sampling */
4150+ u_int pktToSample, sample_rate;
4151+
4152+ /* BPF Filter */
4153+ struct sk_filter *bpfFilter;
4154+
4155+ /* Aho-Corasick */
4156+ ACSM_STRUCT2 * acsm;
4157+
4158+ /* Locks */
4159+ atomic_t num_ring_slots_waiters;
4160+ wait_queue_head_t ring_slots_waitqueue;
4161+ rwlock_t ring_index_lock;
4162+
4163+ /* Bloom Filters */
4164+ u_char bitmask_enabled;
4165+ bitmask_selector mac_bitmask, vlan_bitmask, ip_bitmask, twin_ip_bitmask,
4166+ port_bitmask, twin_port_bitmask, proto_bitmask;
4167+ u_int32_t num_mac_bitmask_add, num_mac_bitmask_remove;
4168+ u_int32_t num_vlan_bitmask_add, num_vlan_bitmask_remove;
4169+ u_int32_t num_ip_bitmask_add, num_ip_bitmask_remove;
4170+ u_int32_t num_port_bitmask_add, num_port_bitmask_remove;
4171+ u_int32_t num_proto_bitmask_add, num_proto_bitmask_remove;
4172+
4173+ /* Indexes (Internal) */
4174+ u_int insert_page_id, insert_slot_id;
4175+};
4176+
4177+/* ************************************************* */
4178+
4179+/* List of all ring sockets. */
4180+static struct list_head ring_table;
4181+static u_int ring_table_size;
4182+
4183+/* List of all clusters */
4184+static struct ring_cluster *ring_cluster_list;
4185+
4186+static rwlock_t ring_mgmt_lock = RW_LOCK_UNLOCKED;
4187+
4188+/* ********************************** */
4189+
4190+/* /proc entry for ring module */
4191+struct proc_dir_entry *ring_proc_dir = NULL;
4192+struct proc_dir_entry *ring_proc = NULL;
4193+
4194+static int ring_proc_get_info(char *, char **, off_t, int, int *, void *);
4195+static void ring_proc_add(struct ring_opt *pfr);
4196+static void ring_proc_remove(struct ring_opt *pfr);
4197+static void ring_proc_init(void);
4198+static void ring_proc_term(void);
4199+
4200+/* ********************************** */
4201+
4202+/* Forward */
4203+static struct proto_ops ring_ops;
4204+
4205+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,11))
4206+static struct proto ring_proto;
4207+#endif
4208+
4209+static int skb_ring_handler(struct sk_buff *skb, u_char recv_packet,
4210+ u_char real_skb);
4211+static int buffer_ring_handler(struct net_device *dev, char *data, int len);
4212+static int remove_from_cluster(struct sock *sock, struct ring_opt *pfr);
4213+
4214+/* Extern */
4215+
4216+/* ********************************** */
4217+
4218+/* Defaults */
4219+static unsigned int bucket_len = 128, num_slots = 4096, sample_rate = 1,
4220+ transparent_mode = 1, enable_tx_capture = 1;
4221+
4222+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16))
4223+module_param(bucket_len, uint, 0644);
4224+module_param(num_slots, uint, 0644);
4225+module_param(sample_rate, uint, 0644);
4226+module_param(transparent_mode, uint, 0644);
4227+module_param(enable_tx_capture, uint, 0644);
4228+#else
4229+MODULE_PARM(bucket_len, "i");
4230+MODULE_PARM(num_slots, "i");
4231+MODULE_PARM(sample_rate, "i");
4232+MODULE_PARM(transparent_mode, "i");
4233+MODULE_PARM(enable_tx_capture, "i");
4234+#endif
4235+
4236+MODULE_PARM_DESC(bucket_len, "Number of ring buckets");
4237+MODULE_PARM_DESC(num_slots, "Number of ring slots");
4238+MODULE_PARM_DESC(sample_rate, "Ring packet sample rate");
4239+MODULE_PARM_DESC(transparent_mode,
4240+ "Set to 1 to set transparent mode "
4241+ "(slower but backwards compatible)");
4242+
4243+MODULE_PARM_DESC(enable_tx_capture, "Set to 1 to capture outgoing packets");
4244+
4245+/* ********************************** */
4246+
4247+#define MIN_QUEUED_PKTS 64
4248+#define MAX_QUEUE_LOOPS 64
4249+
4250+
4251+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
4252+#define ring_sk_datatype(__sk) ((struct ring_opt *)__sk)
4253+#define ring_sk(__sk) ((__sk)->sk_protinfo)
4254+#else
4255+#define ring_sk_datatype(a) (a)
4256+#define ring_sk(__sk) ((__sk)->protinfo.pf_ring)
4257+#endif
4258+
4259+#define _rdtsc() ({ uint64_t x; asm volatile("rdtsc" : "=A" (x)); x; })
4260+
4261+/*
4262+ int dev_queue_xmit(struct sk_buff *skb)
4263+ skb->dev;
4264+ struct net_device *dev_get_by_name(const char *name)
4265+*/
4266+
4267+/* ********************************** */
4268+
4269+/*
4270+** $Id$
4271+**
4272+** acsmx2.c
4273+**
4274+** Multi-Pattern Search Engine
4275+**
4276+** Aho-Corasick State Machine - version 2.0
4277+**
4278+** Supports both Non-Deterministic and Deterministic Finite Automata
4279+**
4280+**
4281+** Reference - Efficient String matching: An Aid to Bibliographic Search
4282+** Alfred V Aho and Margaret J Corasick
4283+** Bell Labratories
4284+** Copyright(C) 1975 Association for Computing Machinery,Inc
4285+**
4286+** +++
4287+** +++ Version 1.0 notes - Marc Norton:
4288+** +++
4289+**
4290+** Original implementation based on the 4 algorithms in the paper by Aho & Corasick,
4291+** some implementation ideas from 'Practical Algorithms in C', and some
4292+** of my own.
4293+**
4294+** 1) Finds all occurrences of all patterns within a text.
4295+**
4296+** +++
4297+** +++ Version 2.0 Notes - Marc Norton/Dan Roelker:
4298+** +++
4299+**
4300+** New implementation modifies the state table storage and access model to use
4301+** compacted sparse vector storage. Dan Roelker and I hammered this strategy out
4302+** amongst many others in order to reduce memory usage and improve caching performance.
4303+** The memory usage is greatly reduced, we only use 1/4 of what we use to. The caching
4304+** performance is better in pure benchmarking tests, but does not show overall improvement
4305+** in Snort. Unfortunately, once a pattern match test has been performed Snort moves on to doing
4306+** many other things before we get back to a patteren match test, so the cache is voided.
4307+**
4308+** This versions has better caching performance characteristics, reduced memory,
4309+** more state table storage options, and requires no a priori case conversions.
4310+** It does maintain the same public interface. (Snort only used banded storage).
4311+**
4312+** 1) Supports NFA and DFA state machines, and basic keyword state machines
4313+** 2) Initial transition table uses Linked Lists
4314+** 3) Improved state table memory options. NFA and DFA state
4315+** transition tables are converted to one of 4 formats during compilation.
4316+** a) Full matrix
4317+** b) Sparse matrix
4318+** c) Banded matrix (Default-this is the only one used in snort)
4319+** d) Sparse-Banded matrix
4320+** 4) Added support for acstate_t in .h file so we can compile states as
4321+** 16, or 32 bit state values for another reduction in memory consumption,
4322+** smaller states allows more of the state table to be cached, and improves
4323+** performance on x86-P4. Your mileage may vary, especially on risc systems.
4324+** 5) Added a bool to each state transition list to indicate if there is a matching
4325+** pattern in the state. This prevents us from accessing another data array
4326+** and can improve caching/performance.
4327+** 6) The search functions are very sensitive, don't change them without extensive testing,
4328+** or you'll just spoil the caching and prefetching opportunities.
4329+**
4330+** Extras for fellow pattern matchers:
4331+** The table below explains the storage format used at each step.
4332+** You can use an NFA or DFA to match with, the NFA is slower but tiny - set the structure directly.
4333+** You can use any of the 4 storage modes above -full,sparse,banded,sparse-bands, set the structure directly.
4334+** For applications where you have lots of data and a pattern set to search, this version was up to 3x faster
4335+** than the previous verion, due to caching performance. This cannot be fully realized in Snort yet,
4336+** but other applications may have better caching opportunities.
4337+** Snort only needs to use the banded or full storage.
4338+**
4339+** Transition table format at each processing stage.
4340+** -------------------------------------------------
4341+** Patterns -> Keyword State Table (List)
4342+** Keyword State Table -> NFA (List)
4343+** NFA -> DFA (List)
4344+** DFA (List)-> Sparse Rows O(m-avg # transitions per state)
4345+** -> Banded Rows O(1)
4346+** -> Sparse-Banded Rows O(nb-# bands)
4347+** -> Full Matrix O(1)
4348+**
4349+** Copyright(C) 2002,2003,2004 Marc Norton
4350+** Copyright(C) 2003,2004 Daniel Roelker
4351+** Copyright(C) 2002,2003,2004 Sourcefire,Inc.
4352+**
4353+** This program is free software; you can redistribute it and/or modify
4354+** it under the terms of the GNU General Public License as published by
4355+** the Free Software Foundation; either version 2 of the License, or
4356+** (at your option) any later version.
4357+**
4358+** This program is distributed in the hope that it will be useful,
4359+** but WITHOUT ANY WARRANTY; without even the implied warranty of
4360+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
4361+** GNU General Public License for more details.
4362+**
4363+** You should have received a copy of the GNU General Public License
4364+** along with this program; if not, write to the Free Software
4365+** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
4366+*
4367+*/
4368+
4369+/*
4370+ *
4371+ */
4372+#define MEMASSERT(p,s) if(!p){printk("ACSM-No Memory: %s!\n",s);}
4373+
4374+/*
4375+ *
4376+ */
4377+static int max_memory = 0;
4378+
4379+/*
4380+ *
4381+ */
4382+typedef struct acsm_summary_s
4383+{
4384+ unsigned num_states;
4385+ unsigned num_transitions;
4386+ ACSM_STRUCT2 acsm;
4387+
4388+}acsm_summary_t;
4389+
4390+/*
4391+ *
4392+ */
4393+static acsm_summary_t summary={0,0};
4394+
4395+/*
4396+** Case Translation Table
4397+*/
4398+static unsigned char xlatcase[256];
4399+/*
4400+ *
4401+ */
4402+
4403+inline int toupper(int ch) {
4404+ if ( (unsigned int)(ch - 'a') < 26u )
4405+ ch += 'A' - 'a';
4406+ return ch;
4407+}
4408+
4409+static void init_xlatcase(void)
4410+{
4411+ int i;
4412+ for (i = 0; i < 256; i++)
4413+ {
4414+ xlatcase[i] = toupper(i);
4415+ }
4416+}
4417+
4418+/*
4419+ * Case Conversion
4420+ */
4421+static
4422+inline
4423+void
4424+ConvertCaseEx (unsigned char *d, unsigned char *s, int m)
4425+{
4426+ int i;
4427+#ifdef XXXX
4428+ int n;
4429+ n = m & 3;
4430+ m >>= 2;
4431+
4432+ for (i = 0; i < m; i++ )
4433+ {
4434+ d[0] = xlatcase[ s[0] ];
4435+ d[2] = xlatcase[ s[2] ];
4436+ d[1] = xlatcase[ s[1] ];
4437+ d[3] = xlatcase[ s[3] ];
4438+ d+=4;
4439+ s+=4;
4440+ }
4441+
4442+ for (i=0; i < n; i++)
4443+ {
4444+ d[i] = xlatcase[ s[i] ];
4445+ }
4446+#else
4447+ for (i=0; i < m; i++)
4448+ {
4449+ d[i] = xlatcase[ s[i] ];
4450+ }
4451+
4452+#endif
4453+}
4454+
4455+
4456+/*
4457+ *
4458+ */
4459+static void *
4460+AC_MALLOC (int n)
4461+{
4462+ void *p;
4463+ p = kmalloc (n, GFP_KERNEL);
4464+ if (p)
4465+ max_memory += n;
4466+ return p;
4467+}
4468+
4469+
4470+/*
4471+ *
4472+ */
4473+static void
4474+AC_FREE (void *p)
4475+{
4476+ if (p)
4477+ kfree (p);
4478+}
4479+
4480+
4481+/*
4482+ * Simple QUEUE NODE
4483+ */
4484+typedef struct _qnode
4485+{
4486+ int state;
4487+ struct _qnode *next;
4488+}
4489+ QNODE;
4490+
4491+/*
4492+ * Simple QUEUE Structure
4493+ */
4494+typedef struct _queue
4495+{
4496+ QNODE * head, *tail;
4497+ int count;
4498+}
4499+ QUEUE;
4500+
4501+/*
4502+ * Initialize the queue
4503+ */
4504+static void
4505+queue_init (QUEUE * s)
4506+{
4507+ s->head = s->tail = 0;
4508+ s->count= 0;
4509+}
4510+
4511+/*
4512+ * Find a State in the queue
4513+ */
4514+static int
4515+queue_find (QUEUE * s, int state)
4516+{
4517+ QNODE * q;
4518+ q = s->head;
4519+ while( q )
4520+ {
4521+ if( q->state == state ) return 1;
4522+ q = q->next;
4523+ }
4524+ return 0;
4525+}
4526+
4527+/*
4528+ * Add Tail Item to queue (FiFo/LiLo)
4529+ */
4530+static void
4531+queue_add (QUEUE * s, int state)
4532+{
4533+ QNODE * q;
4534+
4535+ if( queue_find( s, state ) ) return;
4536+
4537+ if (!s->head)
4538+ {
4539+ q = s->tail = s->head = (QNODE *) AC_MALLOC (sizeof (QNODE));
4540+ MEMASSERT (q, "queue_add");
4541+ q->state = state;
4542+ q->next = 0;
4543+ }
4544+ else
4545+ {
4546+ q = (QNODE *) AC_MALLOC (sizeof (QNODE));
4547+ q->state = state;
4548+ q->next = 0;
4549+ s->tail->next = q;
4550+ s->tail = q;
4551+ }
4552+ s->count++;
4553+}
4554+
4555+
4556+/*
4557+ * Remove Head Item from queue
4558+ */
4559+static int
4560+queue_remove (QUEUE * s)
4561+{
4562+ int state = 0;
4563+ QNODE * q;
4564+ if (s->head)
4565+ {
4566+ q = s->head;
4567+ state = q->state;
4568+ s->head = s->head->next;
4569+ s->count--;
4570+
4571+ if( !s->head )
4572+ {
4573+ s->tail = 0;
4574+ s->count = 0;
4575+ }
4576+ AC_FREE (q);
4577+ }
4578+ return state;
4579+}
4580+
4581+
4582+/*
4583+ * Return items in the queue
4584+ */
4585+static int
4586+queue_count (QUEUE * s)
4587+{
4588+ return s->count;
4589+}
4590+
4591+
4592+/*
4593+ * Free the queue
4594+ */
4595+static void
4596+queue_free (QUEUE * s)
4597+{
4598+ while (queue_count (s))
4599+ {
4600+ queue_remove (s);
4601+ }
4602+}
4603+
4604+/*
4605+ * Get Next State-NFA
4606+ */
4607+static
4608+int List_GetNextState( ACSM_STRUCT2 * acsm, int state, int input )
4609+{
4610+ trans_node_t * t = acsm->acsmTransTable[state];
4611+
4612+ while( t )
4613+ {
4614+ if( t->key == input )
4615+ {
4616+ return t->next_state;
4617+ }
4618+ t=t->next;
4619+ }
4620+
4621+ if( state == 0 ) return 0;
4622+
4623+ return ACSM_FAIL_STATE2; /* Fail state ??? */
4624+}
4625+
4626+/*
4627+ * Get Next State-DFA
4628+ */
4629+static
4630+int List_GetNextState2( ACSM_STRUCT2 * acsm, int state, int input )
4631+{
4632+ trans_node_t * t = acsm->acsmTransTable[state];
4633+
4634+ while( t )
4635+ {
4636+ if( t->key == input )
4637+ {
4638+ return t->next_state;
4639+ }
4640+ t = t->next;
4641+ }
4642+
4643+ return 0; /* default state */
4644+}
4645+/*
4646+ * Put Next State - Head insertion, and transition updates
4647+ */
4648+static
4649+int List_PutNextState( ACSM_STRUCT2 * acsm, int state, int input, int next_state )
4650+{
4651+ trans_node_t * p;
4652+ trans_node_t * tnew;
4653+
4654+ // printk(" List_PutNextState: state=%d, input='%c', next_state=%d\n",state,input,next_state);
4655+
4656+
4657+ /* Check if the transition already exists, if so just update the next_state */
4658+ p = acsm->acsmTransTable[state];
4659+ while( p )
4660+ {
4661+ if( p->key == input ) /* transition already exists- reset the next state */
4662+ {
4663+ p->next_state = next_state;
4664+ return 0;
4665+ }
4666+ p=p->next;
4667+ }
4668+
4669+ /* Definitely not an existing transition - add it */
4670+ tnew = (trans_node_t*)AC_MALLOC(sizeof(trans_node_t));
4671+ if( !tnew ) return -1;
4672+
4673+ tnew->key = input;
4674+ tnew->next_state = next_state;
4675+ tnew->next = 0;
4676+
4677+ tnew->next = acsm->acsmTransTable[state];
4678+ acsm->acsmTransTable[state] = tnew;
4679+
4680+ acsm->acsmNumTrans++;
4681+
4682+ return 0;
4683+}
4684+/*
4685+ * Free the entire transition table
4686+ */
4687+static
4688+int List_FreeTransTable( ACSM_STRUCT2 * acsm )
4689+{
4690+ int i;
4691+ trans_node_t * t, *p;
4692+
4693+ if( !acsm->acsmTransTable ) return 0;
4694+
4695+ for(i=0;i< acsm->acsmMaxStates;i++)
4696+ {
4697+ t = acsm->acsmTransTable[i];
4698+
4699+ while( t )
4700+ {
4701+ p = t->next;
4702+ kfree(t);
4703+ t = p;
4704+ max_memory -= sizeof(trans_node_t);
4705+ }
4706+ }
4707+
4708+ kfree(acsm->acsmTransTable);
4709+
4710+ max_memory -= sizeof(void*) * acsm->acsmMaxStates;
4711+
4712+ acsm->acsmTransTable = 0;
4713+
4714+ return 0;
4715+}
4716+
4717+/*
4718+ *
4719+ */
4720+/*
4721+ static
4722+ int List_FreeList( trans_node_t * t )
4723+ {
4724+ int tcnt=0;
4725+
4726+ trans_node_t *p;
4727+
4728+ while( t )
4729+ {
4730+ p = t->next;
4731+ kfree(t);
4732+ t = p;
4733+ max_memory -= sizeof(trans_node_t);
4734+ tcnt++;
4735+ }
4736+
4737+ return tcnt;
4738+ }
4739+*/
4740+
4741+/*
4742+ * Converts row of states from list to a full vector format
4743+ */
4744+static
4745+int List_ConvToFull(ACSM_STRUCT2 * acsm, acstate_t state, acstate_t * full )
4746+{
4747+ int tcnt = 0;
4748+ trans_node_t * t = acsm->acsmTransTable[ state ];
4749+
4750+ memset(full,0,sizeof(acstate_t)*acsm->acsmAlphabetSize);
4751+
4752+ if( !t ) return 0;
4753+
4754+ while(t)
4755+ {
4756+ full[ t->key ] = t->next_state;
4757+ tcnt++;
4758+ t = t->next;
4759+ }
4760+ return tcnt;
4761+}
4762+
4763+/*
4764+ * Copy a Match List Entry - don't dup the pattern data
4765+ */
4766+static ACSM_PATTERN2*
4767+CopyMatchListEntry (ACSM_PATTERN2 * px)
4768+{
4769+ ACSM_PATTERN2 * p;
4770+
4771+ p = (ACSM_PATTERN2 *) AC_MALLOC (sizeof (ACSM_PATTERN2));
4772+ MEMASSERT (p, "CopyMatchListEntry");
4773+
4774+ memcpy (p, px, sizeof (ACSM_PATTERN2));
4775+
4776+ p->next = 0;
4777+
4778+ return p;
4779+}
4780+
4781+/*
4782+ * Check if a pattern is in the list already,
4783+ * validate it using the 'id' field. This must be unique
4784+ * for every pattern.
4785+ */
4786+/*
4787+ static
4788+ int FindMatchListEntry (ACSM_STRUCT2 * acsm, int state, ACSM_PATTERN2 * px)
4789+ {
4790+ ACSM_PATTERN2 * p;
4791+
4792+ p = acsm->acsmMatchList[state];
4793+ while( p )
4794+ {
4795+ if( p->id == px->id ) return 1;
4796+ p = p->next;
4797+ }
4798+
4799+ return 0;
4800+ }
4801+*/
4802+
4803+
4804+/*
4805+ * Add a pattern to the list of patterns terminated at this state.
4806+ * Insert at front of list.
4807+ */
4808+static void
4809+AddMatchListEntry (ACSM_STRUCT2 * acsm, int state, ACSM_PATTERN2 * px)
4810+{
4811+ ACSM_PATTERN2 * p;
4812+
4813+ p = (ACSM_PATTERN2 *) AC_MALLOC (sizeof (ACSM_PATTERN2));
4814+
4815+ MEMASSERT (p, "AddMatchListEntry");
4816+
4817+ memcpy (p, px, sizeof (ACSM_PATTERN2));
4818+
4819+ p->next = acsm->acsmMatchList[state];
4820+
4821+ acsm->acsmMatchList[state] = p;
4822+}
4823+
4824+
4825+static void
4826+AddPatternStates (ACSM_STRUCT2 * acsm, ACSM_PATTERN2 * p)
4827+{
4828+ int state, next, n;
4829+ unsigned char *pattern;
4830+
4831+ n = p->n;
4832+ pattern = p->patrn;
4833+ state = 0;
4834+
4835+ /*
4836+ * Match up pattern with existing states
4837+ */
4838+ for (; n > 0; pattern++, n--)
4839+ {
4840+ next = List_GetNextState(acsm,state,*pattern);
4841+ if (next == ACSM_FAIL_STATE2 || next == 0)
4842+ {
4843+ break;
4844+ }
4845+ state = next;
4846+ }
4847+
4848+ /*
4849+ * Add new states for the rest of the pattern bytes, 1 state per byte
4850+ */
4851+ for (; n > 0; pattern++, n--)
4852+ {
4853+ acsm->acsmNumStates++;
4854+ List_PutNextState(acsm,state,*pattern,acsm->acsmNumStates);
4855+ state = acsm->acsmNumStates;
4856+ }
4857+
4858+ AddMatchListEntry (acsm, state, p );
4859+}
4860+
4861+/*
4862+ * Build A Non-Deterministic Finite Automata
4863+ * The keyword state table must already be built, via AddPatternStates().
4864+ */
4865+static void
4866+Build_NFA (ACSM_STRUCT2 * acsm)
4867+{
4868+ int r, s, i;
4869+ QUEUE q, *queue = &q;
4870+ acstate_t * FailState = acsm->acsmFailState;
4871+ ACSM_PATTERN2 ** MatchList = acsm->acsmMatchList;
4872+ ACSM_PATTERN2 * mlist,* px;
4873+
4874+ /* Init a Queue */
4875+ queue_init (queue);
4876+
4877+
4878+ /* Add the state 0 transitions 1st, the states at depth 1, fail to state 0 */
4879+ for (i = 0; i < acsm->acsmAlphabetSize; i++)
4880+ {
4881+ s = List_GetNextState2(acsm,0,i);
4882+ if( s )
4883+ {
4884+ queue_add (queue, s);
4885+ FailState[s] = 0;
4886+ }
4887+ }
4888+
4889+ /* Build the fail state successive layer of transitions */
4890+ while (queue_count (queue) > 0)
4891+ {
4892+ r = queue_remove (queue);
4893+
4894+ /* Find Final States for any Failure */
4895+ for (i = 0; i < acsm->acsmAlphabetSize; i++)
4896+ {
4897+ int fs, next;
4898+
4899+ s = List_GetNextState(acsm,r,i);
4900+
4901+ if( s != ACSM_FAIL_STATE2 )
4902+ {
4903+ queue_add (queue, s);
4904+
4905+ fs = FailState[r];
4906+
4907+ /*
4908+ * Locate the next valid state for 'i' starting at fs
4909+ */
4910+ while( (next=List_GetNextState(acsm,fs,i)) == ACSM_FAIL_STATE2 )
4911+ {
4912+ fs = FailState[fs];
4913+ }
4914+
4915+ /*
4916+ * Update 's' state failure state to point to the next valid state
4917+ */
4918+ FailState[s] = next;
4919+
4920+ /*
4921+ * Copy 'next'states MatchList to 's' states MatchList,
4922+ * we copy them so each list can be AC_FREE'd later,
4923+ * else we could just manipulate pointers to fake the copy.
4924+ */
4925+ for( mlist = MatchList[next];
4926+ mlist;
4927+ mlist = mlist->next)
4928+ {
4929+ px = CopyMatchListEntry (mlist);
4930+
4931+ /* Insert at front of MatchList */
4932+ px->next = MatchList[s];
4933+ MatchList[s] = px;
4934+ }
4935+ }
4936+ }
4937+ }
4938+
4939+ /* Clean up the queue */
4940+ queue_free (queue);
4941+}
4942+
4943+/*
4944+ * Build Deterministic Finite Automata from the NFA
4945+ */
4946+static void
4947+Convert_NFA_To_DFA (ACSM_STRUCT2 * acsm)
4948+{
4949+ int i, r, s, cFailState;
4950+ QUEUE q, *queue = &q;
4951+ acstate_t * FailState = acsm->acsmFailState;
4952+
4953+ /* Init a Queue */
4954+ queue_init (queue);
4955+
4956+ /* Add the state 0 transitions 1st */
4957+ for(i=0; i<acsm->acsmAlphabetSize; i++)
4958+ {
4959+ s = List_GetNextState(acsm,0,i);
4960+ if ( s != 0 )
4961+ {
4962+ queue_add (queue, s);
4963+ }
4964+ }
4965+
4966+ /* Start building the next layer of transitions */
4967+ while( queue_count(queue) > 0 )
4968+ {
4969+ r = queue_remove(queue);
4970+
4971+ /* Process this states layer */
4972+ for (i = 0; i < acsm->acsmAlphabetSize; i++)
4973+ {
4974+ s = List_GetNextState(acsm,r,i);
4975+
4976+ if( s != ACSM_FAIL_STATE2 && s!= 0)
4977+ {
4978+ queue_add (queue, s);
4979+ }
4980+ else
4981+ {
4982+ cFailState = List_GetNextState(acsm,FailState[r],i);
4983+
4984+ if( cFailState != 0 && cFailState != ACSM_FAIL_STATE2 )
4985+ {
4986+ List_PutNextState(acsm,r,i,cFailState);
4987+ }
4988+ }
4989+ }
4990+ }
4991+
4992+ /* Clean up the queue */
4993+ queue_free (queue);
4994+}
4995+
4996+/*
4997+ *
4998+ * Convert a row lists for the state table to a full vector format
4999+ *
5000+ */
5001+static int
5002+Conv_List_To_Full(ACSM_STRUCT2 * acsm)
5003+{
5004+ int tcnt, k;
5005+ acstate_t * p;
5006+ acstate_t ** NextState = acsm->acsmNextState;
5007+
5008+ for(k=0;k<acsm->acsmMaxStates;k++)
5009+ {
5010+ p = AC_MALLOC( sizeof(acstate_t) * (acsm->acsmAlphabetSize+2) );
5011+ if(!p) return -1;
5012+
5013+ tcnt = List_ConvToFull( acsm, (acstate_t)k, p+2 );
5014+
5015+ p[0] = ACF_FULL;
5016+ p[1] = 0; /* no matches yet */
5017+
5018+ NextState[k] = p; /* now we have a full format row vector */
5019+ }
5020+
5021+ return 0;
5022+}
5023+
5024+/*
5025+ * Convert DFA memory usage from list based storage to a sparse-row storage.
5026+ *
5027+ * The Sparse format allows each row to be either full or sparse formatted. If the sparse row has
5028+ * too many transitions, performance or space may dictate that we use the standard full formatting
5029+ * for the row. More than 5 or 10 transitions per state ought to really whack performance. So the
5030+ * user can specify the max state transitions per state allowed in the sparse format.
5031+ *
5032+ * Standard Full Matrix Format
5033+ * ---------------------------
5034+ * acstate_t ** NextState ( 1st index is row/state, 2nd index is column=event/input)
5035+ *
5036+ * example:
5037+ *
5038+ * events -> a b c d e f g h i j k l m n o p
5039+ * states
5040+ * N 1 7 0 0 0 3 0 0 0 0 0 0 0 0 0 0
5041+ *
5042+ * Sparse Format, each row : Words Value
5043+ * 1-1 fmt(0-full,1-sparse,2-banded,3-sparsebands)
5044+ * 2-2 bool match flag (indicates this state has pattern matches)
5045+ * 3-3 sparse state count ( # of input/next-state pairs )
5046+ * 4-3+2*cnt 'input,next-state' pairs... each sizof(acstate_t)
5047+ *
5048+ * above example case yields:
5049+ * Full Format: 0, 1 7 0 0 0 3 0 0 0 0 0 0 0 0 0 0 ...
5050+ * Sparse format: 1, 3, 'a',1,'b',7,'f',3 - uses 2+2*ntransitions (non-default transitions)
5051+ */
5052+static int
5053+Conv_Full_DFA_To_Sparse(ACSM_STRUCT2 * acsm)
5054+{
5055+ int cnt, m, k, i;
5056+ acstate_t * p, state, maxstates=0;
5057+ acstate_t ** NextState = acsm->acsmNextState;
5058+ acstate_t full[MAX_ALPHABET_SIZE];
5059+
5060+ for(k=0;k<acsm->acsmMaxStates;k++)
5061+ {
5062+ cnt=0;
5063+
5064+ List_ConvToFull(acsm, (acstate_t)k, full );
5065+
5066+ for (i = 0; i < acsm->acsmAlphabetSize; i++)
5067+ {
5068+ state = full[i];
5069+ if( state != 0 && state != ACSM_FAIL_STATE2 ) cnt++;
5070+ }
5071+
5072+ if( cnt > 0 ) maxstates++;
5073+
5074+ if( k== 0 || cnt > acsm->acsmSparseMaxRowNodes )
5075+ {
5076+ p = AC_MALLOC(sizeof(acstate_t)*(acsm->acsmAlphabetSize+2) );
5077+ if(!p) return -1;
5078+
5079+ p[0] = ACF_FULL;
5080+ p[1] = 0;
5081+ memcpy(&p[2],full,acsm->acsmAlphabetSize*sizeof(acstate_t));
5082+ }
5083+ else
5084+ {
5085+ p = AC_MALLOC(sizeof(acstate_t)*(3+2*cnt));
5086+ if(!p) return -1;
5087+
5088+ m = 0;
5089+ p[m++] = ACF_SPARSE;
5090+ p[m++] = 0; /* no matches */
5091+ p[m++] = cnt;
5092+
5093+ for(i = 0; i < acsm->acsmAlphabetSize ; i++)
5094+ {
5095+ state = full[i];
5096+ if( state != 0 && state != ACSM_FAIL_STATE2 )
5097+ {
5098+ p[m++] = i;
5099+ p[m++] = state;
5100+ }
5101+ }
5102+ }
5103+
5104+ NextState[k] = p; /* now we are a sparse formatted state transition array */
5105+ }
5106+
5107+ return 0;
5108+}
5109+/*
5110+ Convert Full matrix to Banded row format.
5111+
5112+ Word values
5113+ 1 2 -> banded
5114+ 2 n number of values
5115+ 3 i index of 1st value (0-256)
5116+ 4 - 3+n next-state values at each index
5117+
5118+*/
5119+static int
5120+Conv_Full_DFA_To_Banded(ACSM_STRUCT2 * acsm)
5121+{
5122+ int first = -1, last;
5123+ acstate_t * p, state, full[MAX_ALPHABET_SIZE];
5124+ acstate_t ** NextState = acsm->acsmNextState;
5125+ int cnt,m,k,i;
5126+
5127+ for(k=0;k<acsm->acsmMaxStates;k++)
5128+ {
5129+ cnt=0;
5130+
5131+ List_ConvToFull(acsm, (acstate_t)k, full );
5132+
5133+ first=-1;
5134+ last =-2;
5135+
5136+ for (i = 0; i < acsm->acsmAlphabetSize; i++)
5137+ {
5138+ state = full[i];
5139+
5140+ if( state !=0 && state != ACSM_FAIL_STATE2 )
5141+ {
5142+ if( first < 0 ) first = i;
5143+ last = i;
5144+ }
5145+ }
5146+
5147+ /* calc band width */
5148+ cnt= last - first + 1;
5149+
5150+ p = AC_MALLOC(sizeof(acstate_t)*(4+cnt));
5151+
5152+ if(!p) return -1;
5153+
5154+ m = 0;
5155+ p[m++] = ACF_BANDED;
5156+ p[m++] = 0; /* no matches */
5157+ p[m++] = cnt;
5158+ p[m++] = first;
5159+
5160+ for(i = first; i <= last; i++)
5161+ {
5162+ p[m++] = full[i];
5163+ }
5164+
5165+ NextState[k] = p; /* now we are a banded formatted state transition array */
5166+ }
5167+
5168+ return 0;
5169+}
5170+
5171+/*
5172+ * Convert full matrix to Sparse Band row format.
5173+ *
5174+ * next - Full formatted row of next states
5175+ * asize - size of alphabet
5176+ * zcnt - max number of zeros in a run of zeros in any given band.
5177+ *
5178+ * Word Values
5179+ * 1 ACF_SPARSEBANDS
5180+ * 2 number of bands
5181+ * repeat 3 - 5+ ....once for each band in this row.
5182+ * 3 number of items in this band* 4 start index of this band
5183+ * 5- next-state values in this band...
5184+ */
5185+static
5186+int calcSparseBands( acstate_t * next, int * begin, int * end, int asize, int zmax )
5187+{
5188+ int i, nbands,zcnt,last=0;
5189+ acstate_t state;
5190+
5191+ nbands=0;
5192+ for( i=0; i<asize; i++ )
5193+ {
5194+ state = next[i];
5195+
5196+ if( state !=0 && state != ACSM_FAIL_STATE2 )
5197+ {
5198+ begin[nbands] = i;
5199+ zcnt=0;
5200+
5201+ for( ; i< asize; i++ )
5202+ {
5203+ state = next[i];
5204+ if( state ==0 || state == ACSM_FAIL_STATE2 )
5205+ {
5206+ zcnt++;
5207+ if( zcnt > zmax ) break;
5208+ }
5209+ else
5210+ {
5211+ zcnt=0;
5212+ last = i;
5213+ }
5214+ }
5215+
5216+ end[nbands++] = last;
5217+
5218+ }
5219+ }
5220+
5221+ return nbands;
5222+}
5223+
5224+
5225+/*
5226+ * Sparse Bands
5227+ *
5228+ * Row Format:
5229+ * Word
5230+ * 1 SPARSEBANDS format indicator
5231+ * 2 bool indicates a pattern match in this state
5232+ * 3 number of sparse bands
5233+ * 4 number of elements in this band
5234+ * 5 start index of this band
5235+ * 6- list of next states
5236+ *
5237+ * m number of elements in this band
5238+ * m+1 start index of this band
5239+ * m+2- list of next states
5240+ */
5241+static int
5242+Conv_Full_DFA_To_SparseBands(ACSM_STRUCT2 * acsm)
5243+{
5244+ acstate_t * p;
5245+ acstate_t ** NextState = acsm->acsmNextState;
5246+ int cnt,m,k,i,zcnt=acsm->acsmSparseMaxZcnt;
5247+
5248+ int band_begin[MAX_ALPHABET_SIZE];
5249+ int band_end[MAX_ALPHABET_SIZE];
5250+ int nbands,j;
5251+ acstate_t full[MAX_ALPHABET_SIZE];
5252+
5253+ for(k=0;k<acsm->acsmMaxStates;k++)
5254+ {
5255+ cnt=0;
5256+
5257+ List_ConvToFull(acsm, (acstate_t)k, full );
5258+
5259+ nbands = calcSparseBands( full, band_begin, band_end, acsm->acsmAlphabetSize, zcnt );
5260+
5261+ /* calc band width space*/
5262+ cnt = 3;
5263+ for(i=0;i<nbands;i++)
5264+ {
5265+ cnt += 2;
5266+ cnt += band_end[i] - band_begin[i] + 1;
5267+
5268+ /*printk("state %d: sparseband %d, first=%d, last=%d, cnt=%d\n",k,i,band_begin[i],band_end[i],band_end[i]-band_begin[i]+1); */
5269+ }
5270+
5271+ p = AC_MALLOC(sizeof(acstate_t)*(cnt));
5272+
5273+ if(!p) return -1;
5274+
5275+ m = 0;
5276+ p[m++] = ACF_SPARSEBANDS;
5277+ p[m++] = 0; /* no matches */
5278+ p[m++] = nbands;
5279+
5280+ for( i=0;i<nbands;i++ )
5281+ {
5282+ p[m++] = band_end[i] - band_begin[i] + 1; /* # states in this band */
5283+ p[m++] = band_begin[i]; /* start index */
5284+
5285+ for( j=band_begin[i]; j<=band_end[i]; j++ )
5286+ {
5287+ p[m++] = full[j]; /* some states may be state zero */
5288+ }
5289+ }
5290+
5291+ NextState[k] = p; /* now we are a sparse-banded formatted state transition array */
5292+ }
5293+
5294+ return 0;
5295+}
5296+
5297+/*
5298+ *
5299+ * Convert an NFA or DFA row from sparse to full format
5300+ * and store into the 'full' buffer.
5301+ *
5302+ * returns:
5303+ * 0 - failed, no state transitions
5304+ * *p - pointer to 'full' buffer
5305+ *
5306+ */
5307+/*
5308+ static
5309+ acstate_t * acsmConvToFull(ACSM_STRUCT2 * acsm, acstate_t k, acstate_t * full )
5310+ {
5311+ int i;
5312+ acstate_t * p, n, fmt, index, nb, bmatch;
5313+ acstate_t ** NextState = acsm->acsmNextState;
5314+
5315+ p = NextState[k];
5316+
5317+ if( !p ) return 0;
5318+
5319+ fmt = *p++;
5320+
5321+ bmatch = *p++;
5322+
5323+ if( fmt ==ACF_SPARSE )
5324+ {
5325+ n = *p++;
5326+ for( ; n>0; n--, p+=2 )
5327+ {
5328+ full[ p[0] ] = p[1];
5329+ }
5330+ }
5331+ else if( fmt ==ACF_BANDED )
5332+ {
5333+
5334+ n = *p++;
5335+ index = *p++;
5336+
5337+ for( ; n>0; n--, p++ )
5338+ {
5339+ full[ index++ ] = p[0];
5340+ }
5341+ }
5342+ else if( fmt ==ACF_SPARSEBANDS )
5343+ {
5344+ nb = *p++;
5345+ for(i=0;i<nb;i++)
5346+ {
5347+ n = *p++;
5348+ index = *p++;
5349+ for( ; n>0; n--, p++ )
5350+ {
5351+ full[ index++ ] = p[0];
5352+ }
5353+ }
5354+ }
5355+ else if( fmt == ACF_FULL )
5356+ {
5357+ memcpy(full,p,acsm->acsmAlphabetSize*sizeof(acstate_t));
5358+ }
5359+
5360+ return full;
5361+ }
5362+*/
5363+
5364+/*
5365+ * Select the desired storage mode
5366+ */
5367+int acsmSelectFormat2( ACSM_STRUCT2 * acsm, int m )
5368+{
5369+ switch( m )
5370+ {
5371+ case ACF_FULL:
5372+ case ACF_SPARSE:
5373+ case ACF_BANDED:
5374+ case ACF_SPARSEBANDS:
5375+ acsm->acsmFormat = m;
5376+ break;
5377+ default:
5378+ return -1;
5379+ }
5380+
5381+ return 0;
5382+}
5383+/*
5384+ *
5385+ */
5386+void acsmSetMaxSparseBandZeros2( ACSM_STRUCT2 * acsm, int n )
5387+{
5388+ acsm->acsmSparseMaxZcnt = n;
5389+}
5390+/*
5391+ *
5392+ */
5393+void acsmSetMaxSparseElements2( ACSM_STRUCT2 * acsm, int n )
5394+{
5395+ acsm->acsmSparseMaxRowNodes = n;
5396+}
5397+/*
5398+ *
5399+ */
5400+int acsmSelectFSA2( ACSM_STRUCT2 * acsm, int m )
5401+{
5402+ switch( m )
5403+ {
5404+ case FSA_TRIE:
5405+ case FSA_NFA:
5406+ case FSA_DFA:
5407+ acsm->acsmFSA = m;
5408+ default:
5409+ return -1;
5410+ }
5411+}
5412+/*
5413+ *
5414+ */
5415+int acsmSetAlphabetSize2( ACSM_STRUCT2 * acsm, int n )
5416+{
5417+ if( n <= MAX_ALPHABET_SIZE )
5418+ {
5419+ acsm->acsmAlphabetSize = n;
5420+ }
5421+ else
5422+ {
5423+ return -1;
5424+ }
5425+ return 0;
5426+}
5427+/*
5428+ * Create a new AC state machine
5429+ */
5430+static ACSM_STRUCT2 * acsmNew2 (void)
5431+{
5432+ ACSM_STRUCT2 * p;
5433+
5434+ init_xlatcase ();
5435+
5436+ p = (ACSM_STRUCT2 *) AC_MALLOC(sizeof (ACSM_STRUCT2));
5437+ MEMASSERT (p, "acsmNew");
5438+
5439+ if (p)
5440+ {
5441+ memset (p, 0, sizeof (ACSM_STRUCT2));
5442+
5443+ /* Some defaults */
5444+ p->acsmFSA = FSA_DFA;
5445+ p->acsmFormat = ACF_BANDED;
5446+ p->acsmAlphabetSize = 256;
5447+ p->acsmSparseMaxRowNodes = 256;
5448+ p->acsmSparseMaxZcnt = 10;
5449+ }
5450+
5451+ return p;
5452+}
5453+/*
5454+ * Add a pattern to the list of patterns for this state machine
5455+ *
5456+ */
5457+int
5458+acsmAddPattern2 (ACSM_STRUCT2 * p, unsigned char *pat, int n, int nocase,
5459+ int offset, int depth, void * id, int iid)
5460+{
5461+ ACSM_PATTERN2 * plist;
5462+
5463+ plist = (ACSM_PATTERN2 *) AC_MALLOC (sizeof (ACSM_PATTERN2));
5464+ MEMASSERT (plist, "acsmAddPattern");
5465+
5466+ plist->patrn = (unsigned char *) AC_MALLOC ( n );
5467+ MEMASSERT (plist->patrn, "acsmAddPattern");
5468+
5469+ ConvertCaseEx(plist->patrn, pat, n);
5470+
5471+ plist->casepatrn = (unsigned char *) AC_MALLOC ( n );
5472+ MEMASSERT (plist->casepatrn, "acsmAddPattern");
5473+
5474+ memcpy (plist->casepatrn, pat, n);
5475+
5476+ plist->n = n;
5477+ plist->nocase = nocase;
5478+ plist->offset = offset;
5479+ plist->depth = depth;
5480+ plist->id = id;
5481+ plist->iid = iid;
5482+
5483+ plist->next = p->acsmPatterns;
5484+ p->acsmPatterns = plist;
5485+
5486+ return 0;
5487+}
5488+/*
5489+ * Add a Key to the list of key+data pairs
5490+ */
5491+int acsmAddKey2(ACSM_STRUCT2 * p, unsigned char *key, int klen, int nocase, void * data)
5492+{
5493+ ACSM_PATTERN2 * plist;
5494+
5495+ plist = (ACSM_PATTERN2 *) AC_MALLOC (sizeof (ACSM_PATTERN2));
5496+ MEMASSERT (plist, "acsmAddPattern");
5497+
5498+ plist->patrn = (unsigned char *) AC_MALLOC (klen);
5499+ memcpy (plist->patrn, key, klen);
5500+
5501+ plist->casepatrn = (unsigned char *) AC_MALLOC (klen);
5502+ memcpy (plist->casepatrn, key, klen);
5503+
5504+ plist->n = klen;
5505+ plist->nocase = nocase;
5506+ plist->offset = 0;
5507+ plist->depth = 0;
5508+ plist->id = 0;
5509+ plist->iid = 0;
5510+
5511+ plist->next = p->acsmPatterns;
5512+ p->acsmPatterns = plist;
5513+
5514+ return 0;
5515+}
5516+
5517+/*
5518+ * Copy a boolean match flag int NextState table, for caching purposes.
5519+ */
5520+static
5521+void acsmUpdateMatchStates( ACSM_STRUCT2 * acsm )
5522+{
5523+ acstate_t state;
5524+ acstate_t ** NextState = acsm->acsmNextState;
5525+ ACSM_PATTERN2 ** MatchList = acsm->acsmMatchList;
5526+
5527+ for( state=0; state<acsm->acsmNumStates; state++ )
5528+ {
5529+ if( MatchList[state] )
5530+ {
5531+ NextState[state][1] = 1;
5532+ }
5533+ else
5534+ {
5535+ NextState[state][1] = 0;
5536+ }
5537+ }
5538+}
5539+
5540+/*
5541+ * Compile State Machine - NFA or DFA and Full or Banded or Sparse or SparseBands
5542+ */
5543+int
5544+acsmCompile2 (ACSM_STRUCT2 * acsm)
5545+{
5546+ int k;
5547+ ACSM_PATTERN2 * plist;
5548+
5549+ /* Count number of states */
5550+ for (plist = acsm->acsmPatterns; plist != NULL; plist = plist->next)
5551+ {
5552+ acsm->acsmMaxStates += plist->n;
5553+ /* acsm->acsmMaxStates += plist->n*2; if we handle case in the table */
5554+ }
5555+ acsm->acsmMaxStates++; /* one extra */
5556+
5557+ /* Alloc a List based State Transition table */
5558+ acsm->acsmTransTable =(trans_node_t**) AC_MALLOC(sizeof(trans_node_t*) * acsm->acsmMaxStates );
5559+ MEMASSERT (acsm->acsmTransTable, "acsmCompile");
5560+
5561+ memset (acsm->acsmTransTable, 0, sizeof(trans_node_t*) * acsm->acsmMaxStates);
5562+
5563+ /* Alloc a failure table - this has a failure state, and a match list for each state */
5564+ acsm->acsmFailState =(acstate_t*) AC_MALLOC(sizeof(acstate_t) * acsm->acsmMaxStates );
5565+ MEMASSERT (acsm->acsmFailState, "acsmCompile");
5566+
5567+ memset (acsm->acsmFailState, 0, sizeof(acstate_t) * acsm->acsmMaxStates );
5568+
5569+ /* Alloc a MatchList table - this has a lis tof pattern matches for each state, if any */
5570+ acsm->acsmMatchList=(ACSM_PATTERN2**) AC_MALLOC(sizeof(ACSM_PATTERN2*) * acsm->acsmMaxStates );
5571+ MEMASSERT (acsm->acsmMatchList, "acsmCompile");
5572+
5573+ memset (acsm->acsmMatchList, 0, sizeof(ACSM_PATTERN2*) * acsm->acsmMaxStates );
5574+
5575+ /* Alloc a separate state transition table == in state 's' due to event 'k', transition to 'next' state */
5576+ acsm->acsmNextState=(acstate_t**)AC_MALLOC( acsm->acsmMaxStates * sizeof(acstate_t*) );
5577+ MEMASSERT(acsm->acsmNextState, "acsmCompile-NextState");
5578+
5579+ for (k = 0; k < acsm->acsmMaxStates; k++)
5580+ {
5581+ acsm->acsmNextState[k]=(acstate_t*)0;
5582+ }
5583+
5584+ /* Initialize state zero as a branch */
5585+ acsm->acsmNumStates = 0;
5586+
5587+ /* Add the 0'th state, */
5588+ //acsm->acsmNumStates++;
5589+
5590+ /* Add each Pattern to the State Table - This forms a keywords state table */
5591+ for (plist = acsm->acsmPatterns; plist != NULL; plist = plist->next)
5592+ {
5593+ AddPatternStates (acsm, plist);
5594+ }
5595+
5596+ acsm->acsmNumStates++;
5597+
5598+ if( acsm->acsmFSA == FSA_DFA || acsm->acsmFSA == FSA_NFA )
5599+ {
5600+ /* Build the NFA */
5601+ Build_NFA (acsm);
5602+ }
5603+
5604+ if( acsm->acsmFSA == FSA_DFA )
5605+ {
5606+ /* Convert the NFA to a DFA */
5607+ Convert_NFA_To_DFA (acsm);
5608+ }
5609+
5610+ /*
5611+ *
5612+ * Select Final Transition Table Storage Mode
5613+ *
5614+ */
5615+ if( acsm->acsmFormat == ACF_SPARSE )
5616+ {
5617+ /* Convert DFA Full matrix to a Sparse matrix */
5618+ if( Conv_Full_DFA_To_Sparse(acsm) )
5619+ return -1;
5620+ }
5621+
5622+ else if( acsm->acsmFormat == ACF_BANDED )
5623+ {
5624+ /* Convert DFA Full matrix to a Sparse matrix */
5625+ if( Conv_Full_DFA_To_Banded(acsm) )
5626+ return -1;
5627+ }
5628+
5629+ else if( acsm->acsmFormat == ACF_SPARSEBANDS )
5630+ {
5631+ /* Convert DFA Full matrix to a Sparse matrix */
5632+ if( Conv_Full_DFA_To_SparseBands(acsm) )
5633+ return -1;
5634+ }
5635+ else if( acsm->acsmFormat == ACF_FULL )
5636+ {
5637+ if( Conv_List_To_Full( acsm ) )
5638+ return -1;
5639+ }
5640+
5641+ acsmUpdateMatchStates( acsm ); /* load boolean match flags into state table */
5642+
5643+ /* Free up the Table Of Transition Lists */
5644+ List_FreeTransTable( acsm );
5645+
5646+ /* For now -- show this info */
5647+ /*
5648+ * acsmPrintInfo( acsm );
5649+ */
5650+
5651+
5652+ /* Accrue Summary State Stats */
5653+ summary.num_states += acsm->acsmNumStates;
5654+ summary.num_transitions += acsm->acsmNumTrans;
5655+
5656+ memcpy( &summary.acsm, acsm, sizeof(ACSM_STRUCT2));
5657+
5658+ return 0;
5659+}
5660+
5661+/*
5662+ * Get the NextState from the NFA, all NFA storage formats use this
5663+ */
5664+inline
5665+acstate_t SparseGetNextStateNFA(acstate_t * ps, acstate_t state, unsigned input)
5666+{
5667+ acstate_t fmt;
5668+ acstate_t n;
5669+ int index;
5670+ int nb;
5671+
5672+ fmt = *ps++;
5673+
5674+ ps++; /* skip bMatchState */
5675+
5676+ switch( fmt )
5677+ {
5678+ case ACF_BANDED:
5679+ {
5680+ n = ps[0];
5681+ index = ps[1];
5682+
5683+ if( input < index )
5684+ {
5685+ if(state==0)
5686+ {
5687+ return 0;
5688+ }
5689+ else
5690+ {
5691+ return (acstate_t)ACSM_FAIL_STATE2;
5692+ }
5693+ }
5694+ if( input >= index + n )
5695+ {
5696+ if(state==0)
5697+ {
5698+ return 0;
5699+ }
5700+ else
5701+ {
5702+ return (acstate_t)ACSM_FAIL_STATE2;
5703+ }
5704+ }
5705+ if( ps[input-index] == 0 )
5706+ {
5707+ if( state != 0 )
5708+ {
5709+ return ACSM_FAIL_STATE2;
5710+ }
5711+ }
5712+
5713+ return (acstate_t) ps[input-index];
5714+ }
5715+
5716+ case ACF_SPARSE:
5717+ {
5718+ n = *ps++; /* number of sparse index-value entries */
5719+
5720+ for( ; n>0 ; n-- )
5721+ {
5722+ if( ps[0] > input ) /* cannot match the input, already a higher value than the input */
5723+ {
5724+ return (acstate_t)ACSM_FAIL_STATE2; /* default state */
5725+ }
5726+ else if( ps[0] == input )
5727+ {
5728+ return ps[1]; /* next state */
5729+ }
5730+ ps+=2;
5731+ }
5732+ if( state == 0 )
5733+ {
5734+ return 0;
5735+ }
5736+ return ACSM_FAIL_STATE2;
5737+ }
5738+
5739+ case ACF_SPARSEBANDS:
5740+ {
5741+ nb = *ps++; /* number of bands */
5742+
5743+ while( nb > 0 ) /* for each band */
5744+ {
5745+ n = *ps++; /* number of elements */
5746+ index = *ps++; /* 1st element value */
5747+
5748+ if( input < index )
5749+ {
5750+ if( state != 0 )
5751+ {
5752+ return (acstate_t)ACSM_FAIL_STATE2;
5753+ }
5754+ return (acstate_t)0;
5755+ }
5756+ if( (input >= index) && (input < (index + n)) )
5757+ {
5758+ if( ps[input-index] == 0 )
5759+ {
5760+ if( state != 0 )
5761+ {
5762+ return ACSM_FAIL_STATE2;
5763+ }
5764+ }
5765+ return (acstate_t) ps[input-index];
5766+ }
5767+ nb--;
5768+ ps += n;
5769+ }
5770+ if( state != 0 )
5771+ {
5772+ return (acstate_t)ACSM_FAIL_STATE2;
5773+ }
5774+ return (acstate_t)0;
5775+ }
5776+
5777+ case ACF_FULL:
5778+ {
5779+ if( ps[input] == 0 )
5780+ {
5781+ if( state != 0 )
5782+ {
5783+ return ACSM_FAIL_STATE2;
5784+ }
5785+ }
5786+ return ps[input];
5787+ }
5788+ }
5789+
5790+ return 0;
5791+}
5792+
5793+
5794+
5795+/*
5796+ * Get the NextState from the DFA Next State Transition table
5797+ * Full and banded are supported separately, this is for
5798+ * sparse and sparse-bands
5799+ */
5800+inline
5801+acstate_t SparseGetNextStateDFA(acstate_t * ps, acstate_t state, unsigned input)
5802+{
5803+ acstate_t n, nb;
5804+ int index;
5805+
5806+ switch( ps[0] )
5807+ {
5808+ /* BANDED */
5809+ case ACF_BANDED:
5810+ {
5811+ /* n=ps[2] : number of entries in the band */
5812+ /* index=ps[3] : index of the 1st entry, sequential thereafter */
5813+
5814+ if( input < ps[3] ) return 0;
5815+ if( input >= (ps[3]+ps[2]) ) return 0;
5816+
5817+ return ps[4+input-ps[3]];
5818+ }
5819+
5820+ /* FULL */
5821+ case ACF_FULL:
5822+ {
5823+ return ps[2+input];
5824+ }
5825+
5826+ /* SPARSE */
5827+ case ACF_SPARSE:
5828+ {
5829+ n = ps[2]; /* number of entries/ key+next pairs */
5830+
5831+ ps += 3;
5832+
5833+ for( ; n>0 ; n-- )
5834+ {
5835+ if( input < ps[0] ) /* cannot match the input, already a higher value than the input */
5836+ {
5837+ return (acstate_t)0; /* default state */
5838+ }
5839+ else if( ps[0] == input )
5840+ {
5841+ return ps[1]; /* next state */
5842+ }
5843+ ps += 2;
5844+ }
5845+ return (acstate_t)0;
5846+ }
5847+
5848+
5849+ /* SPARSEBANDS */
5850+ case ACF_SPARSEBANDS:
5851+ {
5852+ nb = ps[2]; /* number of bands */
5853+
5854+ ps += 3;
5855+
5856+ while( nb > 0 ) /* for each band */
5857+ {
5858+ n = ps[0]; /* number of elements in this band */
5859+ index = ps[1]; /* start index/char of this band */
5860+ if( input < index )
5861+ {
5862+ return (acstate_t)0;
5863+ }
5864+ if( (input < (index + n)) )
5865+ {
5866+ return (acstate_t) ps[2+input-index];
5867+ }
5868+ nb--;
5869+ ps += n;
5870+ }
5871+ return (acstate_t)0;
5872+ }
5873+ }
5874+
5875+ return 0;
5876+}
5877+/*
5878+ * Search Text or Binary Data for Pattern matches
5879+ *
5880+ * Sparse & Sparse-Banded Matrix search
5881+ */
5882+static
5883+inline
5884+int
5885+acsmSearchSparseDFA(ACSM_STRUCT2 * acsm, unsigned char *Tx, int n,
5886+ int (*Match) (void * id, int index, void *data),
5887+ void *data)
5888+{
5889+ acstate_t state;
5890+ ACSM_PATTERN2 * mlist;
5891+ unsigned char * Tend;
5892+ int nfound = 0;
5893+ unsigned char * T, * Tc;
5894+ int index;
5895+ acstate_t ** NextState = acsm->acsmNextState;
5896+ ACSM_PATTERN2 ** MatchList = acsm->acsmMatchList;
5897+
5898+ Tc = Tx;
5899+ T = Tx;
5900+ Tend = T + n;
5901+
5902+ for( state = 0; T < Tend; T++ )
5903+ {
5904+ state = SparseGetNextStateDFA ( NextState[state], state, xlatcase[*T] );
5905+
5906+ /* test if this state has any matching patterns */
5907+ if( NextState[state][1] )
5908+ {
5909+ for( mlist = MatchList[state];
5910+ mlist!= NULL;
5911+ mlist = mlist->next )
5912+ {
5913+ index = T - mlist->n - Tc;
5914+ if( mlist->nocase )
5915+ {
5916+ nfound++;
5917+ if (Match (mlist->id, index, data))
5918+ return nfound;
5919+ }
5920+ else
5921+ {
5922+ if( memcmp (mlist->casepatrn, Tx + index, mlist->n) == 0 )
5923+ {
5924+ nfound++;
5925+ if (Match (mlist->id, index, data))
5926+ return nfound;
5927+ }
5928+ }
5929+ }
5930+ }
5931+ }
5932+ return nfound;
5933+}
5934+/*
5935+ * Full format DFA search
5936+ * Do not change anything here without testing, caching and prefetching
5937+ * performance is very sensitive to any changes.
5938+ *
5939+ * Perf-Notes:
5940+ * 1) replaced ConvertCaseEx with inline xlatcase - this improves performance 5-10%
5941+ * 2) using 'nocase' improves performance again by 10-15%, since memcmp is not needed
5942+ * 3)
5943+ */
5944+static
5945+inline
5946+int
5947+acsmSearchSparseDFA_Full(ACSM_STRUCT2 * acsm, unsigned char *Tx, int n,
5948+ int (*Match) (void * id, int index, void *data),
5949+ void *data)
5950+{
5951+ ACSM_PATTERN2 * mlist;
5952+ unsigned char * Tend;
5953+ unsigned char * T;
5954+ int index;
5955+ acstate_t state;
5956+ acstate_t * ps;
5957+ acstate_t sindex;
5958+ acstate_t ** NextState = acsm->acsmNextState;
5959+ ACSM_PATTERN2 ** MatchList = acsm->acsmMatchList;
5960+ int nfound = 0;
5961+
5962+ T = Tx;
5963+ Tend = Tx + n;
5964+
5965+ for( state = 0; T < Tend; T++ )
5966+ {
5967+ ps = NextState[ state ];
5968+
5969+ sindex = xlatcase[ T[0] ];
5970+
5971+ /* check the current state for a pattern match */
5972+ if( ps[1] )
5973+ {
5974+ for( mlist = MatchList[state];
5975+ mlist!= NULL;
5976+ mlist = mlist->next )
5977+ {
5978+ index = T - mlist->n - Tx;
5979+
5980+
5981+ if( mlist->nocase )
5982+ {
5983+ nfound++;
5984+ if (Match (mlist->id, index, data))
5985+ return nfound;
5986+ }
5987+ else
5988+ {
5989+ if( memcmp (mlist->casepatrn, Tx + index, mlist->n ) == 0 )
5990+ {
5991+ nfound++;
5992+ if (Match (mlist->id, index, data))
5993+ return nfound;
5994+ }
5995+ }
5996+
5997+ }
5998+ }
5999+
6000+ state = ps[ 2u + sindex ];
6001+ }
6002+
6003+ /* Check the last state for a pattern match */
6004+ for( mlist = MatchList[state];
6005+ mlist!= NULL;
6006+ mlist = mlist->next )
6007+ {
6008+ index = T - mlist->n - Tx;
6009+
6010+ if( mlist->nocase )
6011+ {
6012+ nfound++;
6013+ if (Match (mlist->id, index, data))
6014+ return nfound;
6015+ }
6016+ else
6017+ {
6018+ if( memcmp (mlist->casepatrn, Tx + index, mlist->n) == 0 )
6019+ {
6020+ nfound++;
6021+ if (Match (mlist->id, index, data))
6022+ return nfound;
6023+ }
6024+ }
6025+ }
6026+
6027+ return nfound;
6028+}
6029+/*
6030+ * Banded-Row format DFA search
6031+ * Do not change anything here, caching and prefetching
6032+ * performance is very sensitive to any changes.
6033+ *
6034+ * ps[0] = storage fmt
6035+ * ps[1] = bool match flag
6036+ * ps[2] = # elements in band
6037+ * ps[3] = index of 1st element
6038+ */
6039+static
6040+inline
6041+int
6042+acsmSearchSparseDFA_Banded(ACSM_STRUCT2 * acsm, unsigned char *Tx, int n,
6043+ int (*Match) (void * id, int index, void *data),
6044+ void *data)
6045+{
6046+ acstate_t state;
6047+ unsigned char * Tend;
6048+ unsigned char * T;
6049+ int sindex;
6050+ int index;
6051+ acstate_t ** NextState = acsm->acsmNextState;
6052+ ACSM_PATTERN2 ** MatchList = acsm->acsmMatchList;
6053+ ACSM_PATTERN2 * mlist;
6054+ acstate_t * ps;
6055+ int nfound = 0;
6056+
6057+ T = Tx;
6058+ Tend = T + n;
6059+
6060+ for( state = 0; T < Tend; T++ )
6061+ {
6062+ ps = NextState[state];
6063+
6064+ sindex = xlatcase[ T[0] ];
6065+
6066+ /* test if this state has any matching patterns */
6067+ if( ps[1] )
6068+ {
6069+ for( mlist = MatchList[state];
6070+ mlist!= NULL;
6071+ mlist = mlist->next )
6072+ {
6073+ index = T - mlist->n - Tx;
6074+
6075+ if( mlist->nocase )
6076+ {
6077+ nfound++;
6078+ if (Match (mlist->id, index, data))
6079+ return nfound;
6080+ }
6081+ else
6082+ {
6083+ if( memcmp (mlist->casepatrn, Tx + index, mlist->n) == 0 )
6084+ {
6085+ nfound++;
6086+ if (Match (mlist->id, index, data))
6087+ return nfound;
6088+ }
6089+ }
6090+ }
6091+ }
6092+
6093+ if( sindex < ps[3] ) state = 0;
6094+ else if( sindex >= (ps[3] + ps[2]) ) state = 0;
6095+ else state = ps[ 4u + sindex - ps[3] ];
6096+ }
6097+
6098+ /* Check the last state for a pattern match */
6099+ for( mlist = MatchList[state];
6100+ mlist!= NULL;
6101+ mlist = mlist->next )
6102+ {
6103+ index = T - mlist->n - Tx;
6104+
6105+ if( mlist->nocase )
6106+ {
6107+ nfound++;
6108+ if (Match (mlist->id, index, data))
6109+ return nfound;
6110+ }
6111+ else
6112+ {
6113+ if( memcmp (mlist->casepatrn, Tx + index, mlist->n) == 0 )
6114+ {
6115+ nfound++;
6116+ if (Match (mlist->id, index, data))
6117+ return nfound;
6118+ }
6119+ }
6120+ }
6121+
6122+ return nfound;
6123+}
6124+
6125+
6126+
6127+/*
6128+ * Search Text or Binary Data for Pattern matches
6129+ *
6130+ * Sparse Storage Version
6131+ */
6132+static
6133+inline
6134+int
6135+acsmSearchSparseNFA(ACSM_STRUCT2 * acsm, unsigned char *Tx, int n,
6136+ int (*Match) (void * id, int index, void *data),
6137+ void *data)
6138+{
6139+ acstate_t state;
6140+ ACSM_PATTERN2 * mlist;
6141+ unsigned char * Tend;
6142+ int nfound = 0;
6143+ unsigned char * T, *Tc;
6144+ int index;
6145+ acstate_t ** NextState= acsm->acsmNextState;
6146+ acstate_t * FailState= acsm->acsmFailState;
6147+ ACSM_PATTERN2 ** MatchList = acsm->acsmMatchList;
6148+ unsigned char Tchar;
6149+
6150+ Tc = Tx;
6151+ T = Tx;
6152+ Tend = T + n;
6153+
6154+ for( state = 0; T < Tend; T++ )
6155+ {
6156+ acstate_t nstate;
6157+
6158+ Tchar = xlatcase[ *T ];
6159+
6160+ while( (nstate=SparseGetNextStateNFA(NextState[state],state,Tchar))==ACSM_FAIL_STATE2 )
6161+ state = FailState[state];
6162+
6163+ state = nstate;
6164+
6165+ for( mlist = MatchList[state];
6166+ mlist!= NULL;
6167+ mlist = mlist->next )
6168+ {
6169+ index = T - mlist->n - Tx;
6170+ if( mlist->nocase )
6171+ {
6172+ nfound++;
6173+ if (Match (mlist->id, index, data))
6174+ return nfound;
6175+ }
6176+ else
6177+ {
6178+ if( memcmp (mlist->casepatrn, Tx + index, mlist->n) == 0 )
6179+ {
6180+ nfound++;
6181+ if (Match (mlist->id, index, data))
6182+ return nfound;
6183+ }
6184+ }
6185+ }
6186+ }
6187+
6188+ return nfound;
6189+}
6190+
6191+/*
6192+ * Search Function
6193+ */
6194+int
6195+acsmSearch2(ACSM_STRUCT2 * acsm, unsigned char *Tx, int n,
6196+ int (*Match) (void * id, int index, void *data),
6197+ void *data)
6198+{
6199+
6200+ switch( acsm->acsmFSA )
6201+ {
6202+ case FSA_DFA:
6203+
6204+ if( acsm->acsmFormat == ACF_FULL )
6205+ {
6206+ return acsmSearchSparseDFA_Full( acsm, Tx, n, Match,data );
6207+ }
6208+ else if( acsm->acsmFormat == ACF_BANDED )
6209+ {
6210+ return acsmSearchSparseDFA_Banded( acsm, Tx, n, Match,data );
6211+ }
6212+ else
6213+ {
6214+ return acsmSearchSparseDFA( acsm, Tx, n, Match,data );
6215+ }
6216+
6217+ case FSA_NFA:
6218+
6219+ return acsmSearchSparseNFA( acsm, Tx, n, Match,data );
6220+
6221+ case FSA_TRIE:
6222+
6223+ return 0;
6224+ }
6225+ return 0;
6226+}
6227+
6228+
6229+/*
6230+ * Free all memory
6231+ */
6232+void
6233+acsmFree2 (ACSM_STRUCT2 * acsm)
6234+{
6235+ int i;
6236+ ACSM_PATTERN2 * mlist, *ilist;
6237+ for (i = 0; i < acsm->acsmMaxStates; i++)
6238+ {
6239+ mlist = acsm->acsmMatchList[i];
6240+
6241+ while (mlist)
6242+ {
6243+ ilist = mlist;
6244+ mlist = mlist->next;
6245+ AC_FREE (ilist);
6246+ }
6247+ AC_FREE(acsm->acsmNextState[i]);
6248+ }
6249+ AC_FREE(acsm->acsmFailState);
6250+ AC_FREE(acsm->acsmMatchList);
6251+}
6252+
6253+/* ********************************** */
6254+
6255+static void ring_sock_destruct(struct sock *sk) {
6256+
6257+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
6258+ skb_queue_purge(&sk->sk_receive_queue);
6259+
6260+ if (!sock_flag(sk, SOCK_DEAD)) {
6261+#if defined(RING_DEBUG)
6262+ printk("Attempt to release alive ring socket: %p\n", sk);
6263+#endif
6264+ return;
6265+ }
6266+
6267+ BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
6268+ BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
6269+#else
6270+
6271+ BUG_TRAP(atomic_read(&sk->rmem_alloc)==0);
6272+ BUG_TRAP(atomic_read(&sk->wmem_alloc)==0);
6273+
6274+ if (!sk->dead) {
6275+#if defined(RING_DEBUG)
6276+ printk("Attempt to release alive ring socket: %p\n", sk);
6277+#endif
6278+ return;
6279+ }
6280+#endif
6281+
6282+ kfree(ring_sk(sk));
6283+
6284+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
6285+ MOD_DEC_USE_COUNT;
6286+#endif
6287+}
6288+
6289+/* ********************************** */
6290+
6291+static void ring_proc_add(struct ring_opt *pfr) {
6292+ if(ring_proc_dir != NULL) {
6293+ char name[16];
6294+
6295+ pfr->ring_pid = current->pid;
6296+
6297+ snprintf(name, sizeof(name), "%d", pfr->ring_pid);
6298+ create_proc_read_entry(name, 0, ring_proc_dir,
6299+ ring_proc_get_info, pfr);
6300+ /* printk("PF_RING: added /proc/net/pf_ring/%s\n", name); */
6301+ }
6302+}
6303+
6304+/* ********************************** */
6305+
6306+static void ring_proc_remove(struct ring_opt *pfr) {
6307+ if(ring_proc_dir != NULL) {
6308+ char name[16];
6309+
6310+ snprintf(name, sizeof(name), "%d", pfr->ring_pid);
6311+ remove_proc_entry(name, ring_proc_dir);
6312+ /* printk("PF_RING: removed /proc/net/pf_ring/%s\n", name); */
6313+ }
6314+}
6315+
6316+/* ********************************** */
6317+
6318+static int ring_proc_get_info(char *buf, char **start, off_t offset,
6319+ int len, int *unused, void *data)
6320+{
6321+ int rlen = 0;
6322+ struct ring_opt *pfr;
6323+ FlowSlotInfo *fsi;
6324+
6325+ if(data == NULL) {
6326+ /* /proc/net/pf_ring/info */
6327+ rlen = sprintf(buf,"Version : %s\n", RING_VERSION);
6328+ rlen += sprintf(buf + rlen,"Bucket length : %d bytes\n", bucket_len);
6329+ rlen += sprintf(buf + rlen,"Ring slots : %d\n", num_slots);
6330+ rlen += sprintf(buf + rlen,"Sample rate : %d [1=no sampling]\n", sample_rate);
6331+
6332+ rlen += sprintf(buf + rlen,"Capture TX : %s\n",
6333+ enable_tx_capture ? "Yes [RX+TX]" : "No [RX only]");
6334+ rlen += sprintf(buf + rlen,"Transparent mode : %s\n",
6335+ transparent_mode ? "Yes" : "No");
6336+ rlen += sprintf(buf + rlen,"Total rings : %d\n", ring_table_size);
6337+ } else {
6338+ /* detailed statistics about a PF_RING */
6339+ pfr = (struct ring_opt*)data;
6340+
6341+ if(data) {
6342+ fsi = pfr->slots_info;
6343+
6344+ if(fsi) {
6345+ rlen = sprintf(buf, "Bound Device : %s\n",
6346+ pfr->ring_netdev->name == NULL ? "<NULL>" : pfr->ring_netdev->name);
6347+ rlen += sprintf(buf + rlen,"Version : %d\n", fsi->version);
6348+ rlen += sprintf(buf + rlen,"Sampling Rate : %d\n", pfr->sample_rate);
6349+ rlen += sprintf(buf + rlen,"BPF Filtering : %s\n", pfr->bpfFilter ? "Enabled" : "Disabled");
6350+ rlen += sprintf(buf + rlen,"Bloom Filters : %s\n", pfr->bitmask_enabled ? "Enabled" : "Disabled");
6351+ rlen += sprintf(buf + rlen,"Pattern Search: %s\n", pfr->acsm ? "Enabled" : "Disabled");
6352+ rlen += sprintf(buf + rlen,"Cluster Id : %d\n", pfr->cluster_id);
6353+ rlen += sprintf(buf + rlen,"Tot Slots : %d\n", fsi->tot_slots);
6354+ rlen += sprintf(buf + rlen,"Slot Len : %d\n", fsi->slot_len);
6355+ rlen += sprintf(buf + rlen,"Data Len : %d\n", fsi->data_len);
6356+ rlen += sprintf(buf + rlen,"Tot Memory : %d\n", fsi->tot_mem);
6357+ rlen += sprintf(buf + rlen,"Tot Packets : %lu\n", (unsigned long)fsi->tot_pkts);
6358+ rlen += sprintf(buf + rlen,"Tot Pkt Lost : %lu\n", (unsigned long)fsi->tot_lost);
6359+ rlen += sprintf(buf + rlen,"Tot Insert : %lu\n", (unsigned long)fsi->tot_insert);
6360+ rlen += sprintf(buf + rlen,"Tot Read : %lu\n", (unsigned long)fsi->tot_read);
6361+
6362+ } else
6363+ rlen = sprintf(buf, "WARNING fsi == NULL\n");
6364+ } else
6365+ rlen = sprintf(buf, "WARNING data == NULL\n");
6366+ }
6367+
6368+ return rlen;
6369+}
6370+
6371+/* ********************************** */
6372+
6373+static void ring_proc_init(void) {
6374+ ring_proc_dir = proc_mkdir("pf_ring", proc_net);
6375+
6376+ if(ring_proc_dir) {
6377+ ring_proc_dir->owner = THIS_MODULE;
6378+ ring_proc = create_proc_read_entry("info", 0, ring_proc_dir,
6379+ ring_proc_get_info, NULL);
6380+ if(!ring_proc)
6381+ printk("PF_RING: unable to register proc file\n");
6382+ else {
6383+ ring_proc->owner = THIS_MODULE;
6384+ printk("PF_RING: registered /proc/net/pf_ring/\n");
6385+ }
6386+ } else
6387+ printk("PF_RING: unable to create /proc/net/pf_ring\n");
6388+}
6389+
6390+/* ********************************** */
6391+
6392+static void ring_proc_term(void) {
6393+ if(ring_proc != NULL) {
6394+ remove_proc_entry("info", ring_proc_dir);
6395+ if(ring_proc_dir != NULL) remove_proc_entry("pf_ring", proc_net);
6396+
6397+ printk("PF_RING: deregistered /proc/net/pf_ring\n");
6398+ }
6399+}
6400+
6401+/* ********************************** */
6402+
6403+/*
6404+ * ring_insert()
6405+ *
6406+ * store the sk in a new element and add it
6407+ * to the head of the list.
6408+ */
6409+static inline void ring_insert(struct sock *sk) {
6410+ struct ring_element *next;
6411+
6412+#if defined(RING_DEBUG)
6413+ printk("RING: ring_insert()\n");
6414+#endif
6415+
6416+ next = kmalloc(sizeof(struct ring_element), GFP_ATOMIC);
6417+ if(next != NULL) {
6418+ next->sk = sk;
6419+ write_lock_irq(&ring_mgmt_lock);
6420+ list_add(&next->list, &ring_table);
6421+ write_unlock_irq(&ring_mgmt_lock);
6422+ } else {
6423+ if(net_ratelimit())
6424+ printk("RING: could not kmalloc slot!!\n");
6425+ }
6426+
6427+ ring_table_size++;
6428+ ring_proc_add(ring_sk(sk));
6429+}
6430+
6431+/* ********************************** */
6432+
6433+/*
6434+ * ring_remove()
6435+ *
6436+ * For each of the elements in the list:
6437+ * - check if this is the element we want to delete
6438+ * - if it is, remove it from the list, and free it.
6439+ *
6440+ * stop when we find the one we're looking for (break),
6441+ * or when we reach the end of the list.
6442+ */
6443+static inline void ring_remove(struct sock *sk) {
6444+ struct list_head *ptr;
6445+ struct ring_element *entry;
6446+
6447+ for(ptr = ring_table.next; ptr != &ring_table; ptr = ptr->next) {
6448+ entry = list_entry(ptr, struct ring_element, list);
6449+
6450+ if(entry->sk == sk) {
6451+ list_del(ptr);
6452+ kfree(ptr);
6453+ ring_table_size--;
6454+ break;
6455+ }
6456+ }
6457+}
6458+
6459+/* ********************************** */
6460+
6461+static u_int32_t num_queued_pkts(struct ring_opt *pfr) {
6462+
6463+ if(pfr->ring_slots != NULL) {
6464+
6465+ u_int32_t tot_insert = pfr->slots_info->insert_idx,
6466+#if defined(RING_DEBUG)
6467+ tot_read = pfr->slots_info->tot_read, tot_pkts;
6468+#else
6469+ tot_read = pfr->slots_info->tot_read;
6470+#endif
6471+
6472+ if(tot_insert >= tot_read) {
6473+#if defined(RING_DEBUG)
6474+ tot_pkts = tot_insert-tot_read;
6475+#endif
6476+ return(tot_insert-tot_read);
6477+ } else {
6478+#if defined(RING_DEBUG)
6479+ tot_pkts = ((u_int32_t)-1)+tot_insert-tot_read;
6480+#endif
6481+ return(((u_int32_t)-1)+tot_insert-tot_read);
6482+ }
6483+
6484+#if defined(RING_DEBUG)
6485+ printk("-> num_queued_pkts=%d [tot_insert=%d][tot_read=%d]\n",
6486+ tot_pkts, tot_insert, tot_read);
6487+#endif
6488+
6489+ } else
6490+ return(0);
6491+}
6492+
6493+/* ********************************** */
6494+
6495+static inline FlowSlot* get_insert_slot(struct ring_opt *pfr) {
6496+#if defined(RING_DEBUG)
6497+ printk("get_insert_slot(%d)\n", pfr->slots_info->insert_idx);
6498+#endif
6499+
6500+ if(pfr->ring_slots != NULL) {
6501+ FlowSlot *slot = (FlowSlot*)&(pfr->ring_slots[pfr->slots_info->insert_idx
6502+ *pfr->slots_info->slot_len]);
6503+ return(slot);
6504+ } else
6505+ return(NULL);
6506+}
6507+
6508+/* ********************************** */
6509+
6510+static inline FlowSlot* get_remove_slot(struct ring_opt *pfr) {
6511+#if defined(RING_DEBUG)
6512+ printk("get_remove_slot(%d)\n", pfr->slots_info->remove_idx);
6513+#endif
6514+
6515+ if(pfr->ring_slots != NULL)
6516+ return((FlowSlot*)&(pfr->ring_slots[pfr->slots_info->remove_idx*
6517+ pfr->slots_info->slot_len]));
6518+ else
6519+ return(NULL);
6520+}
6521+
6522+/* ******************************************************* */
6523+
6524+static int parse_pkt(struct sk_buff *skb, u_int16_t skb_displ,
6525+ u_int8_t *l3_proto, u_int16_t *eth_type,
6526+ u_int16_t *l3_offset, u_int16_t *l4_offset,
6527+ u_int16_t *vlan_id, u_int32_t *ipv4_src,
6528+ u_int32_t *ipv4_dst,
6529+ u_int16_t *l4_src_port, u_int16_t *l4_dst_port,
6530+ u_int16_t *payload_offset) {
6531+ struct iphdr *ip;
6532+ struct ethhdr *eh = (struct ethhdr*)(skb->data-skb_displ);
6533+ u_int16_t displ;
6534+
6535+ *l3_offset = *l4_offset = *l3_proto = *payload_offset = 0;
6536+ *eth_type = ntohs(eh->h_proto);
6537+
6538+ if(*eth_type == 0x8100 /* 802.1q (VLAN) */) {
6539+ (*vlan_id) = (skb->data[14] & 15)*256 + skb->data[15];
6540+ *eth_type = (skb->data[16])*256 + skb->data[17];
6541+ displ = 4;
6542+ } else {
6543+ displ = 0;
6544+ (*vlan_id) = (u_int16_t)-1;
6545+ }
6546+
6547+ if(*eth_type == 0x0800 /* IP */) {
6548+ *l3_offset = displ+sizeof(struct ethhdr);
6549+ ip = (struct iphdr*)(skb->data-skb_displ+(*l3_offset));
6550+
6551+ *ipv4_src = ntohl(ip->saddr), *ipv4_dst = ntohl(ip->daddr), *l3_proto = ip->protocol;
6552+
6553+ if((ip->protocol == IPPROTO_TCP) || (ip->protocol == IPPROTO_UDP)) {
6554+ *l4_offset = (*l3_offset)+(ip->ihl*4);
6555+
6556+ if(ip->protocol == IPPROTO_TCP) {
6557+ struct tcphdr *tcp = (struct tcphdr*)(skb->data-skb_displ+(*l4_offset));
6558+ *l4_src_port = ntohs(tcp->source), *l4_dst_port = ntohs(tcp->dest);
6559+ *payload_offset = (*l4_offset)+(tcp->doff * 4);
6560+ } else if(ip->protocol == IPPROTO_UDP) {
6561+ struct udphdr *udp = (struct udphdr*)(skb->data-skb_displ+(*l4_offset));
6562+ *l4_src_port = ntohs(udp->source), *l4_dst_port = ntohs(udp->dest);
6563+ *payload_offset = (*l4_offset)+sizeof(struct udphdr);
6564+ } else
6565+ *payload_offset = (*l4_offset);
6566+ } else
6567+ *l4_src_port = *l4_dst_port = 0;
6568+
6569+ return(1); /* IP */
6570+ } /* TODO: handle IPv6 */
6571+
6572+ return(0); /* No IP */
6573+}
6574+
6575+/* **************************************************************** */
6576+
6577+static void reset_bitmask(bitmask_selector *selector)
6578+{
6579+ memset((char*)selector->bits_memory, 0, selector->num_bits/8);
6580+
6581+ while(selector->clashes != NULL) {
6582+ bitmask_counter_list *next = selector->clashes->next;
6583+ kfree(selector->clashes);
6584+ selector->clashes = next;
6585+ }
6586+}
6587+
6588+/* **************************************************************** */
6589+
6590+static void alloc_bitmask(u_int32_t tot_bits, bitmask_selector *selector)
6591+{
6592+ u_int tot_mem = tot_bits/8;
6593+
6594+ if(tot_mem <= PAGE_SIZE)
6595+ selector->order = 1;
6596+ else {
6597+ for(selector->order = 0; (PAGE_SIZE << selector->order) < tot_mem; selector->order++)
6598+ ;
6599+ }
6600+
6601+ printk("BITMASK: [order=%d][tot_mem=%d]\n", selector->order, tot_mem);
6602+
6603+ while((selector->bits_memory = __get_free_pages(GFP_ATOMIC, selector->order)) == 0)
6604+ if(selector->order-- == 0)
6605+ break;
6606+
6607+ if(selector->order == 0) {
6608+ printk("BITMASK: ERROR not enough memory for bitmask\n");
6609+ selector->num_bits = 0;
6610+ return;
6611+ }
6612+
6613+ tot_mem = PAGE_SIZE << selector->order;
6614+ printk("BITMASK: succesfully allocated [tot_mem=%d][order=%d]\n",
6615+ tot_mem, selector->order);
6616+
6617+ selector->num_bits = tot_mem*8;
6618+ selector->clashes = NULL;
6619+ reset_bitmask(selector);
6620+}
6621+
6622+/* ********************************** */
6623+
6624+static void free_bitmask(bitmask_selector *selector)
6625+{
6626+ if(selector->bits_memory > 0)
6627+ free_pages(selector->bits_memory, selector->order);
6628+}
6629+
6630+/* ********************************** */
6631+
6632+static void set_bit_bitmask(bitmask_selector *selector, u_int32_t the_bit) {
6633+ u_int32_t idx = the_bit % selector->num_bits;
6634+
6635+ if(BITMASK_ISSET(idx, selector)) {
6636+ bitmask_counter_list *head = selector->clashes;
6637+
6638+ printk("BITMASK: bit %u was already set\n", the_bit);
6639+
6640+ while(head != NULL) {
6641+ if(head->bit_id == the_bit) {
6642+ head->bit_counter++;
6643+ printk("BITMASK: bit %u is now set to %d\n", the_bit, head->bit_counter);
6644+ return;
6645+ }
6646+
6647+ head = head->next;
6648+ }
6649+
6650+ head = kmalloc(sizeof(bitmask_counter_list), GFP_KERNEL);
6651+ if(head) {
6652+ head->bit_id = the_bit;
6653+ head->bit_counter = 1 /* previous value */ + 1 /* the requested set */;
6654+ head->next = selector->clashes;
6655+ selector->clashes = head;
6656+ } else {
6657+ printk("BITMASK: not enough memory\n");
6658+ return;
6659+ }
6660+ } else {
6661+ BITMASK_SET(idx, selector);
6662+ printk("BITMASK: bit %u is now set\n", the_bit);
6663+ }
6664+}
6665+
6666+/* ********************************** */
6667+
6668+static u_char is_set_bit_bitmask(bitmask_selector *selector, u_int32_t the_bit) {
6669+ u_int32_t idx = the_bit % selector->num_bits;
6670+ return(BITMASK_ISSET(idx, selector));
6671+}
6672+
6673+/* ********************************** */
6674+
6675+static void clear_bit_bitmask(bitmask_selector *selector, u_int32_t the_bit) {
6676+ u_int32_t idx = the_bit % selector->num_bits;
6677+
6678+ if(!BITMASK_ISSET(idx, selector))
6679+ printk("BITMASK: bit %u was not set\n", the_bit);
6680+ else {
6681+ bitmask_counter_list *head = selector->clashes, *prev = NULL;
6682+
6683+ while(head != NULL) {
6684+ if(head->bit_id == the_bit) {
6685+ head->bit_counter--;
6686+
6687+ printk("BITMASK: bit %u is now set to %d\n",
6688+ the_bit, head->bit_counter);
6689+
6690+ if(head->bit_counter == 1) {
6691+ /* We can now delete this entry as '1' can be
6692+ accommodated into the bitmask */
6693+
6694+ if(prev == NULL)
6695+ selector->clashes = head->next;
6696+ else
6697+ prev->next = head->next;
6698+
6699+ kfree(head);
6700+ }
6701+ return;
6702+ }
6703+
6704+ prev = head; head = head->next;
6705+ }
6706+
6707+ BITMASK_CLR(idx, selector);
6708+ printk("BITMASK: bit %u is now reset\n", the_bit);
6709+ }
6710+}
6711+
6712+/* ********************************** */
6713+
6714+/* Hash function */
6715+static u_int32_t sdb_hash(u_int32_t value) {
6716+ u_int32_t hash = 0, i;
6717+ u_int8_t str[sizeof(value)];
6718+
6719+ memcpy(str, &value, sizeof(value));
6720+
6721+ for(i = 0; i < sizeof(value); i++) {
6722+ hash = str[i] + (hash << 6) + (hash << 16) - hash;
6723+ }
6724+
6725+ return(hash);
6726+}
6727+
6728+/* ********************************** */
6729+
6730+static void handle_bloom_filter_rule(struct ring_opt *pfr, char *buf) {
6731+ u_int count;
6732+
6733+ if(buf == NULL)
6734+ return;
6735+ else
6736+ count = strlen(buf);
6737+
6738+ printk("PF_RING: -> handle_bloom_filter_rule(%s)\n", buf);
6739+
6740+ if((buf[count-1] == '\n') || (buf[count-1] == '\r')) buf[count-1] = '\0';
6741+
6742+ if(count > 1) {
6743+ u_int32_t the_bit;
6744+
6745+ if(!strncmp(&buf[1], "vlan=", 5)) {
6746+ sscanf(&buf[6], "%d", &the_bit);
6747+
6748+ if(buf[0] == '+')
6749+ set_bit_bitmask(&pfr->vlan_bitmask, the_bit), pfr->num_vlan_bitmask_add++;
6750+ else
6751+ clear_bit_bitmask(&pfr->vlan_bitmask, the_bit), pfr->num_vlan_bitmask_remove++;
6752+ } else if(!strncmp(&buf[1], "mac=", 4)) {
6753+ int a, b, c, d, e, f;
6754+
6755+ if(sscanf(&buf[5], "%02x:%02x:%02x:%02x:%02x:%02x:",
6756+ &a, &b, &c, &d, &e, &f) == 6) {
6757+ u_int32_t mac_addr = (a & 0xff) + (b & 0xff) + ((c & 0xff) << 24) + ((d & 0xff) << 16) + ((e & 0xff) << 8) + (f & 0xff);
6758+
6759+ /* printk("PF_RING: -> [%u][%u][%u][%u][%u][%u] -> [%u]\n", a, b, c, d, e, f, mac_addr); */
6760+
6761+ if(buf[0] == '+')
6762+ set_bit_bitmask(&pfr->mac_bitmask, mac_addr), pfr->num_mac_bitmask_add++;
6763+ else
6764+ clear_bit_bitmask(&pfr->mac_bitmask, mac_addr), pfr->num_mac_bitmask_remove++;
6765+ } else
6766+ printk("PF_RING: -> Invalid MAC address '%s'\n", &buf[5]);
6767+ } else if(!strncmp(&buf[1], "ip=", 3)) {
6768+ int a, b, c, d;
6769+
6770+ if(sscanf(&buf[4], "%d.%d.%d.%d", &a, &b, &c, &d) == 4) {
6771+ u_int32_t ip_addr = ((a & 0xff) << 24) + ((b & 0xff) << 16) + ((c & 0xff) << 8) + (d & 0xff);
6772+
6773+ if(buf[0] == '+')
6774+ set_bit_bitmask(&pfr->ip_bitmask, ip_addr), set_bit_bitmask(&pfr->ip_bitmask, sdb_hash(ip_addr)), pfr->num_ip_bitmask_add++;
6775+ else
6776+ clear_bit_bitmask(&pfr->ip_bitmask, ip_addr), clear_bit_bitmask(&pfr->twin_ip_bitmask, sdb_hash(ip_addr)), pfr->num_ip_bitmask_remove++;
6777+ } else
6778+ printk("PF_RING: -> Invalid IP address '%s'\n", &buf[4]);
6779+ } else if(!strncmp(&buf[1], "port=", 5)) {
6780+ sscanf(&buf[6], "%d", &the_bit);
6781+
6782+ if(buf[0] == '+')
6783+ set_bit_bitmask(&pfr->port_bitmask, the_bit), set_bit_bitmask(&pfr->port_bitmask, sdb_hash(the_bit)), pfr->num_port_bitmask_add++;
6784+ else
6785+ clear_bit_bitmask(&pfr->port_bitmask, the_bit), clear_bit_bitmask(&pfr->twin_port_bitmask, sdb_hash(the_bit)), pfr->num_port_bitmask_remove++;
6786+ } else if(!strncmp(&buf[1], "proto=", 6)) {
6787+ if(!strncmp(&buf[7], "tcp", 3)) the_bit = 6;
6788+ else if(!strncmp(&buf[7], "udp", 3)) the_bit = 17;
6789+ else if(!strncmp(&buf[7], "icmp", 4)) the_bit = 1;
6790+ else sscanf(&buf[7], "%d", &the_bit);
6791+
6792+ if(buf[0] == '+')
6793+ set_bit_bitmask(&pfr->proto_bitmask, the_bit);
6794+ else
6795+ clear_bit_bitmask(&pfr->proto_bitmask, the_bit);
6796+ } else
6797+ printk("PF_RING: -> Unknown rule type '%s'\n", buf);
6798+ }
6799+}
6800+
6801+/* ********************************** */
6802+
6803+static void reset_bloom_filters(struct ring_opt *pfr) {
6804+ reset_bitmask(&pfr->mac_bitmask);
6805+ reset_bitmask(&pfr->vlan_bitmask);
6806+ reset_bitmask(&pfr->ip_bitmask); reset_bitmask(&pfr->twin_ip_bitmask);
6807+ reset_bitmask(&pfr->port_bitmask); reset_bitmask(&pfr->twin_port_bitmask);
6808+ reset_bitmask(&pfr->proto_bitmask);
6809+
6810+ pfr->num_mac_bitmask_add = pfr->num_mac_bitmask_remove = 0;
6811+ pfr->num_vlan_bitmask_add = pfr->num_vlan_bitmask_remove = 0;
6812+ pfr->num_ip_bitmask_add = pfr->num_ip_bitmask_remove = 0;
6813+ pfr->num_port_bitmask_add = pfr->num_port_bitmask_remove = 0;
6814+ pfr->num_proto_bitmask_add = pfr->num_proto_bitmask_remove = 0;
6815+
6816+ printk("PF_RING: rules have been reset\n");
6817+}
6818+
6819+/* ********************************** */
6820+
6821+static void init_blooms(struct ring_opt *pfr) {
6822+ alloc_bitmask(4096, &pfr->mac_bitmask);
6823+ alloc_bitmask(4096, &pfr->vlan_bitmask);
6824+ alloc_bitmask(32768, &pfr->ip_bitmask); alloc_bitmask(32768, &pfr->twin_ip_bitmask);
6825+ alloc_bitmask(4096, &pfr->port_bitmask); alloc_bitmask(4096, &pfr->twin_port_bitmask);
6826+ alloc_bitmask(4096, &pfr->proto_bitmask);
6827+
6828+ pfr->num_mac_bitmask_add = pfr->num_mac_bitmask_remove = 0;
6829+ pfr->num_vlan_bitmask_add = pfr->num_vlan_bitmask_remove = 0;
6830+ pfr->num_ip_bitmask_add = pfr->num_ip_bitmask_remove = 0;
6831+ pfr->num_port_bitmask_add = pfr->num_port_bitmask_remove = 0;
6832+ pfr->num_proto_bitmask_add = pfr->num_proto_bitmask_remove = 0;
6833+
6834+ reset_bloom_filters(pfr);
6835+}
6836+
6837+/* ********************************** */
6838+
6839+inline int MatchFound (void* id, int index, void *data) { return(0); }
6840+
6841+/* ********************************** */
6842+
6843+static void add_skb_to_ring(struct sk_buff *skb,
6844+ struct ring_opt *pfr,
6845+ u_char recv_packet,
6846+ u_char real_skb /* 1=skb 0=faked skb */) {
6847+ FlowSlot *theSlot;
6848+ int idx, displ, fwd_pkt = 0;
6849+
6850+ if(recv_packet) {
6851+ /* Hack for identifying a packet received by the e1000 */
6852+ if(real_skb) {
6853+ displ = SKB_DISPLACEMENT;
6854+ } else
6855+ displ = 0; /* Received by the e1000 wrapper */
6856+ } else
6857+ displ = 0;
6858+
6859+ write_lock(&pfr->ring_index_lock);
6860+ pfr->slots_info->tot_pkts++;
6861+ write_unlock(&pfr->ring_index_lock);
6862+
6863+ /* BPF Filtering (from af_packet.c) */
6864+ if(pfr->bpfFilter != NULL) {
6865+ unsigned res = 1, len;
6866+
6867+ len = skb->len-skb->data_len;
6868+
6869+ write_lock(&pfr->ring_index_lock);
6870+ skb->data -= displ;
6871+ res = sk_run_filter(skb, pfr->bpfFilter->insns, pfr->bpfFilter->len);
6872+ skb->data += displ;
6873+ write_unlock(&pfr->ring_index_lock);
6874+
6875+ if(res == 0) {
6876+ /* Filter failed */
6877+
6878+#if defined(RING_DEBUG)
6879+ printk("add_skb_to_ring(skb): Filter failed [len=%d][tot=%llu]"
6880+ "[insertIdx=%d][pkt_type=%d][cloned=%d]\n",
6881+ (int)skb->len, pfr->slots_info->tot_pkts,
6882+ pfr->slots_info->insert_idx,
6883+ skb->pkt_type, skb->cloned);
6884+#endif
6885+
6886+ return;
6887+ }
6888+ }
6889+
6890+ /* ************************** */
6891+
6892+ if(pfr->sample_rate > 1) {
6893+ if(pfr->pktToSample == 0) {
6894+ write_lock(&pfr->ring_index_lock);
6895+ pfr->pktToSample = pfr->sample_rate;
6896+ write_unlock(&pfr->ring_index_lock);
6897+ } else {
6898+ write_lock(&pfr->ring_index_lock);
6899+ pfr->pktToSample--;
6900+ write_unlock(&pfr->ring_index_lock);
6901+
6902+#if defined(RING_DEBUG)
6903+ printk("add_skb_to_ring(skb): sampled packet [len=%d]"
6904+ "[tot=%llu][insertIdx=%d][pkt_type=%d][cloned=%d]\n",
6905+ (int)skb->len, pfr->slots_info->tot_pkts,
6906+ pfr->slots_info->insert_idx,
6907+ skb->pkt_type, skb->cloned);
6908+#endif
6909+ return;
6910+ }
6911+ }
6912+
6913+ /* ************************************* */
6914+
6915+ if((pfr->reflector_dev != NULL)
6916+ && (!netif_queue_stopped(pfr->reflector_dev))) {
6917+ int cpu = smp_processor_id();
6918+
6919+ /* increase reference counter so that this skb is not freed */
6920+ atomic_inc(&skb->users);
6921+
6922+ skb->data -= displ;
6923+
6924+ /* send it */
6925+ if (pfr->reflector_dev->xmit_lock_owner != cpu) {
6926+ /* Patch below courtesy of Matthew J. Roth <mroth@imminc.com> */
6927+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18))
6928+ spin_lock_bh(&pfr->reflector_dev->xmit_lock);
6929+ pfr->reflector_dev->xmit_lock_owner = cpu;
6930+ spin_unlock_bh(&pfr->reflector_dev->xmit_lock);
6931+#else
6932+ netif_tx_lock_bh(pfr->reflector_dev);
6933+#endif
6934+ if (pfr->reflector_dev->hard_start_xmit(skb, pfr->reflector_dev) == 0) {
6935+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18))
6936+ spin_lock_bh(&pfr->reflector_dev->xmit_lock);
6937+ pfr->reflector_dev->xmit_lock_owner = -1;
6938+ spin_unlock_bh(&pfr->reflector_dev->xmit_lock);
6939+#else
6940+ netif_tx_unlock_bh(pfr->reflector_dev);
6941+#endif
6942+ skb->data += displ;
6943+#if defined(RING_DEBUG)
6944+ printk("++ hard_start_xmit succeeded\n");
6945+#endif
6946+ return; /* OK */
6947+ }
6948+
6949+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18))
6950+ spin_lock_bh(&pfr->reflector_dev->xmit_lock);
6951+ pfr->reflector_dev->xmit_lock_owner = -1;
6952+ spin_unlock_bh(&pfr->reflector_dev->xmit_lock);
6953+#else
6954+ netif_tx_unlock_bh(pfr->reflector_dev);
6955+#endif
6956+ }
6957+
6958+#if defined(RING_DEBUG)
6959+ printk("++ hard_start_xmit failed\n");
6960+#endif
6961+ skb->data += displ;
6962+ return; /* -ENETDOWN */
6963+ }
6964+
6965+ /* ************************************* */
6966+
6967+#if defined(RING_DEBUG)
6968+ printk("add_skb_to_ring(skb) [len=%d][tot=%llu][insertIdx=%d]"
6969+ "[pkt_type=%d][cloned=%d]\n",
6970+ (int)skb->len, pfr->slots_info->tot_pkts,
6971+ pfr->slots_info->insert_idx,
6972+ skb->pkt_type, skb->cloned);
6973+#endif
6974+
6975+ idx = pfr->slots_info->insert_idx;
6976+ theSlot = get_insert_slot(pfr);
6977+
6978+ if((theSlot != NULL) && (theSlot->slot_state == 0)) {
6979+ struct pcap_pkthdr *hdr;
6980+ char *bucket;
6981+ int is_ip_pkt, debug = 0;
6982+
6983+ /* Update Index */
6984+ idx++;
6985+
6986+ bucket = &theSlot->bucket;
6987+ hdr = (struct pcap_pkthdr*)bucket;
6988+
6989+ /* BD - API changed for time keeping */
6990+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14))
6991+ if(skb->stamp.tv_sec == 0) do_gettimeofday(&skb->stamp);
6992+
6993+ hdr->ts.tv_sec = skb->stamp.tv_sec, hdr->ts.tv_usec = skb->stamp.tv_usec;
6994+#else
6995+ if(skb->tstamp.tv64 == 0) __net_timestamp(skb);
6996+
6997+ struct timeval tv = ktime_to_timeval(skb->tstamp);
6998+ hdr->ts.tv_sec = tv.tv_sec, hdr->ts.tv_usec = tv.tv_usec;
6999+#endif
7000+ hdr->caplen = skb->len+displ;
7001+
7002+ if(hdr->caplen > pfr->slots_info->data_len)
7003+ hdr->caplen = pfr->slots_info->data_len;
7004+
7005+ hdr->len = skb->len+displ;
7006+
7007+ /* Extensions */
7008+ is_ip_pkt = parse_pkt(skb, displ,
7009+ &hdr->l3_proto,
7010+ &hdr->eth_type,
7011+ &hdr->l3_offset,
7012+ &hdr->l4_offset,
7013+ &hdr->vlan_id,
7014+ &hdr->ipv4_src,
7015+ &hdr->ipv4_dst,
7016+ &hdr->l4_src_port,
7017+ &hdr->l4_dst_port,
7018+ &hdr->payload_offset);
7019+
7020+ if(is_ip_pkt && pfr->bitmask_enabled) {
7021+ int vlan_match = 0;
7022+
7023+ fwd_pkt = 0;
7024+
7025+ if(debug) {
7026+ if(is_ip_pkt)
7027+ printk(KERN_INFO "PF_RING: [proto=%d][vlan=%d][sport=%d][dport=%d][src=%u][dst=%u]\n",
7028+ hdr->l3_proto, hdr->vlan_id, hdr->l4_src_port, hdr->l4_dst_port, hdr->ipv4_src, hdr->ipv4_dst);
7029+ else
7030+ printk(KERN_INFO "PF_RING: [proto=%d][vlan=%d]\n", hdr->l3_proto, hdr->vlan_id);
7031+ }
7032+
7033+ if(hdr->vlan_id != (u_int16_t)-1) {
7034+ vlan_match = is_set_bit_bitmask(&pfr->vlan_bitmask, hdr->vlan_id);
7035+ } else
7036+ vlan_match = 1;
7037+
7038+ if(vlan_match) {
7039+ struct ethhdr *eh = (struct ethhdr*)(skb->data);
7040+ u_int32_t src_mac = (eh->h_source[0] & 0xff) + (eh->h_source[1] & 0xff) + ((eh->h_source[2] & 0xff) << 24)
7041+ + ((eh->h_source[3] & 0xff) << 16) + ((eh->h_source[4] & 0xff) << 8) + (eh->h_source[5] & 0xff);
7042+
7043+ if(debug) printk(KERN_INFO "PF_RING: [src_mac=%u]\n", src_mac);
7044+
7045+ fwd_pkt |= is_set_bit_bitmask(&pfr->mac_bitmask, src_mac);
7046+
7047+ if(!fwd_pkt) {
7048+ u_int32_t dst_mac = (eh->h_dest[0] & 0xff) + (eh->h_dest[1] & 0xff) + ((eh->h_dest[2] & 0xff) << 24)
7049+ + ((eh->h_dest[3] & 0xff) << 16) + ((eh->h_dest[4] & 0xff) << 8) + (eh->h_dest[5] & 0xff);
7050+
7051+ if(debug) printk(KERN_INFO "PF_RING: [dst_mac=%u]\n", dst_mac);
7052+
7053+ fwd_pkt |= is_set_bit_bitmask(&pfr->mac_bitmask, dst_mac);
7054+
7055+ if(is_ip_pkt && (!fwd_pkt)) {
7056+ fwd_pkt |= is_set_bit_bitmask(&pfr->ip_bitmask, hdr->ipv4_src);
7057+
7058+ if(!fwd_pkt) {
7059+ fwd_pkt |= is_set_bit_bitmask(&pfr->ip_bitmask, hdr->ipv4_dst);
7060+
7061+ if((!fwd_pkt) && ((hdr->l3_proto == IPPROTO_TCP)
7062+ || (hdr->l3_proto == IPPROTO_UDP))) {
7063+ fwd_pkt |= is_set_bit_bitmask(&pfr->port_bitmask, hdr->l4_src_port);
7064+ if(!fwd_pkt) fwd_pkt |= is_set_bit_bitmask(&pfr->port_bitmask, hdr->l4_dst_port);
7065+ }
7066+
7067+ if(!fwd_pkt) fwd_pkt |= is_set_bit_bitmask(&pfr->proto_bitmask, hdr->l3_proto);
7068+ }
7069+ }
7070+ }
7071+ }
7072+ } else
7073+ fwd_pkt = 1;
7074+
7075+ if(fwd_pkt && (pfr->acsm != NULL)) {
7076+ if((hdr->payload_offset > 0) && ((skb->len+skb->mac_len) > hdr->payload_offset)) {
7077+ char *payload = (skb->data-displ+hdr->payload_offset);
7078+ int payload_len = skb->len /* + skb->mac_len */ - hdr->payload_offset;
7079+
7080+ if((payload_len > 0)
7081+ && ((hdr->l4_src_port == 80) || (hdr->l4_dst_port == 80))) {
7082+ int rc;
7083+
7084+ if(0) {
7085+ char buf[1500];
7086+
7087+ memcpy(buf, payload, payload_len);
7088+ buf[payload_len] = '\0';
7089+ printk("[%s]\n", payload);
7090+ }
7091+
7092+ /* printk("Tring to match pattern [len=%d][%s]\n", payload_len, payload); */
7093+ rc = acsmSearch2(pfr->acsm, payload, payload_len, MatchFound, (void *)0) ? 1 : 0;
7094+
7095+ // printk("Match result: %d\n", fwd_pkt);
7096+ if(rc) {
7097+ printk("Pattern matched!\n");
7098+ } else {
7099+ fwd_pkt = 0;
7100+ }
7101+ } else
7102+ fwd_pkt = 0;
7103+ } else
7104+ fwd_pkt = 0;
7105+ }
7106+
7107+ if(fwd_pkt) {
7108+ memcpy(&bucket[sizeof(struct pcap_pkthdr)], skb->data-displ, hdr->caplen);
7109+
7110+#if defined(RING_DEBUG)
7111+ {
7112+ static unsigned int lastLoss = 0;
7113+
7114+ if(pfr->slots_info->tot_lost
7115+ && (lastLoss != pfr->slots_info->tot_lost)) {
7116+ printk("add_skb_to_ring(%d): [data_len=%d]"
7117+ "[hdr.caplen=%d][skb->len=%d]"
7118+ "[pcap_pkthdr=%d][removeIdx=%d]"
7119+ "[loss=%lu][page=%u][slot=%u]\n",
7120+ idx-1, pfr->slots_info->data_len, hdr->caplen, skb->len,
7121+ sizeof(struct pcap_pkthdr),
7122+ pfr->slots_info->remove_idx,
7123+ (long unsigned int)pfr->slots_info->tot_lost,
7124+ pfr->insert_page_id, pfr->insert_slot_id);
7125+
7126+ lastLoss = pfr->slots_info->tot_lost;
7127+ }
7128+ }
7129+#endif
7130+
7131+ write_lock(&pfr->ring_index_lock);
7132+ if(idx == pfr->slots_info->tot_slots)
7133+ pfr->slots_info->insert_idx = 0;
7134+ else
7135+ pfr->slots_info->insert_idx = idx;
7136+
7137+ pfr->slots_info->tot_insert++;
7138+ theSlot->slot_state = 1;
7139+ write_unlock(&pfr->ring_index_lock);
7140+ }
7141+ } else {
7142+ write_lock(&pfr->ring_index_lock);
7143+ pfr->slots_info->tot_lost++;
7144+ write_unlock(&pfr->ring_index_lock);
7145+
7146+#if defined(RING_DEBUG)
7147+ printk("add_skb_to_ring(skb): packet lost [loss=%lu]"
7148+ "[removeIdx=%u][insertIdx=%u]\n",
7149+ (long unsigned int)pfr->slots_info->tot_lost,
7150+ pfr->slots_info->remove_idx, pfr->slots_info->insert_idx);
7151+#endif
7152+ }
7153+
7154+ if(fwd_pkt) {
7155+
7156+ /* wakeup in case of poll() */
7157+ if(waitqueue_active(&pfr->ring_slots_waitqueue))
7158+ wake_up_interruptible(&pfr->ring_slots_waitqueue);
7159+ }
7160+}
7161+
7162+/* ********************************** */
7163+
7164+static u_int hash_skb(struct ring_cluster *cluster_ptr,
7165+ struct sk_buff *skb, u_char recv_packet) {
7166+ u_int idx;
7167+ int displ;
7168+ struct iphdr *ip;
7169+
7170+ if(cluster_ptr->hashing_mode == cluster_round_robin) {
7171+ idx = cluster_ptr->hashing_id++;
7172+ } else {
7173+ /* Per-flow clustering */
7174+ if(skb->len > sizeof(struct iphdr)+sizeof(struct tcphdr)) {
7175+ if(recv_packet)
7176+ displ = 0;
7177+ else
7178+ displ = SKB_DISPLACEMENT;
7179+
7180+ /*
7181+ skb->data+displ
7182+
7183+ Always points to to the IP part of the packet
7184+ */
7185+
7186+ ip = (struct iphdr*)(skb->data+displ);
7187+
7188+ idx = ip->saddr+ip->daddr+ip->protocol;
7189+
7190+ if(ip->protocol == IPPROTO_TCP) {
7191+ struct tcphdr *tcp = (struct tcphdr*)(skb->data+displ
7192+ +sizeof(struct iphdr));
7193+ idx += tcp->source+tcp->dest;
7194+ } else if(ip->protocol == IPPROTO_UDP) {
7195+ struct udphdr *udp = (struct udphdr*)(skb->data+displ
7196+ +sizeof(struct iphdr));
7197+ idx += udp->source+udp->dest;
7198+ }
7199+ } else
7200+ idx = skb->len;
7201+ }
7202+
7203+ return(idx % cluster_ptr->num_cluster_elements);
7204+}
7205+
7206+/* ********************************** */
7207+
7208+static int skb_ring_handler(struct sk_buff *skb,
7209+ u_char recv_packet,
7210+ u_char real_skb /* 1=skb 0=faked skb */) {
7211+ struct sock *skElement;
7212+ int rc = 0;
7213+ struct list_head *ptr;
7214+ struct ring_cluster *cluster_ptr;
7215+
7216+#ifdef PROFILING
7217+ uint64_t rdt = _rdtsc(), rdt1, rdt2;
7218+#endif
7219+
7220+ if((!skb) /* Invalid skb */
7221+ || ((!enable_tx_capture) && (!recv_packet))) {
7222+ /*
7223+ An outgoing packet is about to be sent out
7224+ but we decided not to handle transmitted
7225+ packets.
7226+ */
7227+ return(0);
7228+ }
7229+
7230+#if defined(RING_DEBUG)
7231+ if(0) {
7232+ printk("skb_ring_handler() [len=%d][dev=%s]\n", skb->len,
7233+ skb->dev->name == NULL ? "<NULL>" : skb->dev->name);
7234+ }
7235+#endif
7236+
7237+#ifdef PROFILING
7238+ rdt1 = _rdtsc();
7239+#endif
7240+
7241+ /* [1] Check unclustered sockets */
7242+ for (ptr = ring_table.next; ptr != &ring_table; ptr = ptr->next) {
7243+ struct ring_opt *pfr;
7244+ struct ring_element *entry;
7245+
7246+ entry = list_entry(ptr, struct ring_element, list);
7247+
7248+ read_lock(&ring_mgmt_lock);
7249+ skElement = entry->sk;
7250+ pfr = ring_sk(skElement);
7251+ read_unlock(&ring_mgmt_lock);
7252+
7253+ if((pfr != NULL)
7254+ && (pfr->cluster_id == 0 /* No cluster */)
7255+ && (pfr->ring_slots != NULL)
7256+ && ((pfr->ring_netdev == skb->dev) || ((skb->dev->flags & IFF_SLAVE) && pfr->ring_netdev == skb->dev->master))) {
7257+ /* We've found the ring where the packet can be stored */
7258+ read_lock(&ring_mgmt_lock);
7259+ add_skb_to_ring(skb, pfr, recv_packet, real_skb);
7260+ read_unlock(&ring_mgmt_lock);
7261+
7262+ rc = 1; /* Ring found: we've done our job */
7263+ }
7264+ }
7265+
7266+ /* [2] Check socket clusters */
7267+ cluster_ptr = ring_cluster_list;
7268+
7269+ while(cluster_ptr != NULL) {
7270+ struct ring_opt *pfr;
7271+
7272+ if(cluster_ptr->num_cluster_elements > 0) {
7273+ u_int skb_hash = hash_skb(cluster_ptr, skb, recv_packet);
7274+
7275+ read_lock(&ring_mgmt_lock);
7276+ skElement = cluster_ptr->sk[skb_hash];
7277+ read_unlock(&ring_mgmt_lock);
7278+
7279+ if(skElement != NULL) {
7280+ pfr = ring_sk(skElement);
7281+
7282+ if((pfr != NULL)
7283+ && (pfr->ring_slots != NULL)
7284+ && ((pfr->ring_netdev == skb->dev) || ((skb->dev->flags & IFF_SLAVE) && pfr->ring_netdev == skb->dev->master))) {
7285+ /* We've found the ring where the packet can be stored */
7286+ read_lock(&ring_mgmt_lock);
7287+ add_skb_to_ring(skb, pfr, recv_packet, real_skb);
7288+ read_unlock(&ring_mgmt_lock);
7289+
7290+ rc = 1; /* Ring found: we've done our job */
7291+ }
7292+ }
7293+ }
7294+
7295+ cluster_ptr = cluster_ptr->next;
7296+ }
7297+
7298+#ifdef PROFILING
7299+ rdt1 = _rdtsc()-rdt1;
7300+#endif
7301+
7302+#ifdef PROFILING
7303+ rdt2 = _rdtsc();
7304+#endif
7305+
7306+ if(transparent_mode) rc = 0;
7307+
7308+ if((rc != 0) && real_skb)
7309+ dev_kfree_skb(skb); /* Free the skb */
7310+
7311+#ifdef PROFILING
7312+ rdt2 = _rdtsc()-rdt2;
7313+ rdt = _rdtsc()-rdt;
7314+
7315+#if defined(RING_DEBUG)
7316+ printk("# cycles: %d [lock costed %d %d%%][free costed %d %d%%]\n",
7317+ (int)rdt, rdt-rdt1,
7318+ (int)((float)((rdt-rdt1)*100)/(float)rdt),
7319+ rdt2,
7320+ (int)((float)(rdt2*100)/(float)rdt));
7321+#endif
7322+#endif
7323+
7324+ return(rc); /* 0 = packet not handled */
7325+}
7326+
7327+/* ********************************** */
7328+
7329+struct sk_buff skb;
7330+
7331+static int buffer_ring_handler(struct net_device *dev,
7332+ char *data, int len) {
7333+
7334+#if defined(RING_DEBUG)
7335+ printk("buffer_ring_handler: [dev=%s][len=%d]\n",
7336+ dev->name == NULL ? "<NULL>" : dev->name, len);
7337+#endif
7338+
7339+ /* BD - API changed for time keeping */
7340+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14))
7341+ skb.dev = dev, skb.len = len, skb.data = data,
7342+ skb.data_len = len, skb.stamp.tv_sec = 0; /* Calculate the time */
7343+#else
7344+ skb.dev = dev, skb.len = len, skb.data = data,
7345+ skb.data_len = len, skb.tstamp.tv64 = 0; /* Calculate the time */
7346+#endif
7347+
7348+ skb_ring_handler(&skb, 1, 0 /* fake skb */);
7349+
7350+ return(0);
7351+}
7352+
7353+/* ********************************** */
7354+
7355+static int ring_create(struct socket *sock, int protocol) {
7356+ struct sock *sk;
7357+ struct ring_opt *pfr;
7358+ int err;
7359+
7360+#if defined(RING_DEBUG)
7361+ printk("RING: ring_create()\n");
7362+#endif
7363+
7364+ /* Are you root, superuser or so ? */
7365+ if(!capable(CAP_NET_ADMIN))
7366+ return -EPERM;
7367+
7368+ if(sock->type != SOCK_RAW)
7369+ return -ESOCKTNOSUPPORT;
7370+
7371+ if(protocol != htons(ETH_P_ALL))
7372+ return -EPROTONOSUPPORT;
7373+
7374+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
7375+ MOD_INC_USE_COUNT;
7376+#endif
7377+
7378+ err = -ENOMEM;
7379+
7380+ // BD: -- broke this out to keep it more simple and clear as to what the
7381+ // options are.
7382+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
7383+#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,11))
7384+ sk = sk_alloc(PF_RING, GFP_KERNEL, 1, NULL);
7385+#else
7386+ // BD: API changed in 2.6.12, ref:
7387+ // http://svn.clkao.org/svnweb/linux/revision/?rev=28201
7388+ sk = sk_alloc(PF_RING, GFP_ATOMIC, &ring_proto, 1);
7389+#endif
7390+#else
7391+ /* Kernel 2.4 */
7392+ sk = sk_alloc(PF_RING, GFP_KERNEL, 1);
7393+#endif
7394+
7395+ if (sk == NULL)
7396+ goto out;
7397+
7398+ sock->ops = &ring_ops;
7399+ sock_init_data(sock, sk);
7400+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
7401+#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,11))
7402+ sk_set_owner(sk, THIS_MODULE);
7403+#endif
7404+#endif
7405+
7406+ err = -ENOMEM;
7407+ ring_sk(sk) = ring_sk_datatype(kmalloc(sizeof(*pfr), GFP_KERNEL));
7408+
7409+ if (!(pfr = ring_sk(sk))) {
7410+ sk_free(sk);
7411+ goto out;
7412+ }
7413+ memset(pfr, 0, sizeof(*pfr));
7414+ init_waitqueue_head(&pfr->ring_slots_waitqueue);
7415+ pfr->ring_index_lock = RW_LOCK_UNLOCKED;
7416+ atomic_set(&pfr->num_ring_slots_waiters, 0);
7417+ init_blooms(pfr);
7418+ pfr->acsm = NULL;
7419+
7420+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
7421+ sk->sk_family = PF_RING;
7422+ sk->sk_destruct = ring_sock_destruct;
7423+#else
7424+ sk->family = PF_RING;
7425+ sk->destruct = ring_sock_destruct;
7426+ sk->num = protocol;
7427+#endif
7428+
7429+ ring_insert(sk);
7430+
7431+#if defined(RING_DEBUG)
7432+ printk("RING: ring_create() - created\n");
7433+#endif
7434+
7435+ return(0);
7436+ out:
7437+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
7438+ MOD_DEC_USE_COUNT;
7439+#endif
7440+ return err;
7441+}
7442+
7443+/* *********************************************** */
7444+
7445+static int ring_release(struct socket *sock)
7446+{
7447+ struct sock *sk = sock->sk;
7448+ struct ring_opt *pfr = ring_sk(sk);
7449+
7450+ if(!sk) return 0;
7451+
7452+#if defined(RING_DEBUG)
7453+ printk("RING: called ring_release\n");
7454+#endif
7455+
7456+#if defined(RING_DEBUG)
7457+ printk("RING: ring_release entered\n");
7458+#endif
7459+
7460+ /*
7461+ The calls below must be placed outside the
7462+ write_lock_irq...write_unlock_irq block.
7463+ */
7464+ sock_orphan(sk);
7465+ ring_proc_remove(ring_sk(sk));
7466+
7467+ write_lock_irq(&ring_mgmt_lock);
7468+ ring_remove(sk);
7469+ sock->sk = NULL;
7470+
7471+ /* Free the ring buffer */
7472+ if(pfr->ring_memory) {
7473+ struct page *page, *page_end;
7474+
7475+ page_end = virt_to_page(pfr->ring_memory + (PAGE_SIZE << pfr->order) - 1);
7476+ for(page = virt_to_page(pfr->ring_memory); page <= page_end; page++)
7477+ ClearPageReserved(page);
7478+
7479+ free_pages(pfr->ring_memory, pfr->order);
7480+ }
7481+
7482+ free_bitmask(&pfr->mac_bitmask);
7483+ free_bitmask(&pfr->vlan_bitmask);
7484+ free_bitmask(&pfr->ip_bitmask); free_bitmask(&pfr->twin_ip_bitmask);
7485+ free_bitmask(&pfr->port_bitmask); free_bitmask(&pfr->twin_port_bitmask);
7486+ free_bitmask(&pfr->proto_bitmask);
7487+
7488+ if(pfr->acsm != NULL) acsmFree2(pfr->acsm);
7489+
7490+ kfree(pfr);
7491+ ring_sk(sk) = NULL;
7492+
7493+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
7494+ skb_queue_purge(&sk->sk_write_queue);
7495+#endif
7496+
7497+ sock_put(sk);
7498+ write_unlock_irq(&ring_mgmt_lock);
7499+
7500+#if defined(RING_DEBUG)
7501+ printk("RING: ring_release leaving\n");
7502+#endif
7503+
7504+ return 0;
7505+}
7506+
7507+/* ********************************** */
7508+/*
7509+ * We create a ring for this socket and bind it to the specified device
7510+ */
7511+static int packet_ring_bind(struct sock *sk, struct net_device *dev)
7512+{
7513+ u_int the_slot_len;
7514+ u_int32_t tot_mem;
7515+ struct ring_opt *pfr = ring_sk(sk);
7516+ struct page *page, *page_end;
7517+
7518+ if(!dev) return(-1);
7519+
7520+#if defined(RING_DEBUG)
7521+ printk("RING: packet_ring_bind(%s) called\n", dev->name);
7522+#endif
7523+
7524+ /* **********************************************
7525+
7526+ *************************************
7527+ * *
7528+ * FlowSlotInfo *
7529+ * *
7530+ ************************************* <-+
7531+ * FlowSlot * |
7532+ ************************************* |
7533+ * FlowSlot * |
7534+ ************************************* +- num_slots
7535+ * FlowSlot * |
7536+ ************************************* |
7537+ * FlowSlot * |
7538+ ************************************* <-+
7539+
7540+ ********************************************** */
7541+
7542+ the_slot_len = sizeof(u_char) /* flowSlot.slot_state */
7543+#ifdef RING_MAGIC
7544+ + sizeof(u_char)
7545+#endif
7546+ + sizeof(struct pcap_pkthdr)
7547+ + bucket_len /* flowSlot.bucket */;
7548+
7549+ tot_mem = sizeof(FlowSlotInfo) + num_slots*the_slot_len;
7550+
7551+ /*
7552+ Calculate the value of the order parameter used later.
7553+ See http://www.linuxjournal.com/article.php?sid=1133
7554+ */
7555+ for(pfr->order = 0;(PAGE_SIZE << pfr->order) < tot_mem; pfr->order++) ;
7556+
7557+ /*
7558+ We now try to allocate the memory as required. If we fail
7559+ we try to allocate a smaller amount or memory (hence a
7560+ smaller ring).
7561+ */
7562+ while((pfr->ring_memory = __get_free_pages(GFP_ATOMIC, pfr->order)) == 0)
7563+ if(pfr->order-- == 0)
7564+ break;
7565+
7566+ if(pfr->order == 0) {
7567+ printk("RING: ERROR not enough memory for ring\n");
7568+ return(-1);
7569+ } else {
7570+ printk("RING: succesfully allocated %lu KB [tot_mem=%d][order=%ld]\n",
7571+ PAGE_SIZE >> (10 - pfr->order), tot_mem, pfr->order);
7572+ }
7573+
7574+ tot_mem = PAGE_SIZE << pfr->order;
7575+ memset((char*)pfr->ring_memory, 0, tot_mem);
7576+
7577+ /* Now we need to reserve the pages */
7578+ page_end = virt_to_page(pfr->ring_memory + (PAGE_SIZE << pfr->order) - 1);
7579+ for(page = virt_to_page(pfr->ring_memory); page <= page_end; page++)
7580+ SetPageReserved(page);
7581+
7582+ pfr->slots_info = (FlowSlotInfo*)pfr->ring_memory;
7583+ pfr->ring_slots = (char*)(pfr->ring_memory+sizeof(FlowSlotInfo));
7584+
7585+ pfr->slots_info->version = RING_FLOWSLOT_VERSION;
7586+ pfr->slots_info->slot_len = the_slot_len;
7587+ pfr->slots_info->data_len = bucket_len;
7588+ pfr->slots_info->tot_slots = (tot_mem-sizeof(FlowSlotInfo))/the_slot_len;
7589+ pfr->slots_info->tot_mem = tot_mem;
7590+ pfr->slots_info->sample_rate = sample_rate;
7591+
7592+ printk("RING: allocated %d slots [slot_len=%d][tot_mem=%u]\n",
7593+ pfr->slots_info->tot_slots, pfr->slots_info->slot_len,
7594+ pfr->slots_info->tot_mem);
7595+
7596+#ifdef RING_MAGIC
7597+ {
7598+ int i;
7599+
7600+ for(i=0; i<pfr->slots_info->tot_slots; i++) {
7601+ unsigned long idx = i*pfr->slots_info->slot_len;
7602+ FlowSlot *slot = (FlowSlot*)&pfr->ring_slots[idx];
7603+ slot->magic = RING_MAGIC_VALUE; slot->slot_state = 0;
7604+ }
7605+ }
7606+#endif
7607+
7608+ pfr->insert_page_id = 1, pfr->insert_slot_id = 0;
7609+
7610+ /*
7611+ IMPORTANT
7612+ Leave this statement here as last one. In fact when
7613+ the ring_netdev != NULL the socket is ready to be used.
7614+ */
7615+ pfr->ring_netdev = dev;
7616+
7617+ return(0);
7618+}
7619+
7620+/* ************************************* */
7621+
7622+/* Bind to a device */
7623+static int ring_bind(struct socket *sock,
7624+ struct sockaddr *sa, int addr_len)
7625+{
7626+ struct sock *sk=sock->sk;
7627+ struct net_device *dev = NULL;
7628+
7629+#if defined(RING_DEBUG)
7630+ printk("RING: ring_bind() called\n");
7631+#endif
7632+
7633+ /*
7634+ * Check legality
7635+ */
7636+ if (addr_len != sizeof(struct sockaddr))
7637+ return -EINVAL;
7638+ if (sa->sa_family != PF_RING)
7639+ return -EINVAL;
7640+
7641+ /* Safety check: add trailing zero if missing */
7642+ sa->sa_data[sizeof(sa->sa_data)-1] = '\0';
7643+
7644+#if defined(RING_DEBUG)
7645+ printk("RING: searching device %s\n", sa->sa_data);
7646+#endif
7647+
7648+ if((dev = __dev_get_by_name(sa->sa_data)) == NULL) {
7649+#if defined(RING_DEBUG)
7650+ printk("RING: search failed\n");
7651+#endif
7652+ return(-EINVAL);
7653+ } else
7654+ return(packet_ring_bind(sk, dev));
7655+}
7656+
7657+/* ************************************* */
7658+
7659+static int ring_mmap(struct file *file,
7660+ struct socket *sock,
7661+ struct vm_area_struct *vma)
7662+{
7663+ struct sock *sk = sock->sk;
7664+ struct ring_opt *pfr = ring_sk(sk);
7665+ unsigned long size, start;
7666+ u_int pagesToMap;
7667+ char *ptr;
7668+
7669+#if defined(RING_DEBUG)
7670+ printk("RING: ring_mmap() called\n");
7671+#endif
7672+
7673+ if(pfr->ring_memory == 0) {
7674+#if defined(RING_DEBUG)
7675+ printk("RING: ring_mmap() failed: mapping area to an unbound socket\n");
7676+#endif
7677+ return -EINVAL;
7678+ }
7679+
7680+ size = (unsigned long)(vma->vm_end-vma->vm_start);
7681+
7682+ if(size % PAGE_SIZE) {
7683+#if defined(RING_DEBUG)
7684+ printk("RING: ring_mmap() failed: len is not multiple of PAGE_SIZE\n");
7685+#endif
7686+ return(-EINVAL);
7687+ }
7688+
7689+ /* if userspace tries to mmap beyond end of our buffer, fail */
7690+ if(size > pfr->slots_info->tot_mem) {
7691+#if defined(RING_DEBUG)
7692+ printk("proc_mmap() failed: area too large [%ld > %d]\n", size, pfr->slots_info->tot_mem);
7693+#endif
7694+ return(-EINVAL);
7695+ }
7696+
7697+ pagesToMap = size/PAGE_SIZE;
7698+
7699+#if defined(RING_DEBUG)
7700+ printk("RING: ring_mmap() called. %d pages to map\n", pagesToMap);
7701+#endif
7702+
7703+#if defined(RING_DEBUG)
7704+ printk("RING: mmap [slot_len=%d][tot_slots=%d] for ring on device %s\n",
7705+ pfr->slots_info->slot_len, pfr->slots_info->tot_slots,
7706+ pfr->ring_netdev->name);
7707+#endif
7708+
7709+ /* we do not want to have this area swapped out, lock it */
7710+ vma->vm_flags |= VM_LOCKED;
7711+ start = vma->vm_start;
7712+
7713+ /* Ring slots start from page 1 (page 0 is reserved for FlowSlotInfo) */
7714+ ptr = (char*)(start+PAGE_SIZE);
7715+
7716+ if(remap_page_range(
7717+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
7718+ vma,
7719+#endif
7720+ start,
7721+ __pa(pfr->ring_memory),
7722+ PAGE_SIZE*pagesToMap, vma->vm_page_prot)) {
7723+#if defined(RING_DEBUG)
7724+ printk("remap_page_range() failed\n");
7725+#endif
7726+ return(-EAGAIN);
7727+ }
7728+
7729+#if defined(RING_DEBUG)
7730+ printk("proc_mmap(pagesToMap=%d): success.\n", pagesToMap);
7731+#endif
7732+
7733+ return 0;
7734+}
7735+
7736+/* ************************************* */
7737+
7738+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
7739+static int ring_recvmsg(struct kiocb *iocb, struct socket *sock,
7740+ struct msghdr *msg, size_t len, int flags)
7741+#else
7742+ static int ring_recvmsg(struct socket *sock, struct msghdr *msg, int len,
7743+ int flags, struct scm_cookie *scm)
7744+#endif
7745+{
7746+ FlowSlot* slot;
7747+ struct ring_opt *pfr = ring_sk(sock->sk);
7748+ u_int32_t queued_pkts, num_loops = 0;
7749+
7750+#if defined(RING_DEBUG)
7751+ printk("ring_recvmsg called\n");
7752+#endif
7753+
7754+ slot = get_remove_slot(pfr);
7755+
7756+ while((queued_pkts = num_queued_pkts(pfr)) < MIN_QUEUED_PKTS) {
7757+ wait_event_interruptible(pfr->ring_slots_waitqueue, 1);
7758+
7759+#if defined(RING_DEBUG)
7760+ printk("-> ring_recvmsg returning %d [queued_pkts=%d][num_loops=%d]\n",
7761+ slot->slot_state, queued_pkts, num_loops);
7762+#endif
7763+
7764+ if(queued_pkts > 0) {
7765+ if(num_loops++ > MAX_QUEUE_LOOPS)
7766+ break;
7767+ }
7768+ }
7769+
7770+#if defined(RING_DEBUG)
7771+ if(slot != NULL)
7772+ printk("ring_recvmsg is returning [queued_pkts=%d][num_loops=%d]\n",
7773+ queued_pkts, num_loops);
7774+#endif
7775+
7776+ return(queued_pkts);
7777+}
7778+
7779+/* ************************************* */
7780+
7781+unsigned int ring_poll(struct file * file,
7782+ struct socket *sock, poll_table *wait)
7783+{
7784+ FlowSlot* slot;
7785+ struct ring_opt *pfr = ring_sk(sock->sk);
7786+
7787+#if defined(RING_DEBUG)
7788+ printk("poll called\n");
7789+#endif
7790+
7791+ slot = get_remove_slot(pfr);
7792+
7793+ if((slot != NULL) && (slot->slot_state == 0))
7794+ poll_wait(file, &pfr->ring_slots_waitqueue, wait);
7795+
7796+#if defined(RING_DEBUG)
7797+ printk("poll returning %d\n", slot->slot_state);
7798+#endif
7799+
7800+ if((slot != NULL) && (slot->slot_state == 1))
7801+ return(POLLIN | POLLRDNORM);
7802+ else
7803+ return(0);
7804+}
7805+
7806+/* ************************************* */
7807+
7808+int add_to_cluster_list(struct ring_cluster *el,
7809+ struct sock *sock) {
7810+
7811+ if(el->num_cluster_elements == CLUSTER_LEN)
7812+ return(-1); /* Cluster full */
7813+
7814+ ring_sk_datatype(ring_sk(sock))->cluster_id = el->cluster_id;
7815+ el->sk[el->num_cluster_elements] = sock;
7816+ el->num_cluster_elements++;
7817+ return(0);
7818+}
7819+
7820+/* ************************************* */
7821+
7822+int remove_from_cluster_list(struct ring_cluster *el,
7823+ struct sock *sock) {
7824+ int i, j;
7825+
7826+ for(i=0; i<CLUSTER_LEN; i++)
7827+ if(el->sk[i] == sock) {
7828+ el->num_cluster_elements--;
7829+
7830+ if(el->num_cluster_elements > 0) {
7831+ /* The cluster contains other elements */
7832+ for(j=i; j<CLUSTER_LEN-1; j++)
7833+ el->sk[j] = el->sk[j+1];
7834+
7835+ el->sk[CLUSTER_LEN-1] = NULL;
7836+ } else {
7837+ /* Empty cluster */
7838+ memset(el->sk, 0, sizeof(el->sk));
7839+ }
7840+
7841+ return(0);
7842+ }
7843+
7844+ return(-1); /* Not found */
7845+}
7846+
7847+/* ************************************* */
7848+
7849+static int remove_from_cluster(struct sock *sock,
7850+ struct ring_opt *pfr)
7851+{
7852+ struct ring_cluster *el;
7853+
7854+#if defined(RING_DEBUG)
7855+ printk("--> remove_from_cluster(%d)\n", pfr->cluster_id);
7856+#endif
7857+
7858+ if(pfr->cluster_id == 0 /* 0 = No Cluster */)
7859+ return(0); /* Noting to do */
7860+
7861+ el = ring_cluster_list;
7862+
7863+ while(el != NULL) {
7864+ if(el->cluster_id == pfr->cluster_id) {
7865+ return(remove_from_cluster_list(el, sock));
7866+ } else
7867+ el = el->next;
7868+ }
7869+
7870+ return(-EINVAL); /* Not found */
7871+}
7872+
7873+/* ************************************* */
7874+
7875+static int add_to_cluster(struct sock *sock,
7876+ struct ring_opt *pfr,
7877+ u_short cluster_id)
7878+{
7879+ struct ring_cluster *el;
7880+
7881+#ifndef RING_DEBUG
7882+ printk("--> add_to_cluster(%d)\n", cluster_id);
7883+#endif
7884+
7885+ if(cluster_id == 0 /* 0 = No Cluster */) return(-EINVAL);
7886+
7887+ if(pfr->cluster_id != 0)
7888+ remove_from_cluster(sock, pfr);
7889+
7890+ el = ring_cluster_list;
7891+
7892+ while(el != NULL) {
7893+ if(el->cluster_id == cluster_id) {
7894+ return(add_to_cluster_list(el, sock));
7895+ } else
7896+ el = el->next;
7897+ }
7898+
7899+ /* There's no existing cluster. We need to create one */
7900+ if((el = kmalloc(sizeof(struct ring_cluster), GFP_KERNEL)) == NULL)
7901+ return(-ENOMEM);
7902+
7903+ el->cluster_id = cluster_id;
7904+ el->num_cluster_elements = 1;
7905+ el->hashing_mode = cluster_per_flow; /* Default */
7906+ el->hashing_id = 0;
7907+
7908+ memset(el->sk, 0, sizeof(el->sk));
7909+ el->sk[0] = sock;
7910+ el->next = ring_cluster_list;
7911+ ring_cluster_list = el;
7912+ pfr->cluster_id = cluster_id;
7913+
7914+ return(0); /* 0 = OK */
7915+}
7916+
7917+/* ************************************* */
7918+
7919+/* Code taken/inspired from core/sock.c */
7920+static int ring_setsockopt(struct socket *sock,
7921+ int level, int optname,
7922+ char *optval, int optlen)
7923+{
7924+ struct ring_opt *pfr = ring_sk(sock->sk);
7925+ int val, found, ret = 0;
7926+ u_int cluster_id, do_enable;
7927+ char devName[8], bloom_filter[256], aho_pattern[256];
7928+
7929+ if(pfr == NULL) return(-EINVAL);
7930+
7931+ if (get_user(val, (int *)optval))
7932+ return -EFAULT;
7933+
7934+ found = 1;
7935+
7936+ switch(optname)
7937+ {
7938+ case SO_ATTACH_FILTER:
7939+ ret = -EINVAL;
7940+ if (optlen == sizeof(struct sock_fprog)) {
7941+ unsigned int fsize;
7942+ struct sock_fprog fprog;
7943+ struct sk_filter *filter;
7944+
7945+ ret = -EFAULT;
7946+
7947+ /*
7948+ NOTE
7949+
7950+ Do not call copy_from_user within a held
7951+ splinlock (e.g. ring_mgmt_lock) as this caused
7952+ problems when certain debugging was enabled under
7953+ 2.6.5 -- including hard lockups of the machine.
7954+ */
7955+ if(copy_from_user(&fprog, optval, sizeof(fprog)))
7956+ break;
7957+
7958+ fsize = sizeof(struct sock_filter) * fprog.len;
7959+ filter = kmalloc(fsize, GFP_KERNEL);
7960+
7961+ if(filter == NULL) {
7962+ ret = -ENOMEM;
7963+ break;
7964+ }
7965+
7966+ if(copy_from_user(filter->insns, fprog.filter, fsize))
7967+ break;
7968+
7969+ filter->len = fprog.len;
7970+
7971+ if(sk_chk_filter(filter->insns, filter->len) != 0) {
7972+ /* Bad filter specified */
7973+ kfree(filter);
7974+ pfr->bpfFilter = NULL;
7975+ break;
7976+ }
7977+
7978+ /* get the lock, set the filter, release the lock */
7979+ write_lock(&ring_mgmt_lock);
7980+ pfr->bpfFilter = filter;
7981+ write_unlock(&ring_mgmt_lock);
7982+ ret = 0;
7983+ }
7984+ break;
7985+
7986+ case SO_DETACH_FILTER:
7987+ write_lock(&ring_mgmt_lock);
7988+ found = 1;
7989+ if(pfr->bpfFilter != NULL) {
7990+ kfree(pfr->bpfFilter);
7991+ pfr->bpfFilter = NULL;
7992+ write_unlock(&ring_mgmt_lock);
7993+ break;
7994+ }
7995+ ret = -ENONET;
7996+ break;
7997+
7998+ case SO_ADD_TO_CLUSTER:
7999+ if (optlen!=sizeof(val))
8000+ return -EINVAL;
8001+
8002+ if (copy_from_user(&cluster_id, optval, sizeof(cluster_id)))
8003+ return -EFAULT;
8004+
8005+ write_lock(&ring_mgmt_lock);
8006+ ret = add_to_cluster(sock->sk, pfr, cluster_id);
8007+ write_unlock(&ring_mgmt_lock);
8008+ break;
8009+
8010+ case SO_REMOVE_FROM_CLUSTER:
8011+ write_lock(&ring_mgmt_lock);
8012+ ret = remove_from_cluster(sock->sk, pfr);
8013+ write_unlock(&ring_mgmt_lock);
8014+ break;
8015+
8016+ case SO_SET_REFLECTOR:
8017+ if(optlen >= (sizeof(devName)-1))
8018+ return -EINVAL;
8019+
8020+ if(optlen > 0) {
8021+ if(copy_from_user(devName, optval, optlen))
8022+ return -EFAULT;
8023+ }
8024+
8025+ devName[optlen] = '\0';
8026+
8027+#if defined(RING_DEBUG)
8028+ printk("+++ SO_SET_REFLECTOR(%s)\n", devName);
8029+#endif
8030+
8031+ write_lock(&ring_mgmt_lock);
8032+ pfr->reflector_dev = dev_get_by_name(devName);
8033+ write_unlock(&ring_mgmt_lock);
8034+
8035+#if defined(RING_DEBUG)
8036+ if(pfr->reflector_dev != NULL)
8037+ printk("SO_SET_REFLECTOR(%s): succeded\n", devName);
8038+ else
8039+ printk("SO_SET_REFLECTOR(%s): device unknown\n", devName);
8040+#endif
8041+ break;
8042+
8043+ case SO_SET_BLOOM:
8044+ if(optlen >= (sizeof(bloom_filter)-1))
8045+ return -EINVAL;
8046+
8047+ if(optlen > 0) {
8048+ if(copy_from_user(bloom_filter, optval, optlen))
8049+ return -EFAULT;
8050+ }
8051+
8052+ bloom_filter[optlen] = '\0';
8053+
8054+ write_lock(&ring_mgmt_lock);
8055+ handle_bloom_filter_rule(pfr, bloom_filter);
8056+ write_unlock(&ring_mgmt_lock);
8057+ break;
8058+
8059+ case SO_SET_STRING:
8060+ if(optlen >= (sizeof(aho_pattern)-1))
8061+ return -EINVAL;
8062+
8063+ if(optlen > 0) {
8064+ if(copy_from_user(aho_pattern, optval, optlen))
8065+ return -EFAULT;
8066+ }
8067+
8068+ aho_pattern[optlen] = '\0';
8069+
8070+ write_lock(&ring_mgmt_lock);
8071+ if(pfr->acsm != NULL) acsmFree2(pfr->acsm);
8072+ if(optlen > 0) {
8073+#if 1
8074+ if((pfr->acsm = acsmNew2()) != NULL) {
8075+ int nc=1 /* case sensitive */, i = 0;
8076+
8077+ pfr->acsm->acsmFormat = ACF_BANDED;
8078+ acsmAddPattern2(pfr->acsm, (unsigned char*)aho_pattern,
8079+ (int)strlen(aho_pattern), nc, 0, 0,(void*)aho_pattern, i);
8080+ acsmCompile2(pfr->acsm);
8081+ }
8082+#else
8083+ pfr->acsm = kmalloc (10, GFP_KERNEL); /* TEST */
8084+#endif
8085+ }
8086+ write_unlock(&ring_mgmt_lock);
8087+ break;
8088+
8089+ case SO_TOGGLE_BLOOM_STATE:
8090+ if(optlen >= (sizeof(bloom_filter)-1))
8091+ return -EINVAL;
8092+
8093+ if(optlen > 0) {
8094+ if(copy_from_user(&do_enable, optval, optlen))
8095+ return -EFAULT;
8096+ }
8097+
8098+ write_lock(&ring_mgmt_lock);
8099+ if(do_enable)
8100+ pfr->bitmask_enabled = 1;
8101+ else
8102+ pfr->bitmask_enabled = 0;
8103+ write_unlock(&ring_mgmt_lock);
8104+ printk("SO_TOGGLE_BLOOM_STATE: bloom bitmask %s\n",
8105+ pfr->bitmask_enabled ? "enabled" : "disabled");
8106+ break;
8107+
8108+ case SO_RESET_BLOOM_FILTERS:
8109+ if(optlen >= (sizeof(bloom_filter)-1))
8110+ return -EINVAL;
8111+
8112+ if(optlen > 0) {
8113+ if(copy_from_user(&do_enable, optval, optlen))
8114+ return -EFAULT;
8115+ }
8116+
8117+ write_lock(&ring_mgmt_lock);
8118+ reset_bloom_filters(pfr);
8119+ write_unlock(&ring_mgmt_lock);
8120+ break;
8121+
8122+ default:
8123+ found = 0;
8124+ break;
8125+ }
8126+
8127+ if(found)
8128+ return(ret);
8129+ else
8130+ return(sock_setsockopt(sock, level, optname, optval, optlen));
8131+}
8132+
8133+/* ************************************* */
8134+
8135+static int ring_ioctl(struct socket *sock,
8136+ unsigned int cmd, unsigned long arg)
8137+{
8138+ switch(cmd)
8139+ {
8140+#ifdef CONFIG_INET
8141+ case SIOCGIFFLAGS:
8142+ case SIOCSIFFLAGS:
8143+ case SIOCGIFCONF:
8144+ case SIOCGIFMETRIC:
8145+ case SIOCSIFMETRIC:
8146+ case SIOCGIFMEM:
8147+ case SIOCSIFMEM:
8148+ case SIOCGIFMTU:
8149+ case SIOCSIFMTU:
8150+ case SIOCSIFLINK:
8151+ case SIOCGIFHWADDR:
8152+ case SIOCSIFHWADDR:
8153+ case SIOCSIFMAP:
8154+ case SIOCGIFMAP:
8155+ case SIOCSIFSLAVE:
8156+ case SIOCGIFSLAVE:
8157+ case SIOCGIFINDEX:
8158+ case SIOCGIFNAME:
8159+ case SIOCGIFCOUNT:
8160+ case SIOCSIFHWBROADCAST:
8161+ return(inet_dgram_ops.ioctl(sock, cmd, arg));
8162+#endif
8163+
8164+ default:
8165+ return -ENOIOCTLCMD;
8166+ }
8167+
8168+ return 0;
8169+}
8170+
8171+/* ************************************* */
8172+
8173+static struct proto_ops ring_ops = {
8174+ .family = PF_RING,
8175+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
8176+ .owner = THIS_MODULE,
8177+#endif
8178+
8179+ /* Operations that make no sense on ring sockets. */
8180+ .connect = sock_no_connect,
8181+ .socketpair = sock_no_socketpair,
8182+ .accept = sock_no_accept,
8183+ .getname = sock_no_getname,
8184+ .listen = sock_no_listen,
8185+ .shutdown = sock_no_shutdown,
8186+ .sendpage = sock_no_sendpage,
8187+ .sendmsg = sock_no_sendmsg,
8188+ .getsockopt = sock_no_getsockopt,
8189+
8190+ /* Now the operations that really occur. */
8191+ .release = ring_release,
8192+ .bind = ring_bind,
8193+ .mmap = ring_mmap,
8194+ .poll = ring_poll,
8195+ .setsockopt = ring_setsockopt,
8196+ .ioctl = ring_ioctl,
8197+ .recvmsg = ring_recvmsg,
8198+};
8199+
8200+/* ************************************ */
8201+
8202+static struct net_proto_family ring_family_ops = {
8203+ .family = PF_RING,
8204+ .create = ring_create,
8205+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
8206+ .owner = THIS_MODULE,
8207+#endif
8208+};
8209+
8210+// BD: API changed in 2.6.12, ref:
8211+// http://svn.clkao.org/svnweb/linux/revision/?rev=28201
8212+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,11))
8213+static struct proto ring_proto = {
8214+ .name = "PF_RING",
8215+ .owner = THIS_MODULE,
8216+ .obj_size = sizeof(struct sock),
8217+};
8218+#endif
8219+
8220+/* ************************************ */
8221+
8222+static void __exit ring_exit(void)
8223+{
8224+ struct list_head *ptr;
8225+ struct ring_element *entry;
8226+
8227+ for(ptr = ring_table.next; ptr != &ring_table; ptr = ptr->next) {
8228+ entry = list_entry(ptr, struct ring_element, list);
8229+ kfree(entry);
8230+ }
8231+
8232+ while(ring_cluster_list != NULL) {
8233+ struct ring_cluster *next = ring_cluster_list->next;
8234+ kfree(ring_cluster_list);
8235+ ring_cluster_list = next;
8236+ }
8237+
8238+ set_skb_ring_handler(NULL);
8239+ set_buffer_ring_handler(NULL);
8240+ sock_unregister(PF_RING);
8241+ ring_proc_term();
8242+ printk("PF_RING shut down.\n");
8243+}
8244+
8245+/* ************************************ */
8246+
8247+static int __init ring_init(void)
8248+{
8249+ printk("Welcome to PF_RING %s\n(C) 2004-07 L.Deri <deri@ntop.org>\n",
8250+ RING_VERSION);
8251+
8252+ INIT_LIST_HEAD(&ring_table);
8253+ ring_cluster_list = NULL;
8254+
8255+ sock_register(&ring_family_ops);
8256+
8257+ set_skb_ring_handler(skb_ring_handler);
8258+ set_buffer_ring_handler(buffer_ring_handler);
8259+
8260+ if(get_buffer_ring_handler() != buffer_ring_handler) {
8261+ printk("PF_RING: set_buffer_ring_handler FAILED\n");
8262+
8263+ set_skb_ring_handler(NULL);
8264+ set_buffer_ring_handler(NULL);
8265+ sock_unregister(PF_RING);
8266+ return -1;
8267+ } else {
8268+ printk("PF_RING: bucket length %d bytes\n", bucket_len);
8269+ printk("PF_RING: ring slots %d\n", num_slots);
8270+ printk("PF_RING: sample rate %d [1=no sampling]\n", sample_rate);
8271+ printk("PF_RING: capture TX %s\n",
8272+ enable_tx_capture ? "Yes [RX+TX]" : "No [RX only]");
8273+ printk("PF_RING: transparent mode %s\n",
8274+ transparent_mode ? "Yes" : "No");
8275+
8276+ printk("PF_RING initialized correctly.\n");
8277+
8278+ ring_proc_init();
8279+ return 0;
8280+ }
8281+}
8282+
8283+module_init(ring_init);
8284+module_exit(ring_exit);
8285+MODULE_LICENSE("GPL");
8286+
8287+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
8288+MODULE_ALIAS_NETPROTO(PF_RING);
8289+#endif
This page took 1.03736 seconds and 4 git commands to generate.