1 diff --unified --recursive --new-file linux-2.6.30/include/linux/ring.h linux-2.6.30-1-686-smp-PF_RING/include/linux/ring.h
2 --- linux-2.6.30/include/linux/ring.h 1970-01-01 01:00:00.000000000 +0100
3 +++ linux-2.6.30-1-686-smp-PF_RING/include/linux/ring.h 2009-07-21 04:40:31.308485480 +0200
6 + * Definitions for packet ring
8 + * 2004-09 Luca Deri <deri@ntop.org>
14 +#define INCLUDE_MAC_INFO
16 +#ifdef INCLUDE_MAC_INFO
17 +#define SKB_DISPLACEMENT 14 /* Include MAC address information */
19 +#define SKB_DISPLACEMENT 0 /* Do NOT include MAC address information */
23 +#define RING_MAGIC_VALUE 0x88
24 +#define RING_FLOWSLOT_VERSION 9
26 +#define DEFAULT_BUCKET_LEN 128
27 +#define MAX_NUM_DEVICES 256
30 +#define RING_VERSION "3.9.5"
31 +#define RING_VERSION_NUM 0x030904
34 +#define SO_ADD_TO_CLUSTER 99
35 +#define SO_REMOVE_FROM_CLUSTER 100
36 +#define SO_SET_REFLECTOR 101
37 +#define SO_SET_STRING 102
38 +#define SO_ADD_FILTERING_RULE 103
39 +#define SO_REMOVE_FILTERING_RULE 104
40 +#define SO_TOGGLE_FILTER_POLICY 105
41 +#define SO_SET_SAMPLING_RATE 106
42 +#define SO_ACTIVATE_RING 107
43 +#define SO_RING_BUCKET_LEN 108
44 +#define SO_SET_CHANNEL_ID 109
45 +#define SO_PURGE_IDLE_HASH_RULES 110 /* inactivity (sec) */
46 +#define SO_SET_APPL_NAME 111
49 +#define SO_GET_RING_VERSION 120
50 +#define SO_GET_FILTERING_RULE_STATS 121
51 +#define SO_GET_HASH_FILTERING_RULE_STATS 122
52 +#define SO_GET_MAPPED_DNA_DEVICE 123
55 +#define SO_MAP_DNA_DEVICE 130
57 +/* **************** regexp.h ******************* */
60 +http://www.opensource.apple.com/darwinsource/10.3/expect-1/expect/expect.h ,
61 +which contains a version of this library, says:
64 + * NSUBEXP must be at least 10, and no greater than 117 or the parser
65 + * will not work properly.
68 +However, it looks rather like this library is limited to 10. If you think
69 +otherwise, let us know.
73 +typedef struct regexp {
74 + char *startp[NSUBEXP];
75 + char *endp[NSUBEXP];
76 + char regstart; /* Internal use only. */
77 + char reganch; /* Internal use only. */
78 + char *regmust; /* Internal use only. */
79 + int regmlen; /* Internal use only. */
80 + char program[1]; /* Unwarranted chumminess with compiler. */
83 +regexp * regcomp(char *exp, int *patternsize);
84 +int regexec(regexp *prog, char *string);
85 +void regsub(regexp *prog, char *source, char *dest);
86 +void regerror(char *s);
89 + * The first byte of the regexp internal "program" is actually this magic
90 + * number; the start node begins in the second byte.
94 +/* *********************************** */
96 +struct pkt_aggregation_info {
97 + u_int32_t num_pkts, num_bytes;
98 + struct timeval first_seen, last_seen;
102 + Note that as offsets *can* be negative,
103 + please do not change them to unsigned
106 + int16_t eth_offset; /* This offset *must* be added to all offsets below */
107 + int16_t vlan_offset;
110 + int16_t payload_offset;
113 +struct pkt_parsing_info {
114 + /* Core fields (also used by NetFlow) */
115 + u_int16_t eth_type; /* Ethernet type */
116 + u_int16_t vlan_id; /* VLAN Id or NO_VLAN */
117 + u_int8_t l3_proto, ipv4_tos; /* Layer 3 protocol/TOS */
118 + u_int32_t ipv4_src, ipv4_dst; /* IPv4 src/dst IP addresses */
119 + u_int16_t l4_src_port, l4_dst_port; /* Layer 4 src/dst ports */
120 + u_int8_t tcp_flags; /* TCP flags (0 if not available) */
122 + u_int16_t last_matched_plugin_id; /* If > 0 identifies a plugin to that matched the packet */
124 + struct pkt_offset offset; /* Offsets of L3/L4/payload elements */
125 + struct pkt_aggregation_info aggregation; /* Future or plugin use */
129 +struct pfring_pkthdr {
130 + struct timeval ts; /* time stamp */
131 + u_int32_t caplen; /* length of portion present */
132 + u_int32_t len; /* length this packet (off wire) */
133 + struct pkt_parsing_info parsed_pkt; /* packet parsing info */
134 + u_int16_t parsed_header_len; /* Extra parsing data before packet */
137 +/* *********************************** */
139 +#define MAX_PLUGIN_ID 64
140 +#define MAX_PLUGIN_FIELDS 32
142 +/* ************************************************* */
145 + u_int8_t proto; /* Use 0 for 'any' protocol */
146 + u_int16_t vlan_id; /* Use '0' for any vlan */
147 + u_int32_t host_low, host_high; /* User '0' for any host. This is applied to both source
148 + and destination. */
149 + u_int16_t port_low, port_high; /* All ports between port_low...port_high
150 + 0 means 'any' port. This is applied to both source
151 + and destination. This means that
152 + (proto, sip, sport, dip, dport) matches the rule if
153 + one in "sip & sport", "sip & dport" "dip & sport"
155 +} filtering_rule_core_fields;
157 +/* ************************************************* */
159 +#define FILTER_PLUGIN_DATA_LEN 256
162 + char payload_pattern[32]; /* If strlen(payload_pattern) > 0, the packet payload
163 + must match the specified pattern */
164 + u_int16_t filter_plugin_id; /* If > 0 identifies a plugin to which the datastructure
165 + below will be passed for matching */
166 + char filter_plugin_data[FILTER_PLUGIN_DATA_LEN];
167 + /* Opaque datastructure that is interpreted by the
168 + specified plugin and that specifies a filtering
169 + criteria to be checked for match. Usually this data
170 + is re-casted to a more meaningful datastructure
172 +} filtering_rule_extended_fields;
174 +/* ************************************************* */
177 + /* Plugin Action */
178 + u_int16_t plugin_id; /* ('0'=no plugin) id of the plugin associated with this rule */
179 +} filtering_rule_plugin_action;
182 + forward_packet_and_stop_rule_evaluation = 0,
183 + dont_forward_packet_and_stop_rule_evaluation,
184 + execute_action_and_continue_rule_evaluation,
185 + forward_packet_add_rule_and_stop_rule_evaluation
186 +} rule_action_behaviour;
189 + forward_packet = 100,
190 + dont_forward_packet,
191 + use_rule_forward_policy
192 +} packet_action_behaviour;
195 + u_int16_t rule_id; /* Rules are processed in order from lowest to higest id */
196 + rule_action_behaviour rule_action; /* What to do in case of match */
197 + u_int8_t balance_id, balance_pool; /* If balance_pool > 0, then pass the packet above only if the
198 + (hash(proto, sip, sport, dip, dport) % balance_pool)
200 + filtering_rule_core_fields core_fields;
201 + filtering_rule_extended_fields extended_fields;
202 + filtering_rule_plugin_action plugin_action;
203 + unsigned long jiffies_last_match; /* Jiffies of the last rule match (updated by pf_ring) */
206 +/* *********************************** */
208 +/* Hash size used for precise packet matching */
209 +#define DEFAULT_RING_HASH_SIZE 4096
212 + * The hashtable contains only perfect matches: no
213 + * wildacards or so are accepted.
218 + u_int32_t host_peer_a, host_peer_b;
219 + u_int16_t port_peer_a, port_peer_b;
221 + rule_action_behaviour rule_action; /* What to do in case of match */
222 + filtering_rule_plugin_action plugin_action;
223 + unsigned long jiffies_last_match; /* Jiffies of the last rule match (updated by pf_ring) */
224 +} hash_filtering_rule;
226 +/* ************************************************* */
228 +typedef struct _filtering_hash_bucket {
229 + hash_filtering_rule rule;
230 + void *plugin_data_ptr; /* ptr to a *continuous* memory area
231 + allocated by the plugin */
232 + u_int16_t plugin_data_ptr_len;
233 + struct _filtering_hash_bucket *next;
234 +} filtering_hash_bucket;
236 +/* *********************************** */
238 +#define RING_MIN_SLOT_SIZE (60+sizeof(struct pfring_pkthdr))
239 +#define RING_MAX_SLOT_SIZE (1514+sizeof(struct pfring_pkthdr))
242 +#define min(a,b) ((a < b) ? a : b)
245 +/* *********************************** */
246 +/* False sharing reference: http://en.wikipedia.org/wiki/False_sharing */
248 +typedef struct flowSlotInfo {
249 + u_int16_t version, sample_rate;
250 + u_int32_t tot_slots, slot_len, data_len, tot_mem;
251 + u_int64_t tot_pkts, tot_lost, tot_insert, tot_read;
252 + u_int32_t insert_idx;
253 + u_int8_t padding[72]; /* Used to avoid false sharing */
254 + u_int32_t remove_idx;
255 + u_int32_t padding2[31]; /* Used to avoid false sharing */
258 +/* *********************************** */
260 +typedef struct flowSlot {
262 + u_char magic; /* It must alwasy be zero */
264 + u_char slot_state; /* 0=empty, 1=full */
265 + u_char bucket; /* bucket[bucketLen] */
268 +/* *********************************** */
272 +FlowSlotInfo* getRingPtr(void);
273 +int allocateRing(char *deviceName, u_int numSlots,
274 + u_int bucketLen, u_int sampleRate);
275 +unsigned int pollRing(struct file *fp, struct poll_table_struct * wait);
276 +void deallocateRing(void);
278 +/* ************************* */
280 +#endif /* __KERNEL__ */
282 +/* *********************************** */
284 +#define PF_RING 27 /* Packet Ring */
285 +#define SOCK_RING PF_RING
288 +#define SIORINGPOLL 0x8888
290 +/* ************************************************* */
292 +typedef int (*dna_wait_packet)(void *adapter, int mode);
295 + add_device_mapping = 0, remove_device_mapping
296 +} dna_device_operation;
299 + intel_e1000 = 0, intel_igb, intel_ixgbe
303 + unsigned long packet_memory; /* Invalid in userland */
304 + u_int packet_memory_num_slots;
305 + u_int packet_memory_slot_len;
306 + u_int packet_memory_tot_len;
307 + void *descr_packet_memory; /* Invalid in userland */
308 + u_int descr_packet_memory_num_slots;
309 + u_int descr_packet_memory_slot_len;
310 + u_int descr_packet_memory_tot_len;
312 + char *phys_card_memory; /* Invalid in userland */
313 + u_int phys_card_memory_len;
314 + struct net_device *netdev; /* Invalid in userland */
315 + dna_device_model device_model;
317 + wait_queue_head_t *packet_waitqueue;
319 + void *packet_waitqueue;
321 + u_int8_t *interrupt_received, in_use;
323 + dna_wait_packet wait_packet_function_ptr;
327 + dna_device_operation operation;
328 + char device_name[8];
329 + int32_t channel_id;
330 +} dna_device_mapping;
332 +/* ************************************************* */
337 + cluster_per_flow = 0,
338 + cluster_round_robin
341 +#define CLUSTER_LEN 8
344 + * A ring cluster is used group together rings used by various applications
345 + * so that they look, from the PF_RING point of view, as a single ring.
346 + * This means that developers can use clusters for sharing packets across
347 + * applications using various policies as specified in the hashing_mode
350 +struct ring_cluster {
351 + u_short cluster_id; /* 0 = no cluster */
352 + u_short num_cluster_elements;
353 + enum cluster_type hashing_mode;
354 + u_short hashing_id;
355 + struct sock *sk[CLUSTER_LEN];
359 + * Linked-list of ring clusters.
362 + struct ring_cluster cluster;
363 + struct list_head list;
364 +} ring_cluster_element;
368 + struct list_head list;
371 +/* ************************************************* */
374 + * Linked-list of ring sockets.
376 +struct ring_element {
377 + struct list_head list;
381 +/* ************************************************* */
383 +struct ring_opt *pfr; /* Forward */
385 +typedef int (*do_handle_filtering_hash_bucket)(struct ring_opt *pfr,
386 + filtering_hash_bucket* rule,
389 +/* ************************************************* */
391 +#define RING_ANY_CHANNEL -1
397 + u_int8_t ring_active;
398 + struct net_device *ring_netdev;
401 + char *appl_name; /* String that identifies the application bound to the socket */
403 + /* Direct NIC Access */
404 + u_int8_t mmap_count;
405 + dna_device *dna_device;
408 + u_short cluster_id; /* 0 = no cluster */
411 + int32_t channel_id; /* -1 = any channel */
414 + struct net_device *reflector_dev; /* Reflector device */
416 + /* Packet buffers */
417 + unsigned long order;
420 + void * ring_memory;
421 + u_int32_t bucket_len;
422 + FlowSlotInfo *slots_info; /* Points to ring_memory */
423 + char *ring_slots; /* Points to ring_memory+sizeof(FlowSlotInfo) */
425 + /* Packet Sampling */
426 + u_int32_t pktToSample, sample_rate;
429 + struct sk_filter *bpfFilter;
431 + /* Filtering Rules */
432 + filtering_hash_bucket **filtering_hash;
433 + u_int16_t num_filtering_rules;
434 + u_int8_t rules_default_accept_policy; /* 1=default policy is accept, drop otherwise */
435 + struct list_head rules;
438 + atomic_t num_ring_users;
439 + wait_queue_head_t ring_slots_waitqueue;
440 + rwlock_t ring_index_lock, ring_rules_lock;
442 + /* Indexes (Internal) */
443 + u_int insert_page_id, insert_slot_id;
445 + /* Function pointer */
446 + do_handle_filtering_hash_bucket handle_hash_rule;
449 +/* **************************************** */
452 + * Linked-list of device rings
455 + struct ring_opt *the_ring;
456 + struct list_head list;
457 +} device_ring_list_element;
459 +/* **************************************** */
462 + filtering_rule rule;
464 + struct list_head list;
466 + /* Plugin action */
467 + void *plugin_data_ptr; /* ptr to a *continuous* memory area allocated by the plugin */
468 +} filtering_rule_element;
470 +struct parse_buffer {
475 +/* **************************************** */
478 +/* Execute an action (e.g. update rule stats) */
479 +typedef int (*plugin_handle_skb)(struct ring_opt *the_ring,
480 + filtering_rule_element *rule, /* In case the match is on the list */
481 + filtering_hash_bucket *hash_bucket, /* In case the match is on the hash */
482 + struct pfring_pkthdr *hdr,
483 + struct sk_buff *skb,
484 + u_int16_t filter_plugin_id,
485 + struct parse_buffer **filter_rule_memory_storage,
486 + packet_action_behaviour *behaviour);
487 +/* Return 1/0 in case of match/no match for the given skb */
488 +typedef int (*plugin_filter_skb)(struct ring_opt *the_ring,
489 + filtering_rule_element *rule,
490 + struct pfring_pkthdr *hdr,
491 + struct sk_buff *skb,
492 + struct parse_buffer **filter_rule_memory_storage);
493 +/* Get stats about the rule */
494 +typedef int (*plugin_get_stats)(struct ring_opt *pfr,
495 + filtering_rule_element *rule,
496 + filtering_hash_bucket *hash_bucket,
497 + u_char* stats_buffer, u_int stats_buffer_len);
499 +/* Called when a ring is disposed */
500 +typedef void (*plugin_free_ring_mem)(filtering_rule_element *rule);
502 +struct pfring_plugin_registration {
503 + u_int16_t plugin_id;
504 + char name[16]; /* Unique plugin name (e.g. sip, udp) */
505 + char description[64]; /* Short plugin description */
506 + plugin_filter_skb pfring_plugin_filter_skb; /* Filter skb: 1=match, 0=no match */
507 + plugin_handle_skb pfring_plugin_handle_skb;
508 + plugin_get_stats pfring_plugin_get_stats;
509 + plugin_free_ring_mem pfring_plugin_free_ring_mem;
512 +typedef int (*register_pfring_plugin)(struct pfring_plugin_registration
514 +typedef int (*unregister_pfring_plugin)(u_int16_t pfring_plugin_id);
515 +typedef u_int (*read_device_pfring_free_slots)(int ifindex);
516 +typedef void (*handle_ring_dna_device)(dna_device_operation operation,
517 + unsigned long packet_memory,
518 + u_int packet_memory_num_slots,
519 + u_int packet_memory_slot_len,
520 + u_int packet_memory_tot_len,
521 + void *descr_packet_memory,
522 + u_int descr_packet_memory_num_slots,
523 + u_int descr_packet_memory_slot_len,
524 + u_int descr_packet_memory_tot_len,
526 + void *phys_card_memory,
527 + u_int phys_card_memory_len,
528 + struct net_device *netdev,
529 + dna_device_model device_model,
530 + wait_queue_head_t *packet_waitqueue,
531 + u_int8_t *interrupt_received,
533 + dna_wait_packet wait_packet_function_ptr);
535 +extern register_pfring_plugin get_register_pfring_plugin(void);
536 +extern unregister_pfring_plugin get_unregister_pfring_plugin(void);
537 +extern read_device_pfring_free_slots get_read_device_pfring_free_slots(void);
539 +extern void set_register_pfring_plugin(register_pfring_plugin the_handler);
540 +extern void set_unregister_pfring_plugin(unregister_pfring_plugin the_handler);
541 +extern void set_read_device_pfring_free_slots(read_device_pfring_free_slots the_handler);
543 +extern int do_register_pfring_plugin(struct pfring_plugin_registration *reg);
544 +extern int do_unregister_pfring_plugin(u_int16_t pfring_plugin_id);
545 +extern int do_read_device_pfring_free_slots(int deviceidx);
547 +extern handle_ring_dna_device get_ring_dna_device_handler(void);
548 +extern void set_ring_dna_device_handler(handle_ring_dna_device
549 + the_dna_device_handler);
550 +extern void do_ring_dna_device_handler(dna_device_operation operation,
551 + unsigned long packet_memory,
552 + u_int packet_memory_num_slots,
553 + u_int packet_memory_slot_len,
554 + u_int packet_memory_tot_len,
555 + void *descr_packet_memory,
556 + u_int descr_packet_memory_num_slots,
557 + u_int descr_packet_memory_slot_len,
558 + u_int descr_packet_memory_tot_len,
560 + void *phys_card_memory,
561 + u_int phys_card_memory_len,
562 + struct net_device *netdev,
563 + dna_device_model device_model,
564 + wait_queue_head_t *packet_waitqueue,
565 + u_int8_t *interrupt_received,
567 + dna_wait_packet wait_packet_function_ptr);
569 +typedef int (*handle_ring_skb)(struct sk_buff *skb, u_char recv_packet,
570 + u_char real_skb, short channel_id);
571 +extern handle_ring_skb get_skb_ring_handler(void);
572 +extern void set_skb_ring_handler(handle_ring_skb the_handler);
573 +extern void do_skb_ring_handler(struct sk_buff *skb,
574 + u_char recv_packet, u_char real_skb);
576 +typedef int (*handle_ring_buffer)(struct net_device *dev,
577 + char *data, int len);
578 +extern handle_ring_buffer get_buffer_ring_handler(void);
579 +extern void set_buffer_ring_handler(handle_ring_buffer the_handler);
580 +extern int do_buffer_ring_handler(struct net_device *dev,
581 + char *data, int len);
583 +typedef int (*handle_add_hdr_to_ring)(struct ring_opt *pfr,
584 + struct pfring_pkthdr *hdr);
585 +extern handle_add_hdr_to_ring get_add_hdr_to_ring(void);
586 +extern void set_add_hdr_to_ring(handle_add_hdr_to_ring the_handler);
587 +extern int do_add_hdr_to_ring(struct ring_opt *pfr, struct pfring_pkthdr *hdr);
589 +#endif /* __KERNEL__ */
592 +/* *********************************** */
594 +#endif /* __RING_H */
595 diff --unified --recursive --new-file linux-2.6.30/net/core/dev.c linux-2.6.30-1-686-smp-PF_RING/net/core/dev.c
596 --- linux-2.6.30/net/core/dev.c 2009-06-10 05:05:27.000000000 +0200
597 +++ linux-2.6.30-1-686-smp-PF_RING/net/core/dev.c 2009-07-21 04:40:31.365770966 +0200
598 @@ -129,6 +129,196 @@
600 #include "net-sysfs.h"
602 +#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE)
604 +/* #define RING_DEBUG */
606 +#include <linux/ring.h>
607 +#include <linux/version.h>
609 +/* ************************************************ */
611 +static handle_ring_skb ring_handler = NULL;
613 +handle_ring_skb get_skb_ring_handler() { return(ring_handler); }
615 +void set_skb_ring_handler(handle_ring_skb the_handler) {
616 + ring_handler = the_handler;
619 +void do_skb_ring_handler(struct sk_buff *skb,
620 + u_char recv_packet, u_char real_skb) {
622 + ring_handler(skb, recv_packet, real_skb, -1 /* Unknown channel */);
625 +/* ************************************************ */
627 +static handle_ring_buffer buffer_ring_handler = NULL;
629 +handle_ring_buffer get_buffer_ring_handler() { return(buffer_ring_handler); }
631 +void set_buffer_ring_handler(handle_ring_buffer the_handler) {
632 + buffer_ring_handler = the_handler;
635 +int do_buffer_ring_handler(struct net_device *dev, char *data, int len) {
636 + if(buffer_ring_handler) {
637 + buffer_ring_handler(dev, data, len);
643 +/* ******************* */
645 +static handle_add_hdr_to_ring buffer_add_hdr_to_ring = NULL;
647 +handle_add_hdr_to_ring get_add_hdr_to_ring() { return(buffer_add_hdr_to_ring); }
649 +void set_add_hdr_to_ring(handle_add_hdr_to_ring the_handler) {
650 + buffer_add_hdr_to_ring = the_handler;
653 +int do_add_hdr_to_ring(struct ring_opt *pfr, struct pfring_pkthdr *hdr) {
654 + if(buffer_add_hdr_to_ring) {
655 + buffer_add_hdr_to_ring(pfr, hdr);
661 +/* ************************************************ */
663 +static register_pfring_plugin pfring_registration = NULL;
665 +register_pfring_plugin get_register_pfring_plugin() { return(pfring_registration); }
667 +void set_register_pfring_plugin(register_pfring_plugin the_handler) {
668 + pfring_registration = the_handler;
671 +int do_register_pfring_plugin(struct pfring_plugin_registration *reg) {
672 + if(pfring_registration) {
673 + pfring_registration(reg);
679 +/* ************************************************ */
681 +static unregister_pfring_plugin pfring_unregistration = NULL;
683 +unregister_pfring_plugin get_unregister_pfring_plugin() { return(pfring_unregistration); }
685 +void set_unregister_pfring_plugin(unregister_pfring_plugin the_handler) {
686 + pfring_unregistration = the_handler;
689 +int do_unregister_pfring_plugin(u_int16_t pfring_plugin_id) {
690 + if(pfring_unregistration) {
691 + pfring_unregistration(pfring_plugin_id);
697 +/* ************************************************ */
699 +static handle_ring_dna_device ring_dna_device_handler = NULL;
701 +handle_ring_dna_device get_ring_dna_device_handler() { return(ring_dna_device_handler); }
703 +void set_ring_dna_device_handler(handle_ring_dna_device the_dna_device_handler) {
704 + ring_dna_device_handler = the_dna_device_handler;
707 +void do_ring_dna_device_handler(dna_device_operation operation,
708 + unsigned long packet_memory,
709 + u_int packet_memory_num_slots,
710 + u_int packet_memory_slot_len,
711 + u_int packet_memory_tot_len,
712 + void *descr_packet_memory,
713 + u_int descr_packet_memory_num_slots,
714 + u_int descr_packet_memory_slot_len,
715 + u_int descr_packet_memory_tot_len,
717 + void *phys_card_memory,
718 + u_int phys_card_memory_len,
719 + struct net_device *netdev,
720 + dna_device_model device_model,
721 + wait_queue_head_t *packet_waitqueue,
722 + u_int8_t *interrupt_received,
724 + dna_wait_packet wait_packet_function_ptr) {
725 + if(ring_dna_device_handler)
726 + ring_dna_device_handler(operation,
728 + packet_memory_num_slots,
729 + packet_memory_slot_len,
730 + packet_memory_tot_len,
731 + descr_packet_memory,
732 + descr_packet_memory_num_slots,
733 + descr_packet_memory_slot_len,
734 + descr_packet_memory_tot_len, channel_id,
735 + phys_card_memory, phys_card_memory_len,
736 + netdev, device_model, packet_waitqueue,
737 + interrupt_received, adapter_ptr,
738 + wait_packet_function_ptr);
741 +/* ************************************************ */
743 +static read_device_pfring_free_slots pfring_free_device_slots = NULL;
745 +read_device_pfring_free_slots get_read_device_pfring_free_slots() { return(pfring_free_device_slots); }
747 +void set_read_device_pfring_free_slots(read_device_pfring_free_slots the_handler) {
748 + pfring_free_device_slots = the_handler;
751 +int do_read_device_pfring_free_slots(int deviceidx) {
752 + if(pfring_free_device_slots) {
753 + return(pfring_free_device_slots(deviceidx));
758 +/* ************************************************ */
760 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
761 +EXPORT_SYMBOL(get_skb_ring_handler);
762 +EXPORT_SYMBOL(set_skb_ring_handler);
763 +EXPORT_SYMBOL(do_skb_ring_handler);
765 +EXPORT_SYMBOL(get_buffer_ring_handler);
766 +EXPORT_SYMBOL(set_buffer_ring_handler);
767 +EXPORT_SYMBOL(do_buffer_ring_handler);
769 +EXPORT_SYMBOL(get_add_hdr_to_ring);
770 +EXPORT_SYMBOL(set_add_hdr_to_ring);
771 +EXPORT_SYMBOL(do_add_hdr_to_ring);
773 +EXPORT_SYMBOL(get_register_pfring_plugin);
774 +EXPORT_SYMBOL(set_register_pfring_plugin);
775 +EXPORT_SYMBOL(do_register_pfring_plugin);
777 +EXPORT_SYMBOL(get_unregister_pfring_plugin);
778 +EXPORT_SYMBOL(set_unregister_pfring_plugin);
779 +EXPORT_SYMBOL(do_unregister_pfring_plugin);
781 +EXPORT_SYMBOL(get_ring_dna_device_handler);
782 +EXPORT_SYMBOL(set_ring_dna_device_handler);
783 +EXPORT_SYMBOL(do_ring_dna_device_handler);
785 +EXPORT_SYMBOL(get_read_device_pfring_free_slots);
786 +EXPORT_SYMBOL(set_read_device_pfring_free_slots);
787 +EXPORT_SYMBOL(do_read_device_pfring_free_slots);
792 /* Instead of increasing this, you should create a hash table. */
793 #define MAX_GRO_SKBS 8
795 @@ -1839,6 +2029,12 @@
797 spinlock_t *root_lock = qdisc_lock(q);
799 + /* This TX patch applies to all drivers */
800 + #if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE)
801 + if(ring_handler) ring_handler(skb, 0, 1, -1 /* Unknown channel */);
802 + #endif /* CONFIG_RING */
805 spin_lock(root_lock);
807 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
808 @@ -1936,6 +2132,16 @@
811 /* if netpoll wants it, pretend we never saw it */
812 +/* This RX patch applies only to non-NAPI drivers */
814 +#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE)
815 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
816 +if(ring_handler && ring_handler(skb, 1, 1, -1 /* Unknown channel */)) {
817 + /* The packet has been copied into a ring */
818 + return(NET_RX_SUCCESS);
821 +#endif /* CONFIG_RING */
825 @@ -2220,6 +2426,16 @@
826 struct net_device *orig_dev;
827 struct net_device *null_or_orig;
828 int ret = NET_RX_DROP;
830 + This RX patch applies to both non-NAPI (this as netif_receive_rx
831 + is called by netif_rx) and NAPI drivers.
833 +#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE)
834 +if(ring_handler && ring_handler(skb, 1, 1, -1 /* Unknown channel */)) {
835 + /* The packet has been copied into a ring */
836 + return(NET_RX_SUCCESS);
838 +#endif /* CONFIG_RING */
841 if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
842 diff --unified --recursive --new-file linux-2.6.30/net/core/dev.c.ORG linux-2.6.30-1-686-smp-PF_RING/net/core/dev.c.ORG
843 --- linux-2.6.30/net/core/dev.c.ORG 1970-01-01 01:00:00.000000000 +0100
844 +++ linux-2.6.30-1-686-smp-PF_RING/net/core/dev.c.ORG 2009-07-21 04:40:31.319103951 +0200
847 + * NET3 Protocol independent device support routines.
849 + * This program is free software; you can redistribute it and/or
850 + * modify it under the terms of the GNU General Public License
851 + * as published by the Free Software Foundation; either version
852 + * 2 of the License, or (at your option) any later version.
854 + * Derived from the non IP parts of dev.c 1.0.19
855 + * Authors: Ross Biro
856 + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
857 + * Mark Evans, <evansmp@uhura.aston.ac.uk>
859 + * Additional Authors:
860 + * Florian la Roche <rzsfl@rz.uni-sb.de>
861 + * Alan Cox <gw4pts@gw4pts.ampr.org>
862 + * David Hinds <dahinds@users.sourceforge.net>
863 + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
864 + * Adam Sulmicki <adam@cfar.umd.edu>
865 + * Pekka Riikonen <priikone@poesidon.pspt.fi>
868 + * D.J. Barrow : Fixed bug where dev->refcnt gets set
869 + * to 2 if register_netdev gets called
870 + * before net_dev_init & also removed a
871 + * few lines of code in the process.
872 + * Alan Cox : device private ioctl copies fields back.
873 + * Alan Cox : Transmit queue code does relevant
874 + * stunts to keep the queue safe.
875 + * Alan Cox : Fixed double lock.
876 + * Alan Cox : Fixed promisc NULL pointer trap
877 + * ???????? : Support the full private ioctl range
878 + * Alan Cox : Moved ioctl permission check into
880 + * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
881 + * Alan Cox : 100 backlog just doesn't cut it when
882 + * you start doing multicast video 8)
883 + * Alan Cox : Rewrote net_bh and list manager.
884 + * Alan Cox : Fix ETH_P_ALL echoback lengths.
885 + * Alan Cox : Took out transmit every packet pass
886 + * Saved a few bytes in the ioctl handler
887 + * Alan Cox : Network driver sets packet type before
888 + * calling netif_rx. Saves a function
890 + * Alan Cox : Hashed net_bh()
891 + * Richard Kooijman: Timestamp fixes.
892 + * Alan Cox : Wrong field in SIOCGIFDSTADDR
893 + * Alan Cox : Device lock protection.
894 + * Alan Cox : Fixed nasty side effect of device close
896 + * Rudi Cilibrasi : Pass the right thing to
897 + * set_mac_address()
898 + * Dave Miller : 32bit quantity for the device lock to
899 + * make it work out on a Sparc.
900 + * Bjorn Ekwall : Added KERNELD hack.
901 + * Alan Cox : Cleaned up the backlog initialise.
902 + * Craig Metz : SIOCGIFCONF fix if space for under
904 + * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
905 + * is no device open function.
906 + * Andi Kleen : Fix error reporting for SIOCGIFCONF
907 + * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
908 + * Cyrus Durgin : Cleaned for KMOD
909 + * Adam Sulmicki : Bug Fix : Network Device Unload
910 + * A network device unload needs to purge
911 + * the backlog queue.
912 + * Paul Rusty Russell : SIOCSIFNAME
913 + * Pekka Riikonen : Netdev boot-time settings code
914 + * Andrew Morton : Make unregister_netdevice wait
915 + * indefinitely on dev->refcnt
916 + * J Hadi Salim : - Backlog queue sampling
917 + * - netif_rx() feedback
920 +#include <asm/uaccess.h>
921 +#include <asm/system.h>
922 +#include <linux/bitops.h>
923 +#include <linux/capability.h>
924 +#include <linux/cpu.h>
925 +#include <linux/types.h>
926 +#include <linux/kernel.h>
927 +#include <linux/sched.h>
928 +#include <linux/mutex.h>
929 +#include <linux/string.h>
930 +#include <linux/mm.h>
931 +#include <linux/socket.h>
932 +#include <linux/sockios.h>
933 +#include <linux/errno.h>
934 +#include <linux/interrupt.h>
935 +#include <linux/if_ether.h>
936 +#include <linux/netdevice.h>
937 +#include <linux/etherdevice.h>
938 +#include <linux/ethtool.h>
939 +#include <linux/notifier.h>
940 +#include <linux/skbuff.h>
941 +#include <net/net_namespace.h>
942 +#include <net/sock.h>
943 +#include <linux/rtnetlink.h>
944 +#include <linux/proc_fs.h>
945 +#include <linux/seq_file.h>
946 +#include <linux/stat.h>
947 +#include <linux/if_bridge.h>
948 +#include <linux/if_macvlan.h>
949 +#include <net/dst.h>
950 +#include <net/pkt_sched.h>
951 +#include <net/checksum.h>
952 +#include <linux/highmem.h>
953 +#include <linux/init.h>
954 +#include <linux/kmod.h>
955 +#include <linux/module.h>
956 +#include <linux/netpoll.h>
957 +#include <linux/rcupdate.h>
958 +#include <linux/delay.h>
959 +#include <net/wext.h>
960 +#include <net/iw_handler.h>
961 +#include <asm/current.h>
962 +#include <linux/audit.h>
963 +#include <linux/dmaengine.h>
964 +#include <linux/err.h>
965 +#include <linux/ctype.h>
966 +#include <linux/if_arp.h>
967 +#include <linux/if_vlan.h>
968 +#include <linux/ip.h>
970 +#include <linux/ipv6.h>
971 +#include <linux/in.h>
972 +#include <linux/jhash.h>
973 +#include <linux/random.h>
975 +#include "net-sysfs.h"
977 +/* Instead of increasing this, you should create a hash table. */
978 +#define MAX_GRO_SKBS 8
980 +/* This should be increased if a protocol with a bigger head is added. */
981 +#define GRO_MAX_HEAD (MAX_HEADER + 128)
984 + * The list of packet types we will receive (as opposed to discard)
985 + * and the routines to invoke.
987 + * Why 16. Because with 16 the only overlap we get on a hash of the
988 + * low nibble of the protocol value is RARP/SNAP/X.25.
990 + * NOTE: That is no longer true with the addition of VLAN tags. Not
991 + * sure which should go first, but I bet it won't make much
992 + * difference if we are running VLANs. The good news is that
993 + * this protocol won't be in the list unless compiled in, so
994 + * the average user (w/out VLANs) will not be adversely affected.
1011 +#define PTYPE_HASH_SIZE (16)
1012 +#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
1014 +static DEFINE_SPINLOCK(ptype_lock);
1015 +static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
1016 +static struct list_head ptype_all __read_mostly; /* Taps */
1019 + * The @dev_base_head list is protected by @dev_base_lock and the rtnl
1022 + * Pure readers hold dev_base_lock for reading.
1024 + * Writers must hold the rtnl semaphore while they loop through the
1025 + * dev_base_head list, and hold dev_base_lock for writing when they do the
1026 + * actual updates. This allows pure readers to access the list even
1027 + * while a writer is preparing to update it.
1029 + * To put it another way, dev_base_lock is held for writing only to
1030 + * protect against pure readers; the rtnl semaphore provides the
1031 + * protection against other writers.
1033 + * See, for example usages, register_netdevice() and
1034 + * unregister_netdevice(), which must be called with the rtnl
1037 +DEFINE_RWLOCK(dev_base_lock);
1039 +EXPORT_SYMBOL(dev_base_lock);
1041 +#define NETDEV_HASHBITS 8
1042 +#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
1044 +static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
1046 + unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
1047 + return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
1050 +static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
1052 + return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
1055 +/* Device list insertion */
1056 +static int list_netdevice(struct net_device *dev)
1058 + struct net *net = dev_net(dev);
1062 + write_lock_bh(&dev_base_lock);
1063 + list_add_tail(&dev->dev_list, &net->dev_base_head);
1064 + hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
1065 + hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
1066 + write_unlock_bh(&dev_base_lock);
1070 +/* Device list removal */
1071 +static void unlist_netdevice(struct net_device *dev)
1075 + /* Unlink dev from the device chain */
1076 + write_lock_bh(&dev_base_lock);
1077 + list_del(&dev->dev_list);
1078 + hlist_del(&dev->name_hlist);
1079 + hlist_del(&dev->index_hlist);
1080 + write_unlock_bh(&dev_base_lock);
1084 + * Our notifier list
1087 +static RAW_NOTIFIER_HEAD(netdev_chain);
1090 + * Device drivers call our routines to queue packets here. We empty the
1091 + * queue in the local softnet handler.
1094 +DEFINE_PER_CPU(struct softnet_data, softnet_data);
1096 +#ifdef CONFIG_LOCKDEP
1098 + * register_netdevice() inits txq->_xmit_lock and sets lockdep class
1099 + * according to dev->type
1101 +static const unsigned short netdev_lock_type[] =
1102 + {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
1103 + ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
1104 + ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
1105 + ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
1106 + ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
1107 + ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
1108 + ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
1109 + ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
1110 + ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
1111 + ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
1112 + ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
1113 + ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
1114 + ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
1115 + ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
1116 + ARPHRD_PHONET_PIPE, ARPHRD_VOID, ARPHRD_NONE};
1118 +static const char *netdev_lock_name[] =
1119 + {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
1120 + "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
1121 + "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
1122 + "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
1123 + "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
1124 + "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
1125 + "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
1126 + "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
1127 + "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
1128 + "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
1129 + "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
1130 + "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
1131 + "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
1132 + "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
1133 + "_xmit_PHONET_PIPE", "_xmit_VOID", "_xmit_NONE"};
1135 +static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
1136 +static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
1138 +static inline unsigned short netdev_lock_pos(unsigned short dev_type)
1142 + for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
1143 + if (netdev_lock_type[i] == dev_type)
1145 + /* the last key is used by default */
1146 + return ARRAY_SIZE(netdev_lock_type) - 1;
1149 +static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
1150 + unsigned short dev_type)
1154 + i = netdev_lock_pos(dev_type);
1155 + lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
1156 + netdev_lock_name[i]);
1159 +static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
1163 + i = netdev_lock_pos(dev->type);
1164 + lockdep_set_class_and_name(&dev->addr_list_lock,
1165 + &netdev_addr_lock_key[i],
1166 + netdev_lock_name[i]);
1169 +static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
1170 + unsigned short dev_type)
1173 +static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
1178 +/*******************************************************************************
1180 + Protocol management and registration routines
1182 +*******************************************************************************/
1185 + * Add a protocol ID to the list. Now that the input handler is
1186 + * smarter we can dispense with all the messy stuff that used to be
1189 + * BEWARE!!! Protocol handlers, mangling input packets,
1190 + * MUST BE last in hash buckets and checking protocol handlers
1191 + * MUST start from promiscuous ptype_all chain in net_bh.
1192 + * It is true now, do not change it.
1193 + * Explanation follows: if protocol handler, mangling packet, will
1194 + * be the first on list, it is not able to sense, that packet
1195 + * is cloned and should be copied-on-write, so that it will
1196 + * change it and subsequent readers will get broken packet.
1201 + * dev_add_pack - add packet handler
1202 + * @pt: packet type declaration
1204 + * Add a protocol handler to the networking stack. The passed &packet_type
1205 + * is linked into kernel lists and may not be freed until it has been
1206 + * removed from the kernel lists.
1208 + * This call does not sleep therefore it can not
1209 + * guarantee all CPU's that are in middle of receiving packets
1210 + * will see the new packet type (until the next received packet).
1213 +void dev_add_pack(struct packet_type *pt)
1217 + spin_lock_bh(&ptype_lock);
1218 + if (pt->type == htons(ETH_P_ALL))
1219 + list_add_rcu(&pt->list, &ptype_all);
1221 + hash = ntohs(pt->type) & PTYPE_HASH_MASK;
1222 + list_add_rcu(&pt->list, &ptype_base[hash]);
1224 + spin_unlock_bh(&ptype_lock);
1228 + * __dev_remove_pack - remove packet handler
1229 + * @pt: packet type declaration
1231 + * Remove a protocol handler that was previously added to the kernel
1232 + * protocol handlers by dev_add_pack(). The passed &packet_type is removed
1233 + * from the kernel lists and can be freed or reused once this function
1236 + * The packet type might still be in use by receivers
1237 + * and must not be freed until after all the CPU's have gone
1238 + * through a quiescent state.
1240 +void __dev_remove_pack(struct packet_type *pt)
1242 + struct list_head *head;
1243 + struct packet_type *pt1;
1245 + spin_lock_bh(&ptype_lock);
1247 + if (pt->type == htons(ETH_P_ALL))
1248 + head = &ptype_all;
1250 + head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
1252 + list_for_each_entry(pt1, head, list) {
1254 + list_del_rcu(&pt->list);
1259 + printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
1261 + spin_unlock_bh(&ptype_lock);
1264 + * dev_remove_pack - remove packet handler
1265 + * @pt: packet type declaration
1267 + * Remove a protocol handler that was previously added to the kernel
1268 + * protocol handlers by dev_add_pack(). The passed &packet_type is removed
1269 + * from the kernel lists and can be freed or reused once this function
1272 + * This call sleeps to guarantee that no CPU is looking at the packet
1273 + * type after return.
1275 +void dev_remove_pack(struct packet_type *pt)
1277 + __dev_remove_pack(pt);
1279 + synchronize_net();
1282 +/******************************************************************************
1284 + Device Boot-time Settings Routines
1286 +*******************************************************************************/
1288 +/* Boot time configuration table */
1289 +static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
1292 + * netdev_boot_setup_add - add new setup entry
1293 + * @name: name of the device
1294 + * @map: configured settings for the device
1296 + * Adds new setup entry to the dev_boot_setup list. The function
1297 + * returns 0 on error and 1 on success. This is a generic routine to
1300 +static int netdev_boot_setup_add(char *name, struct ifmap *map)
1302 + struct netdev_boot_setup *s;
1305 + s = dev_boot_setup;
1306 + for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
1307 + if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
1308 + memset(s[i].name, 0, sizeof(s[i].name));
1309 + strlcpy(s[i].name, name, IFNAMSIZ);
1310 + memcpy(&s[i].map, map, sizeof(s[i].map));
1315 + return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
1319 + * netdev_boot_setup_check - check boot time settings
1320 + * @dev: the netdevice
1322 + * Check boot time settings for the device.
1323 + * The found settings are set for the device to be used
1324 + * later in the device probing.
1325 + * Returns 0 if no settings found, 1 if they are.
1327 +int netdev_boot_setup_check(struct net_device *dev)
1329 + struct netdev_boot_setup *s = dev_boot_setup;
1332 + for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
1333 + if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
1334 + !strcmp(dev->name, s[i].name)) {
1335 + dev->irq = s[i].map.irq;
1336 + dev->base_addr = s[i].map.base_addr;
1337 + dev->mem_start = s[i].map.mem_start;
1338 + dev->mem_end = s[i].map.mem_end;
1347 + * netdev_boot_base - get address from boot time settings
1348 + * @prefix: prefix for network device
1349 + * @unit: id for network device
1351 + * Check boot time settings for the base address of device.
1352 + * The found settings are set for the device to be used
1353 + * later in the device probing.
1354 + * Returns 0 if no settings found.
1356 +unsigned long netdev_boot_base(const char *prefix, int unit)
1358 + const struct netdev_boot_setup *s = dev_boot_setup;
1359 + char name[IFNAMSIZ];
1362 + sprintf(name, "%s%d", prefix, unit);
1365 + * If device already registered then return base of 1
1366 + * to indicate not to probe for this interface
1368 + if (__dev_get_by_name(&init_net, name))
1371 + for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
1372 + if (!strcmp(name, s[i].name))
1373 + return s[i].map.base_addr;
1378 + * Saves at boot time configured settings for any netdevice.
1380 +int __init netdev_boot_setup(char *str)
1385 + str = get_options(str, ARRAY_SIZE(ints), ints);
1386 + if (!str || !*str)
1389 + /* Save settings */
1390 + memset(&map, 0, sizeof(map));
1392 + map.irq = ints[1];
1394 + map.base_addr = ints[2];
1396 + map.mem_start = ints[3];
1398 + map.mem_end = ints[4];
1400 + /* Add new entry to the list */
1401 + return netdev_boot_setup_add(str, &map);
1404 +__setup("netdev=", netdev_boot_setup);
1406 +/*******************************************************************************
1408 + Device Interface Subroutines
1410 +*******************************************************************************/
1413 + * __dev_get_by_name - find a device by its name
1414 + * @net: the applicable net namespace
1415 + * @name: name to find
1417 + * Find an interface by name. Must be called under RTNL semaphore
1418 + * or @dev_base_lock. If the name is found a pointer to the device
1419 + * is returned. If the name is not found then %NULL is returned. The
1420 + * reference counters are not incremented so the caller must be
1421 + * careful with locks.
1424 +struct net_device *__dev_get_by_name(struct net *net, const char *name)
1426 + struct hlist_node *p;
1428 + hlist_for_each(p, dev_name_hash(net, name)) {
1429 + struct net_device *dev
1430 + = hlist_entry(p, struct net_device, name_hlist);
1431 + if (!strncmp(dev->name, name, IFNAMSIZ))
1438 + * dev_get_by_name - find a device by its name
1439 + * @net: the applicable net namespace
1440 + * @name: name to find
1442 + * Find an interface by name. This can be called from any
1443 + * context and does its own locking. The returned handle has
1444 + * the usage count incremented and the caller must use dev_put() to
1445 + * release it when it is no longer needed. %NULL is returned if no
1446 + * matching device is found.
1449 +struct net_device *dev_get_by_name(struct net *net, const char *name)
1451 + struct net_device *dev;
1453 + read_lock(&dev_base_lock);
1454 + dev = __dev_get_by_name(net, name);
1457 + read_unlock(&dev_base_lock);
1462 + * __dev_get_by_index - find a device by its ifindex
1463 + * @net: the applicable net namespace
1464 + * @ifindex: index of device
1466 + * Search for an interface by index. Returns %NULL if the device
1467 + * is not found or a pointer to the device. The device has not
1468 + * had its reference counter increased so the caller must be careful
1469 + * about locking. The caller must hold either the RTNL semaphore
1470 + * or @dev_base_lock.
1473 +struct net_device *__dev_get_by_index(struct net *net, int ifindex)
1475 + struct hlist_node *p;
1477 + hlist_for_each(p, dev_index_hash(net, ifindex)) {
1478 + struct net_device *dev
1479 + = hlist_entry(p, struct net_device, index_hlist);
1480 + if (dev->ifindex == ifindex)
1488 + * dev_get_by_index - find a device by its ifindex
1489 + * @net: the applicable net namespace
1490 + * @ifindex: index of device
1492 + * Search for an interface by index. Returns NULL if the device
1493 + * is not found or a pointer to the device. The device returned has
1494 + * had a reference added and the pointer is safe until the user calls
1495 + * dev_put to indicate they have finished with it.
1498 +struct net_device *dev_get_by_index(struct net *net, int ifindex)
1500 + struct net_device *dev;
1502 + read_lock(&dev_base_lock);
1503 + dev = __dev_get_by_index(net, ifindex);
1506 + read_unlock(&dev_base_lock);
1511 + * dev_getbyhwaddr - find a device by its hardware address
1512 + * @net: the applicable net namespace
1513 + * @type: media type of device
1514 + * @ha: hardware address
1516 + * Search for an interface by MAC address. Returns NULL if the device
1517 + * is not found or a pointer to the device. The caller must hold the
1518 + * rtnl semaphore. The returned device has not had its ref count increased
1519 + * and the caller must therefore be careful about locking
1522 + * If the API was consistent this would be __dev_get_by_hwaddr
1525 +struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
1527 + struct net_device *dev;
1531 + for_each_netdev(net, dev)
1532 + if (dev->type == type &&
1533 + !memcmp(dev->dev_addr, ha, dev->addr_len))
1539 +EXPORT_SYMBOL(dev_getbyhwaddr);
1541 +struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
1543 + struct net_device *dev;
1546 + for_each_netdev(net, dev)
1547 + if (dev->type == type)
1553 +EXPORT_SYMBOL(__dev_getfirstbyhwtype);
1555 +struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
1557 + struct net_device *dev;
1560 + dev = __dev_getfirstbyhwtype(net, type);
1567 +EXPORT_SYMBOL(dev_getfirstbyhwtype);
1570 + * dev_get_by_flags - find any device with given flags
1571 + * @net: the applicable net namespace
1572 + * @if_flags: IFF_* values
1573 + * @mask: bitmask of bits in if_flags to check
1575 + * Search for any interface with the given flags. Returns NULL if a device
1576 + * is not found or a pointer to the device. The device returned has
1577 + * had a reference added and the pointer is safe until the user calls
1578 + * dev_put to indicate they have finished with it.
1581 +struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
1583 + struct net_device *dev, *ret;
1586 + read_lock(&dev_base_lock);
1587 + for_each_netdev(net, dev) {
1588 + if (((dev->flags ^ if_flags) & mask) == 0) {
1594 + read_unlock(&dev_base_lock);
1599 + * dev_valid_name - check if name is okay for network device
1600 + * @name: name string
1602 + * Network device names need to be valid file names to
1603 + * to allow sysfs to work. We also disallow any kind of
1606 +int dev_valid_name(const char *name)
1608 + if (*name == '\0')
1610 + if (strlen(name) >= IFNAMSIZ)
1612 + if (!strcmp(name, ".") || !strcmp(name, ".."))
1616 + if (*name == '/' || isspace(*name))
1624 + * __dev_alloc_name - allocate a name for a device
1625 + * @net: network namespace to allocate the device name in
1626 + * @name: name format string
1627 + * @buf: scratch buffer and result name string
1629 + * Passed a format string - eg "lt%d" it will try and find a suitable
1630 + * id. It scans list of devices to build up a free map, then chooses
1631 + * the first empty slot. The caller must hold the dev_base or rtnl lock
1632 + * while allocating the name and adding the device in order to avoid
1634 + * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1635 + * Returns the number of the unit assigned or a negative errno code.
1638 +static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1642 + const int max_netdevices = 8*PAGE_SIZE;
1643 + unsigned long *inuse;
1644 + struct net_device *d;
1646 + p = strnchr(name, IFNAMSIZ-1, '%');
1649 + * Verify the string as this thing may have come from
1650 + * the user. There must be either one "%d" and no other "%"
1653 + if (p[1] != 'd' || strchr(p + 2, '%'))
1656 + /* Use one page as a bit array of possible slots */
1657 + inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1661 + for_each_netdev(net, d) {
1662 + if (!sscanf(d->name, name, &i))
1664 + if (i < 0 || i >= max_netdevices)
1667 + /* avoid cases where sscanf is not exact inverse of printf */
1668 + snprintf(buf, IFNAMSIZ, name, i);
1669 + if (!strncmp(buf, d->name, IFNAMSIZ))
1670 + set_bit(i, inuse);
1673 + i = find_first_zero_bit(inuse, max_netdevices);
1674 + free_page((unsigned long) inuse);
1677 + snprintf(buf, IFNAMSIZ, name, i);
1678 + if (!__dev_get_by_name(net, buf))
1681 + /* It is possible to run out of possible slots
1682 + * when the name is long and there isn't enough space left
1683 + * for the digits, or if all bits are used.
1689 + * dev_alloc_name - allocate a name for a device
1691 + * @name: name format string
1693 + * Passed a format string - eg "lt%d" it will try and find a suitable
1694 + * id. It scans list of devices to build up a free map, then chooses
1695 + * the first empty slot. The caller must hold the dev_base or rtnl lock
1696 + * while allocating the name and adding the device in order to avoid
1698 + * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1699 + * Returns the number of the unit assigned or a negative errno code.
1702 +int dev_alloc_name(struct net_device *dev, const char *name)
1704 + char buf[IFNAMSIZ];
1708 + BUG_ON(!dev_net(dev));
1709 + net = dev_net(dev);
1710 + ret = __dev_alloc_name(net, name, buf);
1712 + strlcpy(dev->name, buf, IFNAMSIZ);
1718 + * dev_change_name - change name of a device
1720 + * @newname: name (or format string) must be at least IFNAMSIZ
1722 + * Change name of a device, can pass format strings "eth%d".
1723 + * for wildcarding.
1725 +int dev_change_name(struct net_device *dev, const char *newname)
1727 + char oldname[IFNAMSIZ];
1733 + BUG_ON(!dev_net(dev));
1735 + net = dev_net(dev);
1736 + if (dev->flags & IFF_UP)
1739 + if (!dev_valid_name(newname))
1742 + if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1745 + memcpy(oldname, dev->name, IFNAMSIZ);
1747 + if (strchr(newname, '%')) {
1748 + err = dev_alloc_name(dev, newname);
1752 + else if (__dev_get_by_name(net, newname))
1755 + strlcpy(dev->name, newname, IFNAMSIZ);
1758 + /* For now only devices in the initial network namespace
1761 + if (net == &init_net) {
1762 + ret = device_rename(&dev->dev, dev->name);
1764 + memcpy(dev->name, oldname, IFNAMSIZ);
1769 + write_lock_bh(&dev_base_lock);
1770 + hlist_del(&dev->name_hlist);
1771 + hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
1772 + write_unlock_bh(&dev_base_lock);
1774 + ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1775 + ret = notifier_to_errno(ret);
1780 + "%s: name change rollback failed: %d.\n",
1784 + memcpy(dev->name, oldname, IFNAMSIZ);
1793 + * dev_set_alias - change ifalias of a device
1795 + * @alias: name up to IFALIASZ
1796 + * @len: limit of bytes to copy from info
1798 + * Set ifalias for a device,
1800 +int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1804 + if (len >= IFALIASZ)
1808 + if (dev->ifalias) {
1809 + kfree(dev->ifalias);
1810 + dev->ifalias = NULL;
1815 + dev->ifalias = krealloc(dev->ifalias, len+1, GFP_KERNEL);
1816 + if (!dev->ifalias)
1819 + strlcpy(dev->ifalias, alias, len+1);
1825 + * netdev_features_change - device changes features
1826 + * @dev: device to cause notification
1828 + * Called to indicate a device has changed features.
1830 +void netdev_features_change(struct net_device *dev)
1832 + call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1834 +EXPORT_SYMBOL(netdev_features_change);
1837 + * netdev_state_change - device changes state
1838 + * @dev: device to cause notification
1840 + * Called to indicate a device has changed state. This function calls
1841 + * the notifier chains for netdev_chain and sends a NEWLINK message
1842 + * to the routing socket.
1844 +void netdev_state_change(struct net_device *dev)
1846 + if (dev->flags & IFF_UP) {
1847 + call_netdevice_notifiers(NETDEV_CHANGE, dev);
1848 + rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1852 +void netdev_bonding_change(struct net_device *dev)
1854 + call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev);
1856 +EXPORT_SYMBOL(netdev_bonding_change);
1859 + * dev_load - load a network module
1860 + * @net: the applicable net namespace
1861 + * @name: name of interface
1863 + * If a network interface is not present and the process has suitable
1864 + * privileges this function loads the module. If module loading is not
1865 + * available in this kernel then it becomes a nop.
1868 +void dev_load(struct net *net, const char *name)
1870 + struct net_device *dev;
1872 + read_lock(&dev_base_lock);
1873 + dev = __dev_get_by_name(net, name);
1874 + read_unlock(&dev_base_lock);
1876 + if (!dev && capable(CAP_SYS_MODULE))
1877 + request_module("%s", name);
1881 + * dev_open - prepare an interface for use.
1882 + * @dev: device to open
1884 + * Takes a device from down to up state. The device's private open
1885 + * function is invoked and then the multicast lists are loaded. Finally
1886 + * the device is moved into the up state and a %NETDEV_UP message is
1887 + * sent to the netdev notifier chain.
1889 + * Calling this function on an active interface is a nop. On a failure
1890 + * a negative errno code is returned.
1892 +int dev_open(struct net_device *dev)
1894 + const struct net_device_ops *ops = dev->netdev_ops;
1900 + * Is it already up?
1903 + if (dev->flags & IFF_UP)
1907 + * Is it even present?
1909 + if (!netif_device_present(dev))
1913 + * Call device private open method
1915 + set_bit(__LINK_STATE_START, &dev->state);
1917 + if (ops->ndo_validate_addr)
1918 + ret = ops->ndo_validate_addr(dev);
1920 + if (!ret && ops->ndo_open)
1921 + ret = ops->ndo_open(dev);
1924 + * If it went open OK then:
1928 + clear_bit(__LINK_STATE_START, &dev->state);
1933 + dev->flags |= IFF_UP;
1938 + net_dmaengine_get();
1941 + * Initialize multicasting status
1943 + dev_set_rx_mode(dev);
1946 + * Wakeup transmit queue engine
1948 + dev_activate(dev);
1951 + * ... and announce new interface.
1953 + call_netdevice_notifiers(NETDEV_UP, dev);
1960 + * dev_close - shutdown an interface.
1961 + * @dev: device to shutdown
1963 + * This function moves an active device into down state. A
1964 + * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1965 + * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1968 +int dev_close(struct net_device *dev)
1970 + const struct net_device_ops *ops = dev->netdev_ops;
1975 + if (!(dev->flags & IFF_UP))
1979 + * Tell people we are going down, so that they can
1980 + * prepare to death, when device is still operating.
1982 + call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1984 + clear_bit(__LINK_STATE_START, &dev->state);
1986 + /* Synchronize to scheduled poll. We cannot touch poll list,
1987 + * it can be even on different cpu. So just clear netif_running().
1989 + * dev->stop() will invoke napi_disable() on all of it's
1990 + * napi_struct instances on this device.
1992 + smp_mb__after_clear_bit(); /* Commit netif_running(). */
1994 + dev_deactivate(dev);
1997 + * Call the device specific close. This cannot fail.
1998 + * Only if device is UP
2000 + * We allow it to be called even after a DETACH hot-plug
2003 + if (ops->ndo_stop)
2004 + ops->ndo_stop(dev);
2007 + * Device is now down.
2010 + dev->flags &= ~IFF_UP;
2013 + * Tell people we are down
2015 + call_netdevice_notifiers(NETDEV_DOWN, dev);
2018 + * Shutdown NET_DMA
2020 + net_dmaengine_put();
2027 + * dev_disable_lro - disable Large Receive Offload on a device
2030 + * Disable Large Receive Offload (LRO) on a net device. Must be
2031 + * called under RTNL. This is needed if received packets may be
2032 + * forwarded to another interface.
2034 +void dev_disable_lro(struct net_device *dev)
2036 + if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
2037 + dev->ethtool_ops->set_flags) {
2038 + u32 flags = dev->ethtool_ops->get_flags(dev);
2039 + if (flags & ETH_FLAG_LRO) {
2040 + flags &= ~ETH_FLAG_LRO;
2041 + dev->ethtool_ops->set_flags(dev, flags);
2044 + WARN_ON(dev->features & NETIF_F_LRO);
2046 +EXPORT_SYMBOL(dev_disable_lro);
2049 +static int dev_boot_phase = 1;
2052 + * Device change register/unregister. These are not inline or static
2053 + * as we export them to the world.
2057 + * register_netdevice_notifier - register a network notifier block
2060 + * Register a notifier to be called when network device events occur.
2061 + * The notifier passed is linked into the kernel structures and must
2062 + * not be reused until it has been unregistered. A negative errno code
2063 + * is returned on a failure.
2065 + * When registered all registration and up events are replayed
2066 + * to the new notifier to allow device to have a race free
2067 + * view of the network device list.
2070 +int register_netdevice_notifier(struct notifier_block *nb)
2072 + struct net_device *dev;
2073 + struct net_device *last;
2078 + err = raw_notifier_chain_register(&netdev_chain, nb);
2081 + if (dev_boot_phase)
2083 + for_each_net(net) {
2084 + for_each_netdev(net, dev) {
2085 + err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
2086 + err = notifier_to_errno(err);
2090 + if (!(dev->flags & IFF_UP))
2093 + nb->notifier_call(nb, NETDEV_UP, dev);
2103 + for_each_net(net) {
2104 + for_each_netdev(net, dev) {
2108 + if (dev->flags & IFF_UP) {
2109 + nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
2110 + nb->notifier_call(nb, NETDEV_DOWN, dev);
2112 + nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
2116 + raw_notifier_chain_unregister(&netdev_chain, nb);
2121 + * unregister_netdevice_notifier - unregister a network notifier block
2124 + * Unregister a notifier previously registered by
2125 + * register_netdevice_notifier(). The notifier is unlinked into the
2126 + * kernel structures and may then be reused. A negative errno code
2127 + * is returned on a failure.
2130 +int unregister_netdevice_notifier(struct notifier_block *nb)
2135 + err = raw_notifier_chain_unregister(&netdev_chain, nb);
2141 + * call_netdevice_notifiers - call all network notifier blocks
2142 + * @val: value passed unmodified to notifier function
2143 + * @dev: net_device pointer passed unmodified to notifier function
2145 + * Call all network notifier blocks. Parameters and return value
2146 + * are as for raw_notifier_call_chain().
2149 +int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
2151 + return raw_notifier_call_chain(&netdev_chain, val, dev);
2154 +/* When > 0 there are consumers of rx skb time stamps */
2155 +static atomic_t netstamp_needed = ATOMIC_INIT(0);
2157 +void net_enable_timestamp(void)
2159 + atomic_inc(&netstamp_needed);
2162 +void net_disable_timestamp(void)
2164 + atomic_dec(&netstamp_needed);
2167 +static inline void net_timestamp(struct sk_buff *skb)
2169 + if (atomic_read(&netstamp_needed))
2170 + __net_timestamp(skb);
2172 + skb->tstamp.tv64 = 0;
2176 + * Support routine. Sends outgoing frames to any network
2177 + * taps currently in use.
2180 +static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
2182 + struct packet_type *ptype;
2184 +#ifdef CONFIG_NET_CLS_ACT
2185 + if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
2186 + net_timestamp(skb);
2188 + net_timestamp(skb);
2192 + list_for_each_entry_rcu(ptype, &ptype_all, list) {
2193 + /* Never send packets back to the socket
2194 + * they originated from - MvS (miquels@drinkel.ow.org)
2196 + if ((ptype->dev == dev || !ptype->dev) &&
2197 + (ptype->af_packet_priv == NULL ||
2198 + (struct sock *)ptype->af_packet_priv != skb->sk)) {
2199 + struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
2203 + /* skb->nh should be correctly
2204 + set by sender, so that the second statement is
2205 + just protection against buggy protocols.
2207 + skb_reset_mac_header(skb2);
2209 + if (skb_network_header(skb2) < skb2->data ||
2210 + skb2->network_header > skb2->tail) {
2211 + if (net_ratelimit())
2212 + printk(KERN_CRIT "protocol %04x is "
2213 + "buggy, dev %s\n",
2214 + skb2->protocol, dev->name);
2215 + skb_reset_network_header(skb2);
2218 + skb2->transport_header = skb2->network_header;
2219 + skb2->pkt_type = PACKET_OUTGOING;
2220 + ptype->func(skb2, skb->dev, ptype, skb->dev);
2223 + rcu_read_unlock();
2227 +static inline void __netif_reschedule(struct Qdisc *q)
2229 + struct softnet_data *sd;
2230 + unsigned long flags;
2232 + local_irq_save(flags);
2233 + sd = &__get_cpu_var(softnet_data);
2234 + q->next_sched = sd->output_queue;
2235 + sd->output_queue = q;
2236 + raise_softirq_irqoff(NET_TX_SOFTIRQ);
2237 + local_irq_restore(flags);
2240 +void __netif_schedule(struct Qdisc *q)
2242 + if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2243 + __netif_reschedule(q);
2245 +EXPORT_SYMBOL(__netif_schedule);
2247 +void dev_kfree_skb_irq(struct sk_buff *skb)
2249 + if (atomic_dec_and_test(&skb->users)) {
2250 + struct softnet_data *sd;
2251 + unsigned long flags;
2253 + local_irq_save(flags);
2254 + sd = &__get_cpu_var(softnet_data);
2255 + skb->next = sd->completion_queue;
2256 + sd->completion_queue = skb;
2257 + raise_softirq_irqoff(NET_TX_SOFTIRQ);
2258 + local_irq_restore(flags);
2261 +EXPORT_SYMBOL(dev_kfree_skb_irq);
2263 +void dev_kfree_skb_any(struct sk_buff *skb)
2265 + if (in_irq() || irqs_disabled())
2266 + dev_kfree_skb_irq(skb);
2268 + dev_kfree_skb(skb);
2270 +EXPORT_SYMBOL(dev_kfree_skb_any);
2274 + * netif_device_detach - mark device as removed
2275 + * @dev: network device
2277 + * Mark device as removed from system and therefore no longer available.
2279 +void netif_device_detach(struct net_device *dev)
2281 + if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2282 + netif_running(dev)) {
2283 + netif_tx_stop_all_queues(dev);
2286 +EXPORT_SYMBOL(netif_device_detach);
2289 + * netif_device_attach - mark device as attached
2290 + * @dev: network device
2292 + * Mark device as attached from system and restart if needed.
2294 +void netif_device_attach(struct net_device *dev)
2296 + if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2297 + netif_running(dev)) {
2298 + netif_tx_wake_all_queues(dev);
2299 + __netdev_watchdog_up(dev);
2302 +EXPORT_SYMBOL(netif_device_attach);
2304 +static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2306 + return ((features & NETIF_F_GEN_CSUM) ||
2307 + ((features & NETIF_F_IP_CSUM) &&
2308 + protocol == htons(ETH_P_IP)) ||
2309 + ((features & NETIF_F_IPV6_CSUM) &&
2310 + protocol == htons(ETH_P_IPV6)) ||
2311 + ((features & NETIF_F_FCOE_CRC) &&
2312 + protocol == htons(ETH_P_FCOE)));
2315 +static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
2317 + if (can_checksum_protocol(dev->features, skb->protocol))
2320 + if (skb->protocol == htons(ETH_P_8021Q)) {
2321 + struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2322 + if (can_checksum_protocol(dev->features & dev->vlan_features,
2323 + veh->h_vlan_encapsulated_proto))
2331 + * Invalidate hardware checksum when packet is to be mangled, and
2332 + * complete checksum manually on outgoing path.
2334 +int skb_checksum_help(struct sk_buff *skb)
2337 + int ret = 0, offset;
2339 + if (skb->ip_summed == CHECKSUM_COMPLETE)
2340 + goto out_set_summed;
2342 + if (unlikely(skb_shinfo(skb)->gso_size)) {
2343 + /* Let GSO fix up the checksum. */
2344 + goto out_set_summed;
2347 + offset = skb->csum_start - skb_headroom(skb);
2348 + BUG_ON(offset >= skb_headlen(skb));
2349 + csum = skb_checksum(skb, offset, skb->len - offset, 0);
2351 + offset += skb->csum_offset;
2352 + BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2354 + if (skb_cloned(skb) &&
2355 + !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2356 + ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2361 + *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2363 + skb->ip_summed = CHECKSUM_NONE;
2369 + * skb_gso_segment - Perform segmentation on skb.
2370 + * @skb: buffer to segment
2371 + * @features: features for the output path (see dev->features)
2373 + * This function segments the given skb and returns a list of segments.
2375 + * It may return NULL if the skb requires no segmentation. This is
2376 + * only possible when GSO is used for verifying header integrity.
2378 +struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
2380 + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2381 + struct packet_type *ptype;
2382 + __be16 type = skb->protocol;
2385 + skb_reset_mac_header(skb);
2386 + skb->mac_len = skb->network_header - skb->mac_header;
2387 + __skb_pull(skb, skb->mac_len);
2389 + if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2390 + struct net_device *dev = skb->dev;
2391 + struct ethtool_drvinfo info = {};
2393 + if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
2394 + dev->ethtool_ops->get_drvinfo(dev, &info);
2396 + WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
2398 + info.driver, dev ? dev->features : 0L,
2399 + skb->sk ? skb->sk->sk_route_caps : 0L,
2400 + skb->len, skb->data_len, skb->ip_summed);
2402 + if (skb_header_cloned(skb) &&
2403 + (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2404 + return ERR_PTR(err);
2408 + list_for_each_entry_rcu(ptype,
2409 + &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2410 + if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
2411 + if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2412 + err = ptype->gso_send_check(skb);
2413 + segs = ERR_PTR(err);
2414 + if (err || skb_gso_ok(skb, features))
2416 + __skb_push(skb, (skb->data -
2417 + skb_network_header(skb)));
2419 + segs = ptype->gso_segment(skb, features);
2423 + rcu_read_unlock();
2425 + __skb_push(skb, skb->data - skb_mac_header(skb));
2430 +EXPORT_SYMBOL(skb_gso_segment);
2432 +/* Take action when hardware reception checksum errors are detected. */
2434 +void netdev_rx_csum_fault(struct net_device *dev)
2436 + if (net_ratelimit()) {
2437 + printk(KERN_ERR "%s: hw csum failure.\n",
2438 + dev ? dev->name : "<unknown>");
2442 +EXPORT_SYMBOL(netdev_rx_csum_fault);
2445 +/* Actually, we should eliminate this check as soon as we know, that:
2446 + * 1. IOMMU is present and allows to map all the memory.
2447 + * 2. No high memory really exists on this machine.
2450 +static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2452 +#ifdef CONFIG_HIGHMEM
2455 + if (dev->features & NETIF_F_HIGHDMA)
2458 + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
2459 + if (PageHighMem(skb_shinfo(skb)->frags[i].page))
2466 +struct dev_gso_cb {
2467 + void (*destructor)(struct sk_buff *skb);
2470 +#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2472 +static void dev_gso_skb_destructor(struct sk_buff *skb)
2474 + struct dev_gso_cb *cb;
2477 + struct sk_buff *nskb = skb->next;
2479 + skb->next = nskb->next;
2480 + nskb->next = NULL;
2482 + } while (skb->next);
2484 + cb = DEV_GSO_CB(skb);
2485 + if (cb->destructor)
2486 + cb->destructor(skb);
2490 + * dev_gso_segment - Perform emulated hardware segmentation on skb.
2491 + * @skb: buffer to segment
2493 + * This function segments the given skb and stores the list of segments
2496 +static int dev_gso_segment(struct sk_buff *skb)
2498 + struct net_device *dev = skb->dev;
2499 + struct sk_buff *segs;
2500 + int features = dev->features & ~(illegal_highdma(dev, skb) ?
2503 + segs = skb_gso_segment(skb, features);
2505 + /* Verifying header integrity only. */
2510 + return PTR_ERR(segs);
2513 + DEV_GSO_CB(skb)->destructor = skb->destructor;
2514 + skb->destructor = dev_gso_skb_destructor;
2519 +int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2520 + struct netdev_queue *txq)
2522 + const struct net_device_ops *ops = dev->netdev_ops;
2525 + if (likely(!skb->next)) {
2526 + if (!list_empty(&ptype_all))
2527 + dev_queue_xmit_nit(skb, dev);
2529 + if (netif_needs_gso(dev, skb)) {
2530 + if (unlikely(dev_gso_segment(skb)))
2531 + goto out_kfree_skb;
2536 + rc = ops->ndo_start_xmit(skb, dev);
2538 + * TODO: if skb_orphan() was called by
2539 + * dev->hard_start_xmit() (for example, the unmodified
2540 + * igb driver does that; bnx2 doesn't), then
2541 + * skb_tx_software_timestamp() will be unable to send
2542 + * back the time stamp.
2544 + * How can this be prevented? Always create another
2545 + * reference to the socket before calling
2546 + * dev->hard_start_xmit()? Prevent that skb_orphan()
2547 + * does anything in dev->hard_start_xmit() by clearing
2548 + * the skb destructor before the call and restoring it
2549 + * afterwards, then doing the skb_orphan() ourselves?
2556 + struct sk_buff *nskb = skb->next;
2558 + skb->next = nskb->next;
2559 + nskb->next = NULL;
2560 + rc = ops->ndo_start_xmit(nskb, dev);
2561 + if (unlikely(rc)) {
2562 + nskb->next = skb->next;
2566 + if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2567 + return NETDEV_TX_BUSY;
2568 + } while (skb->next);
2570 + skb->destructor = DEV_GSO_CB(skb)->destructor;
2577 +static u32 skb_tx_hashrnd;
2579 +u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
2583 + if (skb_rx_queue_recorded(skb))
2584 + return skb_get_rx_queue(skb) % dev->real_num_tx_queues;
2586 + if (skb->sk && skb->sk->sk_hash)
2587 + hash = skb->sk->sk_hash;
2589 + hash = skb->protocol;
2591 + hash = jhash_1word(hash, skb_tx_hashrnd);
2593 + return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
2595 +EXPORT_SYMBOL(skb_tx_hash);
2597 +static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2598 + struct sk_buff *skb)
2600 + const struct net_device_ops *ops = dev->netdev_ops;
2601 + u16 queue_index = 0;
2603 + if (ops->ndo_select_queue)
2604 + queue_index = ops->ndo_select_queue(dev, skb);
2605 + else if (dev->real_num_tx_queues > 1)
2606 + queue_index = skb_tx_hash(dev, skb);
2608 + skb_set_queue_mapping(skb, queue_index);
2609 + return netdev_get_tx_queue(dev, queue_index);
2613 + * dev_queue_xmit - transmit a buffer
2614 + * @skb: buffer to transmit
2616 + * Queue a buffer for transmission to a network device. The caller must
2617 + * have set the device and priority and built the buffer before calling
2618 + * this function. The function can be called from an interrupt.
2620 + * A negative errno code is returned on a failure. A success does not
2621 + * guarantee the frame will be transmitted as it may be dropped due
2622 + * to congestion or traffic shaping.
2624 + * -----------------------------------------------------------------------------------
2625 + * I notice this method can also return errors from the queue disciplines,
2626 + * including NET_XMIT_DROP, which is a positive value. So, errors can also
2629 + * Regardless of the return value, the skb is consumed, so it is currently
2630 + * difficult to retry a send to this method. (You can bump the ref count
2631 + * before sending to hold a reference for retry if you are careful.)
2633 + * When calling this method, interrupts MUST be enabled. This is because
2634 + * the BH enable code must have IRQs enabled so that it will not deadlock.
2637 +int dev_queue_xmit(struct sk_buff *skb)
2639 + struct net_device *dev = skb->dev;
2640 + struct netdev_queue *txq;
2644 + /* GSO will handle the following emulations directly. */
2645 + if (netif_needs_gso(dev, skb))
2648 + if (skb_shinfo(skb)->frag_list &&
2649 + !(dev->features & NETIF_F_FRAGLIST) &&
2650 + __skb_linearize(skb))
2651 + goto out_kfree_skb;
2653 + /* Fragmented skb is linearized if device does not support SG,
2654 + * or if at least one of fragments is in highmem and device
2655 + * does not support DMA from it.
2657 + if (skb_shinfo(skb)->nr_frags &&
2658 + (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
2659 + __skb_linearize(skb))
2660 + goto out_kfree_skb;
2662 + /* If packet is not checksummed and device does not support
2663 + * checksumming for this protocol, complete checksumming here.
2665 + if (skb->ip_summed == CHECKSUM_PARTIAL) {
2666 + skb_set_transport_header(skb, skb->csum_start -
2667 + skb_headroom(skb));
2668 + if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
2669 + goto out_kfree_skb;
2673 + /* Disable soft irqs for various locks below. Also
2674 + * stops preemption for RCU.
2676 + rcu_read_lock_bh();
2678 + txq = dev_pick_tx(dev, skb);
2679 + q = rcu_dereference(txq->qdisc);
2681 +#ifdef CONFIG_NET_CLS_ACT
2682 + skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
2685 + spinlock_t *root_lock = qdisc_lock(q);
2687 + spin_lock(root_lock);
2689 + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2691 + rc = NET_XMIT_DROP;
2693 + rc = qdisc_enqueue_root(skb, q);
2696 + spin_unlock(root_lock);
2701 + /* The device has no queue. Common case for software devices:
2702 + loopback, all the sorts of tunnels...
2704 + Really, it is unlikely that netif_tx_lock protection is necessary
2705 + here. (f.e. loopback and IP tunnels are clean ignoring statistics
2707 + However, it is possible, that they rely on protection
2710 + Check this and shot the lock. It is not prone from deadlocks.
2711 + Either shot noqueue qdisc, it is even simpler 8)
2713 + if (dev->flags & IFF_UP) {
2714 + int cpu = smp_processor_id(); /* ok because BHs are off */
2716 + if (txq->xmit_lock_owner != cpu) {
2718 + HARD_TX_LOCK(dev, txq, cpu);
2720 + if (!netif_tx_queue_stopped(txq)) {
2722 + if (!dev_hard_start_xmit(skb, dev, txq)) {
2723 + HARD_TX_UNLOCK(dev, txq);
2727 + HARD_TX_UNLOCK(dev, txq);
2728 + if (net_ratelimit())
2729 + printk(KERN_CRIT "Virtual device %s asks to "
2730 + "queue packet!\n", dev->name);
2732 + /* Recursion is detected! It is possible,
2733 + * unfortunately */
2734 + if (net_ratelimit())
2735 + printk(KERN_CRIT "Dead loop on virtual device "
2736 + "%s, fix it urgently!\n", dev->name);
2741 + rcu_read_unlock_bh();
2747 + rcu_read_unlock_bh();
2752 +/*=======================================================================
2754 + =======================================================================*/
2756 +int netdev_max_backlog __read_mostly = 1000;
2757 +int netdev_budget __read_mostly = 300;
2758 +int weight_p __read_mostly = 64; /* old backlog weight */
2760 +DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
2764 + * netif_rx - post buffer to the network code
2765 + * @skb: buffer to post
2767 + * This function receives a packet from a device driver and queues it for
2768 + * the upper (protocol) levels to process. It always succeeds. The buffer
2769 + * may be dropped during processing for congestion control or by the
2770 + * protocol layers.
2773 + * NET_RX_SUCCESS (no congestion)
2774 + * NET_RX_DROP (packet was dropped)
2778 +int netif_rx(struct sk_buff *skb)
2780 + struct softnet_data *queue;
2781 + unsigned long flags;
2783 + /* if netpoll wants it, pretend we never saw it */
2784 + if (netpoll_rx(skb))
2785 + return NET_RX_DROP;
2787 + if (!skb->tstamp.tv64)
2788 + net_timestamp(skb);
2791 + * The code is rearranged so that the path is the most
2792 + * short when CPU is congested, but is still operating.
2794 + local_irq_save(flags);
2795 + queue = &__get_cpu_var(softnet_data);
2797 + __get_cpu_var(netdev_rx_stat).total++;
2798 + if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
2799 + if (queue->input_pkt_queue.qlen) {
2801 + __skb_queue_tail(&queue->input_pkt_queue, skb);
2802 + local_irq_restore(flags);
2803 + return NET_RX_SUCCESS;
2806 + napi_schedule(&queue->backlog);
2810 + __get_cpu_var(netdev_rx_stat).dropped++;
2811 + local_irq_restore(flags);
2814 + return NET_RX_DROP;
2817 +int netif_rx_ni(struct sk_buff *skb)
2821 + preempt_disable();
2822 + err = netif_rx(skb);
2823 + if (local_softirq_pending())
2830 +EXPORT_SYMBOL(netif_rx_ni);
2832 +static void net_tx_action(struct softirq_action *h)
2834 + struct softnet_data *sd = &__get_cpu_var(softnet_data);
2836 + if (sd->completion_queue) {
2837 + struct sk_buff *clist;
2839 + local_irq_disable();
2840 + clist = sd->completion_queue;
2841 + sd->completion_queue = NULL;
2842 + local_irq_enable();
2845 + struct sk_buff *skb = clist;
2846 + clist = clist->next;
2848 + WARN_ON(atomic_read(&skb->users));
2853 + if (sd->output_queue) {
2854 + struct Qdisc *head;
2856 + local_irq_disable();
2857 + head = sd->output_queue;
2858 + sd->output_queue = NULL;
2859 + local_irq_enable();
2862 + struct Qdisc *q = head;
2863 + spinlock_t *root_lock;
2865 + head = head->next_sched;
2867 + root_lock = qdisc_lock(q);
2868 + if (spin_trylock(root_lock)) {
2869 + smp_mb__before_clear_bit();
2870 + clear_bit(__QDISC_STATE_SCHED,
2873 + spin_unlock(root_lock);
2875 + if (!test_bit(__QDISC_STATE_DEACTIVATED,
2877 + __netif_reschedule(q);
2879 + smp_mb__before_clear_bit();
2880 + clear_bit(__QDISC_STATE_SCHED,
2888 +static inline int deliver_skb(struct sk_buff *skb,
2889 + struct packet_type *pt_prev,
2890 + struct net_device *orig_dev)
2892 + atomic_inc(&skb->users);
2893 + return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2896 +#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2897 +/* These hooks defined here for ATM */
2899 +struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
2900 + unsigned char *addr);
2901 +void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
2904 + * If bridge module is loaded call bridging hook.
2905 + * returns NULL if packet was consumed.
2907 +struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2908 + struct sk_buff *skb) __read_mostly;
2909 +static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2910 + struct packet_type **pt_prev, int *ret,
2911 + struct net_device *orig_dev)
2913 + struct net_bridge_port *port;
2915 + if (skb->pkt_type == PACKET_LOOPBACK ||
2916 + (port = rcu_dereference(skb->dev->br_port)) == NULL)
2920 + *ret = deliver_skb(skb, *pt_prev, orig_dev);
2924 + return br_handle_frame_hook(port, skb);
2927 +#define handle_bridge(skb, pt_prev, ret, orig_dev) (skb)
2930 +#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2931 +struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2932 +EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2934 +static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2935 + struct packet_type **pt_prev,
2937 + struct net_device *orig_dev)
2939 + if (skb->dev->macvlan_port == NULL)
2943 + *ret = deliver_skb(skb, *pt_prev, orig_dev);
2946 + return macvlan_handle_frame_hook(skb);
2949 +#define handle_macvlan(skb, pt_prev, ret, orig_dev) (skb)
2952 +#ifdef CONFIG_NET_CLS_ACT
2953 +/* TODO: Maybe we should just force sch_ingress to be compiled in
2954 + * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2955 + * a compare and 2 stores extra right now if we dont have it on
2956 + * but have CONFIG_NET_CLS_ACT
2957 + * NOTE: This doesnt stop any functionality; if you dont have
2958 + * the ingress scheduler, you just cant add policies on ingress.
2961 +static int ing_filter(struct sk_buff *skb)
2963 + struct net_device *dev = skb->dev;
2964 + u32 ttl = G_TC_RTTL(skb->tc_verd);
2965 + struct netdev_queue *rxq;
2966 + int result = TC_ACT_OK;
2969 + if (MAX_RED_LOOP < ttl++) {
2970 + printk(KERN_WARNING
2971 + "Redir loop detected Dropping packet (%d->%d)\n",
2972 + skb->iif, dev->ifindex);
2973 + return TC_ACT_SHOT;
2976 + skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2977 + skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2979 + rxq = &dev->rx_queue;
2982 + if (q != &noop_qdisc) {
2983 + spin_lock(qdisc_lock(q));
2984 + if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2985 + result = qdisc_enqueue_root(skb, q);
2986 + spin_unlock(qdisc_lock(q));
2992 +static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2993 + struct packet_type **pt_prev,
2994 + int *ret, struct net_device *orig_dev)
2996 + if (skb->dev->rx_queue.qdisc == &noop_qdisc)
3000 + *ret = deliver_skb(skb, *pt_prev, orig_dev);
3003 + /* Huh? Why does turning on AF_PACKET affect this? */
3004 + skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
3007 + switch (ing_filter(skb)) {
3009 + case TC_ACT_STOLEN:
3021 + * netif_nit_deliver - deliver received packets to network taps
3024 + * This function is used to deliver incoming packets to network
3025 + * taps. It should be used when the normal netif_receive_skb path
3026 + * is bypassed, for example because of VLAN acceleration.
3028 +void netif_nit_deliver(struct sk_buff *skb)
3030 + struct packet_type *ptype;
3032 + if (list_empty(&ptype_all))
3035 + skb_reset_network_header(skb);
3036 + skb_reset_transport_header(skb);
3037 + skb->mac_len = skb->network_header - skb->mac_header;
3040 + list_for_each_entry_rcu(ptype, &ptype_all, list) {
3041 + if (!ptype->dev || ptype->dev == skb->dev)
3042 + deliver_skb(skb, ptype, skb->dev);
3044 + rcu_read_unlock();
3048 + * netif_receive_skb - process receive buffer from network
3049 + * @skb: buffer to process
3051 + * netif_receive_skb() is the main receive data processing function.
3052 + * It always succeeds. The buffer may be dropped during processing
3053 + * for congestion control or by the protocol layers.
3055 + * This function may only be called from softirq context and interrupts
3056 + * should be enabled.
3058 + * Return values (usually ignored):
3059 + * NET_RX_SUCCESS: no congestion
3060 + * NET_RX_DROP: packet was dropped
3062 +int netif_receive_skb(struct sk_buff *skb)
3064 + struct packet_type *ptype, *pt_prev;
3065 + struct net_device *orig_dev;
3066 + struct net_device *null_or_orig;
3067 + int ret = NET_RX_DROP;
3070 + if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
3071 + return NET_RX_SUCCESS;
3073 + /* if we've gotten here through NAPI, check netpoll */
3074 + if (netpoll_receive_skb(skb))
3075 + return NET_RX_DROP;
3077 + if (!skb->tstamp.tv64)
3078 + net_timestamp(skb);
3081 + skb->iif = skb->dev->ifindex;
3083 + null_or_orig = NULL;
3084 + orig_dev = skb->dev;
3085 + if (orig_dev->master) {
3086 + if (skb_bond_should_drop(skb))
3087 + null_or_orig = orig_dev; /* deliver only exact match */
3089 + skb->dev = orig_dev->master;
3092 + __get_cpu_var(netdev_rx_stat).total++;
3094 + skb_reset_network_header(skb);
3095 + skb_reset_transport_header(skb);
3096 + skb->mac_len = skb->network_header - skb->mac_header;
3102 +#ifdef CONFIG_NET_CLS_ACT
3103 + if (skb->tc_verd & TC_NCLS) {
3104 + skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3109 + list_for_each_entry_rcu(ptype, &ptype_all, list) {
3110 + if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
3111 + ptype->dev == orig_dev) {
3113 + ret = deliver_skb(skb, pt_prev, orig_dev);
3118 +#ifdef CONFIG_NET_CLS_ACT
3119 + skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3125 + skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
3128 + skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
3134 + type = skb->protocol;
3135 + list_for_each_entry_rcu(ptype,
3136 + &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3137 + if (ptype->type == type &&
3138 + (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
3139 + ptype->dev == orig_dev)) {
3141 + ret = deliver_skb(skb, pt_prev, orig_dev);
3147 + ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3150 + /* Jamal, now you will not able to escape explaining
3151 + * me how you were going to use this. :-)
3153 + ret = NET_RX_DROP;
3157 + rcu_read_unlock();
3161 +/* Network device is going away, flush any packets still pending */
3162 +static void flush_backlog(void *arg)
3164 + struct net_device *dev = arg;
3165 + struct softnet_data *queue = &__get_cpu_var(softnet_data);
3166 + struct sk_buff *skb, *tmp;
3168 + skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
3169 + if (skb->dev == dev) {
3170 + __skb_unlink(skb, &queue->input_pkt_queue);
3175 +static int napi_gro_complete(struct sk_buff *skb)
3177 + struct packet_type *ptype;
3178 + __be16 type = skb->protocol;
3179 + struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3180 + int err = -ENOENT;
3182 + if (NAPI_GRO_CB(skb)->count == 1) {
3183 + skb_shinfo(skb)->gso_size = 0;
3188 + list_for_each_entry_rcu(ptype, head, list) {
3189 + if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3192 + err = ptype->gro_complete(skb);
3195 + rcu_read_unlock();
3198 + WARN_ON(&ptype->list == head);
3200 + return NET_RX_SUCCESS;
3204 + return netif_receive_skb(skb);
3207 +void napi_gro_flush(struct napi_struct *napi)
3209 + struct sk_buff *skb, *next;
3211 + for (skb = napi->gro_list; skb; skb = next) {
3214 + napi_gro_complete(skb);
3217 + napi->gro_count = 0;
3218 + napi->gro_list = NULL;
3220 +EXPORT_SYMBOL(napi_gro_flush);
3222 +void *skb_gro_header(struct sk_buff *skb, unsigned int hlen)
3224 + unsigned int offset = skb_gro_offset(skb);
3227 + if (hlen <= skb_headlen(skb))
3228 + return skb->data + offset;
3230 + if (unlikely(!skb_shinfo(skb)->nr_frags ||
3231 + skb_shinfo(skb)->frags[0].size <=
3232 + hlen - skb_headlen(skb) ||
3233 + PageHighMem(skb_shinfo(skb)->frags[0].page)))
3234 + return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL;
3236 + return page_address(skb_shinfo(skb)->frags[0].page) +
3237 + skb_shinfo(skb)->frags[0].page_offset +
3238 + offset - skb_headlen(skb);
3240 +EXPORT_SYMBOL(skb_gro_header);
3242 +int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3244 + struct sk_buff **pp = NULL;
3245 + struct packet_type *ptype;
3246 + __be16 type = skb->protocol;
3247 + struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3252 + if (!(skb->dev->features & NETIF_F_GRO))
3255 + if (skb_is_gso(skb) || skb_shinfo(skb)->frag_list)
3259 + list_for_each_entry_rcu(ptype, head, list) {
3260 + if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3263 + skb_set_network_header(skb, skb_gro_offset(skb));
3264 + mac_len = skb->network_header - skb->mac_header;
3265 + skb->mac_len = mac_len;
3266 + NAPI_GRO_CB(skb)->same_flow = 0;
3267 + NAPI_GRO_CB(skb)->flush = 0;
3268 + NAPI_GRO_CB(skb)->free = 0;
3270 + pp = ptype->gro_receive(&napi->gro_list, skb);
3273 + rcu_read_unlock();
3275 + if (&ptype->list == head)
3278 + same_flow = NAPI_GRO_CB(skb)->same_flow;
3279 + ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3282 + struct sk_buff *nskb = *pp;
3285 + nskb->next = NULL;
3286 + napi_gro_complete(nskb);
3287 + napi->gro_count--;
3293 + if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3296 + napi->gro_count++;
3297 + NAPI_GRO_CB(skb)->count = 1;
3298 + skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3299 + skb->next = napi->gro_list;
3300 + napi->gro_list = skb;
3304 + if (unlikely(!pskb_may_pull(skb, skb_gro_offset(skb)))) {
3305 + if (napi->gro_list == skb)
3306 + napi->gro_list = skb->next;
3317 +EXPORT_SYMBOL(dev_gro_receive);
3319 +static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3321 + struct sk_buff *p;
3323 + if (netpoll_rx_on(skb))
3324 + return GRO_NORMAL;
3326 + for (p = napi->gro_list; p; p = p->next) {
3327 + NAPI_GRO_CB(p)->same_flow = (p->dev == skb->dev)
3328 + && !compare_ether_header(skb_mac_header(p),
3329 + skb_gro_mac_header(skb));
3330 + NAPI_GRO_CB(p)->flush = 0;
3333 + return dev_gro_receive(napi, skb);
3336 +int napi_skb_finish(int ret, struct sk_buff *skb)
3338 + int err = NET_RX_SUCCESS;
3342 + return netif_receive_skb(skb);
3345 + err = NET_RX_DROP;
3346 + /* fall through */
3348 + case GRO_MERGED_FREE:
3355 +EXPORT_SYMBOL(napi_skb_finish);
3357 +int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3359 + skb_gro_reset_offset(skb);
3361 + return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3363 +EXPORT_SYMBOL(napi_gro_receive);
3365 +void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3367 + __skb_pull(skb, skb_headlen(skb));
3368 + skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3372 +EXPORT_SYMBOL(napi_reuse_skb);
3374 +struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
3375 + struct napi_gro_fraginfo *info)
3377 + struct net_device *dev = napi->dev;
3378 + struct sk_buff *skb = napi->skb;
3379 + struct ethhdr *eth;
3386 + skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN);
3390 + skb_reserve(skb, NET_IP_ALIGN);
3393 + BUG_ON(info->nr_frags > MAX_SKB_FRAGS);
3394 + frag = info->frags;
3396 + for (i = 0; i < info->nr_frags; i++) {
3397 + skb_fill_page_desc(skb, i, frag->page, frag->page_offset,
3401 + skb_shinfo(skb)->nr_frags = info->nr_frags;
3403 + skb->data_len = info->len;
3404 + skb->len += info->len;
3405 + skb->truesize += info->len;
3407 + skb_reset_mac_header(skb);
3408 + skb_gro_reset_offset(skb);
3410 + eth = skb_gro_header(skb, sizeof(*eth));
3412 + napi_reuse_skb(napi, skb);
3417 + skb_gro_pull(skb, sizeof(*eth));
3420 + * This works because the only protocols we care about don't require
3421 + * special handling. We'll fix it up properly at the end.
3423 + skb->protocol = eth->h_proto;
3425 + skb->ip_summed = info->ip_summed;
3426 + skb->csum = info->csum;
3431 +EXPORT_SYMBOL(napi_fraginfo_skb);
3433 +int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
3435 + int err = NET_RX_SUCCESS;
3440 + skb->protocol = eth_type_trans(skb, napi->dev);
3442 + if (ret == GRO_NORMAL)
3443 + return netif_receive_skb(skb);
3445 + skb_gro_pull(skb, -ETH_HLEN);
3449 + err = NET_RX_DROP;
3450 + /* fall through */
3452 + case GRO_MERGED_FREE:
3453 + napi_reuse_skb(napi, skb);
3459 +EXPORT_SYMBOL(napi_frags_finish);
3461 +int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
3463 + struct sk_buff *skb = napi_fraginfo_skb(napi, info);
3466 + return NET_RX_DROP;
3468 + return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3470 +EXPORT_SYMBOL(napi_gro_frags);
3472 +static int process_backlog(struct napi_struct *napi, int quota)
3475 + struct softnet_data *queue = &__get_cpu_var(softnet_data);
3476 + unsigned long start_time = jiffies;
3478 + napi->weight = weight_p;
3480 + struct sk_buff *skb;
3482 + local_irq_disable();
3483 + skb = __skb_dequeue(&queue->input_pkt_queue);
3485 + __napi_complete(napi);
3486 + local_irq_enable();
3489 + local_irq_enable();
3491 + netif_receive_skb(skb);
3492 + } while (++work < quota && jiffies == start_time);
3498 + * __napi_schedule - schedule for receive
3499 + * @n: entry to schedule
3501 + * The entry's receive function will be scheduled to run
3503 +void __napi_schedule(struct napi_struct *n)
3505 + unsigned long flags;
3507 + local_irq_save(flags);
3508 + list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
3509 + __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3510 + local_irq_restore(flags);
3512 +EXPORT_SYMBOL(__napi_schedule);
3514 +void __napi_complete(struct napi_struct *n)
3516 + BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3517 + BUG_ON(n->gro_list);
3519 + list_del(&n->poll_list);
3520 + smp_mb__before_clear_bit();
3521 + clear_bit(NAPI_STATE_SCHED, &n->state);
3523 +EXPORT_SYMBOL(__napi_complete);
3525 +void napi_complete(struct napi_struct *n)
3527 + unsigned long flags;
3530 + * don't let napi dequeue from the cpu poll list
3531 + * just in case its running on a different cpu
3533 + if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3536 + napi_gro_flush(n);
3537 + local_irq_save(flags);
3538 + __napi_complete(n);
3539 + local_irq_restore(flags);
3541 +EXPORT_SYMBOL(napi_complete);
3543 +void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3544 + int (*poll)(struct napi_struct *, int), int weight)
3546 + INIT_LIST_HEAD(&napi->poll_list);
3547 + napi->gro_count = 0;
3548 + napi->gro_list = NULL;
3550 + napi->poll = poll;
3551 + napi->weight = weight;
3552 + list_add(&napi->dev_list, &dev->napi_list);
3554 +#ifdef CONFIG_NETPOLL
3555 + spin_lock_init(&napi->poll_lock);
3556 + napi->poll_owner = -1;
3558 + set_bit(NAPI_STATE_SCHED, &napi->state);
3560 +EXPORT_SYMBOL(netif_napi_add);
3562 +void netif_napi_del(struct napi_struct *napi)
3564 + struct sk_buff *skb, *next;
3566 + list_del_init(&napi->dev_list);
3567 + kfree_skb(napi->skb);
3569 + for (skb = napi->gro_list; skb; skb = next) {
3575 + napi->gro_list = NULL;
3576 + napi->gro_count = 0;
3578 +EXPORT_SYMBOL(netif_napi_del);
3581 +static void net_rx_action(struct softirq_action *h)
3583 + struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
3584 + unsigned long time_limit = jiffies + 2;
3585 + int budget = netdev_budget;
3588 + local_irq_disable();
3590 + while (!list_empty(list)) {
3591 + struct napi_struct *n;
3594 + /* If softirq window is exhuasted then punt.
3595 + * Allow this to run for 2 jiffies since which will allow
3596 + * an average latency of 1.5/HZ.
3598 + if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3599 + goto softnet_break;
3601 + local_irq_enable();
3603 + /* Even though interrupts have been re-enabled, this
3604 + * access is safe because interrupts can only add new
3605 + * entries to the tail of this list, and only ->poll()
3606 + * calls can remove this head entry from the list.
3608 + n = list_entry(list->next, struct napi_struct, poll_list);
3610 + have = netpoll_poll_lock(n);
3612 + weight = n->weight;
3614 + /* This NAPI_STATE_SCHED test is for avoiding a race
3615 + * with netpoll's poll_napi(). Only the entity which
3616 + * obtains the lock and sees NAPI_STATE_SCHED set will
3617 + * actually make the ->poll() call. Therefore we avoid
3618 + * accidently calling ->poll() when NAPI is not scheduled.
3621 + if (test_bit(NAPI_STATE_SCHED, &n->state))
3622 + work = n->poll(n, weight);
3624 + WARN_ON_ONCE(work > weight);
3628 + local_irq_disable();
3630 + /* Drivers must not modify the NAPI state if they
3631 + * consume the entire weight. In such cases this code
3632 + * still "owns" the NAPI instance and therefore can
3633 + * move the instance around on the list at-will.
3635 + if (unlikely(work == weight)) {
3636 + if (unlikely(napi_disable_pending(n)))
3637 + __napi_complete(n);
3639 + list_move_tail(&n->poll_list, list);
3642 + netpoll_poll_unlock(have);
3645 + local_irq_enable();
3647 +#ifdef CONFIG_NET_DMA
3649 + * There may not be any more sk_buffs coming right now, so push
3650 + * any pending DMA copies to hardware
3652 + dma_issue_pending_all();
3658 + __get_cpu_var(netdev_rx_stat).time_squeeze++;
3659 + __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3663 +static gifconf_func_t * gifconf_list [NPROTO];
3666 + * register_gifconf - register a SIOCGIF handler
3667 + * @family: Address family
3668 + * @gifconf: Function handler
3670 + * Register protocol dependent address dumping routines. The handler
3671 + * that is passed must not be freed or reused until it has been replaced
3672 + * by another handler.
3674 +int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
3676 + if (family >= NPROTO)
3678 + gifconf_list[family] = gifconf;
3684 + * Map an interface index to its name (SIOCGIFNAME)
3688 + * We need this ioctl for efficient implementation of the
3689 + * if_indextoname() function required by the IPv6 API. Without
3690 + * it, we would have to search all the interfaces to find a
3694 +static int dev_ifname(struct net *net, struct ifreq __user *arg)
3696 + struct net_device *dev;
3700 + * Fetch the caller's info block.
3703 + if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3706 + read_lock(&dev_base_lock);
3707 + dev = __dev_get_by_index(net, ifr.ifr_ifindex);
3709 + read_unlock(&dev_base_lock);
3713 + strcpy(ifr.ifr_name, dev->name);
3714 + read_unlock(&dev_base_lock);
3716 + if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3722 + * Perform a SIOCGIFCONF call. This structure will change
3723 + * size eventually, and there is nothing I can do about it.
3724 + * Thus we will need a 'compatibility mode'.
3727 +static int dev_ifconf(struct net *net, char __user *arg)
3729 + struct ifconf ifc;
3730 + struct net_device *dev;
3737 + * Fetch the caller's info block.
3740 + if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3743 + pos = ifc.ifc_buf;
3744 + len = ifc.ifc_len;
3747 + * Loop over the interfaces, and write an info block for each.
3751 + for_each_netdev(net, dev) {
3752 + for (i = 0; i < NPROTO; i++) {
3753 + if (gifconf_list[i]) {
3756 + done = gifconf_list[i](dev, NULL, 0);
3758 + done = gifconf_list[i](dev, pos + total,
3768 + * All done. Write the updated control block back to the caller.
3770 + ifc.ifc_len = total;
3773 + * Both BSD and Solaris return 0 here, so we do too.
3775 + return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3778 +#ifdef CONFIG_PROC_FS
3780 + * This is invoked by the /proc filesystem handler to display a device
3783 +void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3784 + __acquires(dev_base_lock)
3786 + struct net *net = seq_file_net(seq);
3788 + struct net_device *dev;
3790 + read_lock(&dev_base_lock);
3792 + return SEQ_START_TOKEN;
3795 + for_each_netdev(net, dev)
3796 + if (off++ == *pos)
3802 +void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3804 + struct net *net = seq_file_net(seq);
3806 + return v == SEQ_START_TOKEN ?
3807 + first_net_device(net) : next_net_device((struct net_device *)v);
3810 +void dev_seq_stop(struct seq_file *seq, void *v)
3811 + __releases(dev_base_lock)
3813 + read_unlock(&dev_base_lock);
3816 +static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3818 + const struct net_device_stats *stats = dev_get_stats(dev);
3820 + seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
3821 + "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
3822 + dev->name, stats->rx_bytes, stats->rx_packets,
3824 + stats->rx_dropped + stats->rx_missed_errors,
3825 + stats->rx_fifo_errors,
3826 + stats->rx_length_errors + stats->rx_over_errors +
3827 + stats->rx_crc_errors + stats->rx_frame_errors,
3828 + stats->rx_compressed, stats->multicast,
3829 + stats->tx_bytes, stats->tx_packets,
3830 + stats->tx_errors, stats->tx_dropped,
3831 + stats->tx_fifo_errors, stats->collisions,
3832 + stats->tx_carrier_errors +
3833 + stats->tx_aborted_errors +
3834 + stats->tx_window_errors +
3835 + stats->tx_heartbeat_errors,
3836 + stats->tx_compressed);
3840 + * Called from the PROCfs module. This now uses the new arbitrary sized
3841 + * /proc/net interface to create /proc/net/dev
3843 +static int dev_seq_show(struct seq_file *seq, void *v)
3845 + if (v == SEQ_START_TOKEN)
3846 + seq_puts(seq, "Inter-| Receive "
3848 + " face |bytes packets errs drop fifo frame "
3849 + "compressed multicast|bytes packets errs "
3850 + "drop fifo colls carrier compressed\n");
3852 + dev_seq_printf_stats(seq, v);
3856 +static struct netif_rx_stats *softnet_get_online(loff_t *pos)
3858 + struct netif_rx_stats *rc = NULL;
3860 + while (*pos < nr_cpu_ids)
3861 + if (cpu_online(*pos)) {
3862 + rc = &per_cpu(netdev_rx_stat, *pos);
3869 +static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3871 + return softnet_get_online(pos);
3874 +static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3877 + return softnet_get_online(pos);
3880 +static void softnet_seq_stop(struct seq_file *seq, void *v)
3884 +static int softnet_seq_show(struct seq_file *seq, void *v)
3886 + struct netif_rx_stats *s = v;
3888 + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3889 + s->total, s->dropped, s->time_squeeze, 0,
3890 + 0, 0, 0, 0, /* was fastroute */
3891 + s->cpu_collision );
3895 +static const struct seq_operations dev_seq_ops = {
3896 + .start = dev_seq_start,
3897 + .next = dev_seq_next,
3898 + .stop = dev_seq_stop,
3899 + .show = dev_seq_show,
3902 +static int dev_seq_open(struct inode *inode, struct file *file)
3904 + return seq_open_net(inode, file, &dev_seq_ops,
3905 + sizeof(struct seq_net_private));
3908 +static const struct file_operations dev_seq_fops = {
3909 + .owner = THIS_MODULE,
3910 + .open = dev_seq_open,
3912 + .llseek = seq_lseek,
3913 + .release = seq_release_net,
3916 +static const struct seq_operations softnet_seq_ops = {
3917 + .start = softnet_seq_start,
3918 + .next = softnet_seq_next,
3919 + .stop = softnet_seq_stop,
3920 + .show = softnet_seq_show,
3923 +static int softnet_seq_open(struct inode *inode, struct file *file)
3925 + return seq_open(file, &softnet_seq_ops);
3928 +static const struct file_operations softnet_seq_fops = {
3929 + .owner = THIS_MODULE,
3930 + .open = softnet_seq_open,
3932 + .llseek = seq_lseek,
3933 + .release = seq_release,
3936 +static void *ptype_get_idx(loff_t pos)
3938 + struct packet_type *pt = NULL;
3942 + list_for_each_entry_rcu(pt, &ptype_all, list) {
3948 + for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3949 + list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3958 +static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3962 + return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3965 +static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3967 + struct packet_type *pt;
3968 + struct list_head *nxt;
3972 + if (v == SEQ_START_TOKEN)
3973 + return ptype_get_idx(0);
3976 + nxt = pt->list.next;
3977 + if (pt->type == htons(ETH_P_ALL)) {
3978 + if (nxt != &ptype_all)
3981 + nxt = ptype_base[0].next;
3983 + hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3985 + while (nxt == &ptype_base[hash]) {
3986 + if (++hash >= PTYPE_HASH_SIZE)
3988 + nxt = ptype_base[hash].next;
3991 + return list_entry(nxt, struct packet_type, list);
3994 +static void ptype_seq_stop(struct seq_file *seq, void *v)
3997 + rcu_read_unlock();
4000 +static int ptype_seq_show(struct seq_file *seq, void *v)
4002 + struct packet_type *pt = v;
4004 + if (v == SEQ_START_TOKEN)
4005 + seq_puts(seq, "Type Device Function\n");
4006 + else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4007 + if (pt->type == htons(ETH_P_ALL))
4008 + seq_puts(seq, "ALL ");
4010 + seq_printf(seq, "%04x", ntohs(pt->type));
4012 + seq_printf(seq, " %-8s %pF\n",
4013 + pt->dev ? pt->dev->name : "", pt->func);
4019 +static const struct seq_operations ptype_seq_ops = {
4020 + .start = ptype_seq_start,
4021 + .next = ptype_seq_next,
4022 + .stop = ptype_seq_stop,
4023 + .show = ptype_seq_show,
4026 +static int ptype_seq_open(struct inode *inode, struct file *file)
4028 + return seq_open_net(inode, file, &ptype_seq_ops,
4029 + sizeof(struct seq_net_private));
4032 +static const struct file_operations ptype_seq_fops = {
4033 + .owner = THIS_MODULE,
4034 + .open = ptype_seq_open,
4036 + .llseek = seq_lseek,
4037 + .release = seq_release_net,
4041 +static int __net_init dev_proc_net_init(struct net *net)
4045 + if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4047 + if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4049 + if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4052 + if (wext_proc_init(net))
4058 + proc_net_remove(net, "ptype");
4060 + proc_net_remove(net, "softnet_stat");
4062 + proc_net_remove(net, "dev");
4066 +static void __net_exit dev_proc_net_exit(struct net *net)
4068 + wext_proc_exit(net);
4070 + proc_net_remove(net, "ptype");
4071 + proc_net_remove(net, "softnet_stat");
4072 + proc_net_remove(net, "dev");
4075 +static struct pernet_operations __net_initdata dev_proc_ops = {
4076 + .init = dev_proc_net_init,
4077 + .exit = dev_proc_net_exit,
4080 +static int __init dev_proc_init(void)
4082 + return register_pernet_subsys(&dev_proc_ops);
4085 +#define dev_proc_init() 0
4086 +#endif /* CONFIG_PROC_FS */
4090 + * netdev_set_master - set up master/slave pair
4091 + * @slave: slave device
4092 + * @master: new master device
4094 + * Changes the master device of the slave. Pass %NULL to break the
4095 + * bonding. The caller must hold the RTNL semaphore. On a failure
4096 + * a negative errno code is returned. On success the reference counts
4097 + * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4098 + * function returns zero.
4100 +int netdev_set_master(struct net_device *slave, struct net_device *master)
4102 + struct net_device *old = slave->master;
4112 + slave->master = master;
4114 + synchronize_net();
4120 + slave->flags |= IFF_SLAVE;
4122 + slave->flags &= ~IFF_SLAVE;
4124 + rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4128 +static void dev_change_rx_flags(struct net_device *dev, int flags)
4130 + const struct net_device_ops *ops = dev->netdev_ops;
4132 + if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4133 + ops->ndo_change_rx_flags(dev, flags);
4136 +static int __dev_set_promiscuity(struct net_device *dev, int inc)
4138 + unsigned short old_flags = dev->flags;
4144 + dev->flags |= IFF_PROMISC;
4145 + dev->promiscuity += inc;
4146 + if (dev->promiscuity == 0) {
4149 + * If inc causes overflow, untouch promisc and return error.
4152 + dev->flags &= ~IFF_PROMISC;
4154 + dev->promiscuity -= inc;
4155 + printk(KERN_WARNING "%s: promiscuity touches roof, "
4156 + "set promiscuity failed, promiscuity feature "
4157 + "of device might be broken.\n", dev->name);
4158 + return -EOVERFLOW;
4161 + if (dev->flags != old_flags) {
4162 + printk(KERN_INFO "device %s %s promiscuous mode\n",
4163 + dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4165 + if (audit_enabled) {
4166 + current_uid_gid(&uid, &gid);
4167 + audit_log(current->audit_context, GFP_ATOMIC,
4168 + AUDIT_ANOM_PROMISCUOUS,
4169 + "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4170 + dev->name, (dev->flags & IFF_PROMISC),
4171 + (old_flags & IFF_PROMISC),
4172 + audit_get_loginuid(current),
4174 + audit_get_sessionid(current));
4177 + dev_change_rx_flags(dev, IFF_PROMISC);
4183 + * dev_set_promiscuity - update promiscuity count on a device
4187 + * Add or remove promiscuity from a device. While the count in the device
4188 + * remains above zero the interface remains promiscuous. Once it hits zero
4189 + * the device reverts back to normal filtering operation. A negative inc
4190 + * value is used to drop promiscuity on the device.
4191 + * Return 0 if successful or a negative errno code on error.
4193 +int dev_set_promiscuity(struct net_device *dev, int inc)
4195 + unsigned short old_flags = dev->flags;
4198 + err = __dev_set_promiscuity(dev, inc);
4201 + if (dev->flags != old_flags)
4202 + dev_set_rx_mode(dev);
4207 + * dev_set_allmulti - update allmulti count on a device
4211 + * Add or remove reception of all multicast frames to a device. While the
4212 + * count in the device remains above zero the interface remains listening
4213 + * to all interfaces. Once it hits zero the device reverts back to normal
4214 + * filtering operation. A negative @inc value is used to drop the counter
4215 + * when releasing a resource needing all multicasts.
4216 + * Return 0 if successful or a negative errno code on error.
4219 +int dev_set_allmulti(struct net_device *dev, int inc)
4221 + unsigned short old_flags = dev->flags;
4225 + dev->flags |= IFF_ALLMULTI;
4226 + dev->allmulti += inc;
4227 + if (dev->allmulti == 0) {
4230 + * If inc causes overflow, untouch allmulti and return error.
4233 + dev->flags &= ~IFF_ALLMULTI;
4235 + dev->allmulti -= inc;
4236 + printk(KERN_WARNING "%s: allmulti touches roof, "
4237 + "set allmulti failed, allmulti feature of "
4238 + "device might be broken.\n", dev->name);
4239 + return -EOVERFLOW;
4242 + if (dev->flags ^ old_flags) {
4243 + dev_change_rx_flags(dev, IFF_ALLMULTI);
4244 + dev_set_rx_mode(dev);
4250 + * Upload unicast and multicast address lists to device and
4251 + * configure RX filtering. When the device doesn't support unicast
4252 + * filtering it is put in promiscuous mode while unicast addresses
4255 +void __dev_set_rx_mode(struct net_device *dev)
4257 + const struct net_device_ops *ops = dev->netdev_ops;
4259 + /* dev_open will call this function so the list will stay sane. */
4260 + if (!(dev->flags&IFF_UP))
4263 + if (!netif_device_present(dev))
4266 + if (ops->ndo_set_rx_mode)
4267 + ops->ndo_set_rx_mode(dev);
4269 + /* Unicast addresses changes may only happen under the rtnl,
4270 + * therefore calling __dev_set_promiscuity here is safe.
4272 + if (dev->uc_count > 0 && !dev->uc_promisc) {
4273 + __dev_set_promiscuity(dev, 1);
4274 + dev->uc_promisc = 1;
4275 + } else if (dev->uc_count == 0 && dev->uc_promisc) {
4276 + __dev_set_promiscuity(dev, -1);
4277 + dev->uc_promisc = 0;
4280 + if (ops->ndo_set_multicast_list)
4281 + ops->ndo_set_multicast_list(dev);
4285 +void dev_set_rx_mode(struct net_device *dev)
4287 + netif_addr_lock_bh(dev);
4288 + __dev_set_rx_mode(dev);
4289 + netif_addr_unlock_bh(dev);
4292 +int __dev_addr_delete(struct dev_addr_list **list, int *count,
4293 + void *addr, int alen, int glbl)
4295 + struct dev_addr_list *da;
4297 + for (; (da = *list) != NULL; list = &da->next) {
4298 + if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
4299 + alen == da->da_addrlen) {
4301 + int old_glbl = da->da_gusers;
4302 + da->da_gusers = 0;
4303 + if (old_glbl == 0)
4306 + if (--da->da_users)
4318 +int __dev_addr_add(struct dev_addr_list **list, int *count,
4319 + void *addr, int alen, int glbl)
4321 + struct dev_addr_list *da;
4323 + for (da = *list; da != NULL; da = da->next) {
4324 + if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
4325 + da->da_addrlen == alen) {
4327 + int old_glbl = da->da_gusers;
4328 + da->da_gusers = 1;
4337 + da = kzalloc(sizeof(*da), GFP_ATOMIC);
4340 + memcpy(da->da_addr, addr, alen);
4341 + da->da_addrlen = alen;
4343 + da->da_gusers = glbl ? 1 : 0;
4351 + * dev_unicast_delete - Release secondary unicast address.
4353 + * @addr: address to delete
4354 + * @alen: length of @addr
4356 + * Release reference to a secondary unicast address and remove it
4357 + * from the device if the reference count drops to zero.
4359 + * The caller must hold the rtnl_mutex.
4361 +int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
4367 + netif_addr_lock_bh(dev);
4368 + err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
4370 + __dev_set_rx_mode(dev);
4371 + netif_addr_unlock_bh(dev);
4374 +EXPORT_SYMBOL(dev_unicast_delete);
4377 + * dev_unicast_add - add a secondary unicast address
4379 + * @addr: address to add
4380 + * @alen: length of @addr
4382 + * Add a secondary unicast address to the device or increase
4383 + * the reference count if it already exists.
4385 + * The caller must hold the rtnl_mutex.
4387 +int dev_unicast_add(struct net_device *dev, void *addr, int alen)
4393 + netif_addr_lock_bh(dev);
4394 + err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
4396 + __dev_set_rx_mode(dev);
4397 + netif_addr_unlock_bh(dev);
4400 +EXPORT_SYMBOL(dev_unicast_add);
4402 +int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
4403 + struct dev_addr_list **from, int *from_count)
4405 + struct dev_addr_list *da, *next;
4409 + while (da != NULL) {
4411 + if (!da->da_synced) {
4412 + err = __dev_addr_add(to, to_count,
4413 + da->da_addr, da->da_addrlen, 0);
4416 + da->da_synced = 1;
4418 + } else if (da->da_users == 1) {
4419 + __dev_addr_delete(to, to_count,
4420 + da->da_addr, da->da_addrlen, 0);
4421 + __dev_addr_delete(from, from_count,
4422 + da->da_addr, da->da_addrlen, 0);
4429 +void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
4430 + struct dev_addr_list **from, int *from_count)
4432 + struct dev_addr_list *da, *next;
4435 + while (da != NULL) {
4437 + if (da->da_synced) {
4438 + __dev_addr_delete(to, to_count,
4439 + da->da_addr, da->da_addrlen, 0);
4440 + da->da_synced = 0;
4441 + __dev_addr_delete(from, from_count,
4442 + da->da_addr, da->da_addrlen, 0);
4449 + * dev_unicast_sync - Synchronize device's unicast list to another device
4450 + * @to: destination device
4451 + * @from: source device
4453 + * Add newly added addresses to the destination device and release
4454 + * addresses that have no users left. The source device must be
4455 + * locked by netif_tx_lock_bh.
4457 + * This function is intended to be called from the dev->set_rx_mode
4458 + * function of layered software devices.
4460 +int dev_unicast_sync(struct net_device *to, struct net_device *from)
4464 + netif_addr_lock_bh(to);
4465 + err = __dev_addr_sync(&to->uc_list, &to->uc_count,
4466 + &from->uc_list, &from->uc_count);
4468 + __dev_set_rx_mode(to);
4469 + netif_addr_unlock_bh(to);
4472 +EXPORT_SYMBOL(dev_unicast_sync);
4475 + * dev_unicast_unsync - Remove synchronized addresses from the destination device
4476 + * @to: destination device
4477 + * @from: source device
4479 + * Remove all addresses that were added to the destination device by
4480 + * dev_unicast_sync(). This function is intended to be called from the
4481 + * dev->stop function of layered software devices.
4483 +void dev_unicast_unsync(struct net_device *to, struct net_device *from)
4485 + netif_addr_lock_bh(from);
4486 + netif_addr_lock(to);
4488 + __dev_addr_unsync(&to->uc_list, &to->uc_count,
4489 + &from->uc_list, &from->uc_count);
4490 + __dev_set_rx_mode(to);
4492 + netif_addr_unlock(to);
4493 + netif_addr_unlock_bh(from);
4495 +EXPORT_SYMBOL(dev_unicast_unsync);
4497 +static void __dev_addr_discard(struct dev_addr_list **list)
4499 + struct dev_addr_list *tmp;
4501 + while (*list != NULL) {
4503 + *list = tmp->next;
4504 + if (tmp->da_users > tmp->da_gusers)
4505 + printk("__dev_addr_discard: address leakage! "
4506 + "da_users=%d\n", tmp->da_users);
4511 +static void dev_addr_discard(struct net_device *dev)
4513 + netif_addr_lock_bh(dev);
4515 + __dev_addr_discard(&dev->uc_list);
4516 + dev->uc_count = 0;
4518 + __dev_addr_discard(&dev->mc_list);
4519 + dev->mc_count = 0;
4521 + netif_addr_unlock_bh(dev);
4525 + * dev_get_flags - get flags reported to userspace
4528 + * Get the combination of flag bits exported through APIs to userspace.
4530 +unsigned dev_get_flags(const struct net_device *dev)
4534 + flags = (dev->flags & ~(IFF_PROMISC |
4539 + (dev->gflags & (IFF_PROMISC |
4542 + if (netif_running(dev)) {
4543 + if (netif_oper_up(dev))
4544 + flags |= IFF_RUNNING;
4545 + if (netif_carrier_ok(dev))
4546 + flags |= IFF_LOWER_UP;
4547 + if (netif_dormant(dev))
4548 + flags |= IFF_DORMANT;
4555 + * dev_change_flags - change device settings
4557 + * @flags: device state flags
4559 + * Change settings on device based state flags. The flags are
4560 + * in the userspace exported format.
4562 +int dev_change_flags(struct net_device *dev, unsigned flags)
4565 + int old_flags = dev->flags;
4570 + * Set the flags on our device.
4573 + dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4574 + IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4576 + (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4580 + * Load in the correct multicast list now the flags have changed.
4583 + if ((old_flags ^ flags) & IFF_MULTICAST)
4584 + dev_change_rx_flags(dev, IFF_MULTICAST);
4586 + dev_set_rx_mode(dev);
4589 + * Have we downed the interface. We handle IFF_UP ourselves
4590 + * according to user attempts to set it, rather than blindly
4595 + if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
4596 + ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
4599 + dev_set_rx_mode(dev);
4602 + if (dev->flags & IFF_UP &&
4603 + ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
4605 + call_netdevice_notifiers(NETDEV_CHANGE, dev);
4607 + if ((flags ^ dev->gflags) & IFF_PROMISC) {
4608 + int inc = (flags & IFF_PROMISC) ? +1 : -1;
4609 + dev->gflags ^= IFF_PROMISC;
4610 + dev_set_promiscuity(dev, inc);
4613 + /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4614 + is important. Some (broken) drivers set IFF_PROMISC, when
4615 + IFF_ALLMULTI is requested not asking us and not reporting.
4617 + if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4618 + int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
4619 + dev->gflags ^= IFF_ALLMULTI;
4620 + dev_set_allmulti(dev, inc);
4623 + /* Exclude state transition flags, already notified */
4624 + changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
4626 + rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4632 + * dev_set_mtu - Change maximum transfer unit
4634 + * @new_mtu: new transfer unit
4636 + * Change the maximum transfer size of the network device.
4638 +int dev_set_mtu(struct net_device *dev, int new_mtu)
4640 + const struct net_device_ops *ops = dev->netdev_ops;
4643 + if (new_mtu == dev->mtu)
4646 + /* MTU must be positive. */
4650 + if (!netif_device_present(dev))
4654 + if (ops->ndo_change_mtu)
4655 + err = ops->ndo_change_mtu(dev, new_mtu);
4657 + dev->mtu = new_mtu;
4659 + if (!err && dev->flags & IFF_UP)
4660 + call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4665 + * dev_set_mac_address - Change Media Access Control Address
4667 + * @sa: new address
4669 + * Change the hardware (MAC) address of the device
4671 +int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4673 + const struct net_device_ops *ops = dev->netdev_ops;
4676 + if (!ops->ndo_set_mac_address)
4677 + return -EOPNOTSUPP;
4678 + if (sa->sa_family != dev->type)
4680 + if (!netif_device_present(dev))
4682 + err = ops->ndo_set_mac_address(dev, sa);
4684 + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4689 + * Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
4691 +static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4694 + struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4700 + case SIOCGIFFLAGS: /* Get interface flags */
4701 + ifr->ifr_flags = dev_get_flags(dev);
4704 + case SIOCGIFMETRIC: /* Get the metric on the interface
4705 + (currently unused) */
4706 + ifr->ifr_metric = 0;
4709 + case SIOCGIFMTU: /* Get the MTU of a device */
4710 + ifr->ifr_mtu = dev->mtu;
4713 + case SIOCGIFHWADDR:
4714 + if (!dev->addr_len)
4715 + memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4717 + memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4718 + min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4719 + ifr->ifr_hwaddr.sa_family = dev->type;
4722 + case SIOCGIFSLAVE:
4727 + ifr->ifr_map.mem_start = dev->mem_start;
4728 + ifr->ifr_map.mem_end = dev->mem_end;
4729 + ifr->ifr_map.base_addr = dev->base_addr;
4730 + ifr->ifr_map.irq = dev->irq;
4731 + ifr->ifr_map.dma = dev->dma;
4732 + ifr->ifr_map.port = dev->if_port;
4735 + case SIOCGIFINDEX:
4736 + ifr->ifr_ifindex = dev->ifindex;
4739 + case SIOCGIFTXQLEN:
4740 + ifr->ifr_qlen = dev->tx_queue_len;
4744 + /* dev_ioctl() should ensure this case
4745 + * is never reached
4756 + * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4758 +static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4761 + struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4762 + const struct net_device_ops *ops;
4767 + ops = dev->netdev_ops;
4770 + case SIOCSIFFLAGS: /* Set interface flags */
4771 + return dev_change_flags(dev, ifr->ifr_flags);
4773 + case SIOCSIFMETRIC: /* Set the metric on the interface
4774 + (currently unused) */
4775 + return -EOPNOTSUPP;
4777 + case SIOCSIFMTU: /* Set the MTU of a device */
4778 + return dev_set_mtu(dev, ifr->ifr_mtu);
4780 + case SIOCSIFHWADDR:
4781 + return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4783 + case SIOCSIFHWBROADCAST:
4784 + if (ifr->ifr_hwaddr.sa_family != dev->type)
4786 + memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4787 + min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4788 + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4792 + if (ops->ndo_set_config) {
4793 + if (!netif_device_present(dev))
4795 + return ops->ndo_set_config(dev, &ifr->ifr_map);
4797 + return -EOPNOTSUPP;
4799 + case SIOCADDMULTI:
4800 + if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4801 + ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4803 + if (!netif_device_present(dev))
4805 + return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
4806 + dev->addr_len, 1);
4808 + case SIOCDELMULTI:
4809 + if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4810 + ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4812 + if (!netif_device_present(dev))
4814 + return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
4815 + dev->addr_len, 1);
4817 + case SIOCSIFTXQLEN:
4818 + if (ifr->ifr_qlen < 0)
4820 + dev->tx_queue_len = ifr->ifr_qlen;
4824 + ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4825 + return dev_change_name(dev, ifr->ifr_newname);
4828 + * Unknown or private ioctl
4832 + if ((cmd >= SIOCDEVPRIVATE &&
4833 + cmd <= SIOCDEVPRIVATE + 15) ||
4834 + cmd == SIOCBONDENSLAVE ||
4835 + cmd == SIOCBONDRELEASE ||
4836 + cmd == SIOCBONDSETHWADDR ||
4837 + cmd == SIOCBONDSLAVEINFOQUERY ||
4838 + cmd == SIOCBONDINFOQUERY ||
4839 + cmd == SIOCBONDCHANGEACTIVE ||
4840 + cmd == SIOCGMIIPHY ||
4841 + cmd == SIOCGMIIREG ||
4842 + cmd == SIOCSMIIREG ||
4843 + cmd == SIOCBRADDIF ||
4844 + cmd == SIOCBRDELIF ||
4845 + cmd == SIOCSHWTSTAMP ||
4846 + cmd == SIOCWANDEV) {
4847 + err = -EOPNOTSUPP;
4848 + if (ops->ndo_do_ioctl) {
4849 + if (netif_device_present(dev))
4850 + err = ops->ndo_do_ioctl(dev, ifr, cmd);
4862 + * This function handles all "interface"-type I/O control requests. The actual
4863 + * 'doing' part of this is dev_ifsioc above.
4867 + * dev_ioctl - network device ioctl
4868 + * @net: the applicable net namespace
4869 + * @cmd: command to issue
4870 + * @arg: pointer to a struct ifreq in user space
4872 + * Issue ioctl functions to devices. This is normally called by the
4873 + * user space syscall interfaces but can sometimes be useful for
4874 + * other purposes. The return value is the return from the syscall if
4875 + * positive or a negative errno code on error.
4878 +int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4884 + /* One special case: SIOCGIFCONF takes ifconf argument
4885 + and requires shared lock, because it sleeps writing
4889 + if (cmd == SIOCGIFCONF) {
4891 + ret = dev_ifconf(net, (char __user *) arg);
4895 + if (cmd == SIOCGIFNAME)
4896 + return dev_ifname(net, (struct ifreq __user *)arg);
4898 + if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4901 + ifr.ifr_name[IFNAMSIZ-1] = 0;
4903 + colon = strchr(ifr.ifr_name, ':');
4908 + * See which interface the caller is talking about.
4913 + * These ioctl calls:
4914 + * - can be done by all.
4915 + * - atomic and do not require locking.
4916 + * - return a value
4918 + case SIOCGIFFLAGS:
4919 + case SIOCGIFMETRIC:
4921 + case SIOCGIFHWADDR:
4922 + case SIOCGIFSLAVE:
4924 + case SIOCGIFINDEX:
4925 + case SIOCGIFTXQLEN:
4926 + dev_load(net, ifr.ifr_name);
4927 + read_lock(&dev_base_lock);
4928 + ret = dev_ifsioc_locked(net, &ifr, cmd);
4929 + read_unlock(&dev_base_lock);
4933 + if (copy_to_user(arg, &ifr,
4934 + sizeof(struct ifreq)))
4940 + dev_load(net, ifr.ifr_name);
4942 + ret = dev_ethtool(net, &ifr);
4947 + if (copy_to_user(arg, &ifr,
4948 + sizeof(struct ifreq)))
4954 + * These ioctl calls:
4955 + * - require superuser power.
4956 + * - require strict serialization.
4957 + * - return a value
4962 + if (!capable(CAP_NET_ADMIN))
4964 + dev_load(net, ifr.ifr_name);
4966 + ret = dev_ifsioc(net, &ifr, cmd);
4971 + if (copy_to_user(arg, &ifr,
4972 + sizeof(struct ifreq)))
4978 + * These ioctl calls:
4979 + * - require superuser power.
4980 + * - require strict serialization.
4981 + * - do not return a value
4983 + case SIOCSIFFLAGS:
4984 + case SIOCSIFMETRIC:
4987 + case SIOCSIFHWADDR:
4988 + case SIOCSIFSLAVE:
4989 + case SIOCADDMULTI:
4990 + case SIOCDELMULTI:
4991 + case SIOCSIFHWBROADCAST:
4992 + case SIOCSIFTXQLEN:
4994 + case SIOCBONDENSLAVE:
4995 + case SIOCBONDRELEASE:
4996 + case SIOCBONDSETHWADDR:
4997 + case SIOCBONDCHANGEACTIVE:
5000 + case SIOCSHWTSTAMP:
5001 + if (!capable(CAP_NET_ADMIN))
5003 + /* fall through */
5004 + case SIOCBONDSLAVEINFOQUERY:
5005 + case SIOCBONDINFOQUERY:
5006 + dev_load(net, ifr.ifr_name);
5008 + ret = dev_ifsioc(net, &ifr, cmd);
5013 + /* Get the per device memory space. We can add this but
5014 + * currently do not support it */
5016 + /* Set the per device memory buffer space.
5017 + * Not applicable in our case */
5022 + * Unknown or private ioctl.
5025 + if (cmd == SIOCWANDEV ||
5026 + (cmd >= SIOCDEVPRIVATE &&
5027 + cmd <= SIOCDEVPRIVATE + 15)) {
5028 + dev_load(net, ifr.ifr_name);
5030 + ret = dev_ifsioc(net, &ifr, cmd);
5032 + if (!ret && copy_to_user(arg, &ifr,
5033 + sizeof(struct ifreq)))
5037 + /* Take care of Wireless Extensions */
5038 + if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5039 + return wext_handle_ioctl(net, &ifr, cmd, arg);
5046 + * dev_new_index - allocate an ifindex
5047 + * @net: the applicable net namespace
5049 + * Returns a suitable unique value for a new device interface
5050 + * number. The caller must hold the rtnl semaphore or the
5051 + * dev_base_lock to be sure it remains unique.
5053 +static int dev_new_index(struct net *net)
5055 + static int ifindex;
5057 + if (++ifindex <= 0)
5059 + if (!__dev_get_by_index(net, ifindex))
5064 +/* Delayed registration/unregisteration */
5065 +static LIST_HEAD(net_todo_list);
5067 +static void net_set_todo(struct net_device *dev)
5069 + list_add_tail(&dev->todo_list, &net_todo_list);
5072 +static void rollback_registered(struct net_device *dev)
5074 + BUG_ON(dev_boot_phase);
5077 + /* Some devices call without registering for initialization unwind. */
5078 + if (dev->reg_state == NETREG_UNINITIALIZED) {
5079 + printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
5080 + "was registered\n", dev->name, dev);
5086 + BUG_ON(dev->reg_state != NETREG_REGISTERED);
5088 + /* If device is running, close it first. */
5091 + /* And unlink it from device chain. */
5092 + unlist_netdevice(dev);
5094 + dev->reg_state = NETREG_UNREGISTERING;
5096 + synchronize_net();
5098 + /* Shutdown queueing discipline. */
5099 + dev_shutdown(dev);
5102 + /* Notify protocols, that we are about to destroy
5103 + this device. They should clean all the things.
5105 + call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5108 + * Flush the unicast and multicast chains
5110 + dev_addr_discard(dev);
5112 + if (dev->netdev_ops->ndo_uninit)
5113 + dev->netdev_ops->ndo_uninit(dev);
5115 + /* Notifier chain MUST detach us from master device. */
5116 + WARN_ON(dev->master);
5118 + /* Remove entries from kobject tree */
5119 + netdev_unregister_kobject(dev);
5121 + synchronize_net();
5126 +static void __netdev_init_queue_locks_one(struct net_device *dev,
5127 + struct netdev_queue *dev_queue,
5130 + spin_lock_init(&dev_queue->_xmit_lock);
5131 + netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
5132 + dev_queue->xmit_lock_owner = -1;
5135 +static void netdev_init_queue_locks(struct net_device *dev)
5137 + netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
5138 + __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
5141 +unsigned long netdev_fix_features(unsigned long features, const char *name)
5143 + /* Fix illegal SG+CSUM combinations. */
5144 + if ((features & NETIF_F_SG) &&
5145 + !(features & NETIF_F_ALL_CSUM)) {
5147 + printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
5148 + "checksum feature.\n", name);
5149 + features &= ~NETIF_F_SG;
5152 + /* TSO requires that SG is present as well. */
5153 + if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
5155 + printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
5156 + "SG feature.\n", name);
5157 + features &= ~NETIF_F_TSO;
5160 + if (features & NETIF_F_UFO) {
5161 + if (!(features & NETIF_F_GEN_CSUM)) {
5163 + printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5164 + "since no NETIF_F_HW_CSUM feature.\n",
5166 + features &= ~NETIF_F_UFO;
5169 + if (!(features & NETIF_F_SG)) {
5171 + printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5172 + "since no NETIF_F_SG feature.\n", name);
5173 + features &= ~NETIF_F_UFO;
5179 +EXPORT_SYMBOL(netdev_fix_features);
5181 +/* Some devices need to (re-)set their netdev_ops inside
5182 + * ->init() or similar. If that happens, we have to setup
5183 + * the compat pointers again.
5185 +void netdev_resync_ops(struct net_device *dev)
5187 +#ifdef CONFIG_COMPAT_NET_DEV_OPS
5188 + const struct net_device_ops *ops = dev->netdev_ops;
5190 + dev->init = ops->ndo_init;
5191 + dev->uninit = ops->ndo_uninit;
5192 + dev->open = ops->ndo_open;
5193 + dev->change_rx_flags = ops->ndo_change_rx_flags;
5194 + dev->set_rx_mode = ops->ndo_set_rx_mode;
5195 + dev->set_multicast_list = ops->ndo_set_multicast_list;
5196 + dev->set_mac_address = ops->ndo_set_mac_address;
5197 + dev->validate_addr = ops->ndo_validate_addr;
5198 + dev->do_ioctl = ops->ndo_do_ioctl;
5199 + dev->set_config = ops->ndo_set_config;
5200 + dev->change_mtu = ops->ndo_change_mtu;
5201 + dev->neigh_setup = ops->ndo_neigh_setup;
5202 + dev->tx_timeout = ops->ndo_tx_timeout;
5203 + dev->get_stats = ops->ndo_get_stats;
5204 + dev->vlan_rx_register = ops->ndo_vlan_rx_register;
5205 + dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
5206 + dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
5207 +#ifdef CONFIG_NET_POLL_CONTROLLER
5208 + dev->poll_controller = ops->ndo_poll_controller;
5212 +EXPORT_SYMBOL(netdev_resync_ops);
5215 + * register_netdevice - register a network device
5216 + * @dev: device to register
5218 + * Take a completed network device structure and add it to the kernel
5219 + * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5220 + * chain. 0 is returned on success. A negative errno code is returned
5221 + * on a failure to set up the device, or if the name is a duplicate.
5223 + * Callers must hold the rtnl semaphore. You may want
5224 + * register_netdev() instead of this.
5227 + * The locking appears insufficient to guarantee two parallel registers
5228 + * will not get the same name.
5231 +int register_netdevice(struct net_device *dev)
5233 + struct hlist_head *head;
5234 + struct hlist_node *p;
5236 + struct net *net = dev_net(dev);
5238 + BUG_ON(dev_boot_phase);
5243 + /* When net_device's are persistent, this will be fatal. */
5244 + BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5247 + spin_lock_init(&dev->addr_list_lock);
5248 + netdev_set_addr_lockdep_class(dev);
5249 + netdev_init_queue_locks(dev);
5253 +#ifdef CONFIG_COMPAT_NET_DEV_OPS
5254 + /* Netdevice_ops API compatibility support.
5255 + * This is temporary until all network devices are converted.
5257 + if (dev->netdev_ops) {
5258 + netdev_resync_ops(dev);
5260 + char drivername[64];
5261 + pr_info("%s (%s): not using net_device_ops yet\n",
5262 + dev->name, netdev_drivername(dev, drivername, 64));
5264 + /* This works only because net_device_ops and the
5265 + compatibility structure are the same. */
5266 + dev->netdev_ops = (void *) &(dev->init);
5270 + /* Init, if this function is available */
5271 + if (dev->netdev_ops->ndo_init) {
5272 + ret = dev->netdev_ops->ndo_init(dev);
5280 + if (!dev_valid_name(dev->name)) {
5285 + dev->ifindex = dev_new_index(net);
5286 + if (dev->iflink == -1)
5287 + dev->iflink = dev->ifindex;
5289 + /* Check for existence of name */
5290 + head = dev_name_hash(net, dev->name);
5291 + hlist_for_each(p, head) {
5292 + struct net_device *d
5293 + = hlist_entry(p, struct net_device, name_hlist);
5294 + if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
5300 + /* Fix illegal checksum combinations */
5301 + if ((dev->features & NETIF_F_HW_CSUM) &&
5302 + (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5303 + printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5305 + dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5308 + if ((dev->features & NETIF_F_NO_CSUM) &&
5309 + (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5310 + printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5312 + dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5315 + dev->features = netdev_fix_features(dev->features, dev->name);
5317 + /* Enable software GSO if SG is supported. */
5318 + if (dev->features & NETIF_F_SG)
5319 + dev->features |= NETIF_F_GSO;
5321 + netdev_initialize_kobject(dev);
5322 + ret = netdev_register_kobject(dev);
5325 + dev->reg_state = NETREG_REGISTERED;
5328 + * Default initial state at registry is that the
5329 + * device is present.
5332 + set_bit(__LINK_STATE_PRESENT, &dev->state);
5334 + dev_init_scheduler(dev);
5336 + list_netdevice(dev);
5338 + /* Notify protocols, that a new device appeared. */
5339 + ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5340 + ret = notifier_to_errno(ret);
5342 + rollback_registered(dev);
5343 + dev->reg_state = NETREG_UNREGISTERED;
5350 + if (dev->netdev_ops->ndo_uninit)
5351 + dev->netdev_ops->ndo_uninit(dev);
5356 + * init_dummy_netdev - init a dummy network device for NAPI
5357 + * @dev: device to init
5359 + * This takes a network device structure and initialize the minimum
5360 + * amount of fields so it can be used to schedule NAPI polls without
5361 + * registering a full blown interface. This is to be used by drivers
5362 + * that need to tie several hardware interfaces to a single NAPI
5363 + * poll scheduler due to HW limitations.
5365 +int init_dummy_netdev(struct net_device *dev)
5367 + /* Clear everything. Note we don't initialize spinlocks
5368 + * are they aren't supposed to be taken by any of the
5369 + * NAPI code and this dummy netdev is supposed to be
5370 + * only ever used for NAPI polls
5372 + memset(dev, 0, sizeof(struct net_device));
5374 + /* make sure we BUG if trying to hit standard
5375 + * register/unregister code path
5377 + dev->reg_state = NETREG_DUMMY;
5379 + /* initialize the ref count */
5380 + atomic_set(&dev->refcnt, 1);
5382 + /* NAPI wants this */
5383 + INIT_LIST_HEAD(&dev->napi_list);
5385 + /* a dummy interface is started by default */
5386 + set_bit(__LINK_STATE_PRESENT, &dev->state);
5387 + set_bit(__LINK_STATE_START, &dev->state);
5391 +EXPORT_SYMBOL_GPL(init_dummy_netdev);
5395 + * register_netdev - register a network device
5396 + * @dev: device to register
5398 + * Take a completed network device structure and add it to the kernel
5399 + * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5400 + * chain. 0 is returned on success. A negative errno code is returned
5401 + * on a failure to set up the device, or if the name is a duplicate.
5403 + * This is a wrapper around register_netdevice that takes the rtnl semaphore
5404 + * and expands the device name if you passed a format string to
5407 +int register_netdev(struct net_device *dev)
5414 + * If the name is a format string the caller wants us to do a
5415 + * name allocation.
5417 + if (strchr(dev->name, '%')) {
5418 + err = dev_alloc_name(dev, dev->name);
5423 + err = register_netdevice(dev);
5428 +EXPORT_SYMBOL(register_netdev);
5431 + * netdev_wait_allrefs - wait until all references are gone.
5433 + * This is called when unregistering network devices.
5435 + * Any protocol or device that holds a reference should register
5436 + * for netdevice notification, and cleanup and put back the
5437 + * reference if they receive an UNREGISTER event.
5438 + * We can get stuck here if buggy protocols don't correctly
5441 +static void netdev_wait_allrefs(struct net_device *dev)
5443 + unsigned long rebroadcast_time, warning_time;
5445 + rebroadcast_time = warning_time = jiffies;
5446 + while (atomic_read(&dev->refcnt) != 0) {
5447 + if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5450 + /* Rebroadcast unregister notification */
5451 + call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5453 + if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5455 + /* We must not have linkwatch events
5456 + * pending on unregister. If this
5457 + * happens, we simply run the queue
5458 + * unscheduled, resulting in a noop
5459 + * for this device.
5461 + linkwatch_run_queue();
5466 + rebroadcast_time = jiffies;
5471 + if (time_after(jiffies, warning_time + 10 * HZ)) {
5472 + printk(KERN_EMERG "unregister_netdevice: "
5473 + "waiting for %s to become free. Usage "
5475 + dev->name, atomic_read(&dev->refcnt));
5476 + warning_time = jiffies;
5481 +/* The sequence is:
5485 + * register_netdevice(x1);
5486 + * register_netdevice(x2);
5488 + * unregister_netdevice(y1);
5489 + * unregister_netdevice(y2);
5492 + * free_netdev(y1);
5493 + * free_netdev(y2);
5495 + * We are invoked by rtnl_unlock().
5496 + * This allows us to deal with problems:
5497 + * 1) We can delete sysfs objects which invoke hotplug
5498 + * without deadlocking with linkwatch via keventd.
5499 + * 2) Since we run with the RTNL semaphore not held, we can sleep
5500 + * safely in order to wait for the netdev refcnt to drop to zero.
5502 + * We must not return until all unregister events added during
5503 + * the interval the lock was held have been completed.
5505 +void netdev_run_todo(void)
5507 + struct list_head list;
5509 + /* Snapshot list, allow later requests */
5510 + list_replace_init(&net_todo_list, &list);
5514 + while (!list_empty(&list)) {
5515 + struct net_device *dev
5516 + = list_entry(list.next, struct net_device, todo_list);
5517 + list_del(&dev->todo_list);
5519 + if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5520 + printk(KERN_ERR "network todo '%s' but state %d\n",
5521 + dev->name, dev->reg_state);
5526 + dev->reg_state = NETREG_UNREGISTERED;
5528 + on_each_cpu(flush_backlog, dev, 1);
5530 + netdev_wait_allrefs(dev);
5533 + BUG_ON(atomic_read(&dev->refcnt));
5534 + WARN_ON(dev->ip_ptr);
5535 + WARN_ON(dev->ip6_ptr);
5536 + WARN_ON(dev->dn_ptr);
5538 + if (dev->destructor)
5539 + dev->destructor(dev);
5541 + /* Free network device */
5542 + kobject_put(&dev->dev.kobj);
5547 + * dev_get_stats - get network device statistics
5548 + * @dev: device to get statistics from
5550 + * Get network statistics from device. The device driver may provide
5551 + * its own method by setting dev->netdev_ops->get_stats; otherwise
5552 + * the internal statistics structure is used.
5554 +const struct net_device_stats *dev_get_stats(struct net_device *dev)
5556 + const struct net_device_ops *ops = dev->netdev_ops;
5558 + if (ops->ndo_get_stats)
5559 + return ops->ndo_get_stats(dev);
5561 + return &dev->stats;
5563 +EXPORT_SYMBOL(dev_get_stats);
5565 +static void netdev_init_one_queue(struct net_device *dev,
5566 + struct netdev_queue *queue,
5572 +static void netdev_init_queues(struct net_device *dev)
5574 + netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5575 + netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5576 + spin_lock_init(&dev->tx_global_lock);
5580 + * alloc_netdev_mq - allocate network device
5581 + * @sizeof_priv: size of private data to allocate space for
5582 + * @name: device name format string
5583 + * @setup: callback to initialize device
5584 + * @queue_count: the number of subqueues to allocate
5586 + * Allocates a struct net_device with private data area for driver use
5587 + * and performs basic initialization. Also allocates subquue structs
5588 + * for each queue on the device at the end of the netdevice.
5590 +struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5591 + void (*setup)(struct net_device *), unsigned int queue_count)
5593 + struct netdev_queue *tx;
5594 + struct net_device *dev;
5595 + size_t alloc_size;
5598 + BUG_ON(strlen(name) >= sizeof(dev->name));
5600 + alloc_size = sizeof(struct net_device);
5601 + if (sizeof_priv) {
5602 + /* ensure 32-byte alignment of private area */
5603 + alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
5604 + alloc_size += sizeof_priv;
5606 + /* ensure 32-byte alignment of whole construct */
5607 + alloc_size += NETDEV_ALIGN_CONST;
5609 + p = kzalloc(alloc_size, GFP_KERNEL);
5611 + printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5615 + tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5617 + printk(KERN_ERR "alloc_netdev: Unable to allocate "
5623 + dev = (struct net_device *)
5624 + (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
5625 + dev->padded = (char *)dev - (char *)p;
5626 + dev_net_set(dev, &init_net);
5629 + dev->num_tx_queues = queue_count;
5630 + dev->real_num_tx_queues = queue_count;
5632 + dev->gso_max_size = GSO_MAX_SIZE;
5634 + netdev_init_queues(dev);
5636 + INIT_LIST_HEAD(&dev->napi_list);
5638 + strcpy(dev->name, name);
5641 +EXPORT_SYMBOL(alloc_netdev_mq);
5644 + * free_netdev - free network device
5647 + * This function does the last stage of destroying an allocated device
5648 + * interface. The reference to the device object is released.
5649 + * If this is the last reference then it will be freed.
5651 +void free_netdev(struct net_device *dev)
5653 + struct napi_struct *p, *n;
5655 + release_net(dev_net(dev));
5659 + list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5660 + netif_napi_del(p);
5662 + /* Compatibility with error handling in drivers */
5663 + if (dev->reg_state == NETREG_UNINITIALIZED) {
5664 + kfree((char *)dev - dev->padded);
5668 + BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5669 + dev->reg_state = NETREG_RELEASED;
5671 + /* will free via device release */
5672 + put_device(&dev->dev);
5676 + * synchronize_net - Synchronize with packet receive processing
5678 + * Wait for packets currently being received to be done.
5679 + * Does not block later packets from starting.
5681 +void synchronize_net(void)
5684 + synchronize_rcu();
5688 + * unregister_netdevice - remove device from the kernel
5691 + * This function shuts down a device interface and removes it
5692 + * from the kernel tables.
5694 + * Callers must hold the rtnl semaphore. You may want
5695 + * unregister_netdev() instead of this.
5698 +void unregister_netdevice(struct net_device *dev)
5702 + rollback_registered(dev);
5703 + /* Finish processing unregister after unlock */
5704 + net_set_todo(dev);
5708 + * unregister_netdev - remove device from the kernel
5711 + * This function shuts down a device interface and removes it
5712 + * from the kernel tables.
5714 + * This is just a wrapper for unregister_netdevice that takes
5715 + * the rtnl semaphore. In general you want to use this and not
5716 + * unregister_netdevice.
5718 +void unregister_netdev(struct net_device *dev)
5721 + unregister_netdevice(dev);
5725 +EXPORT_SYMBOL(unregister_netdev);
5728 + * dev_change_net_namespace - move device to different nethost namespace
5730 + * @net: network namespace
5731 + * @pat: If not NULL name pattern to try if the current device name
5732 + * is already taken in the destination network namespace.
5734 + * This function shuts down a device interface and moves it
5735 + * to a new network namespace. On success 0 is returned, on
5736 + * a failure a netagive errno code is returned.
5738 + * Callers must hold the rtnl semaphore.
5741 +int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5743 + char buf[IFNAMSIZ];
5744 + const char *destname;
5749 + /* Don't allow namespace local devices to be moved. */
5751 + if (dev->features & NETIF_F_NETNS_LOCAL)
5754 +#ifdef CONFIG_SYSFS
5755 + /* Don't allow real devices to be moved when sysfs
5759 + if (dev->dev.parent)
5763 + /* Ensure the device has been registrered */
5765 + if (dev->reg_state != NETREG_REGISTERED)
5768 + /* Get out if there is nothing todo */
5770 + if (net_eq(dev_net(dev), net))
5773 + /* Pick the destination device name, and ensure
5774 + * we can use it in the destination network namespace.
5777 + destname = dev->name;
5778 + if (__dev_get_by_name(net, destname)) {
5779 + /* We get here if we can't use the current device name */
5782 + if (!dev_valid_name(pat))
5784 + if (strchr(pat, '%')) {
5785 + if (__dev_alloc_name(net, pat, buf) < 0)
5790 + if (__dev_get_by_name(net, destname))
5795 + * And now a mini version of register_netdevice unregister_netdevice.
5798 + /* If device is running close it first. */
5801 + /* And unlink it from device chain */
5803 + unlist_netdevice(dev);
5805 + synchronize_net();
5807 + /* Shutdown queueing discipline. */
5808 + dev_shutdown(dev);
5810 + /* Notify protocols, that we are about to destroy
5811 + this device. They should clean all the things.
5813 + call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5816 + * Flush the unicast and multicast chains
5818 + dev_addr_discard(dev);
5820 + netdev_unregister_kobject(dev);
5822 + /* Actually switch the network namespace */
5823 + dev_net_set(dev, net);
5825 + /* Assign the new device name */
5826 + if (destname != dev->name)
5827 + strcpy(dev->name, destname);
5829 + /* If there is an ifindex conflict assign a new one */
5830 + if (__dev_get_by_index(net, dev->ifindex)) {
5831 + int iflink = (dev->iflink == dev->ifindex);
5832 + dev->ifindex = dev_new_index(net);
5834 + dev->iflink = dev->ifindex;
5837 + /* Fixup kobjects */
5838 + err = netdev_register_kobject(dev);
5841 + /* Add the device back in the hashes */
5842 + list_netdevice(dev);
5844 + /* Notify protocols, that a new device appeared. */
5845 + call_netdevice_notifiers(NETDEV_REGISTER, dev);
5847 + synchronize_net();
5853 +static int dev_cpu_callback(struct notifier_block *nfb,
5854 + unsigned long action,
5857 + struct sk_buff **list_skb;
5858 + struct Qdisc **list_net;
5859 + struct sk_buff *skb;
5860 + unsigned int cpu, oldcpu = (unsigned long)ocpu;
5861 + struct softnet_data *sd, *oldsd;
5863 + if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5866 + local_irq_disable();
5867 + cpu = smp_processor_id();
5868 + sd = &per_cpu(softnet_data, cpu);
5869 + oldsd = &per_cpu(softnet_data, oldcpu);
5871 + /* Find end of our completion_queue. */
5872 + list_skb = &sd->completion_queue;
5874 + list_skb = &(*list_skb)->next;
5875 + /* Append completion queue from offline CPU. */
5876 + *list_skb = oldsd->completion_queue;
5877 + oldsd->completion_queue = NULL;
5879 + /* Find end of our output_queue. */
5880 + list_net = &sd->output_queue;
5882 + list_net = &(*list_net)->next_sched;
5883 + /* Append output queue from offline CPU. */
5884 + *list_net = oldsd->output_queue;
5885 + oldsd->output_queue = NULL;
5887 + raise_softirq_irqoff(NET_TX_SOFTIRQ);
5888 + local_irq_enable();
5890 + /* Process offline CPU's input_pkt_queue */
5891 + while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
5899 + * netdev_increment_features - increment feature set by one
5900 + * @all: current feature set
5901 + * @one: new feature set
5902 + * @mask: mask feature set
5904 + * Computes a new feature set after adding a device with feature set
5905 + * @one to the master device with current feature set @all. Will not
5906 + * enable anything that is off in @mask. Returns the new feature set.
5908 +unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5909 + unsigned long mask)
5911 + /* If device needs checksumming, downgrade to it. */
5912 + if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5913 + all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5914 + else if (mask & NETIF_F_ALL_CSUM) {
5915 + /* If one device supports v4/v6 checksumming, set for all. */
5916 + if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5917 + !(all & NETIF_F_GEN_CSUM)) {
5918 + all &= ~NETIF_F_ALL_CSUM;
5919 + all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5922 + /* If one device supports hw checksumming, set for all. */
5923 + if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5924 + all &= ~NETIF_F_ALL_CSUM;
5925 + all |= NETIF_F_HW_CSUM;
5929 + one |= NETIF_F_ALL_CSUM;
5931 + one |= all & NETIF_F_ONE_FOR_ALL;
5932 + all &= one | NETIF_F_LLTX | NETIF_F_GSO;
5933 + all |= one & mask & NETIF_F_ONE_FOR_ALL;
5937 +EXPORT_SYMBOL(netdev_increment_features);
5939 +static struct hlist_head *netdev_create_hash(void)
5942 + struct hlist_head *hash;
5944 + hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5946 + for (i = 0; i < NETDEV_HASHENTRIES; i++)
5947 + INIT_HLIST_HEAD(&hash[i]);
5952 +/* Initialize per network namespace state */
5953 +static int __net_init netdev_init(struct net *net)
5955 + INIT_LIST_HEAD(&net->dev_base_head);
5957 + net->dev_name_head = netdev_create_hash();
5958 + if (net->dev_name_head == NULL)
5961 + net->dev_index_head = netdev_create_hash();
5962 + if (net->dev_index_head == NULL)
5968 + kfree(net->dev_name_head);
5974 + * netdev_drivername - network driver for the device
5975 + * @dev: network device
5976 + * @buffer: buffer for resulting name
5977 + * @len: size of buffer
5979 + * Determine network driver for device.
5981 +char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5983 + const struct device_driver *driver;
5984 + const struct device *parent;
5986 + if (len <= 0 || !buffer)
5990 + parent = dev->dev.parent;
5995 + driver = parent->driver;
5996 + if (driver && driver->name)
5997 + strlcpy(buffer, driver->name, len);
6001 +static void __net_exit netdev_exit(struct net *net)
6003 + kfree(net->dev_name_head);
6004 + kfree(net->dev_index_head);
6007 +static struct pernet_operations __net_initdata netdev_net_ops = {
6008 + .init = netdev_init,
6009 + .exit = netdev_exit,
6012 +static void __net_exit default_device_exit(struct net *net)
6014 + struct net_device *dev;
6016 + * Push all migratable of the network devices back to the
6017 + * initial network namespace
6021 + for_each_netdev(net, dev) {
6023 + char fb_name[IFNAMSIZ];
6025 + /* Ignore unmoveable devices (i.e. loopback) */
6026 + if (dev->features & NETIF_F_NETNS_LOCAL)
6029 + /* Delete virtual devices */
6030 + if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
6031 + dev->rtnl_link_ops->dellink(dev);
6035 + /* Push remaing network devices to init_net */
6036 + snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6037 + err = dev_change_net_namespace(dev, &init_net, fb_name);
6039 + printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6040 + __func__, dev->name, err);
6048 +static struct pernet_operations __net_initdata default_device_ops = {
6049 + .exit = default_device_exit,
6053 + * Initialize the DEV module. At boot time this walks the device list and
6054 + * unhooks any devices that fail to initialise (normally hardware not
6055 + * present) and leaves us with a valid list of present and active devices.
6060 + * This is called single threaded during boot, so no need
6061 + * to take the rtnl semaphore.
6063 +static int __init net_dev_init(void)
6065 + int i, rc = -ENOMEM;
6067 + BUG_ON(!dev_boot_phase);
6069 + if (dev_proc_init())
6072 + if (netdev_kobject_init())
6075 + INIT_LIST_HEAD(&ptype_all);
6076 + for (i = 0; i < PTYPE_HASH_SIZE; i++)
6077 + INIT_LIST_HEAD(&ptype_base[i]);
6079 + if (register_pernet_subsys(&netdev_net_ops))
6083 + * Initialise the packet receive queues.
6086 + for_each_possible_cpu(i) {
6087 + struct softnet_data *queue;
6089 + queue = &per_cpu(softnet_data, i);
6090 + skb_queue_head_init(&queue->input_pkt_queue);
6091 + queue->completion_queue = NULL;
6092 + INIT_LIST_HEAD(&queue->poll_list);
6094 + queue->backlog.poll = process_backlog;
6095 + queue->backlog.weight = weight_p;
6096 + queue->backlog.gro_list = NULL;
6097 + queue->backlog.gro_count = 0;
6100 + dev_boot_phase = 0;
6102 + /* The loopback device is special if any other network devices
6103 + * is present in a network namespace the loopback device must
6104 + * be present. Since we now dynamically allocate and free the
6105 + * loopback device ensure this invariant is maintained by
6106 + * keeping the loopback device as the first device on the
6107 + * list of network devices. Ensuring the loopback devices
6108 + * is the first device that appears and the last network device
6109 + * that disappears.
6111 + if (register_pernet_device(&loopback_net_ops))
6114 + if (register_pernet_device(&default_device_ops))
6117 + open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6118 + open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6120 + hotcpu_notifier(dev_cpu_callback, 0);
6128 +subsys_initcall(net_dev_init);
6130 +static int __init initialize_hashrnd(void)
6132 + get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
6136 +late_initcall_sync(initialize_hashrnd);
6138 +EXPORT_SYMBOL(__dev_get_by_index);
6139 +EXPORT_SYMBOL(__dev_get_by_name);
6140 +EXPORT_SYMBOL(__dev_remove_pack);
6141 +EXPORT_SYMBOL(dev_valid_name);
6142 +EXPORT_SYMBOL(dev_add_pack);
6143 +EXPORT_SYMBOL(dev_alloc_name);
6144 +EXPORT_SYMBOL(dev_close);
6145 +EXPORT_SYMBOL(dev_get_by_flags);
6146 +EXPORT_SYMBOL(dev_get_by_index);
6147 +EXPORT_SYMBOL(dev_get_by_name);
6148 +EXPORT_SYMBOL(dev_open);
6149 +EXPORT_SYMBOL(dev_queue_xmit);
6150 +EXPORT_SYMBOL(dev_remove_pack);
6151 +EXPORT_SYMBOL(dev_set_allmulti);
6152 +EXPORT_SYMBOL(dev_set_promiscuity);
6153 +EXPORT_SYMBOL(dev_change_flags);
6154 +EXPORT_SYMBOL(dev_set_mtu);
6155 +EXPORT_SYMBOL(dev_set_mac_address);
6156 +EXPORT_SYMBOL(free_netdev);
6157 +EXPORT_SYMBOL(netdev_boot_setup_check);
6158 +EXPORT_SYMBOL(netdev_set_master);
6159 +EXPORT_SYMBOL(netdev_state_change);
6160 +EXPORT_SYMBOL(netif_receive_skb);
6161 +EXPORT_SYMBOL(netif_rx);
6162 +EXPORT_SYMBOL(register_gifconf);
6163 +EXPORT_SYMBOL(register_netdevice);
6164 +EXPORT_SYMBOL(register_netdevice_notifier);
6165 +EXPORT_SYMBOL(skb_checksum_help);
6166 +EXPORT_SYMBOL(synchronize_net);
6167 +EXPORT_SYMBOL(unregister_netdevice);
6168 +EXPORT_SYMBOL(unregister_netdevice_notifier);
6169 +EXPORT_SYMBOL(net_enable_timestamp);
6170 +EXPORT_SYMBOL(net_disable_timestamp);
6171 +EXPORT_SYMBOL(dev_get_flags);
6173 +#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
6174 +EXPORT_SYMBOL(br_handle_frame_hook);
6175 +EXPORT_SYMBOL(br_fdb_get_hook);
6176 +EXPORT_SYMBOL(br_fdb_put_hook);
6179 +EXPORT_SYMBOL(dev_load);
6181 +EXPORT_PER_CPU_SYMBOL(softnet_data);
6182 diff --unified --recursive --new-file linux-2.6.30/net/Kconfig linux-2.6.30-1-686-smp-PF_RING/net/Kconfig
6183 --- linux-2.6.30/net/Kconfig 2009-06-10 05:05:27.000000000 +0200
6184 +++ linux-2.6.30-1-686-smp-PF_RING/net/Kconfig 2009-07-21 04:40:31.395512101 +0200
6186 source "net/xfrm/Kconfig"
6187 source "net/iucv/Kconfig"
6189 +source "net/ring/Kconfig"
6191 bool "TCP/IP networking"
6193 diff --unified --recursive --new-file linux-2.6.30/net/Makefile linux-2.6.30-1-686-smp-PF_RING/net/Makefile
6194 --- linux-2.6.30/net/Makefile 2009-06-10 05:05:27.000000000 +0200
6195 +++ linux-2.6.30-1-686-smp-PF_RING/net/Makefile 2009-07-21 04:40:31.378997724 +0200
6197 obj-$(CONFIG_PHONET) += phonet/
6198 ifneq ($(CONFIG_VLAN_8021Q),)
6200 +obj-$(CONFIG_RING) += ring/
6202 obj-$(CONFIG_IP_DCCP) += dccp/
6203 obj-$(CONFIG_IP_SCTP) += sctp/
6204 diff --unified --recursive --new-file linux-2.6.30/net/Makefile.ORG linux-2.6.30-1-686-smp-PF_RING/net/Makefile.ORG
6205 --- linux-2.6.30/net/Makefile.ORG 1970-01-01 01:00:00.000000000 +0100
6206 +++ linux-2.6.30-1-686-smp-PF_RING/net/Makefile.ORG 2009-07-21 04:40:31.369103612 +0200
6209 +# Makefile for the linux networking.
6211 +# 2 Sep 2000, Christoph Hellwig <hch@infradead.org>
6212 +# Rewritten to use lists instead of if-statements.
6217 +obj-$(CONFIG_NET) := socket.o core/
6219 +tmp-$(CONFIG_COMPAT) := compat.o
6220 +obj-$(CONFIG_NET) += $(tmp-y)
6222 +# LLC has to be linked before the files in net/802/
6223 +obj-$(CONFIG_LLC) += llc/
6224 +obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/
6225 +obj-$(CONFIG_NETFILTER) += netfilter/
6226 +obj-$(CONFIG_INET) += ipv4/
6227 +obj-$(CONFIG_XFRM) += xfrm/
6228 +obj-$(CONFIG_UNIX) += unix/
6229 +ifneq ($(CONFIG_IPV6),)
6232 +obj-$(CONFIG_PACKET) += packet/
6233 +obj-$(CONFIG_NET_KEY) += key/
6234 +obj-$(CONFIG_NET_SCHED) += sched/
6235 +obj-$(CONFIG_BRIDGE) += bridge/
6236 +obj-$(CONFIG_NET_DSA) += dsa/
6237 +obj-$(CONFIG_IPX) += ipx/
6238 +obj-$(CONFIG_ATALK) += appletalk/
6239 +obj-$(CONFIG_WAN_ROUTER) += wanrouter/
6240 +obj-$(CONFIG_X25) += x25/
6241 +obj-$(CONFIG_LAPB) += lapb/
6242 +obj-$(CONFIG_NETROM) += netrom/
6243 +obj-$(CONFIG_ROSE) += rose/
6244 +obj-$(CONFIG_AX25) += ax25/
6245 +obj-$(CONFIG_CAN) += can/
6246 +obj-$(CONFIG_IRDA) += irda/
6247 +obj-$(CONFIG_BT) += bluetooth/
6248 +obj-$(CONFIG_SUNRPC) += sunrpc/
6249 +obj-$(CONFIG_AF_RXRPC) += rxrpc/
6250 +obj-$(CONFIG_ATM) += atm/
6251 +obj-$(CONFIG_DECNET) += decnet/
6252 +obj-$(CONFIG_ECONET) += econet/
6253 +obj-$(CONFIG_PHONET) += phonet/
6254 +ifneq ($(CONFIG_VLAN_8021Q),)
6257 +obj-$(CONFIG_IP_DCCP) += dccp/
6258 +obj-$(CONFIG_IP_SCTP) += sctp/
6259 +obj-$(CONFIG_RDS) += rds/
6261 +obj-$(CONFIG_MAC80211) += mac80211/
6262 +obj-$(CONFIG_TIPC) += tipc/
6263 +obj-$(CONFIG_NETLABEL) += netlabel/
6264 +obj-$(CONFIG_IUCV) += iucv/
6265 +obj-$(CONFIG_RFKILL) += rfkill/
6266 +obj-$(CONFIG_NET_9P) += 9p/
6267 +ifneq ($(CONFIG_DCB),)
6271 +ifeq ($(CONFIG_NET),y)
6272 +obj-$(CONFIG_SYSCTL) += sysctl_net.o
6274 +obj-$(CONFIG_WIMAX) += wimax/
6275 diff --unified --recursive --new-file linux-2.6.30/net/ring/Kconfig linux-2.6.30-1-686-smp-PF_RING/net/ring/Kconfig
6276 --- linux-2.6.30/net/ring/Kconfig 1970-01-01 01:00:00.000000000 +0100
6277 +++ linux-2.6.30-1-686-smp-PF_RING/net/ring/Kconfig 2009-07-21 04:40:31.399104158 +0200
6280 + tristate "PF_RING sockets (EXPERIMENTAL)"
6281 + depends on EXPERIMENTAL
6283 + PF_RING socket family, optimized for packet capture.
6284 + If a PF_RING socket is bound to an adapter (via the bind() system
6285 + call), such adapter will be used in read-only mode until the socket
6286 + is destroyed. Whenever an incoming packet is received from the adapter
6287 + it will not passed to upper layers, but instead it is copied to a ring
6288 + buffer, which in turn is exported to user space applications via mmap.
6289 + Please refer to http://luca.ntop.org/Ring.pdf for more.
6291 + Say N unless you know what you are doing.
6293 diff --unified --recursive --new-file linux-2.6.30/net/ring/Makefile linux-2.6.30-1-686-smp-PF_RING/net/ring/Makefile
6294 --- linux-2.6.30/net/ring/Makefile 1970-01-01 01:00:00.000000000 +0100
6295 +++ linux-2.6.30-1-686-smp-PF_RING/net/ring/Makefile 2009-07-21 04:40:31.315770393 +0200
6298 +# Makefile for the ring driver.
6303 +ring-objs := ring_packet.o
6304 diff --unified --recursive --new-file linux-2.6.30/net/ring/ring_packet.c linux-2.6.30-1-686-smp-PF_RING/net/ring/ring_packet.c
6305 --- linux-2.6.30/net/ring/ring_packet.c 1970-01-01 01:00:00.000000000 +0100
6306 +++ linux-2.6.30-1-686-smp-PF_RING/net/ring/ring_packet.c 2009-07-21 04:40:31.315770393 +0200
6308 +/* ***************************************************************
6310 + * (C) 2004-09 - Luca Deri <deri@ntop.org>
6312 + * This code includes contributions courtesy of
6313 + * - Amit D. Chaudhary <amit_ml@rajgad.com>
6314 + * - Andrew Gallatin <gallatyn@myri.com>
6315 + * - Brad Doctor <brad@stillsecure.com>
6316 + * - Felipe Huici <felipe.huici@nw.neclab.eu>
6317 + * - Francesco Fusco <fusco@ntop.org> (IP defrag)
6318 + * - Helmut Manck <helmut.manck@secunet.com>
6319 + * - Hitoshi Irino <irino@sfc.wide.ad.jp>
6320 + * - Jakov Haron <jyh@cabel.net>
6321 + * - Jeff Randall <jrandall@nexvu.com>
6322 + * - Kevin Wormington <kworm@sofnet.com>
6323 + * - Mahdi Dashtbozorgi <rdfm2000@gmail.com>
6324 + * - Marketakis Yannis <marketak@ics.forth.gr>
6325 + * - Matthew J. Roth <mroth@imminc.com>
6326 + * - Michael Stiller <ms@2scale.net> (VM memory support)
6327 + * - Noam Dev <noamdev@gmail.com>
6328 + * - Siva Kollipara <siva@cs.arizona.edu>
6329 + * - Vincent Carrier <vicarrier@wanadoo.fr>
6330 + * - Eugene Bogush <b_eugene@ukr.net>
6331 + * - Samir Chang <coobyhb@gmail.com>
6333 + * This program is free software; you can redistribute it and/or modify
6334 + * it under the terms of the GNU General Public License as published by
6335 + * the Free Software Foundation; either version 2 of the License, or
6336 + * (at your option) any later version.
6338 + * This program is distributed in the hope that it will be useful,
6339 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
6340 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
6341 + * GNU General Public License for more details.
6343 + * You should have received a copy of the GNU General Public License
6344 + * along with this program; if not, write to the Free Software Foundation,
6345 + * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
6349 +#include <linux/version.h>
6350 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,19))
6351 +#include <linux/autoconf.h>
6353 +#include <linux/config.h>
6355 +#include <linux/module.h>
6356 +#include <linux/kernel.h>
6357 +#include <linux/socket.h>
6358 +#include <linux/skbuff.h>
6359 +#include <linux/rtnetlink.h>
6360 +#include <linux/in.h>
6361 +#include <linux/inet.h>
6362 +#include <linux/in6.h>
6363 +#include <linux/init.h>
6364 +#include <linux/filter.h>
6365 +#include <linux/ring.h>
6366 +#include <linux/ip.h>
6367 +#include <linux/tcp.h>
6368 +#include <linux/udp.h>
6369 +#include <linux/list.h>
6370 +#include <linux/netdevice.h>
6371 +#include <linux/proc_fs.h>
6373 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
6374 +#include <net/xfrm.h>
6376 +#include <linux/poll.h>
6378 +#include <net/sock.h>
6379 +#include <asm/io.h> /* needed for virt_to_phys() */
6381 +#include <net/inet_common.h>
6383 +#include <net/ip.h>
6385 +/* ================================================== */
6388 + * regcomp and regexec -- regsub and regerror are elsewhere
6389 + * @(#)regexp.c 1.3 of 18 April 87
6391 + * Copyright (c) 1986 by University of Toronto.
6392 + * Written by Henry Spencer. Not derived from licensed software.
6394 + * Permission is granted to anyone to use this software for any
6395 + * purpose on any computer system, and to redistribute it freely,
6396 + * subject to the following restrictions:
6398 + * 1. The author is not responsible for the consequences of use of
6399 + * this software, no matter how awful, even if they arise
6400 + * from defects in it.
6402 + * 2. The origin of this software must not be misrepresented, either
6403 + * by explicit claim or by omission.
6405 + * 3. Altered versions must be plainly marked as such, and must not
6406 + * be misrepresented as being the original software.
6408 + * Beware that some of this code is subtly aware of the way operator
6409 + * precedence is structured in regular expressions. Serious changes in
6410 + * regular-expression syntax might require a total rethink.
6412 + * This code was modified by Ethan Sommer to work within the kernel
6413 + * (it now uses kmalloc etc..)
6415 + * Modified slightly by Matthew Strait to use more modern C.
6418 +/* added by ethan and matt. Lets it work in both kernel and user space.
6419 + (So iptables can use it, for instance.) Yea, it goes both ways... */
6421 +#define malloc(foo) kmalloc(foo,GFP_ATOMIC)
6423 +#define printk(format,args...) printf(format,##args)
6426 +void regerror(char * s)
6428 + printk("<3>Regexp: %s\n", s);
6433 + * The "internal use only" fields in regexp.h are present to pass info from
6434 + * compile to execute that permits the execute phase to run lots faster on
6435 + * simple cases. They are:
6437 + * regstart char that must begin a match; '\0' if none obvious
6438 + * reganch is the match anchored (at beginning-of-line only)?
6439 + * regmust string (pointer into program) that match must include, or NULL
6440 + * regmlen length of regmust string
6442 + * Regstart and reganch permit very fast decisions on suitable starting points
6443 + * for a match, cutting down the work a lot. Regmust permits fast rejection
6444 + * of lines that cannot possibly match. The regmust tests are costly enough
6445 + * that regcomp() supplies a regmust only if the r.e. contains something
6446 + * potentially expensive (at present, the only such thing detected is * or +
6447 + * at the start of the r.e., which can involve a lot of backup). Regmlen is
6448 + * supplied because the test in regexec() needs it and regcomp() is computing
6453 + * Structure for regexp "program". This is essentially a linear encoding
6454 + * of a nondeterministic finite-state machine (aka syntax charts or
6455 + * "railroad normal form" in parsing technology). Each node is an opcode
6456 + * plus a "next" pointer, possibly plus an operand. "Next" pointers of
6457 + * all nodes except BRANCH implement concatenation; a "next" pointer with
6458 + * a BRANCH on both ends of it is connecting two alternatives. (Here we
6459 + * have one of the subtle syntax dependencies: an individual BRANCH (as
6460 + * opposed to a collection of them) is never concatenated with anything
6461 + * because of operator precedence.) The operand of some types of node is
6462 + * a literal string; for others, it is a node leading into a sub-FSM. In
6463 + * particular, the operand of a BRANCH node is the first node of the branch.
6464 + * (NB this is *not* a tree structure: the tail of the branch connects
6465 + * to the thing following the set of BRANCHes.) The opcodes are:
6468 +/* definition number opnd? meaning */
6469 +#define END 0 /* no End of program. */
6470 +#define BOL 1 /* no Match "" at beginning of line. */
6471 +#define EOL 2 /* no Match "" at end of line. */
6472 +#define ANY 3 /* no Match any one character. */
6473 +#define ANYOF 4 /* str Match any character in this string. */
6474 +#define ANYBUT 5 /* str Match any character not in this string. */
6475 +#define BRANCH 6 /* node Match this alternative, or the next... */
6476 +#define BACK 7 /* no Match "", "next" ptr points backward. */
6477 +#define EXACTLY 8 /* str Match this string. */
6478 +#define NOTHING 9 /* no Match empty string. */
6479 +#define STAR 10 /* node Match this (simple) thing 0 or more times. */
6480 +#define PLUS 11 /* node Match this (simple) thing 1 or more times. */
6481 +#define OPEN 20 /* no Mark this point in input as start of #n. */
6482 + /* OPEN+1 is number 1, etc. */
6483 +#define CLOSE 30 /* no Analogous to OPEN. */
6488 + * BRANCH The set of branches constituting a single choice are hooked
6489 + * together with their "next" pointers, since precedence prevents
6490 + * anything being concatenated to any individual branch. The
6491 + * "next" pointer of the last BRANCH in a choice points to the
6492 + * thing following the whole choice. This is also where the
6493 + * final "next" pointer of each individual branch points; each
6494 + * branch starts with the operand node of a BRANCH node.
6496 + * BACK Normal "next" pointers all implicitly point forward; BACK
6497 + * exists to make loop structures possible.
6499 + * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
6500 + * BRANCH structures using BACK. Simple cases (one character
6501 + * per match) are implemented with STAR and PLUS for speed
6502 + * and to minimize recursive plunges.
6504 + * OPEN,CLOSE ...are numbered at compile time.
6508 + * A node is one char of opcode followed by two chars of "next" pointer.
6509 + * "Next" pointers are stored as two 8-bit pieces, high order first. The
6510 + * value is a positive offset from the opcode of the node containing it.
6511 + * An operand, if any, simply follows the node. (Note that much of the
6512 + * code generation knows about this implicit relationship.)
6514 + * Using two bytes for the "next" pointer is vast overkill for most things,
6515 + * but allows patterns to get big without disasters.
6517 +#define OP(p) (*(p))
6518 +#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
6519 +#define OPERAND(p) ((p) + 3)
6522 + * See regmagic.h for one further detail of program structure.
6527 + * Utility definitions.
6530 +#define UCHARAT(p) ((int)*(unsigned char *)(p))
6532 +#define UCHARAT(p) ((int)*(p)&CHARBITS)
6535 +#define FAIL(m) { regerror(m); return(NULL); }
6536 +#define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
6537 +#define META "^$.[()|?+*\\"
6540 + * Flags to be passed up and down.
6542 +#define HASWIDTH 01 /* Known never to match null string. */
6543 +#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
6544 +#define SPSTART 04 /* Starts with * or +. */
6545 +#define WORST 0 /* Worst case. */
6548 + * Global work variables for regcomp().
6550 +struct match_globals {
6551 + char *reginput; /* String-input pointer. */
6552 + char *regbol; /* Beginning of input, for ^ check. */
6553 + char **regstartp; /* Pointer to startp array. */
6554 + char **regendp; /* Ditto for endp. */
6555 + char *regparse; /* Input-scan pointer. */
6556 + int regnpar; /* () count. */
6558 + char *regcode; /* Code-emit pointer; ®dummy = don't. */
6559 + long regsize; /* Code size. */
6563 + * Forward declarations for regcomp()'s friends.
6566 +#define STATIC static
6568 +STATIC char *reg(struct match_globals *g, int paren,int *flagp);
6569 +STATIC char *regbranch(struct match_globals *g, int *flagp);
6570 +STATIC char *regpiece(struct match_globals *g, int *flagp);
6571 +STATIC char *regatom(struct match_globals *g, int *flagp);
6572 +STATIC char *regnode(struct match_globals *g, char op);
6573 +STATIC char *regnext(struct match_globals *g, char *p);
6574 +STATIC void regc(struct match_globals *g, char b);
6575 +STATIC void reginsert(struct match_globals *g, char op, char *opnd);
6576 +STATIC void regtail(struct match_globals *g, char *p, char *val);
6577 +STATIC void regoptail(struct match_globals *g, char *p, char *val);
6579 +static u_int8_t case_insensitive = 1;
6581 +__kernel_size_t my_strcspn(const char *s1,const char *s2)
6588 + for (scan1 = (char *)s1; *scan1 != '\0'; scan1++) {
6589 + for (scan2 = (char *)s2; *scan2 != '\0';) /* ++ moved down. */
6590 + if (*scan1 == *scan2++)
6597 +/* ********************************************** */
6599 +inline char tolower(char c) {
6600 + return ((c >= 'A' && c <= 'Z') ? c + 32: c);
6603 +inline void string2lower(char* str, int str_len) {
6606 + for(i=0; i<str_len; i++) str[i] = tolower(str[i]);
6609 +/* ********************************************** */
6612 + - regcomp - compile a regular expression into internal code
6614 + * We can't allocate space until we know how big the compiled form will be,
6615 + * but we can't compile it (and thus know how big it is) until we've got a
6616 + * place to put the code. So we cheat: we compile it twice, once with code
6617 + * generation turned off and size counting turned on, and once "for real".
6618 + * This also means that we don't allocate space until we are sure that the
6619 + * thing really will compile successfully, and we never have to move the
6620 + * code and thus invalidate pointers into it. (Note that it has to be in
6621 + * one piece because free() must be able to free it all.)
6623 + * Beware that the optimization-preparation code in here knows about some
6624 + * of the structure of the compiled regexp.
6627 +regcomp(char *exp,int *patternsize)
6629 + register regexp *r;
6630 + register char *scan;
6631 + register char *longest;
6634 + struct match_globals g;
6636 + /* commented out by ethan
6637 + extern char *malloc();
6641 + FAIL("NULL argument");
6643 + if(case_insensitive) string2lower(exp, strlen(exp));
6645 + /* First pass: determine size, legality. */
6649 + g.regcode = &g.regdummy;
6651 + if (reg(&g, 0, &flags) == NULL)
6654 + /* Small enough for pointer-storage convention? */
6655 + if (g.regsize >= 32767L) /* Probably could be 65535L. */
6656 + FAIL("regexp too big");
6658 + /* Allocate space. */
6659 + *patternsize=sizeof(regexp) + (unsigned)g.regsize;
6660 + r = (regexp *)malloc(sizeof(regexp) + (unsigned)g.regsize);
6662 + FAIL("out of space");
6664 + /* Second pass: emit code. */
6667 + g.regcode = r->program;
6669 + if (reg(&g, 0, &flags) == NULL)
6672 + /* Dig out information for optimizations. */
6673 + r->regstart = '\0'; /* Worst-case defaults. */
6675 + r->regmust = NULL;
6677 + scan = r->program+1; /* First BRANCH. */
6678 + if (OP(regnext(&g, scan)) == END) { /* Only one top-level choice. */
6679 + scan = OPERAND(scan);
6681 + /* Starting-point info. */
6682 + if (OP(scan) == EXACTLY)
6683 + r->regstart = *OPERAND(scan);
6684 + else if (OP(scan) == BOL)
6688 + * If there's something expensive in the r.e., find the
6689 + * longest literal string that must appear and make it the
6690 + * regmust. Resolve ties in favor of later strings, since
6691 + * the regstart check works with the beginning of the r.e.
6692 + * and avoiding duplication strengthens checking. Not a
6693 + * strong reason, but sufficient in the absence of others.
6695 + if (flags&SPSTART) {
6698 + for (; scan != NULL; scan = regnext(&g, scan))
6699 + if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
6700 + longest = OPERAND(scan);
6701 + len = strlen(OPERAND(scan));
6703 + r->regmust = longest;
6712 + - reg - regular expression, i.e. main body or parenthesized thing
6714 + * Caller must absorb opening parenthesis.
6716 + * Combining parenthesis handling with the base level of regular expression
6717 + * is a trifle forced, but the need to tie the tails of the branches to what
6718 + * follows makes it hard to avoid.
6721 +reg(struct match_globals *g, int paren, int *flagp /* Parenthesized? */ )
6723 + register char *ret;
6724 + register char *br;
6725 + register char *ender;
6726 + register int parno = 0; /* 0 makes gcc happy */
6729 + *flagp = HASWIDTH; /* Tentatively. */
6731 + /* Make an OPEN node, if parenthesized. */
6733 + if (g->regnpar >= NSUBEXP)
6734 + FAIL("too many ()");
6735 + parno = g->regnpar;
6737 + ret = regnode(g, OPEN+parno);
6741 + /* Pick up the branches, linking them together. */
6742 + br = regbranch(g, &flags);
6746 + regtail(g, ret, br); /* OPEN -> first. */
6749 + if (!(flags&HASWIDTH))
6750 + *flagp &= ~HASWIDTH;
6751 + *flagp |= flags&SPSTART;
6752 + while (*g->regparse == '|') {
6754 + br = regbranch(g, &flags);
6757 + regtail(g, ret, br); /* BRANCH -> BRANCH. */
6758 + if (!(flags&HASWIDTH))
6759 + *flagp &= ~HASWIDTH;
6760 + *flagp |= flags&SPSTART;
6763 + /* Make a closing node, and hook it on the end. */
6764 + ender = regnode(g, (paren) ? CLOSE+parno : END);
6765 + regtail(g, ret, ender);
6767 + /* Hook the tails of the branches to the closing node. */
6768 + for (br = ret; br != NULL; br = regnext(g, br))
6769 + regoptail(g, br, ender);
6771 + /* Check for proper termination. */
6772 + if (paren && *g->regparse++ != ')') {
6773 + FAIL("unmatched ()");
6774 + } else if (!paren && *g->regparse != '\0') {
6775 + if (*g->regparse == ')') {
6776 + FAIL("unmatched ()");
6778 + FAIL("junk on end"); /* "Can't happen". */
6786 + - regbranch - one alternative of an | operator
6788 + * Implements the concatenation operator.
6791 +regbranch(struct match_globals *g, int *flagp)
6793 + register char *ret;
6794 + register char *chain;
6795 + register char *latest;
6798 + *flagp = WORST; /* Tentatively. */
6800 + ret = regnode(g, BRANCH);
6802 + while (*g->regparse != '\0' && *g->regparse != '|' && *g->regparse != ')') {
6803 + latest = regpiece(g, &flags);
6804 + if (latest == NULL)
6806 + *flagp |= flags&HASWIDTH;
6807 + if (chain == NULL) /* First piece. */
6808 + *flagp |= flags&SPSTART;
6810 + regtail(g, chain, latest);
6813 + if (chain == NULL) /* Loop ran zero times. */
6814 + (void) regnode(g, NOTHING);
6820 + - regpiece - something followed by possible [*+?]
6822 + * Note that the branching code sequences used for ? and the general cases
6823 + * of * and + are somewhat optimized: they use the same NOTHING node as
6824 + * both the endmarker for their branch list and the body of the last branch.
6825 + * It might seem that this node could be dispensed with entirely, but the
6826 + * endmarker role is not redundant.
6829 +regpiece(struct match_globals *g, int *flagp)
6831 + register char *ret;
6833 + register char *next;
6836 + ret = regatom(g, &flags);
6840 + op = *g->regparse;
6841 + if (!ISMULT(op)) {
6846 + if (!(flags&HASWIDTH) && op != '?')
6847 + FAIL("*+ operand could be empty");
6848 + *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
6850 + if (op == '*' && (flags&SIMPLE))
6851 + reginsert(g, STAR, ret);
6852 + else if (op == '*') {
6853 + /* Emit x* as (x&|), where & means "self". */
6854 + reginsert(g, BRANCH, ret); /* Either x */
6855 + regoptail(g, ret, regnode(g, BACK)); /* and loop */
6856 + regoptail(g, ret, ret); /* back */
6857 + regtail(g, ret, regnode(g, BRANCH)); /* or */
6858 + regtail(g, ret, regnode(g, NOTHING)); /* null. */
6859 + } else if (op == '+' && (flags&SIMPLE))
6860 + reginsert(g, PLUS, ret);
6861 + else if (op == '+') {
6862 + /* Emit x+ as x(&|), where & means "self". */
6863 + next = regnode(g, BRANCH); /* Either */
6864 + regtail(g, ret, next);
6865 + regtail(g, regnode(g, BACK), ret); /* loop back */
6866 + regtail(g, next, regnode(g, BRANCH)); /* or */
6867 + regtail(g, ret, regnode(g, NOTHING)); /* null. */
6868 + } else if (op == '?') {
6869 + /* Emit x? as (x|) */
6870 + reginsert(g, BRANCH, ret); /* Either x */
6871 + regtail(g, ret, regnode(g, BRANCH)); /* or */
6872 + next = regnode(g, NOTHING); /* null. */
6873 + regtail(g, ret, next);
6874 + regoptail(g, ret, next);
6877 + if (ISMULT(*g->regparse))
6878 + FAIL("nested *?+");
6884 + - regatom - the lowest level
6886 + * Optimization: gobbles an entire sequence of ordinary characters so that
6887 + * it can turn them into a single node, which is smaller to store and
6888 + * faster to run. Backslashed characters are exceptions, each becoming a
6889 + * separate node; the code is simpler that way and it's not worth fixing.
6892 +regatom(struct match_globals *g, int *flagp)
6894 + register char *ret;
6897 + *flagp = WORST; /* Tentatively. */
6899 + switch (*g->regparse++) {
6901 + ret = regnode(g, BOL);
6904 + ret = regnode(g, EOL);
6907 + ret = regnode(g, ANY);
6908 + *flagp |= HASWIDTH|SIMPLE;
6911 + register int class;
6912 + register int classend;
6914 + if (*g->regparse == '^') { /* Complement of range. */
6915 + ret = regnode(g, ANYBUT);
6918 + ret = regnode(g, ANYOF);
6919 + if (*g->regparse == ']' || *g->regparse == '-')
6920 + regc(g, *g->regparse++);
6921 + while (*g->regparse != '\0' && *g->regparse != ']') {
6922 + if (*g->regparse == '-') {
6924 + if (*g->regparse == ']' || *g->regparse == '\0')
6927 + class = UCHARAT(g->regparse-2)+1;
6928 + classend = UCHARAT(g->regparse);
6929 + if (class > classend+1)
6930 + FAIL("invalid [] range");
6931 + for (; class <= classend; class++)
6936 + regc(g, *g->regparse++);
6939 + if (*g->regparse != ']')
6940 + FAIL("unmatched []");
6942 + *flagp |= HASWIDTH|SIMPLE;
6946 + ret = reg(g, 1, &flags);
6949 + *flagp |= flags&(HASWIDTH|SPSTART);
6954 + FAIL("internal urp"); /* Supposed to be caught earlier. */
6959 + FAIL("?+* follows nothing");
6962 + if (*g->regparse == '\0')
6963 + FAIL("trailing \\");
6964 + ret = regnode(g, EXACTLY);
6965 + regc(g, *g->regparse++);
6967 + *flagp |= HASWIDTH|SIMPLE;
6971 + register char ender;
6974 + len = my_strcspn((const char *)g->regparse, (const char *)META);
6976 + FAIL("internal disaster");
6977 + ender = *(g->regparse+len);
6978 + if (len > 1 && ISMULT(ender))
6979 + len--; /* Back off clear of ?+* operand. */
6980 + *flagp |= HASWIDTH;
6983 + ret = regnode(g, EXACTLY);
6985 + regc(g, *g->regparse++);
6997 + - regnode - emit a node
6999 +static char * /* Location. */
7000 +regnode(struct match_globals *g, char op)
7002 + register char *ret;
7003 + register char *ptr;
7006 + if (ret == &g->regdummy) {
7013 + *ptr++ = '\0'; /* Null "next" pointer. */
7021 + - regc - emit (if appropriate) a byte of code
7024 +regc(struct match_globals *g, char b)
7026 + if (g->regcode != &g->regdummy)
7027 + *g->regcode++ = b;
7033 + - reginsert - insert an operator in front of already-emitted operand
7035 + * Means relocating the operand.
7038 +reginsert(struct match_globals *g, char op, char* opnd)
7040 + register char *src;
7041 + register char *dst;
7042 + register char *place;
7044 + if (g->regcode == &g->regdummy) {
7052 + while (src > opnd)
7055 + place = opnd; /* Op node, where operand used to be. */
7062 + - regtail - set the next-pointer at the end of a node chain
7065 +regtail(struct match_globals *g, char *p, char *val)
7067 + register char *scan;
7068 + register char *temp;
7069 + register int offset;
7071 + if (p == &g->regdummy)
7074 + /* Find last node. */
7077 + temp = regnext(g, scan);
7083 + if (OP(scan) == BACK)
7084 + offset = scan - val;
7086 + offset = val - scan;
7087 + *(scan+1) = (offset>>8)&0377;
7088 + *(scan+2) = offset&0377;
7092 + - regoptail - regtail on operand of first argument; nop if operandless
7095 +regoptail(struct match_globals *g, char *p, char *val)
7097 + /* "Operandless" and "op != BRANCH" are synonymous in practice. */
7098 + if (p == NULL || p == &g->regdummy || OP(p) != BRANCH)
7100 + regtail(g, OPERAND(p), val);
7104 + * regexec and friends
7111 +STATIC int regtry(struct match_globals *g, regexp *prog, char *string);
7112 +STATIC int regmatch(struct match_globals *g, char *prog);
7113 +STATIC int regrepeat(struct match_globals *g, char *p);
7116 + - regexec - match a regexp against a string
7119 +regexec(regexp *prog, char *string)
7122 + struct match_globals g;
7124 + if(case_insensitive) string2lower(string, strlen(string));
7126 + /* Be paranoid... */
7127 + if (prog == NULL || string == NULL) {
7128 + printk("<3>Regexp: NULL parameter\n");
7132 + /* Check validity of program. */
7133 + if (UCHARAT(prog->program) != MAGIC) {
7134 + printk("<3>Regexp: corrupted program\n");
7138 + /* If there is a "must appear" string, look for it. */
7139 + if (prog->regmust != NULL) {
7141 + while ((s = strchr(s, prog->regmust[0])) != NULL) {
7142 + if (strncmp(s, prog->regmust, prog->regmlen) == 0)
7143 + break; /* Found it. */
7146 + if (s == NULL) /* Not present. */
7150 + /* Mark beginning of line for ^ . */
7151 + g.regbol = string;
7153 + /* Simplest case: anchored match need be tried only once. */
7154 + if (prog->reganch)
7155 + return(regtry(&g, prog, string));
7157 + /* Messy cases: unanchored match. */
7159 + if (prog->regstart != '\0')
7160 + /* We know what char it must start with. */
7161 + while ((s = strchr(s, prog->regstart)) != NULL) {
7162 + if (regtry(&g, prog, s))
7167 + /* We don't -- general case. */
7169 + if (regtry(&g, prog, s))
7171 + } while (*s++ != '\0');
7178 + - regtry - try match at specific point
7180 +static int /* 0 failure, 1 success */
7181 +regtry(struct match_globals *g, regexp *prog, char *string)
7184 + register char **sp;
7185 + register char **ep;
7187 + g->reginput = string;
7188 + g->regstartp = prog->startp;
7189 + g->regendp = prog->endp;
7191 + sp = prog->startp;
7193 + for (i = NSUBEXP; i > 0; i--) {
7197 + if (regmatch(g, prog->program + 1)) {
7198 + prog->startp[0] = string;
7199 + prog->endp[0] = g->reginput;
7206 + - regmatch - main matching routine
7208 + * Conceptually the strategy is simple: check to see whether the current
7209 + * node matches, call self recursively to see whether the rest matches,
7210 + * and then act accordingly. In practice we make some effort to avoid
7211 + * recursion, in particular by going through "ordinary" nodes (that don't
7212 + * need to know whether the rest of the match failed) by a loop instead of
7215 +static int /* 0 failure, 1 success */
7216 +regmatch(struct match_globals *g, char *prog)
7218 + register char *scan = prog; /* Current node. */
7219 + char *next; /* Next node. */
7222 + if (scan != NULL && regnarrate)
7223 + printk("%s(\n", regprop(scan));
7225 + while (scan != NULL) {
7228 + printk("%s...\n", regprop(scan));
7230 + next = regnext(g, scan);
7232 + switch (OP(scan)) {
7234 + if (g->reginput != g->regbol)
7238 + if (*g->reginput != '\0')
7242 + if (*g->reginput == '\0')
7248 + register char *opnd;
7250 + opnd = OPERAND(scan);
7251 + /* Inline the first character, for speed. */
7252 + if (*opnd != *g->reginput)
7254 + len = strlen(opnd);
7255 + if (len > 1 && strncmp(opnd, g->reginput, len) != 0)
7257 + g->reginput += len;
7261 + if (*g->reginput == '\0' || strchr(OPERAND(scan), *g->reginput) == NULL)
7266 + if (*g->reginput == '\0' || strchr(OPERAND(scan), *g->reginput) != NULL)
7283 + register char *save;
7285 + no = OP(scan) - OPEN;
7286 + save = g->reginput;
7288 + if (regmatch(g, next)) {
7290 + * Don't set startp if some later
7291 + * invocation of the same parentheses
7294 + if (g->regstartp[no] == NULL)
7295 + g->regstartp[no] = save;
7312 + register char *save;
7314 + no = OP(scan) - CLOSE;
7315 + save = g->reginput;
7317 + if (regmatch(g, next)) {
7319 + * Don't set endp if some later
7320 + * invocation of the same parentheses
7323 + if (g->regendp[no] == NULL)
7324 + g->regendp[no] = save;
7331 + register char *save;
7333 + if (OP(next) != BRANCH) /* No choice. */
7334 + next = OPERAND(scan); /* Avoid recursion. */
7337 + save = g->reginput;
7338 + if (regmatch(g, OPERAND(scan)))
7340 + g->reginput = save;
7341 + scan = regnext(g, scan);
7342 + } while (scan != NULL && OP(scan) == BRANCH);
7350 + register char nextch;
7352 + register char *save;
7356 + * Lookahead to avoid useless match attempts
7357 + * when we know what character comes next.
7360 + if (OP(next) == EXACTLY)
7361 + nextch = *OPERAND(next);
7362 + min = (OP(scan) == STAR) ? 0 : 1;
7363 + save = g->reginput;
7364 + no = regrepeat(g, OPERAND(scan));
7365 + while (no >= min) {
7366 + /* If it could work, try it. */
7367 + if (nextch == '\0' || *g->reginput == nextch)
7368 + if (regmatch(g, next))
7370 + /* Couldn't or didn't -- back up. */
7372 + g->reginput = save + no;
7378 + return(1); /* Success! */
7381 + printk("<3>Regexp: memory corruption\n");
7390 + * We get here only if there's trouble -- normally "case END" is
7391 + * the terminating point.
7393 + printk("<3>Regexp: corrupted pointers\n");
7398 + - regrepeat - repeatedly match something simple, report how many
7401 +regrepeat(struct match_globals *g, char *p)
7403 + register int count = 0;
7404 + register char *scan;
7405 + register char *opnd;
7407 + scan = g->reginput;
7408 + opnd = OPERAND(p);
7411 + count = strlen(scan);
7415 + while (*opnd == *scan) {
7421 + while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
7427 + while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
7432 + default: /* Oh dear. Called inappropriately. */
7433 + printk("<3>Regexp: internal foulup\n");
7434 + count = 0; /* Best compromise. */
7437 + g->reginput = scan;
7443 + - regnext - dig the "next" pointer out of a node
7446 +regnext(struct match_globals *g, char *p)
7448 + register int offset;
7450 + if (p == &g->regdummy)
7457 + if (OP(p) == BACK)
7463 +/* ================================================== */
7465 +/* #define RING_DEBUG */
7467 +/* ************************************************* */
7469 +#define TH_FIN_MULTIPLIER 0x01
7470 +#define TH_SYN_MULTIPLIER 0x02
7471 +#define TH_RST_MULTIPLIER 0x04
7472 +#define TH_PUSH_MULTIPLIER 0x08
7473 +#define TH_ACK_MULTIPLIER 0x10
7474 +#define TH_URG_MULTIPLIER 0x20
7476 +/* ************************************************* */
7478 +#define PROC_INFO "info"
7479 +#define PROC_PLUGINS_INFO "plugins_info"
7481 +/* ************************************************* */
7483 +/* List of all ring sockets. */
7484 +static struct list_head ring_table;
7485 +static u_int ring_table_size;
7488 + For each device, pf_ring keeps a list of the number of
7489 + available ring socket slots. So that a caller knows in advance whether
7490 + there are slots available (for rings bound to such device)
7491 + that can potentially host the packet
7493 +static struct list_head device_ring_list[MAX_NUM_DEVICES];
7495 +/* List of all clusters */
7496 +static struct list_head ring_cluster_list;
7498 +/* List of all dna (direct nic access) devices */
7499 +static struct list_head ring_dna_devices_list;
7500 +static u_int dna_devices_list_size = 0;
7502 +/* List of all plugins */
7503 +static u_int plugin_registration_size = 0;
7504 +static struct pfring_plugin_registration *plugin_registration[MAX_PLUGIN_ID] = { NULL };
7505 +static u_short max_registered_plugin_id = 0;
7506 +static rwlock_t ring_mgmt_lock = RW_LOCK_UNLOCKED;
7508 +/* ********************************** */
7510 +/* /proc entry for ring module */
7511 +struct proc_dir_entry *ring_proc_dir = NULL;
7512 +struct proc_dir_entry *ring_proc = NULL;
7513 +struct proc_dir_entry *ring_proc_plugins_info = NULL;
7515 +static int ring_proc_get_info(char *, char **, off_t, int, int *, void *);
7516 +static int ring_proc_get_plugin_info(char *, char **, off_t, int, int *, void *);
7517 +static void ring_proc_add(struct ring_opt *pfr, struct net_device *dev);
7518 +static void ring_proc_remove(struct ring_opt *pfr);
7519 +static void ring_proc_init(void);
7520 +static void ring_proc_term(void);
7524 + [http://lists.metaprl.org/pipermail/cs134-labs/2002-October/000025.html]
7526 + GFP_ATOMIC means roughly "make the allocation operation atomic". This
7527 + means that the kernel will try to find the memory using a pile of free
7528 + memory set aside for urgent allocation. If that pile doesn't have
7529 + enough free pages, the operation will fail. This flag is useful for
7530 + allocation within interrupt handlers.
7532 + GFP_KERNEL will try a little harder to find memory. There's a
7533 + possibility that the call to kmalloc() will sleep while the kernel is
7534 + trying to find memory (thus making it unsuitable for interrupt
7535 + handlers). It's much more rare for an allocation with GFP_KERNEL to
7536 + fail than with GFP_ATOMIC.
7538 + In all cases, kmalloc() should only be used allocating small amounts of
7539 + memory (a few kb). vmalloc() is better for larger amounts.
7541 + Also note that in lab 1 and lab 2, it would have been arguably better to
7542 + use GFP_KERNEL instead of GFP_ATOMIC. GFP_ATOMIC should be saved for
7543 + those instances in which a sleep would be totally unacceptable.
7545 +/* ********************************** */
7548 +static struct proto_ops ring_ops;
7550 +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,11))
7551 +static struct proto ring_proto;
7554 +static int skb_ring_handler(struct sk_buff *skb, u_char recv_packet,
7555 + u_char real_skb, short channel_id);
7556 +static int buffer_ring_handler(struct net_device *dev, char *data, int len);
7557 +static int remove_from_cluster(struct sock *sock, struct ring_opt *pfr);
7561 +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23))
7566 +ip_defrag(struct sk_buff *skb, u32 user);
7568 +/* ********************************** */
7571 +static unsigned int num_slots = 4096;
7572 +static unsigned int enable_tx_capture = 1;
7573 +static unsigned int enable_ip_defrag = 0;
7574 +static unsigned int transparent_mode = 1;
7575 +static u_int32_t ring_id_serial = 0;
7577 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16))
7578 +module_param(num_slots, uint, 0644);
7579 +module_param(transparent_mode, uint, 0644);
7580 +module_param(enable_tx_capture, uint, 0644);
7581 +module_param(enable_ip_defrag, uint, 0644);
7583 +MODULE_PARM(num_slots, "i");
7584 +MODULE_PARM(transparent_mode, "i");
7585 +MODULE_PARM(enable_tx_capture, "i");
7586 +MODULE_PARM(enable_ip_defrag, "i");
7589 +MODULE_PARM_DESC(num_slots, "Number of ring slots");
7590 +MODULE_PARM_DESC(transparent_mode,
7591 + "Set to 1 to set transparent mode "
7592 + "(slower but backwards compatible)");
7593 +MODULE_PARM_DESC(enable_tx_capture, "Set to 1 to capture outgoing packets");
7594 +MODULE_PARM_DESC(enable_ip_defrag,
7595 + "Set to 1 to enable IP defragmentation"
7596 + "(only rx traffic is defragmentead)");
7598 +/* ********************************** */
7600 +#define MIN_QUEUED_PKTS 64
7601 +#define MAX_QUEUE_LOOPS 64
7604 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
7605 +#define ring_sk_datatype(__sk) ((struct ring_opt *)__sk)
7606 +#define ring_sk(__sk) ((__sk)->sk_protinfo)
7608 +#define ring_sk_datatype(a) (a)
7609 +#define ring_sk(__sk) ((__sk)->protinfo.pf_ring)
7612 +#define _rdtsc() ({ uint64_t x; asm volatile("rdtsc" : "=A" (x)); x; })
7614 +/* ***************** Legacy code ************************ */
7616 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22))
7617 +static inline struct iphdr *ip_hdr(const struct sk_buff *skb)
7619 + return (struct iphdr *)skb->nh.iph;
7622 +static inline void skb_set_network_header(struct sk_buff *skb,
7625 + skb->nh.iph = (struct iphdr*)skb->data + offset;
7628 +static inline void skb_reset_network_header(struct sk_buff *skb)
7633 +static inline void skb_reset_transport_header(struct sk_buff *skb)
7639 +/* ***** Code taken from other kernel modules ******** */
7642 + * rvmalloc copied from usbvideo.c
7644 +static void *rvmalloc(unsigned long size)
7647 + unsigned long adr;
7648 + unsigned long pages = 0;
7650 +#if defined(RING_DEBUG)
7651 + printk("[PF_RING] rvmalloc: %lu bytes\n", size);
7654 + size = PAGE_ALIGN(size);
7655 + mem = vmalloc_32(size);
7659 + memset(mem, 0, size); /* Clear the ram out, no junk to the user */
7660 + adr = (unsigned long) mem;
7661 + while (size > 0) {
7662 + SetPageReserved(vmalloc_to_page((void *)adr));
7665 + size -= PAGE_SIZE;
7668 +#if defined(RING_DEBUG)
7669 + printk("[PF_RING] rvmalloc: %lu pages\n", pages);
7674 +/* ************************************************** */
7677 + * rvfree copied from usbvideo.c
7679 +static void rvfree(void *mem, unsigned long size)
7681 + unsigned long adr;
7682 + unsigned long pages = 0;
7684 +#if defined(RING_DEBUG)
7685 + printk("[PF_RING] rvfree: %lu bytes\n", size);
7691 + adr = (unsigned long) mem;
7692 + while ((long) size > 0) {
7693 + ClearPageReserved(vmalloc_to_page((void *)adr));
7696 + size -= PAGE_SIZE;
7698 +#if defined(RING_DEBUG)
7699 + printk("[PF_RING] rvfree: %lu pages\n", pages);
7700 + printk("[PF_RING] rvfree: calling vfree....\n");
7703 +#if defined(RING_DEBUG)
7704 + printk("[PF_RING] rvfree: after vfree....\n");
7708 +/* ********************************** */
7710 +#define IP_DEFRAG_RING 1234
7712 +/* Returns new sk_buff, or NULL */
7713 +static struct sk_buff *ring_gather_frags(struct sk_buff *skb)
7715 +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23))
7716 + skb = ip_defrag(skb, IP_DEFRAG_RING);
7719 + ip_send_check(ip_hdr(skb));
7721 + if(ip_defrag(skb, IP_DEFRAG_RING))
7724 + ip_send_check(ip_hdr(skb));
7730 +/* ********************************** */
7732 +static void ring_sock_destruct(struct sock *sk)
7734 + struct ring_opt *pfr;
7736 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
7737 + skb_queue_purge(&sk->sk_receive_queue);
7739 + if (!sock_flag(sk, SOCK_DEAD)) {
7740 +#if defined(RING_DEBUG)
7741 + printk("[PF_RING] Attempt to release alive ring socket: %p\n", sk);
7747 +#if defined(RING_DEBUG)
7748 + printk("[PF_RING] Attempt to release alive ring socket: %p\n", sk);
7754 + pfr = ring_sk(sk);
7756 + if(pfr) kfree(pfr);
7758 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
7759 + MOD_DEC_USE_COUNT;
7763 +/* ********************************** */
7765 +static void ring_proc_add(struct ring_opt *pfr, struct net_device *dev)
7767 + if(ring_proc_dir != NULL) {
7770 + pfr->ring_pid = current->pid;
7771 + pfr->ring_id = ring_id_serial++;
7774 + snprintf(name, sizeof(name), "%d-%s.%d", pfr->ring_pid, dev->name, pfr->ring_id);
7776 + snprintf(name, sizeof(name), "%d.%d", pfr->ring_pid, pfr->ring_id);
7778 + create_proc_read_entry(name, 0, ring_proc_dir, ring_proc_get_info, pfr);
7779 + /* printk("[PF_RING] added /proc/net/pf_ring/%s\n", name); */
7780 + /* printk("[PF_RING] %s has index %d\n", dev->name, dev->ifindex); */
7784 +/* ********************************** */
7786 +static void ring_proc_remove(struct ring_opt *pfr)
7788 + if(ring_proc_dir != NULL) {
7791 + if (pfr->ring_netdev && pfr->ring_netdev->name)
7792 + snprintf(name, sizeof(name), "%d-%s.%d",
7793 + pfr->ring_pid,pfr-> ring_netdev->name, pfr->ring_id);
7795 + snprintf(name, sizeof(name), "%d.%d", pfr->ring_pid, pfr->ring_id);
7797 + remove_proc_entry(name, ring_proc_dir);
7798 + printk("[PF_RING] removed /proc/net/pf_ring/%s\n", name);
7802 +/* ********************************** */
7804 +static u_int32_t num_queued_pkts(struct ring_opt *pfr)
7806 + if(pfr->ring_slots != NULL) {
7807 + u_int32_t tot_insert = pfr->slots_info->tot_insert, tot_read = pfr->slots_info->tot_read;
7809 + if(tot_insert >= tot_read) {
7810 + return(tot_insert-tot_read);
7812 + return(((u_int32_t)-1)+tot_insert-tot_read);
7815 +#if defined(RING_DEBUG)
7816 + printk("[PF_RING] -> [tot_insert=%d][tot_read=%d]\n",
7817 + tot_insert, tot_read);
7823 +/* ************************************* */
7825 +inline u_int get_num_ring_free_slots(struct ring_opt *pfr) {
7826 + return(pfr->slots_info->tot_slots - num_queued_pkts(pfr));
7829 +/* ********************************** */
7831 +static int ring_proc_get_info(char *buf, char **start, off_t offset,
7832 + int len, int *unused, void *data)
7835 + struct ring_opt *pfr;
7836 + FlowSlotInfo *fsi;
7838 + if(data == NULL) {
7839 + /* /proc/net/pf_ring/info */
7840 + rlen = sprintf(buf, "Version : %s\n", RING_VERSION);
7841 + rlen += sprintf(buf + rlen, "Ring slots : %d\n", num_slots);
7842 + rlen += sprintf(buf + rlen, "Slot version : %d\n", RING_FLOWSLOT_VERSION);
7843 + rlen += sprintf(buf + rlen, "Capture TX : %s\n",
7844 + enable_tx_capture ? "Yes [RX+TX]" : "No [RX only]");
7845 + rlen += sprintf(buf + rlen, "IP Defragment : %s\n", enable_ip_defrag ? "Yes" : "No");
7846 + rlen += sprintf(buf + rlen, "Transparent mode : %s\n",
7847 + transparent_mode ? "Yes" : "No");
7848 + rlen += sprintf(buf + rlen, "Total rings : %d\n", ring_table_size);
7849 + rlen += sprintf(buf + rlen, "Total plugins : %d\n", plugin_registration_size);
7851 + /* detailed statistics about a PF_RING */
7852 + pfr = (struct ring_opt*)data;
7855 + fsi = pfr->slots_info;
7858 + rlen = sprintf(buf, "Bound Device : %s\n",
7859 + pfr->ring_netdev->name == NULL ? "<NULL>" : pfr->ring_netdev->name);
7860 + rlen += sprintf(buf + rlen, "Version : %d\n", fsi->version);
7861 + rlen += sprintf(buf + rlen, "Sampling Rate : %d\n", pfr->sample_rate);
7862 + rlen += sprintf(buf + rlen, "Appl. Name : %s\n", pfr->appl_name ? pfr->appl_name : "<unknown>");
7863 + rlen += sprintf(buf + rlen, "IP Defragment : %s\n", enable_ip_defrag ? "Yes" : "No");
7864 + rlen += sprintf(buf + rlen, "BPF Filtering : %s\n", pfr->bpfFilter ? "Enabled" : "Disabled");
7865 + rlen += sprintf(buf + rlen, "# Filt. Rules : %d\n", pfr->num_filtering_rules);
7866 + rlen += sprintf(buf + rlen, "Cluster Id : %d\n", pfr->cluster_id);
7867 + rlen += sprintf(buf + rlen, "Channel Id : %d\n", pfr->channel_id);
7868 + rlen += sprintf(buf + rlen, "Tot Slots : %d\n", fsi->tot_slots);
7869 + rlen += sprintf(buf + rlen, "Bucket Len : %d\n", fsi->data_len);
7870 + rlen += sprintf(buf + rlen, "Slot Len : %d [bucket+header]\n", fsi->slot_len);
7871 + rlen += sprintf(buf + rlen, "Tot Memory : %d\n", fsi->tot_mem);
7872 + rlen += sprintf(buf + rlen, "Tot Packets : %lu\n", (unsigned long)fsi->tot_pkts);
7873 + rlen += sprintf(buf + rlen, "Tot Pkt Lost : %lu\n", (unsigned long)fsi->tot_lost);
7874 + rlen += sprintf(buf + rlen, "Tot Insert : %lu\n", (unsigned long)fsi->tot_insert);
7875 + rlen += sprintf(buf + rlen, "Tot Read : %lu\n", (unsigned long)fsi->tot_read);
7876 + rlen += sprintf(buf + rlen, "Num Free Slots: %u\n", get_num_ring_free_slots(pfr));
7878 + rlen = sprintf(buf, "WARNING fsi == NULL\n");
7880 + rlen = sprintf(buf, "WARNING data == NULL\n");
7886 +/* ********************************** */
7888 +static int ring_proc_get_plugin_info(char *buf, char **start, off_t offset,
7889 + int len, int *unused, void *data)
7891 + int rlen = 0, i = 0;
7892 + struct pfring_plugin_registration* tmp = NULL;
7894 + /* FIXME: I should now the number of plugins registered */
7895 + if (!plugin_registration_size) return rlen;
7897 + /* plugins_info */
7899 + rlen += sprintf(buf + rlen , "ID\tPlugin\n");
7901 + for(i = 0; i < MAX_PLUGIN_ID; i++) {
7902 + tmp = plugin_registration[i];
7904 + rlen += sprintf(buf + rlen , "%d\t%s [%s]\n",
7905 + tmp->plugin_id, tmp->name, tmp->description);
7912 +/* ********************************** */
7914 +static void ring_proc_init(void)
7916 + ring_proc_dir = proc_mkdir("pf_ring",
7917 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24))
7922 + if(ring_proc_dir) {
7923 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,30))
7924 + ring_proc_dir->owner = THIS_MODULE;
7926 + ring_proc = create_proc_read_entry(PROC_INFO, 0,
7928 + ring_proc_get_info,
7930 + ring_proc_plugins_info = create_proc_read_entry(PROC_PLUGINS_INFO, 0,
7932 + ring_proc_get_plugin_info,
7934 + if(!ring_proc || !ring_proc_plugins_info)
7935 + printk("[PF_RING] unable to register proc file\n");
7937 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,30))
7938 + ring_proc->owner = THIS_MODULE;
7939 + ring_proc_plugins_info->owner = THIS_MODULE;
7941 + printk("[PF_RING] registered /proc/net/pf_ring/\n");
7944 + printk("[PF_RING] unable to create /proc/net/pf_ring\n");
7947 +/* ********************************** */
7949 +static void ring_proc_term(void)
7951 + if(ring_proc != NULL) {
7952 + remove_proc_entry(PROC_INFO, ring_proc_dir);
7953 + printk("[PF_RING] removed /proc/net/pf_ring/%s\n", PROC_INFO);
7955 + remove_proc_entry(PROC_PLUGINS_INFO, ring_proc_dir);
7956 + printk("[PF_RING] removed /proc/net/pf_ring/%s\n", PROC_PLUGINS_INFO);
7958 + if(ring_proc_dir != NULL) {
7959 + remove_proc_entry("pf_ring",
7960 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24))
7964 + printk("[PF_RING] deregistered /proc/net/pf_ring\n");
7969 +/* ********************************** */
7974 + * store the sk in a new element and add it
7975 + * to the head of the list.
7977 +static inline void ring_insert(struct sock *sk)
7979 + struct ring_element *next;
7980 + struct ring_opt *pfr;
7982 +#if defined(RING_DEBUG)
7983 + printk("[PF_RING] ring_insert()\n");
7986 + next = kmalloc(sizeof(struct ring_element), GFP_ATOMIC);
7987 + if(next != NULL) {
7989 + write_lock_bh(&ring_mgmt_lock);
7990 + list_add(&next->list, &ring_table);
7991 + write_unlock_bh(&ring_mgmt_lock);
7993 + if(net_ratelimit())
7994 + printk("[PF_RING] net_ratelimit() failure\n");
7997 + ring_table_size++;
7998 + //ring_proc_add(ring_sk(sk));
7999 + pfr = (struct ring_opt *)ring_sk(sk);
8000 + pfr->ring_pid = current->pid;
8003 +/* ********************************** */
8008 + * For each of the elements in the list:
8009 + * - check if this is the element we want to delete
8010 + * - if it is, remove it from the list, and free it.
8012 + * stop when we find the one we're looking for (break),
8013 + * or when we reach the end of the list.
8015 +static inline void ring_remove(struct sock *sk)
8017 + struct list_head *ptr, *tmp_ptr;
8018 + struct ring_element *entry;
8020 +#if defined(RING_DEBUG)
8021 + printk("[PF_RING] ring_remove()\n");
8024 + list_for_each_safe(ptr, tmp_ptr, &ring_table) {
8025 + entry = list_entry(ptr, struct ring_element, list);
8027 + if(entry->sk == sk) {
8030 + ring_table_size--;
8035 +#if defined(RING_DEBUG)
8036 + printk("[PF_RING] leaving ring_remove()\n");
8040 +/* ********************************** */
8042 +static inline FlowSlot* get_insert_slot(struct ring_opt *pfr)
8044 + if(pfr->ring_slots != NULL) {
8045 + FlowSlot *slot = (FlowSlot*)&(pfr->ring_slots[pfr->slots_info->insert_idx
8046 + *pfr->slots_info->slot_len]);
8047 +#if defined(RING_DEBUG)
8048 + printk("[PF_RING] get_insert_slot(%d): returned slot [slot_state=%d]\n",
8049 + pfr->slots_info->insert_idx, slot->slot_state);
8053 +#if defined(RING_DEBUG)
8054 + printk("[PF_RING] get_insert_slot(%d): NULL slot\n", pfr->slots_info->insert_idx);
8060 +/* ********************************** */
8062 +static inline FlowSlot* get_remove_slot(struct ring_opt *pfr)
8064 +#if defined(RING_DEBUG)
8065 + printk("[PF_RING] get_remove_slot(%d)\n", pfr->slots_info->remove_idx);
8068 + if(pfr->ring_slots != NULL)
8069 + return((FlowSlot*)&(pfr->ring_slots[pfr->slots_info->remove_idx*
8070 + pfr->slots_info->slot_len]));
8075 +/* ******************************************************* */
8077 +static int parse_pkt(struct sk_buff *skb,
8078 + u_int16_t skb_displ,
8079 + struct pfring_pkthdr *hdr)
8082 + struct ethhdr *eh = (struct ethhdr*)(skb->data-skb_displ);
8085 + memset(&hdr->parsed_pkt, 0, sizeof(struct pkt_parsing_info));
8086 + hdr->parsed_header_len = 9;
8088 + hdr->parsed_pkt.eth_type = ntohs(eh->h_proto);
8089 + hdr->parsed_pkt.pkt_detail.offset.eth_offset = -skb_displ;
8091 + if(hdr->parsed_pkt.eth_type == 0x8100 /* 802.1q (VLAN) */)
8093 + hdr->parsed_pkt.pkt_detail.offset.vlan_offset = hdr->parsed_pkt.pkt_detail.offset.eth_offset + sizeof(struct ethhdr);
8094 + hdr->parsed_pkt.vlan_id = (skb->data[hdr->parsed_pkt.pkt_detail.offset.vlan_offset] & 15) * 256
8095 + + skb->data[hdr->parsed_pkt.pkt_detail.offset.vlan_offset + 1];
8096 + hdr->parsed_pkt.eth_type = (skb->data[hdr->parsed_pkt.pkt_detail.offset.vlan_offset + 2]) * 256
8097 + + skb->data[hdr->parsed_pkt.pkt_detail.offset.vlan_offset + 3];
8103 + hdr->parsed_pkt.vlan_id = 0; /* Any VLAN */
8106 + if(hdr->parsed_pkt.eth_type == 0x0800 /* IP */) {
8107 + hdr->parsed_pkt.pkt_detail.offset.l3_offset = hdr->parsed_pkt.pkt_detail.offset.eth_offset+displ+sizeof(struct ethhdr);
8108 + ip = (struct iphdr*)(skb->data+hdr->parsed_pkt.pkt_detail.offset.l3_offset);
8110 + hdr->parsed_pkt.ipv4_src = ntohl(ip->saddr), hdr->parsed_pkt.ipv4_dst = ntohl(ip->daddr), hdr->parsed_pkt.l3_proto = ip->protocol;
8111 + hdr->parsed_pkt.ipv4_tos = ip->tos;
8112 + hdr->parsed_pkt.pkt_detail.offset.l4_offset = hdr->parsed_pkt.pkt_detail.offset.l3_offset+ip->ihl*4;
8114 + if((ip->protocol == IPPROTO_TCP) || (ip->protocol == IPPROTO_UDP))
8116 + if(ip->protocol == IPPROTO_TCP)
8118 + struct tcphdr *tcp = (struct tcphdr*)(skb->data+hdr->parsed_pkt.pkt_detail.offset.l4_offset);
8119 + hdr->parsed_pkt.l4_src_port = ntohs(tcp->source), hdr->parsed_pkt.l4_dst_port = ntohs(tcp->dest);
8120 + hdr->parsed_pkt.pkt_detail.offset.payload_offset = hdr->parsed_pkt.pkt_detail.offset.l4_offset+(tcp->doff * 4);
8121 + hdr->parsed_pkt.tcp_flags = (tcp->fin * TH_FIN_MULTIPLIER) + (tcp->syn * TH_SYN_MULTIPLIER) + (tcp->rst * TH_RST_MULTIPLIER) +
8122 + (tcp->psh * TH_PUSH_MULTIPLIER) + (tcp->ack * TH_ACK_MULTIPLIER) + (tcp->urg * TH_URG_MULTIPLIER);
8123 + } else if(ip->protocol == IPPROTO_UDP)
8125 + struct udphdr *udp = (struct udphdr*)(skb->data+hdr->parsed_pkt.pkt_detail.offset.l4_offset);
8126 + hdr->parsed_pkt.l4_src_port = ntohs(udp->source), hdr->parsed_pkt.l4_dst_port = ntohs(udp->dest);
8127 + hdr->parsed_pkt.pkt_detail.offset.payload_offset = hdr->parsed_pkt.pkt_detail.offset.l4_offset+sizeof(struct udphdr);
8129 + hdr->parsed_pkt.pkt_detail.offset.payload_offset = hdr->parsed_pkt.pkt_detail.offset.l4_offset;
8131 + hdr->parsed_pkt.l4_src_port = hdr->parsed_pkt.l4_dst_port = 0;
8133 + hdr->parsed_pkt.pkt_detail.offset.eth_offset = skb_displ;
8135 + return(1); /* IP */
8136 + } /* TODO: handle IPv6 */
8138 + return(0); /* No IP */
8141 +/* ********************************** */
8143 +inline u_int32_t hash_pkt(u_int16_t vlan_id, u_int8_t proto,
8144 + u_int32_t host_peer_a, u_int32_t host_peer_b,
8145 + u_int16_t port_peer_a, u_int16_t port_peer_b)
8147 + return(vlan_id+proto+host_peer_a+host_peer_b+port_peer_a+port_peer_b);
8150 +/* ********************************** */
8152 +inline u_int32_t hash_pkt_header(struct pfring_pkthdr *hdr, u_char mask_src, u_char mask_dst)
8154 + return(hash_pkt(hdr->parsed_pkt.vlan_id,
8155 + hdr->parsed_pkt.l3_proto,
8156 + mask_src ? 0 : hdr->parsed_pkt.ipv4_src,
8157 + mask_dst ? 0 : hdr->parsed_pkt.ipv4_dst,
8158 + mask_src ? 0 : hdr->parsed_pkt.l4_src_port,
8159 + mask_dst ? 0 : hdr->parsed_pkt.l4_dst_port));
8162 +/* ********************************** */
8164 +static int hash_bucket_match(filtering_hash_bucket *hash_bucket,
8165 + struct pfring_pkthdr *hdr,
8166 + u_char mask_src, u_char mask_dst)
8168 + if((hash_bucket->rule.proto == hdr->parsed_pkt.l3_proto)
8169 + && (hash_bucket->rule.vlan_id == hdr->parsed_pkt.vlan_id)
8170 + && (((hash_bucket->rule.host_peer_a == (mask_src ? 0 : hdr->parsed_pkt.ipv4_src))
8171 + && (hash_bucket->rule.host_peer_b == (mask_dst ? 0 : hdr->parsed_pkt.ipv4_dst))
8172 + && (hash_bucket->rule.port_peer_a == (mask_src ? 0 : hdr->parsed_pkt.l4_src_port))
8173 + && (hash_bucket->rule.port_peer_b == (mask_dst ? 0 : hdr->parsed_pkt.l4_dst_port)))
8175 + ((hash_bucket->rule.host_peer_a == (mask_dst ? 0 : hdr->parsed_pkt.ipv4_dst))
8176 + && (hash_bucket->rule.host_peer_b == (mask_src ? 0 : hdr->parsed_pkt.ipv4_src))
8177 + && (hash_bucket->rule.port_peer_a == (mask_dst ? 0 : hdr->parsed_pkt.l4_dst_port))
8178 + && (hash_bucket->rule.port_peer_b == (mask_src ? 0 : hdr->parsed_pkt.l4_src_port))))) {
8179 + hash_bucket->rule.jiffies_last_match = jiffies;
8185 +/* ********************************** */
8187 +inline int hash_bucket_match_rule(filtering_hash_bucket *hash_bucket,
8188 + hash_filtering_rule *rule)
8193 + printk("[PF_RING] (%u,%d,%d.%d.%d.%d:%u,%d.%d.%d.%d:%u) (%u,%d,%d.%d.%d.%d:%u,%d.%d.%d.%d:%u)\n",
8194 + hash_bucket->rule.vlan_id, hash_bucket->rule.proto,
8195 + ((hash_bucket->rule.host_peer_a >> 24) & 0xff),
8196 + ((hash_bucket->rule.host_peer_a >> 16) & 0xff),
8197 + ((hash_bucket->rule.host_peer_a >> 8) & 0xff),
8198 + ((hash_bucket->rule.host_peer_a >> 0) & 0xff),
8199 + hash_bucket->rule.port_peer_a,
8200 + ((hash_bucket->rule.host_peer_b >> 24) & 0xff),
8201 + ((hash_bucket->rule.host_peer_b >> 16) & 0xff),
8202 + ((hash_bucket->rule.host_peer_b >> 8) & 0xff),
8203 + ((hash_bucket->rule.host_peer_b >> 0) & 0xff),
8204 + hash_bucket->rule.port_peer_b,
8205 + rule->vlan_id, rule->proto,
8206 + ((rule->host_peer_a >> 24) & 0xff),
8207 + ((rule->host_peer_a >> 16) & 0xff),
8208 + ((rule->host_peer_a >> 8) & 0xff),
8209 + ((rule->host_peer_a >> 0) & 0xff),
8210 + rule->port_peer_a,
8211 + ((rule->host_peer_b >> 24) & 0xff),
8212 + ((rule->host_peer_b >> 16) & 0xff),
8213 + ((rule->host_peer_b >> 8) & 0xff),
8214 + ((rule->host_peer_b >> 0) & 0xff),
8215 + rule->port_peer_b);
8217 + if((hash_bucket->rule.proto == rule->proto)
8218 + && (hash_bucket->rule.vlan_id == rule->vlan_id)
8219 + && (((hash_bucket->rule.host_peer_a == rule->host_peer_a)
8220 + && (hash_bucket->rule.host_peer_b == rule->host_peer_b)
8221 + && (hash_bucket->rule.port_peer_a == rule->port_peer_a)
8222 + && (hash_bucket->rule.port_peer_b == rule->port_peer_b))
8224 + ((hash_bucket->rule.host_peer_a == rule->host_peer_b)
8225 + && (hash_bucket->rule.host_peer_b == rule->host_peer_a)
8226 + && (hash_bucket->rule.port_peer_a == rule->port_peer_b)
8227 + && (hash_bucket->rule.port_peer_b == rule->port_peer_a)))) {
8228 + hash_bucket->rule.jiffies_last_match = jiffies;
8234 +/* ********************************** */
8236 +inline int hash_filtering_rule_match(hash_filtering_rule *a,
8237 + hash_filtering_rule *b)
8242 + printk("[PF_RING] (%u,%d,%d.%d.%d.%d:%u,%d.%d.%d.%d:%u) (%u,%d,%d.%d.%d.%d:%u,%d.%d.%d.%d:%u)\n",
8243 + a->vlan_id, a->proto,
8244 + ((a->host_peer_a >> 24) & 0xff),
8245 + ((a->host_peer_a >> 16) & 0xff),
8246 + ((a->host_peer_a >> 8) & 0xff),
8247 + ((a->host_peer_a >> 0) & 0xff),
8249 + ((a->host_peer_b >> 24) & 0xff),
8250 + ((a->host_peer_b >> 16) & 0xff),
8251 + ((a->host_peer_b >> 8) & 0xff),
8252 + ((a->host_peer_b >> 0) & 0xff),
8255 + b->vlan_id, b->proto,
8256 + ((b->host_peer_a >> 24) & 0xff),
8257 + ((b->host_peer_a >> 16) & 0xff),
8258 + ((b->host_peer_a >> 8) & 0xff),
8259 + ((b->host_peer_a >> 0) & 0xff),
8261 + ((b->host_peer_b >> 24) & 0xff),
8262 + ((b->host_peer_b >> 16) & 0xff),
8263 + ((b->host_peer_b >> 8) & 0xff),
8264 + ((b->host_peer_b >> 0) & 0xff),
8268 + if((a->proto == b->proto)
8269 + && (a->vlan_id == b->vlan_id)
8270 + && (((a->host_peer_a == b->host_peer_a)
8271 + && (a->host_peer_b == b->host_peer_b)
8272 + && (a->port_peer_a == b->port_peer_a)
8273 + && (a->port_peer_b == b->port_peer_b))
8275 + ((a->host_peer_a == b->host_peer_b)
8276 + && (a->host_peer_b == b->host_peer_a)
8277 + && (a->port_peer_a == b->port_peer_b)
8278 + && (a->port_peer_b == b->port_peer_a)))) {
8284 +/* ********************************** */
8286 +/* 0 = no match, 1 = match */
8287 +static int match_filtering_rule(struct ring_opt *the_ring,
8288 + filtering_rule_element *rule,
8289 + struct pfring_pkthdr *hdr,
8290 + struct sk_buff *skb,
8292 + struct parse_buffer *parse_memory_buffer[],
8293 + u_int8_t *free_parse_mem,
8294 + u_int *last_matched_plugin,
8295 + packet_action_behaviour *behaviour)
8299 + /* if(debug) printk("[PF_RING] match_filtering_rule()\n"); */
8301 + *behaviour = use_rule_forward_policy; /* Default */
8303 + if((rule->rule.core_fields.vlan_id > 0) && (hdr->parsed_pkt.vlan_id != rule->rule.core_fields.vlan_id)) return(0);
8304 + if((rule->rule.core_fields.proto > 0) && (hdr->parsed_pkt.l3_proto != rule->rule.core_fields.proto)) return(0);
8306 + if(rule->rule.core_fields.host_low > 0) {
8307 + if(((hdr->parsed_pkt.ipv4_src < rule->rule.core_fields.host_low)
8308 + || (hdr->parsed_pkt.ipv4_src > rule->rule.core_fields.host_high))
8309 + && ((hdr->parsed_pkt.ipv4_dst < rule->rule.core_fields.host_low)
8310 + || (hdr->parsed_pkt.ipv4_dst > rule->rule.core_fields.host_high)))
8314 + if((rule->rule.core_fields.port_high > 0)
8315 + && (!((hdr->parsed_pkt.l4_src_port >= rule->rule.core_fields.port_low)
8316 + && (hdr->parsed_pkt.l4_src_port <= rule->rule.core_fields.port_high)))
8317 + && (!((hdr->parsed_pkt.l4_dst_port >= rule->rule.core_fields.port_low)
8318 + && (hdr->parsed_pkt.l4_dst_port <= rule->rule.core_fields.port_high))))
8321 + if(rule->rule.balance_pool > 0) {
8322 + u_int32_t balance_hash = hash_pkt_header(hdr, 0, 0) % rule->rule.balance_pool;
8323 + if(balance_hash != rule->rule.balance_id) return(0);
8326 + if(rule->pattern != NULL) {
8327 + if((hdr->parsed_pkt.pkt_detail.offset.payload_offset > 0)
8328 + && (hdr->caplen > hdr->parsed_pkt.pkt_detail.offset.payload_offset)) {
8329 + char *payload = (char*)&(skb->data[hdr->parsed_pkt.pkt_detail.offset.payload_offset /* -displ */]);
8330 + int i, rc, payload_len = hdr->caplen - hdr->parsed_pkt.pkt_detail.offset.payload_offset - displ;
8332 + if(payload_len > 0) {
8334 + printk("[PF_RING] Trying to match pattern [caplen=%d][len=%d][displ=%d][payload_offset=%d][",
8335 + hdr->caplen, payload_len, displ, hdr->parsed_pkt.pkt_detail.offset.payload_offset);
8337 + for(i=0; i<payload_len; i++) printk("[%d/%c]", i, payload[i] & 0xFF);
8341 + payload[payload_len] = '\0';
8343 + if(debug) printk("[PF_RING] Attempt to match [%s]\n", payload);
8344 + rc = regexec(rule->pattern, payload);
8347 + printk("[PF_RING] Match returned: %d [payload_len=%d][%s]\n", rc, payload_len, payload);
8350 + return(0); /* No match */
8352 + return(0); /* No payload data */
8354 + return(0); /* No payload data */
8357 + if((rule->rule.extended_fields.filter_plugin_id > 0)
8358 + && (rule->rule.extended_fields.filter_plugin_id < MAX_PLUGIN_ID)
8359 + && (plugin_registration[rule->rule.extended_fields.filter_plugin_id] != NULL)
8360 + && (plugin_registration[rule->rule.extended_fields.filter_plugin_id]->pfring_plugin_filter_skb != NULL)
8365 + printk("[PF_RING] rule->plugin_id [rule_id=%d][filter_plugin_id=%d][plugin_action=%d][ptr=%p]\n",
8366 + rule->rule.rule_id,
8367 + rule->rule.extended_fields.filter_plugin_id,
8368 + rule->rule.plugin_action.plugin_id,
8369 + plugin_registration[rule->rule.plugin_action.plugin_id]);
8371 + rc = plugin_registration[rule->rule.extended_fields.filter_plugin_id]
8372 + ->pfring_plugin_filter_skb(the_ring, rule, hdr, skb,
8373 + &parse_memory_buffer[rule->rule.extended_fields.filter_plugin_id]);
8375 + if(parse_memory_buffer[rule->rule.extended_fields.filter_plugin_id]) *free_parse_mem = 1;
8378 + return(0); /* No match */
8380 + *last_matched_plugin = rule->rule.extended_fields.filter_plugin_id;
8381 + hdr->parsed_pkt.last_matched_plugin_id = rule->rule.extended_fields.filter_plugin_id;
8384 + printk("[PF_RING] [last_matched_plugin = %d][buffer=%p][len=%d]\n",
8385 + *last_matched_plugin, parse_memory_buffer[rule->rule.extended_fields.filter_plugin_id],
8386 + parse_memory_buffer[rule->rule.extended_fields.filter_plugin_id] ?
8387 + parse_memory_buffer[rule->rule.extended_fields.filter_plugin_id]->mem_len : 0);
8391 + /* Action to be performed in case of match */
8392 + if((rule->rule.plugin_action.plugin_id != 0)
8393 + && (rule->rule.plugin_action.plugin_id < MAX_PLUGIN_ID)
8394 + && (plugin_registration[rule->rule.plugin_action.plugin_id] != NULL)
8395 + && (plugin_registration[rule->rule.plugin_action.plugin_id]->pfring_plugin_handle_skb != NULL)
8397 + if(debug) printk("[PF_RING] Calling pfring_plugin_handle_skb()\n");
8399 + plugin_registration[rule->rule.plugin_action.plugin_id]
8400 + ->pfring_plugin_handle_skb(the_ring, rule, NULL, hdr, skb,
8401 + rule->rule.extended_fields.filter_plugin_id,
8402 + &parse_memory_buffer[rule->rule.extended_fields.filter_plugin_id],
8405 + if(*last_matched_plugin == 0)
8406 + *last_matched_plugin = rule->rule.plugin_action.plugin_id;
8408 + if(parse_memory_buffer[rule->rule.plugin_action.plugin_id]) *free_parse_mem = 1;
8410 + if(debug) printk("[PF_RING] Skipping pfring_plugin_handle_skb(plugin_action=%d)\n",
8411 + rule->rule.plugin_action.plugin_id);
8415 + printk("[PF_RING] MATCH: match_filtering_rule(vlan=%u, proto=%u, sip=%u, sport=%u, dip=%u, dport=%u)\n",
8416 + hdr->parsed_pkt.vlan_id, hdr->parsed_pkt.l3_proto, hdr->parsed_pkt.ipv4_src, hdr->parsed_pkt.l4_src_port,
8417 + hdr->parsed_pkt.ipv4_dst, hdr->parsed_pkt.l4_dst_port);
8418 + printk("[PF_RING] [rule(vlan=%u, proto=%u, ip=%u-%u, port=%u-%u)(behaviour=%d)]\n",
8419 + rule->rule.core_fields.vlan_id, rule->rule.core_fields.proto,
8420 + rule->rule.core_fields.host_low, rule->rule.core_fields.host_high,
8421 + rule->rule.core_fields.port_low,
8422 + rule->rule.core_fields.port_high, *behaviour);
8425 + rule->rule.jiffies_last_match = jiffies;
8426 + return(1); /* match */
8429 +/* ********************************** */
8431 +static void add_pkt_to_ring(struct sk_buff *skb,
8432 + struct ring_opt *pfr,
8433 + struct pfring_pkthdr *hdr,
8434 + int displ, short channel_id,
8435 + int offset, void* plugin_mem)
8437 + char *ring_bucket;
8439 + FlowSlot *theSlot;
8440 + int32_t the_bit = 1 << channel_id;
8442 + if(!pfr->ring_active) return;
8444 +#if defined(RING_DEBUG)
8445 + printk("[PF_RING] --> add_pkt_to_ring(len=%d) [pfr->channel_id=%d][channel_id=%d]\n",
8446 + hdr->len, pfr->channel_id, channel_id);
8449 + if((pfr->channel_id != RING_ANY_CHANNEL)
8450 + && (channel_id != RING_ANY_CHANNEL)
8451 + && ((pfr->channel_id & the_bit) != the_bit))
8452 + return; /* Wrong channel */
8454 + write_lock_bh(&pfr->ring_index_lock);
8455 + idx = pfr->slots_info->insert_idx;
8456 + idx++, theSlot = get_insert_slot(pfr);
8457 + pfr->slots_info->tot_pkts++;
8459 + if((theSlot == NULL) || (theSlot->slot_state != 0)) {
8460 + /* No room left */
8461 + pfr->slots_info->tot_lost++;
8462 + write_unlock_bh(&pfr->ring_index_lock);
8466 + ring_bucket = &theSlot->bucket;
8467 + memcpy(ring_bucket, hdr, sizeof(struct pfring_pkthdr)); /* Copy extended packet header */
8469 + if((plugin_mem != NULL) && (offset > 0)) {
8470 + memcpy(&ring_bucket[sizeof(struct pfring_pkthdr)], plugin_mem, offset);
8474 + hdr->caplen = min(pfr->bucket_len-offset, hdr->caplen);
8476 + if(hdr->caplen > 0) {
8477 +#if defined(RING_DEBUG)
8478 + printk("[PF_RING] --> [caplen=%d][len=%d][displ=%d][parsed_header_len=%d][bucket_len=%d]\n",
8479 + hdr->caplen, hdr->len, displ, hdr->parsed_header_len, pfr->bucket_len);
8481 + skb_copy_bits(skb, -displ, &ring_bucket[sizeof(struct pfring_pkthdr)+offset], hdr->caplen);
8483 + if(hdr->parsed_header_len >= pfr->bucket_len) {
8484 + static u_char print_once = 0;
8487 + printk("[PF_RING] WARNING: the bucket len is [%d] shorter than the plugin parsed header [%d]\n",
8488 + pfr->bucket_len, hdr->parsed_header_len);
8495 + if(idx == pfr->slots_info->tot_slots)
8496 + pfr->slots_info->insert_idx = 0;
8498 + pfr->slots_info->insert_idx = idx;
8500 +#if defined(RING_DEBUG)
8501 + printk("[PF_RING] ==> insert_idx=%d\n", pfr->slots_info->insert_idx);
8504 + pfr->slots_info->tot_insert++;
8505 + theSlot->slot_state = 1;
8506 + write_unlock_bh(&pfr->ring_index_lock);
8508 + /* wakeup in case of poll() */
8509 + if(waitqueue_active(&pfr->ring_slots_waitqueue))
8510 + wake_up_interruptible(&pfr->ring_slots_waitqueue);
8513 +/* ********************************** */
8515 +static int add_hdr_to_ring(struct ring_opt *pfr,
8516 + struct pfring_pkthdr *hdr) {
8517 + read_lock_bh(&ring_mgmt_lock);
8518 + add_pkt_to_ring(NULL, pfr, hdr, 0, 0, 0, NULL);
8519 + read_unlock_bh(&ring_mgmt_lock);
8523 +/* ********************************** */
8525 +/* Free filtering placeholders */
8526 +static void free_parse_memory(struct parse_buffer *parse_memory_buffer[]) {
8529 + for(i=1; i<=max_registered_plugin_id; i++)
8530 + if(parse_memory_buffer[i]) {
8531 + if(parse_memory_buffer[i]->mem != NULL) {
8532 + kfree(parse_memory_buffer[i]->mem);
8535 + kfree(parse_memory_buffer[i]);
8539 +/* ********************************** */
8541 +static int add_skb_to_ring(struct sk_buff *skb,
8542 + struct ring_opt *pfr,
8543 + struct pfring_pkthdr *hdr,
8549 + struct list_head *ptr, *tmp_ptr;
8550 + u_int8_t free_parse_mem = 0;
8551 + u_int last_matched_plugin = 0, debug = 0;
8552 + u_char hash_found = 0;
8553 + struct parse_buffer *parse_memory_buffer[MAX_PLUGIN_ID] = { NULL };
8554 + /* This is a memory holder
8555 + for storing parsed packet information
8556 + that will then be freed when the packet
8560 + if(!pfr->ring_active) return(-1);
8561 + atomic_set(&pfr->num_ring_users, 1);
8563 + /* [1] BPF Filtering (from af_packet.c) */
8564 + if(pfr->bpfFilter != NULL) {
8565 + unsigned res = 1, len;
8567 + len = skb->len-skb->data_len;
8569 + skb->data -= displ;
8570 + res = sk_run_filter(skb, pfr->bpfFilter->insns, pfr->bpfFilter->len);
8571 + skb->data += displ;
8574 + /* Filter failed */
8575 +#if defined(RING_DEBUG)
8576 + printk("[PF_RING] add_skb_to_ring(skb): Filter failed [len=%d][tot=%llu]"
8577 + "[insertIdx=%d][pkt_type=%d][cloned=%d]\n",
8578 + (int)skb->len, pfr->slots_info->tot_pkts,
8579 + pfr->slots_info->insert_idx,
8580 + skb->pkt_type, skb->cloned);
8582 + atomic_set(&pfr->num_ring_users, 0);
8587 +#if defined(RING_DEBUG)
8588 + printk("[PF_RING] add_skb_to_ring: [displ=%d][len=%d][caplen=%d]"
8589 + "[is_ip_pkt=%d][%d -> %d]\n",
8590 + displ, hdr->len, hdr->caplen,
8591 + is_ip_pkt, hdr->parsed_pkt.l4_src_port,
8592 + hdr->parsed_pkt.l4_dst_port);
8595 + /* ************************************* */
8597 +#if defined(RING_DEBUG)
8598 + printk("[PF_RING] add_skb_to_ring(skb) [len=%d][tot=%llu][insertIdx=%d]"
8599 + "[pkt_type=%d][cloned=%d]\n",
8600 + (int)skb->len, pfr->slots_info->tot_pkts,
8601 + pfr->slots_info->insert_idx,
8602 + skb->pkt_type, skb->cloned);
8606 + fwd_pkt = pfr->rules_default_accept_policy;
8607 + /* printk("[PF_RING] rules_default_accept_policy: [fwd_pkt=%d]\n", fwd_pkt); */
8609 + /* ************************** */
8611 + /* [2] Filter packet according to rules */
8614 + printk("[PF_RING] About to evaluate packet [len=%d][tot=%llu][insertIdx=%d]"
8615 + "[pkt_type=%d][cloned=%d]\n",
8616 + (int)skb->len, pfr->slots_info->tot_pkts,
8617 + pfr->slots_info->insert_idx,
8618 + skb->pkt_type, skb->cloned);
8620 + /* [2.1] Search the hash */
8621 + if(pfr->filtering_hash != NULL) {
8623 + filtering_hash_bucket *hash_bucket;
8625 + hash_idx = hash_pkt_header(hdr, 0, 0) % DEFAULT_RING_HASH_SIZE;
8626 + hash_bucket = pfr->filtering_hash[hash_idx];
8628 + while(hash_bucket != NULL) {
8629 + if(hash_bucket_match(hash_bucket, hdr, 0, 0)) {
8633 + hash_bucket = hash_bucket->next;
8637 + packet_action_behaviour behaviour = forward_packet_and_stop_rule_evaluation;
8639 + if((hash_bucket->rule.plugin_action.plugin_id != 0)
8640 + && (hash_bucket->rule.plugin_action.plugin_id < MAX_PLUGIN_ID)
8641 + && (plugin_registration[hash_bucket->rule.plugin_action.plugin_id] != NULL)
8642 + && (plugin_registration[hash_bucket->rule.plugin_action.plugin_id]->pfring_plugin_handle_skb != NULL)
8644 + plugin_registration[hash_bucket->rule.plugin_action.plugin_id]
8645 + ->pfring_plugin_handle_skb(pfr, NULL, hash_bucket, hdr, skb,
8646 + 0 /* no plugin */,
8647 + &parse_memory_buffer[hash_bucket->rule.plugin_action.plugin_id],
8650 + if(parse_memory_buffer[hash_bucket->rule.plugin_action.plugin_id]) free_parse_mem = 1;
8651 + last_matched_plugin = hash_bucket->rule.plugin_action.plugin_id;
8652 + hdr->parsed_pkt.last_matched_plugin_id = hash_bucket->rule.plugin_action.plugin_id;
8655 + if((behaviour == forward_packet_and_stop_rule_evaluation)
8656 + || (behaviour == forward_packet_add_rule_and_stop_rule_evaluation)
8659 + else if(behaviour == dont_forward_packet_and_stop_rule_evaluation)
8662 + if(hash_bucket->rule.rule_action == forward_packet_and_stop_rule_evaluation) {
8664 + } else if(hash_bucket->rule.rule_action == dont_forward_packet_and_stop_rule_evaluation) {
8666 + } else if(hash_bucket->rule.rule_action == execute_action_and_continue_rule_evaluation) {
8667 + hash_found = 0; /* This way we also evaluate the list of rules */
8671 + /* printk("[PF_RING] Packet not found\n"); */
8675 + /* [2.2] Search rules list */
8676 + if((!hash_found) && (pfr->num_filtering_rules > 0)) {
8677 + list_for_each_safe(ptr, tmp_ptr, &pfr->rules)
8679 + filtering_rule_element *entry;
8680 + packet_action_behaviour behaviour = forward_packet_and_stop_rule_evaluation;
8682 + entry = list_entry(ptr, filtering_rule_element, list);
8684 + if(match_filtering_rule(pfr, entry, hdr, skb, displ,
8685 + parse_memory_buffer, &free_parse_mem,
8686 + &last_matched_plugin, &behaviour))
8689 + if(behaviour == use_rule_forward_policy)
8690 + behaviour = entry->rule.rule_action;
8692 + if(debug) printk("[PF_RING] behaviour=%d\n", behaviour);
8694 + if(behaviour == forward_packet_and_stop_rule_evaluation) {
8697 + } else if(behaviour == forward_packet_add_rule_and_stop_rule_evaluation) {
8698 + filtering_hash_bucket *hash_bucket;
8702 + hash_bucket = (filtering_hash_bucket*)kcalloc(1, sizeof(filtering_hash_bucket), GFP_KERNEL);
8707 + hash_bucket->rule.vlan_id = hdr->parsed_pkt.vlan_id;
8708 + hash_bucket->rule.proto = hdr->parsed_pkt.l3_proto;
8709 + hash_bucket->rule.host_peer_a = hdr->parsed_pkt.ipv4_src;
8710 + hash_bucket->rule.host_peer_b = hdr->parsed_pkt.ipv4_dst;
8711 + hash_bucket->rule.port_peer_a = hdr->parsed_pkt.l4_src_port;
8712 + hash_bucket->rule.port_peer_b = hdr->parsed_pkt.l4_dst_port;
8713 + hash_bucket->rule.rule_action = forward_packet_and_stop_rule_evaluation;
8714 + hash_bucket->rule.jiffies_last_match = jiffies; /* Avoid immediate rule purging */
8716 + //write_lock_bh(&pfr->ring_rules_lock);
8717 + rc = pfr->handle_hash_rule(pfr, hash_bucket, 1 /* add_rule_from_plugin */);
8718 + pfr->num_filtering_rules++;
8719 + // write_unlock_bh(&pfr->ring_rules_lock);
8722 + kfree(hash_bucket);
8725 + if(debug) printk("[PF_RING] Added rule: [%d.%d.%d.%d:%d <-> %d.%d.%d.%d:%d][tot_rules=%d]\n",
8726 + ((hash_bucket->rule.host_peer_a >> 24) & 0xff),
8727 + ((hash_bucket->rule.host_peer_a >> 16) & 0xff),
8728 + ((hash_bucket->rule.host_peer_a >> 8) & 0xff),
8729 + ((hash_bucket->rule.host_peer_a >> 0) & 0xff),
8730 + hash_bucket->rule.port_peer_a,
8731 + ((hash_bucket->rule.host_peer_b >> 24) & 0xff),
8732 + ((hash_bucket->rule.host_peer_b >> 16) & 0xff),
8733 + ((hash_bucket->rule.host_peer_b >> 8) & 0xff),
8734 + ((hash_bucket->rule.host_peer_b >> 0) & 0xff),
8735 + hash_bucket->rule.port_peer_b,
8736 + pfr->num_filtering_rules);
8741 + } else if(behaviour == dont_forward_packet_and_stop_rule_evaluation) {
8745 + if(entry->rule.rule_action == forward_packet_and_stop_rule_evaluation) {
8748 + } else if(entry->rule.rule_action == dont_forward_packet_and_stop_rule_evaluation) {
8751 + } else if(entry->rule.rule_action == execute_action_and_continue_rule_evaluation) {
8752 + /* The action has already been performed inside match_filtering_rule()
8753 + hence instead of stopping rule evaluation, the next rule
8754 + will be evaluated */
8762 + /* We accept the packet: it needs to be queued */
8764 + /* [3] Packet sampling */
8765 + if(pfr->sample_rate > 1) {
8766 + write_lock_bh(&pfr->ring_index_lock);
8767 + pfr->slots_info->tot_pkts++;
8769 + if(pfr->pktToSample == 0) {
8770 + pfr->pktToSample = pfr->sample_rate;
8772 + pfr->pktToSample--;
8774 +#if defined(RING_DEBUG)
8775 + printk("[PF_RING] add_skb_to_ring(skb): sampled packet [len=%d]"
8776 + "[tot=%llu][insertIdx=%d][pkt_type=%d][cloned=%d]\n",
8777 + (int)skb->len, pfr->slots_info->tot_pkts,
8778 + pfr->slots_info->insert_idx,
8779 + skb->pkt_type, skb->cloned);
8782 + write_unlock_bh(&pfr->ring_index_lock);
8783 + if(free_parse_mem) free_parse_memory(parse_memory_buffer);
8784 + atomic_set(&pfr->num_ring_users, 0);
8788 + write_unlock_bh(&pfr->ring_index_lock);
8791 + /* [4] Check if there is a reflector device defined */
8792 + if((pfr->reflector_dev != NULL)
8793 + && (!netif_queue_stopped(pfr->reflector_dev) /* TX is in good shape */)
8796 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,30))
8797 + struct netdev_queue *txq = netdev_get_tx_queue(pfr->reflector_dev, 0 /* TX queue 0 */);
8801 + atomic_inc(&skb->users); /* Avoid others to free the skb and crash */
8803 + HARD_TX_LOCK(pfr->reflector_dev,
8804 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,30))
8807 + smp_processor_id());
8808 + skb->data -= displ, skb->len += displ;
8809 + ret = pfr->reflector_dev->hard_start_xmit(skb, pfr->reflector_dev);
8810 + skb->data += displ, skb->len -= displ;
8811 + HARD_TX_UNLOCK(pfr->reflector_dev
8812 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,30))
8817 +#if defined(RING_DEBUG)
8818 + printk("[PF_RING] reflect(len=%d, displ=%d): %d\n", skb->len, displ, ret);
8821 + atomic_set(&pfr->num_ring_users, 0); /* Done */
8822 + if(free_parse_mem) free_parse_memory(parse_memory_buffer);
8823 + return(ret == NETDEV_TX_OK ? 0 : -ENETDOWN); /* -ENETDOWN */
8826 + /* No reflector device: the packet needs to be queued */
8827 + if(hdr->caplen > 0) {
8828 + /* Copy the packet into the bucket */
8832 + if((last_matched_plugin > 0)
8833 + && (parse_memory_buffer[last_matched_plugin] != NULL)) {
8834 + offset = hdr->parsed_header_len = parse_memory_buffer[last_matched_plugin]->mem_len;
8836 + hdr->parsed_pkt.last_matched_plugin_id = last_matched_plugin;
8838 +#if defined(RING_DEBUG)
8839 + printk("[PF_RING] --> [last_matched_plugin = %d][parsed_header_len=%d]\n",
8840 + last_matched_plugin, hdr->parsed_header_len);
8843 + if(offset > pfr->bucket_len) offset = hdr->parsed_header_len = pfr->bucket_len;
8845 + mem = parse_memory_buffer[last_matched_plugin]->mem;
8847 + offset = 0, hdr->parsed_header_len = 0, mem = NULL;
8849 + add_pkt_to_ring(skb, pfr, hdr, displ, channel_id, offset, mem);
8853 +#if defined(RING_DEBUG)
8854 + printk("[PF_RING] [pfr->slots_info->insert_idx=%d]\n", pfr->slots_info->insert_idx);
8857 + if(free_parse_mem) free_parse_memory(parse_memory_buffer);
8858 + atomic_set(&pfr->num_ring_users, 0);
8863 +/* ********************************** */
8865 +static u_int hash_skb(ring_cluster_element *cluster_ptr,
8866 + struct sk_buff *skb,
8872 + if(cluster_ptr->cluster.hashing_mode == cluster_round_robin)
8874 + idx = cluster_ptr->cluster.hashing_id++;
8878 + /* Per-flow clustering */
8879 + if(skb->len > sizeof(struct iphdr)+sizeof(struct tcphdr))
8884 + Always points to to the IP part of the packet
8886 + ip = (struct iphdr*)(skb->data+displ);
8887 + idx = ip->saddr+ip->daddr+ip->protocol;
8889 + if(ip->protocol == IPPROTO_TCP)
8891 + struct tcphdr *tcp = (struct tcphdr*)(skb->data+displ
8892 + +sizeof(struct iphdr));
8893 + idx += tcp->source+tcp->dest;
8895 + else if(ip->protocol == IPPROTO_UDP)
8897 + struct udphdr *udp = (struct udphdr*)(skb->data+displ
8898 + +sizeof(struct iphdr));
8899 + idx += udp->source+udp->dest;
8906 + return(idx % cluster_ptr->cluster.num_cluster_elements);
8909 +/* ********************************** */
8911 +static int register_plugin(struct pfring_plugin_registration *reg)
8913 + if(reg == NULL) return(-1);
8916 + printk("[PF_RING] --> register_plugin(%d)\n", reg->plugin_id);
8919 + if((reg->plugin_id >= MAX_PLUGIN_ID) || (reg->plugin_id == 0))
8922 + if(plugin_registration[reg->plugin_id] != NULL)
8923 + return(-EINVAL); /* plugin already registered */
8925 + plugin_registration[reg->plugin_id] = reg;
8926 + plugin_registration_size++;
8928 + max_registered_plugin_id = max(max_registered_plugin_id, reg->plugin_id);
8930 + printk("[PF_RING] registered plugin [id=%d][max=%d][%p]\n",
8931 + reg->plugin_id, max_registered_plugin_id, plugin_registration[reg->plugin_id]);
8932 + try_module_get(THIS_MODULE); /* Increment usage count */
8937 +/* ********************************** */
8939 +int unregister_plugin(u_int16_t pfring_plugin_id)
8943 + if(pfring_plugin_id >= MAX_PLUGIN_ID)
8946 + if(plugin_registration[pfring_plugin_id] == NULL)
8947 + return(-EINVAL); /* plugin not registered */
8949 + struct list_head *ptr, *tmp_ptr, *ring_ptr, *ring_tmp_ptr;
8951 + plugin_registration[pfring_plugin_id] = NULL;
8952 + plugin_registration_size--;
8954 + read_lock_bh(&ring_mgmt_lock);
8955 + list_for_each_safe(ring_ptr, ring_tmp_ptr, &ring_table) {
8956 + struct ring_element *entry = list_entry(ring_ptr, struct ring_element, list);
8957 + struct ring_opt *pfr = ring_sk(entry->sk);
8959 + list_for_each_safe(ptr, tmp_ptr, &pfr->rules)
8961 + filtering_rule_element *rule;
8963 + rule = list_entry(ptr, filtering_rule_element, list);
8965 + if(rule->rule.plugin_action.plugin_id == pfring_plugin_id) {
8966 + if(plugin_registration[pfring_plugin_id]
8967 + && plugin_registration[pfring_plugin_id]->pfring_plugin_free_ring_mem) {
8968 + /* Custom free function */
8969 + plugin_registration[pfring_plugin_id]->pfring_plugin_free_ring_mem(rule);
8971 + if(rule->plugin_data_ptr != NULL) {
8972 + kfree(rule->plugin_data_ptr);
8973 + rule->plugin_data_ptr = NULL;
8977 + rule->rule.plugin_action.plugin_id = 0;
8981 + read_unlock_bh(&ring_mgmt_lock);
8983 + for(i=MAX_PLUGIN_ID-1; i>0; i--) {
8984 + if(plugin_registration[i] != NULL) {
8985 + max_registered_plugin_id = i;
8990 + printk("[PF_RING] unregistered plugin [id=%d][max=%d]\n",
8991 + pfring_plugin_id, max_registered_plugin_id);
8992 + module_put(THIS_MODULE); /* Decrement usage count */
8997 +/* ********************************** */
8999 +static int skb_ring_handler(struct sk_buff *skb,
9000 + u_char recv_packet,
9001 + u_char real_skb /* 1=real skb, 0=faked skb */,
9004 + struct sock *skElement;
9005 + int rc = 0, is_ip_pkt;
9006 + struct list_head *ptr;
9007 + struct pfring_pkthdr hdr;
9009 + struct sk_buff *skk = NULL;
9010 + struct sk_buff *orig_skb = skb;
9013 + uint64_t rdt = _rdtsc(), rdt1, rdt2;
9016 + if((!skb) /* Invalid skb */
9017 + || ((!enable_tx_capture) && (!recv_packet)))
9020 + An outgoing packet is about to be sent out
9021 + but we decided not to handle transmitted
9027 +#if defined(RING_DEBUG)
9029 + struct timeval tv;
9031 + skb_get_timestamp(skb, &tv);
9032 + printk("[PF_RING] skb_ring_handler() [skb=%p][%u.%u][len=%d][dev=%s][csum=%u]\n",
9033 + skb, (unsigned int)tv.tv_sec, (unsigned int)tv.tv_usec, skb->len,
9034 + skb->dev->name == NULL ? "<NULL>" : skb->dev->name, skb->csum);
9038 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,21))
9039 + if(channel_id == RING_ANY_CHANNEL /* Unknown channel */)
9040 + channel_id = skb->iif; /* Might have been set by the driver */
9043 +#if defined (RING_DEBUG)
9044 + /* printk("[PF_RING] channel_id=%d\n", channel_id); */
9052 + /* Hack for identifying a packet received by the e1000 */
9054 + displ = SKB_DISPLACEMENT;
9056 + displ = 0; /* Received by the e1000 wrapper */
9060 + is_ip_pkt = parse_pkt(skb, displ, &hdr);
9062 + /* (de)Fragmentation <fusco@ntop.org> */
9063 + if (enable_ip_defrag
9067 + && (ring_table_size > 0))
9069 + struct sk_buff *cloned = NULL;
9070 + struct iphdr* iphdr = NULL;
9072 + skb_reset_network_header(skb);
9073 + skb_reset_transport_header(skb);
9074 + skb_set_network_header(skb, ETH_HLEN-displ);
9076 + iphdr = ip_hdr(skb);
9079 +#if defined (RING_DEBUG)
9080 + printk("[PF_RING] [version=%d] %X -> %X\n", iphdr->version, iphdr->saddr, iphdr->daddr);
9082 + if (iphdr->frag_off & htons(IP_MF | IP_OFFSET))
9084 + if((cloned = skb_clone(skb, GFP_ATOMIC)) != NULL)
9086 +#if defined (RING_DEBUG)
9087 + int offset = ntohs(iphdr->frag_off);
9088 + offset &= IP_OFFSET;
9091 + printk("[PF_RING] There is a fragment to handle [proto=%d][frag_off=%u]"
9092 + "[ip_id=%u][network_header=%d][displ=%d]\n",
9093 + iphdr->protocol, offset, ntohs(iphdr->id),
9094 + hdr.parsed_pkt.pkt_detail.offset.l3_offset-displ, displ);
9096 + skk = ring_gather_frags(cloned);
9100 +#if defined (RING_DEBUG)
9101 + printk("[PF_RING] IP reasm on new skb [skb_len=%d][head_len=%d][nr_frags=%d][frag_list=%p]\n",
9102 + (int)skk->len, skb_headlen(skk),
9103 + skb_shinfo(skk)->nr_frags, skb_shinfo(skk)->frag_list);
9106 + parse_pkt(skb, displ, &hdr);
9107 + hdr.len = hdr.caplen = skb->len+displ;
9109 + //printk("[PF_RING] Fragment queued \n");
9110 + return(0); /* mask rcvd fragments */
9116 +#if defined (RING_DEBUG)
9117 + printk("[PF_RING] Do not seems to be a fragmented ip_pkt[iphdr=%p]\n", iphdr);
9123 + /* BD - API changed for time keeping */
9124 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14))
9125 + if(skb->stamp.tv_sec == 0) do_gettimeofday(&skb->stamp);
9126 + hdr.ts.tv_sec = skb->stamp.tv_sec, hdr.ts.tv_usec = skb->stamp.tv_usec;
9127 +#elif (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22))
9128 + if(skb->tstamp.off_sec == 0) __net_timestamp(skb);
9129 + hdr.ts.tv_sec = skb->tstamp.off_sec, hdr.ts.tv_usec = skb->tstamp.off_usec;
9130 +#else /* 2.6.22 and above */
9131 + if(skb->tstamp.tv64 == 0) __net_timestamp(skb);
9132 + hdr.ts = ktime_to_timeval(skb->tstamp);
9135 + hdr.len = hdr.caplen = skb->len+displ;
9137 + /* Avoid the ring to be manipulated while playing with it */
9138 + read_lock_bh(&ring_mgmt_lock);
9141 + printk("[PF_RING] -----------------------------------\n");
9144 + /* [1] Check unclustered sockets */
9145 + list_for_each(ptr, &ring_table) {
9146 + struct ring_opt *pfr;
9147 + struct ring_element *entry;
9149 + entry = list_entry(ptr, struct ring_element, list);
9151 + skElement = entry->sk;
9152 + pfr = ring_sk(skElement);
9155 + if(pfr && (pfr->ring_slots != NULL)) {
9156 + /* if(pfr->ring_netdev && pfr->ring_netdev->name && strcmp(pfr->ring_netdev->name, "eth0")) */
9157 + printk("[PF_RING] Received packet [device=%s][socket=%s][%p]\n",
9158 + skb->dev->name ? skb->dev->name : "<unknown>",
9159 + pfr->ring_netdev->name ? pfr->ring_netdev->name : "<unknown>", pfr);
9164 + && (pfr->cluster_id == 0 /* No cluster */)
9165 + && (pfr->ring_slots != NULL)
9166 + && ((pfr->ring_netdev == skb->dev)
9167 + || ((skb->dev->flags & IFF_SLAVE)
9168 + && (pfr->ring_netdev == skb->dev->master)))) {
9169 + /* We've found the ring where the packet can be stored */
9170 + int old_caplen = hdr.caplen; /* Keep old lenght */
9171 + hdr.caplen = min(hdr.caplen, pfr->bucket_len);
9173 + printk("[PF_RING] MATCH received packet [device=%s][socket=%s][%p]\n",
9174 + skb->dev->name ? skb->dev->name : "<unknown>",
9175 + pfr->ring_netdev->name ? pfr->ring_netdev->name : "<unknown>", pfr);
9178 + add_skb_to_ring(skb, pfr, &hdr, is_ip_pkt, displ, channel_id);
9179 + hdr.caplen = old_caplen;
9180 + rc = 1; /* Ring found: we've done our job */
9184 + /* [2] Check socket clusters */
9185 + list_for_each(ptr, &ring_cluster_list) {
9186 + ring_cluster_element *cluster_ptr;
9187 + struct ring_opt *pfr;
9189 + cluster_ptr = list_entry(ptr, ring_cluster_element, list);
9191 + if(cluster_ptr->cluster.num_cluster_elements > 0) {
9192 + u_int skb_hash = hash_skb(cluster_ptr, skb, displ);
9194 + skElement = cluster_ptr->cluster.sk[skb_hash];
9196 + if(skElement != NULL) {
9197 + pfr = ring_sk(skElement);
9200 + && (pfr->ring_slots != NULL)
9201 + && ((pfr->ring_netdev == skb->dev)
9202 + || ((skb->dev->flags & IFF_SLAVE)
9203 + && (pfr->ring_netdev == skb->dev->master)))) {
9204 + /* We've found the ring where the packet can be stored */
9205 + add_skb_to_ring(skb, pfr, &hdr, is_ip_pkt, displ, channel_id);
9206 + rc = 1; /* Ring found: we've done our job */
9212 + read_unlock_bh(&ring_mgmt_lock);
9215 + rdt1 = _rdtsc()-rdt1;
9222 + /* Fragment handling */
9227 + if(transparent_mode) {
9230 + if(recv_packet && real_skb) {
9231 +#if defined(RING_DEBUG)
9232 + printk("[PF_RING] kfree_skb()\n");
9235 + kfree_skb(orig_skb);
9241 + rdt2 = _rdtsc()-rdt2;
9242 + rdt = _rdtsc()-rdt;
9244 +#if defined(RING_DEBUG)
9245 + printk("[PF_RING] # cycles: %d [lock costed %d %d%%][free costed %d %d%%]\n",
9246 + (int)rdt, rdt-rdt1,
9247 + (int)((float)((rdt-rdt1)*100)/(float)rdt),
9249 + (int)((float)(rdt2*100)/(float)rdt));
9253 + //printk("[PF_RING] Returned %d\n", rc);
9254 + return(rc); /* 0 = packet not handled */
9257 +/* ********************************** */
9259 +struct sk_buff skb;
9261 +static int buffer_ring_handler(struct net_device *dev,
9262 + char *data, int len)
9264 +#if defined(RING_DEBUG)
9265 + printk("[PF_RING] buffer_ring_handler: [dev=%s][len=%d]\n",
9266 + dev->name == NULL ? "<NULL>" : dev->name, len);
9269 + skb.dev = dev, skb.len = len, skb.data = data, skb.data_len = len;
9271 + /* BD - API changed for time keeping */
9272 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14))
9273 + skb.stamp.tv_sec = 0;
9274 +#elif (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22))
9275 + skb.tstamp.off_sec = 0;
9277 + skb.tstamp.tv64 = 0;
9280 + return(skb_ring_handler(&skb, 1, 0 /* fake skb */, -1 /* Unknown channel */));
9283 +/* ************************************* */
9285 +static int handle_filtering_hash_bucket(struct ring_opt *pfr,
9286 + filtering_hash_bucket* rule,
9289 + u_int32_t hash_value = hash_pkt(rule->rule.vlan_id, rule->rule.proto,
9290 + rule->rule.host_peer_a, rule->rule.host_peer_b,
9291 + rule->rule.port_peer_a, rule->rule.port_peer_b) % DEFAULT_RING_HASH_SIZE;
9292 + int rc = -1, debug = 0;
9294 + if(debug) printk("[PF_RING] handle_filtering_hash_bucket(vlan=%u, proto=%u, "
9295 + "sip=%d.%d.%d.%d, sport=%u, dip=%d.%d.%d.%d, dport=%u, "
9296 + "hash_value=%u, add_rule=%d) called\n",
9297 + rule->rule.vlan_id, rule->rule.proto,
9298 + ((rule->rule.host_peer_a >> 24) & 0xff),
9299 + ((rule->rule.host_peer_a >> 16) & 0xff),
9300 + ((rule->rule.host_peer_a >> 8) & 0xff),
9301 + ((rule->rule.host_peer_a >> 0) & 0xff),
9302 + rule->rule.port_peer_a,
9303 + ((rule->rule.host_peer_b >> 24) & 0xff),
9304 + ((rule->rule.host_peer_b >> 16) & 0xff),
9305 + ((rule->rule.host_peer_b >> 8) & 0xff),
9306 + ((rule->rule.host_peer_b >> 0) & 0xff),
9307 + rule->rule.port_peer_b,
9308 + hash_value, add_rule);
9311 + if(pfr->filtering_hash == NULL)
9312 + pfr->filtering_hash = (filtering_hash_bucket**)kcalloc(DEFAULT_RING_HASH_SIZE,
9313 + sizeof(filtering_hash_bucket*),
9315 + if(pfr->filtering_hash == NULL) {
9316 + /* kfree(rule); */
9317 + if(debug) printk("[PF_RING] handle_filtering_hash_bucket() returned %d [0]\n", -EFAULT);
9322 + if(debug) printk("[PF_RING] handle_filtering_hash_bucket() allocated memory\n");
9324 + if(pfr->filtering_hash == NULL) {
9325 + /* We're trying to delete a hash rule from an empty hash */
9329 + if(pfr->filtering_hash[hash_value] == NULL) {
9331 + pfr->filtering_hash[hash_value] = rule, rule->next = NULL, rc = 0;
9333 + if(debug) printk("[PF_RING] handle_filtering_hash_bucket() returned %d [1]\n", -1);
9334 + return(-1); /* Unable to find the specified rule */
9337 + filtering_hash_bucket *prev = NULL, *bucket = pfr->filtering_hash[hash_value];
9339 + while(bucket != NULL) {
9340 + if(hash_filtering_rule_match(&bucket->rule, &rule->rule)) {
9342 + if(debug) printk("[PF_RING] Duplicate found while adding rule: discarded\n");
9343 + /* kfree(rule); */
9346 + /* We've found the bucket to delete */
9348 + if(debug) printk("[PF_RING] handle_filtering_hash_bucket() found a bucket to delete: removing it\n");
9350 + pfr->filtering_hash[hash_value] = bucket->next;
9352 + prev->next = bucket->next;
9354 + /* Free the bucket */
9355 + if(bucket->plugin_data_ptr) kfree(bucket->plugin_data_ptr);
9357 + if(debug) printk("[PF_RING] handle_filtering_hash_bucket() returned %d [2]\n", 0);
9362 + bucket = bucket->next;
9367 + /* If the flow arrived until here, then this rule is unique */
9369 + if(debug) printk("[PF_RING] handle_filtering_hash_bucket() no duplicate rule found: adding the rule\n");
9370 + rule->next = pfr->filtering_hash[hash_value];
9371 + pfr->filtering_hash[hash_value] = rule;
9374 + /* The rule we searched for has not been found */
9379 + if(debug) printk("[PF_RING] handle_filtering_hash_bucket() returned %d [3]\n", rc);
9384 +/* ********************************** */
9386 +static int ring_create(
9387 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24))
9390 + struct socket *sock, int protocol)
9393 + struct ring_opt *pfr;
9396 +#if defined(RING_DEBUG)
9397 + printk("[PF_RING] ring_create()\n");
9400 + /* Are you root, superuser or so ? */
9401 + if(!capable(CAP_NET_ADMIN))
9404 + if(sock->type != SOCK_RAW)
9405 + return -ESOCKTNOSUPPORT;
9407 + if(protocol != htons(ETH_P_ALL))
9408 + return -EPROTONOSUPPORT;
9410 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
9411 + MOD_INC_USE_COUNT;
9416 + // BD: -- broke this out to keep it more simple and clear as to what the
9418 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
9419 + sk = sk_alloc(PF_RING, GFP_KERNEL, 1); /* Kernel 2.4 */
9422 +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,11))
9423 + sk = sk_alloc(PF_RING, GFP_KERNEL, 1, NULL);
9425 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24))
9426 + // BD: API changed in 2.6.12, ref:
9427 + // http://svn.clkao.org/svnweb/linux/revision/?rev=28201
9428 + sk = sk_alloc(PF_RING, GFP_ATOMIC, &ring_proto, 1);
9430 + sk = sk_alloc(net, PF_INET, GFP_KERNEL, &ring_proto);
9438 + sock->ops = &ring_ops;
9439 + sock_init_data(sock, sk);
9440 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
9441 +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,11))
9442 + sk_set_owner(sk, THIS_MODULE);
9447 + ring_sk(sk) = ring_sk_datatype(kmalloc(sizeof(*pfr), GFP_KERNEL));
9449 + if (!(pfr = ring_sk(sk))) {
9453 + memset(pfr, 0, sizeof(*pfr));
9454 + pfr->ring_active = 0; /* We activate as soon as somebody waits for packets */
9455 + pfr->channel_id = RING_ANY_CHANNEL;
9456 + pfr->bucket_len = DEFAULT_BUCKET_LEN;
9457 + pfr->handle_hash_rule = handle_filtering_hash_bucket;
9458 + init_waitqueue_head(&pfr->ring_slots_waitqueue);
9459 + rwlock_init(&pfr->ring_index_lock);
9460 + rwlock_init(&pfr->ring_rules_lock);
9461 + atomic_set(&pfr->num_ring_users, 0);
9462 + INIT_LIST_HEAD(&pfr->rules);
9464 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
9465 + sk->sk_family = PF_RING;
9466 + sk->sk_destruct = ring_sock_destruct;
9468 + sk->family = PF_RING;
9469 + sk->destruct = ring_sock_destruct;
9470 + sk->num = protocol;
9475 +#if defined(RING_DEBUG)
9476 + printk("[PF_RING] ring_create() - created\n");
9481 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
9482 + MOD_DEC_USE_COUNT;
9487 +/* *********************************************** */
9489 +static int ring_release(struct socket *sock)
9491 + struct sock *sk = sock->sk;
9492 + struct ring_opt *pfr = ring_sk(sk);
9493 + struct list_head *ptr, *tmp_ptr;
9494 + void * ring_memory_ptr;
9499 + pfr->ring_active = 0;
9501 + while(atomic_read(&pfr->num_ring_users) > 0) {
9505 +#if defined(RING_DEBUG)
9506 + printk("[PF_RING] called ring_release\n");
9510 + The calls below must be placed outside the
9511 + write_lock_bh...write_unlock_bh block.
9514 + ring_proc_remove(ring_sk(sk));
9516 + if(pfr->ring_netdev && (pfr->ring_netdev->ifindex < MAX_NUM_DEVICES)) {
9517 + struct list_head *ptr, *tmp_ptr;
9518 + device_ring_list_element *entry;
9520 + list_for_each_safe(ptr, tmp_ptr, &device_ring_list[pfr->ring_netdev->ifindex]) {
9521 + entry = list_entry(ptr, device_ring_list_element, list);
9523 + if(entry->the_ring == pfr) {
9531 + write_lock_bh(&ring_mgmt_lock);
9536 + list_for_each_safe(ptr, tmp_ptr, &pfr->rules)
9538 + filtering_rule_element *rule;
9540 + rule = list_entry(ptr, filtering_rule_element, list);
9542 + if(plugin_registration[rule->rule.plugin_action.plugin_id]
9543 + && plugin_registration[rule->rule.plugin_action.plugin_id]->pfring_plugin_free_ring_mem) {
9544 + /* Custom free function */
9545 + plugin_registration[rule->rule.plugin_action.plugin_id]->pfring_plugin_free_ring_mem(rule);
9548 + printk("[PF_RING] --> default_free [rule->rule.plugin_action.plugin_id=%d]\n",
9549 + rule->rule.plugin_action.plugin_id);
9551 + if(rule->plugin_data_ptr != NULL) {
9552 + kfree(rule->plugin_data_ptr);
9553 + rule->plugin_data_ptr = NULL;
9557 + if(rule->pattern) kfree(rule->pattern);
9563 + /* Filtering hash rules */
9564 + if(pfr->filtering_hash) {
9567 + for(i=0; i<DEFAULT_RING_HASH_SIZE; i++) {
9568 + if(pfr->filtering_hash[i] != NULL) {
9569 + filtering_hash_bucket *scan = pfr->filtering_hash[i], *next;
9571 + while(scan != NULL) {
9572 + next = scan->next;
9573 + if(scan->plugin_data_ptr != NULL) kfree(scan->plugin_data_ptr);
9580 + kfree(pfr->filtering_hash);
9583 + if(pfr->reflector_dev != NULL)
9584 + dev_put(pfr->reflector_dev); /* Release device */
9586 + /* Free the ring buffer later, vfree needs interrupts enabled */
9587 + ring_memory_ptr = pfr->ring_memory;
9588 + ring_sk(sk) = NULL;
9590 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
9591 + skb_queue_purge(&sk->sk_write_queue);
9595 + write_unlock_bh(&ring_mgmt_lock);
9596 + if(pfr->appl_name != NULL) kfree(pfr->appl_name);
9598 + if(ring_memory_ptr != NULL) {
9599 +#if defined(RING_DEBUG)
9600 + printk("[PF_RING] ring_release: rvfree\n");
9602 + rvfree(ring_memory_ptr, pfr->slots_info->tot_mem);
9607 +#if defined(RING_DEBUG)
9608 + printk("[PF_RING] ring_release: rvfree done\n");
9611 +#if defined(RING_DEBUG)
9612 + printk("[PF_RING] ring_release: done\n");
9618 +/* ********************************** */
9621 + * We create a ring for this socket and bind it to the specified device
9623 +static int packet_ring_bind(struct sock *sk, struct net_device *dev)
9625 + u_int the_slot_len;
9626 + u_int32_t tot_mem;
9627 + struct ring_opt *pfr = ring_sk(sk);
9628 + // struct page *page, *page_end;
9630 + if(!dev) return(-1);
9632 +#if defined(RING_DEBUG)
9633 + printk("[PF_RING] packet_ring_bind(%s) called\n", dev->name);
9636 + /* **********************************************
9638 + *************************************
9642 + ************************************* <-+
9644 + ************************************* |
9646 + ************************************* +- num_slots
9648 + ************************************* |
9650 + ************************************* <-+
9652 + ********************************************** */
9654 + the_slot_len = sizeof(u_char) /* flowSlot.slot_state */
9658 + + sizeof(struct pfring_pkthdr)
9659 + + pfr->bucket_len /* flowSlot.bucket */;
9661 + tot_mem = sizeof(FlowSlotInfo) + num_slots*the_slot_len;
9662 + if (tot_mem % PAGE_SIZE)
9663 + tot_mem += PAGE_SIZE - (tot_mem % PAGE_SIZE);
9665 + pfr->ring_memory = rvmalloc(tot_mem);
9667 + if (pfr->ring_memory != NULL) {
9668 + printk("[PF_RING] successfully allocated %lu bytes at 0x%08lx\n",
9669 + (unsigned long) tot_mem, (unsigned long) pfr->ring_memory);
9671 + printk("[PF_RING] ERROR: not enough memory for ring\n");
9675 + // memset(pfr->ring_memory, 0, tot_mem); // rvmalloc does the memset already
9677 + pfr->slots_info = (FlowSlotInfo*)pfr->ring_memory;
9678 + pfr->ring_slots = (char*)(pfr->ring_memory+sizeof(FlowSlotInfo));
9680 + pfr->slots_info->version = RING_FLOWSLOT_VERSION;
9681 + pfr->slots_info->slot_len = the_slot_len;
9682 + pfr->slots_info->data_len = pfr->bucket_len;
9683 + pfr->slots_info->tot_slots = (tot_mem-sizeof(FlowSlotInfo))/the_slot_len;
9684 + pfr->slots_info->tot_mem = tot_mem;
9685 + pfr->slots_info->sample_rate = 1;
9687 + printk("[PF_RING] allocated %d slots [slot_len=%d][tot_mem=%u]\n",
9688 + pfr->slots_info->tot_slots, pfr->slots_info->slot_len,
9689 + pfr->slots_info->tot_mem);
9695 + for(i=0; i<pfr->slots_info->tot_slots; i++) {
9696 + unsigned long idx = i*pfr->slots_info->slot_len;
9697 + FlowSlot *slot = (FlowSlot*)&pfr->ring_slots[idx];
9698 + slot->magic = RING_MAGIC_VALUE; slot->slot_state = 0;
9703 + pfr->sample_rate = 1; /* No sampling */
9704 + pfr->insert_page_id = 1, pfr->insert_slot_id = 0;
9705 + pfr->rules_default_accept_policy = 1, pfr->num_filtering_rules = 0;
9706 + ring_proc_add(ring_sk(sk), dev);
9708 + if(dev->ifindex < MAX_NUM_DEVICES) {
9709 + device_ring_list_element *elem;
9711 + /* printk("[PF_RING] Adding ring to device index %d\n", dev->ifindex); */
9713 + elem = kmalloc(sizeof(device_ring_list_element), GFP_ATOMIC);
9714 + if(elem != NULL) {
9715 + elem->the_ring = pfr;
9716 + INIT_LIST_HEAD(&elem->list);
9717 + list_add(&elem->list, &device_ring_list[dev->ifindex]);
9718 + /* printk("[PF_RING] Added ring to device index %d\n", dev->ifindex); */
9724 + Leave this statement here as last one. In fact when
9725 + the ring_netdev != NULL the socket is ready to be used.
9727 + pfr->ring_netdev = dev;
9732 +/* ************************************* */
9734 +/* Bind to a device */
9735 +static int ring_bind(struct socket *sock,
9736 + struct sockaddr *sa, int addr_len)
9738 + struct sock *sk=sock->sk;
9739 + struct net_device *dev = NULL;
9741 +#if defined(RING_DEBUG)
9742 + printk("[PF_RING] ring_bind() called\n");
9748 + if(addr_len != sizeof(struct sockaddr))
9750 + if(sa->sa_family != PF_RING)
9752 + if(sa->sa_data == NULL)
9755 + /* Safety check: add trailing zero if missing */
9756 + sa->sa_data[sizeof(sa->sa_data)-1] = '\0';
9758 +#if defined(RING_DEBUG)
9759 + printk("[PF_RING] searching device %s\n", sa->sa_data);
9762 + if((dev = __dev_get_by_name(
9763 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24))
9766 + sa->sa_data)) == NULL) {
9767 +#if defined(RING_DEBUG)
9768 + printk("[PF_RING] search failed\n");
9772 + return(packet_ring_bind(sk, dev));
9775 +/* ************************************* */
9778 + * rvmalloc / rvfree / kvirt_to_pa copied from usbvideo.c
9780 +unsigned long kvirt_to_pa(unsigned long adr)
9782 + unsigned long kva, ret;
9784 + kva = (unsigned long) page_address(vmalloc_to_page((void *)adr));
9785 + kva |= adr & (PAGE_SIZE-1); /* restore the offset */
9790 +/* ************************************* */
9792 +static int do_memory_mmap(struct vm_area_struct *vma,
9793 + unsigned long size, char *ptr,
9794 + u_int flags, int mode) {
9795 + unsigned long start;
9796 + unsigned long page;
9798 + /* we do not want to have this area swapped out, lock it */
9799 + vma->vm_flags |= flags;
9800 + start = vma->vm_start;
9807 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11))
9808 + page = vmalloc_to_pfn(ptr);
9809 + rc = remap_pfn_range(vma, start, page, PAGE_SIZE, PAGE_SHARED);
9811 + page = vmalloc_to_page(ptr);
9812 + page = kvirt_to_pa(ptr);
9813 + rc = remap_page_range(vma, start, page, PAGE_SIZE, PAGE_SHARED);
9815 + } else if(mode == 1) {
9816 + rc = remap_pfn_range(vma, start,
9817 + __pa(ptr) >> PAGE_SHIFT,
9818 + PAGE_SIZE, PAGE_SHARED);
9820 + rc = remap_pfn_range(vma, start,
9821 + ((unsigned long)ptr) >> PAGE_SHIFT,
9822 + PAGE_SIZE, PAGE_SHARED);
9826 +#if defined(RING_DEBUG)
9827 + printk("[PF_RING] remap_pfn_range() failed\n");
9832 + start += PAGE_SIZE;
9834 + if (size > PAGE_SIZE) {
9835 + size -= PAGE_SIZE;
9844 +/* ************************************* */
9846 +static int ring_mmap(struct file *file,
9847 + struct socket *sock,
9848 + struct vm_area_struct *vma)
9850 + struct sock *sk = sock->sk;
9851 + struct ring_opt *pfr = ring_sk(sk);
9853 + unsigned long size = (unsigned long)(vma->vm_end - vma->vm_start);
9855 + if(size % PAGE_SIZE) {
9856 +#if defined(RING_DEBUG)
9857 + printk("[PF_RING] ring_mmap() failed: "
9858 + "len is not multiple of PAGE_SIZE\n");
9863 +#if defined(RING_DEBUG)
9864 + printk("[PF_RING] ring_mmap() called, size: %ld bytes\n", size);
9867 + if((pfr->dna_device == NULL) && (pfr->ring_memory == NULL)) {
9868 +#if defined(RING_DEBUG)
9869 + printk("[PF_RING] ring_mmap() failed: "
9870 + "mapping area to an unbound socket\n");
9875 + if(pfr->dna_device == NULL) {
9876 + /* if userspace tries to mmap beyond end of our buffer, fail */
9877 + if(size > pfr->slots_info->tot_mem) {
9878 +#if defined(RING_DEBUG)
9879 + printk("[PF_RING] ring_mmap() failed: "
9880 + "area too large [%ld > %d]\n",
9881 + size, pfr->slots_info->tot_mem);
9886 +#if defined(RING_DEBUG)
9887 + printk("[PF_RING] mmap [slot_len=%d]"
9888 + "[tot_slots=%d] for ring on device %s\n",
9889 + pfr->slots_info->slot_len, pfr->slots_info->tot_slots,
9890 + pfr->ring_netdev->name);
9893 + if((rc = do_memory_mmap(vma, size, pfr->ring_memory, VM_LOCKED, 0)) < 0)
9897 + if(pfr->dna_device == NULL) return(-EAGAIN);
9899 + switch(pfr->mmap_count) {
9901 + if((rc = do_memory_mmap(vma, size,
9902 + (void*)pfr->dna_device->packet_memory,
9903 + VM_LOCKED, 1)) < 0)
9908 + if((rc = do_memory_mmap(vma, size,
9909 + (void*)pfr->dna_device->descr_packet_memory,
9910 + VM_LOCKED, 1)) < 0)
9915 + if((rc = do_memory_mmap(vma, size,
9916 + (void*)pfr->dna_device->phys_card_memory,
9917 + (VM_RESERVED | VM_IO), 2)) < 0)
9925 + pfr->mmap_count++;
9928 +#if defined(RING_DEBUG)
9929 + printk("[PF_RING] ring_mmap succeeded\n");
9935 +/* ************************************* */
9937 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
9938 +static int ring_recvmsg(struct kiocb *iocb, struct socket *sock,
9939 + struct msghdr *msg, size_t len, int flags)
9941 + static int ring_recvmsg(struct socket *sock, struct msghdr *msg, int len,
9942 + int flags, struct scm_cookie *scm)
9946 + struct ring_opt *pfr = ring_sk(sock->sk);
9947 + u_int32_t queued_pkts, num_loops = 0;
9949 +#if defined(RING_DEBUG)
9950 + printk("[PF_RING] ring_recvmsg called\n");
9953 + pfr->ring_active = 1;
9954 + slot = get_remove_slot(pfr);
9956 + while((queued_pkts = num_queued_pkts(pfr)) < MIN_QUEUED_PKTS) {
9957 + wait_event_interruptible(pfr->ring_slots_waitqueue, 1);
9959 +#if defined(RING_DEBUG)
9960 + printk("[PF_RING] -> ring_recvmsg returning %d [queued_pkts=%d][num_loops=%d]\n",
9961 + slot->slot_state, queued_pkts, num_loops);
9964 + if(queued_pkts > 0) {
9965 + if(num_loops++ > MAX_QUEUE_LOOPS)
9970 +#if defined(RING_DEBUG)
9972 + printk("[PF_RING] ring_recvmsg is returning [queued_pkts=%d][num_loops=%d]\n",
9973 + queued_pkts, num_loops);
9976 + return(queued_pkts);
9979 +/* ************************************* */
9981 +unsigned int ring_poll(struct file * file,
9982 + struct socket *sock, poll_table *wait)
9985 + struct ring_opt *pfr = ring_sk(sock->sk);
9988 + /* printk("[PF_RING] -- poll called\n"); */
9990 + if(pfr->dna_device == NULL) {
9991 + /* PF_RING mode */
9993 +#if defined(RING_DEBUG)
9994 + printk("[PF_RING] poll called (non DNA device)\n");
9997 + pfr->ring_active = 1;
9998 + slot = get_remove_slot(pfr);
10000 + if((slot != NULL) && (slot->slot_state == 0))
10001 + poll_wait(file, &pfr->ring_slots_waitqueue, wait);
10003 +#if defined(RING_DEBUG)
10004 + printk("[PF_RING] poll returning %d\n", slot->slot_state);
10007 + if((slot != NULL) && (slot->slot_state == 1))
10008 + return(POLLIN | POLLRDNORM);
10014 +#if defined(RING_DEBUG)
10015 + printk("[PF_RING] poll called on DNA device [%d]\n",
10016 + *pfr->dna_device->interrupt_received);
10019 + if(pfr->dna_device->wait_packet_function_ptr == NULL)
10022 + rc = pfr->dna_device->wait_packet_function_ptr(pfr->dna_device->adapter_ptr, 1);
10023 + if(rc == 0) /* No packet arrived yet */ {
10024 + /* poll_wait(file, pfr->dna_device->packet_waitqueue, wait); */
10026 + rc = pfr->dna_device->wait_packet_function_ptr(pfr->dna_device->adapter_ptr, 0);
10028 + //*pfr->dna_device->interrupt_received = rc;
10029 + if(rc == 0) rc = *pfr->dna_device->interrupt_received;
10031 +#if defined(RING_DEBUG)
10032 + printk("[PF_RING] poll %s return [%d]\n",
10033 + pfr->ring_netdev->name,
10034 + *pfr->dna_device->interrupt_received);
10038 + return(POLLIN | POLLRDNORM);
10045 +/* ************************************* */
10047 +int add_to_cluster_list(ring_cluster_element *el,
10048 + struct sock *sock)
10050 + if(el->cluster.num_cluster_elements == CLUSTER_LEN)
10051 + return(-1); /* Cluster full */
10053 + ring_sk_datatype(ring_sk(sock))->cluster_id = el->cluster.cluster_id;
10054 + el->cluster.sk[el->cluster.num_cluster_elements] = sock;
10055 + el->cluster.num_cluster_elements++;
10059 +/* ************************************* */
10061 +int remove_from_cluster_list(struct ring_cluster *el,
10062 + struct sock *sock)
10066 + for(i=0; i<CLUSTER_LEN; i++)
10067 + if(el->sk[i] == sock) {
10068 + el->num_cluster_elements--;
10070 + if(el->num_cluster_elements > 0) {
10071 + /* The cluster contains other elements */
10072 + for(j=i; j<CLUSTER_LEN-1; j++)
10073 + el->sk[j] = el->sk[j+1];
10075 + el->sk[CLUSTER_LEN-1] = NULL;
10077 + /* Empty cluster */
10078 + memset(el->sk, 0, sizeof(el->sk));
10084 + return(-1); /* Not found */
10087 +/* ************************************* */
10089 +static int remove_from_cluster(struct sock *sock,
10090 + struct ring_opt *pfr)
10092 + struct list_head *ptr, *tmp_ptr;
10094 +#if defined(RING_DEBUG)
10095 + printk("[PF_RING] --> remove_from_cluster(%d)\n", pfr->cluster_id);
10098 + if(pfr->cluster_id == 0 /* 0 = No Cluster */)
10099 + return(0); /* Noting to do */
10101 + list_for_each_safe(ptr, tmp_ptr, &ring_cluster_list) {
10102 + ring_cluster_element *cluster_ptr;
10104 + cluster_ptr = list_entry(ptr, ring_cluster_element, list);
10106 + if(cluster_ptr->cluster.cluster_id == pfr->cluster_id) {
10107 + return(remove_from_cluster_list(&cluster_ptr->cluster, sock));
10111 + return(-EINVAL); /* Not found */
10114 +/* ************************************* */
10116 +static int add_to_cluster(struct sock *sock,
10117 + struct ring_opt *pfr,
10118 + u_short cluster_id)
10120 + struct list_head *ptr, *tmp_ptr;
10121 + ring_cluster_element *cluster_ptr;
10123 +#ifndef RING_DEBUG
10124 + printk("[PF_RING] --> add_to_cluster(%d)\n", cluster_id);
10127 + if(cluster_id == 0 /* 0 = No Cluster */) return(-EINVAL);
10129 + if(pfr->cluster_id != 0)
10130 + remove_from_cluster(sock, pfr);
10132 + list_for_each_safe(ptr, tmp_ptr, &ring_cluster_list) {
10133 + cluster_ptr = list_entry(ptr, ring_cluster_element, list);
10135 + if(cluster_ptr->cluster.cluster_id == cluster_id) {
10136 + return(add_to_cluster_list(cluster_ptr, sock));
10140 + /* There's no existing cluster. We need to create one */
10141 + if((cluster_ptr = kmalloc(sizeof(ring_cluster_element),
10142 + GFP_KERNEL)) == NULL)
10145 + INIT_LIST_HEAD(&cluster_ptr->list);
10147 + cluster_ptr->cluster.cluster_id = cluster_id;
10148 + cluster_ptr->cluster.num_cluster_elements = 1;
10149 + cluster_ptr->cluster.hashing_mode = cluster_per_flow; /* Default */
10150 + cluster_ptr->cluster.hashing_id = 0;
10152 + memset(cluster_ptr->cluster.sk, 0, sizeof(cluster_ptr->cluster.sk));
10153 + cluster_ptr->cluster.sk[0] = sock;
10154 + pfr->cluster_id = cluster_id;
10156 + list_add(&cluster_ptr->list, &ring_cluster_list); /* Add as first entry */
10158 + return(0); /* 0 = OK */
10161 +/* ************************************* */
10163 +static int ring_map_dna_device(struct ring_opt *pfr,
10164 + dna_device_mapping *mapping) {
10167 + if(mapping->operation == remove_device_mapping) {
10168 + pfr->dna_device = NULL;
10170 + printk("[PF_RING] ring_map_dna_device(%s): removed mapping\n",
10171 + mapping->device_name);
10174 + struct list_head *ptr, *tmp_ptr;
10175 + dna_device_list *entry;
10177 + list_for_each_safe(ptr, tmp_ptr, &ring_dna_devices_list) {
10178 + entry = list_entry(ptr, dna_device_list, list);
10180 + if((!strcmp(entry->dev.netdev->name, mapping->device_name))
10181 + && (entry->dev.channel_id == mapping->channel_id)) {
10182 + pfr->dna_device = &entry->dev, pfr->ring_netdev = entry->dev.netdev;
10185 + printk("[PF_RING] ring_map_dna_device(%s): added mapping\n",
10186 + mapping->device_name);
10193 + printk("[PF_RING] ring_map_dna_device(%s): mapping failed\n",
10194 + mapping->device_name);
10199 +/* ************************************* */
10201 +static void purge_idle_hash_rules(struct ring_opt *pfr, uint16_t rule_inactivity)
10203 + int i, num_purged_rules = 0, debug = 0;
10204 + unsigned long expire_jiffies = jiffies - msecs_to_jiffies(1000*rule_inactivity);
10207 + printk("[PF_RING] purge_idle_hash_rules(rule_inactivity=%d)\n", rule_inactivity);
10209 + /* Free filtering hash rules inactive for more than rule_inactivity seconds */
10210 + if(pfr->filtering_hash != NULL) {
10211 + for(i=0; i<DEFAULT_RING_HASH_SIZE; i++) {
10212 + if(pfr->filtering_hash[i] != NULL) {
10213 + filtering_hash_bucket *scan = pfr->filtering_hash[i], *next, *prev = NULL;
10215 + while(scan != NULL) {
10216 + next = scan->next;
10218 + if(scan->rule.jiffies_last_match < expire_jiffies) {
10219 + /* Expired rule: free it */
10222 + printk("[PF_RING] Purging hash rule "
10223 + /* "[last_match=%u][expire_jiffies=%u]" */
10224 + "[%d.%d.%d.%d:%d <-> %d.%d.%d.%d:%d][purged=%d][tot_rules=%d]\n",
10226 + (unsigned int)scan->rule.jiffies_last_match,
10227 + (unsigned int)expire_jiffies,
10229 + ((scan->rule.host_peer_a >> 24) & 0xff),
10230 + ((scan->rule.host_peer_a >> 16) & 0xff),
10231 + ((scan->rule.host_peer_a >> 8) & 0xff),
10232 + ((scan->rule.host_peer_a >> 0) & 0xff),
10233 + scan->rule.port_peer_a,
10234 + ((scan->rule.host_peer_b >> 24) & 0xff),
10235 + ((scan->rule.host_peer_b >> 16) & 0xff),
10236 + ((scan->rule.host_peer_b >> 8) & 0xff),
10237 + ((scan->rule.host_peer_b >> 0) & 0xff),
10238 + scan->rule.port_peer_b,
10239 + num_purged_rules, pfr->num_filtering_rules);
10241 + if(scan->plugin_data_ptr != NULL) kfree(scan->plugin_data_ptr);
10245 + pfr->filtering_hash[i] = next;
10247 + prev->next = next;
10249 + pfr->num_filtering_rules--, num_purged_rules++;
10260 + printk("[PF_RING] Purged %d hash rules [tot_rules=%d]\n",
10261 + num_purged_rules, pfr->num_filtering_rules);
10264 +/* ************************************* */
10266 +/* Code taken/inspired from core/sock.c */
10267 +static int ring_setsockopt(struct socket *sock,
10268 + int level, int optname,
10269 + char __user *optval, int optlen)
10271 + struct ring_opt *pfr = ring_sk(sock->sk);
10272 + int val, found, ret = 0 /* OK */;
10273 + u_int cluster_id, debug = 0;
10274 + int32_t channel_id;
10275 + char devName[8], applName[32+1];
10276 + struct list_head *prev = NULL;
10277 + filtering_rule_element *entry, *rule;
10278 + u_int16_t rule_id, rule_inactivity;
10283 + if (get_user(val, (int *)optval))
10290 + case SO_ATTACH_FILTER:
10292 + if (optlen == sizeof(struct sock_fprog))
10294 + unsigned int fsize;
10295 + struct sock_fprog fprog;
10296 + struct sk_filter *filter;
10303 + Do not call copy_from_user within a held
10304 + splinlock (e.g. ring_mgmt_lock) as this caused
10305 + problems when certain debugging was enabled under
10306 + 2.6.5 -- including hard lockups of the machine.
10308 + if(copy_from_user(&fprog, optval, sizeof(fprog)))
10311 + /* Fix below courtesy of Noam Dev <noamdev@gmail.com> */
10312 + fsize = sizeof(struct sock_filter) * fprog.len;
10313 + filter = kmalloc(fsize + sizeof(struct sk_filter), GFP_KERNEL);
10315 + if(filter == NULL)
10321 + if(copy_from_user(filter->insns, fprog.filter, fsize))
10324 + filter->len = fprog.len;
10326 + if(sk_chk_filter(filter->insns, filter->len) != 0)
10328 + /* Bad filter specified */
10330 + pfr->bpfFilter = NULL;
10334 + /* get the lock, set the filter, release the lock */
10335 + write_lock(&pfr->ring_rules_lock);
10336 + pfr->bpfFilter = filter;
10337 + write_unlock(&pfr->ring_rules_lock);
10342 + case SO_DETACH_FILTER:
10343 + write_lock(&pfr->ring_rules_lock);
10345 + if(pfr->bpfFilter != NULL)
10347 + kfree(pfr->bpfFilter);
10348 + pfr->bpfFilter = NULL;
10351 + write_unlock(&pfr->ring_rules_lock);
10354 + case SO_ADD_TO_CLUSTER:
10355 + if (optlen!=sizeof(val))
10358 + if (copy_from_user(&cluster_id, optval, sizeof(cluster_id)))
10361 + write_lock(&pfr->ring_rules_lock);
10362 + ret = add_to_cluster(sock->sk, pfr, cluster_id);
10363 + write_unlock(&pfr->ring_rules_lock);
10366 + case SO_REMOVE_FROM_CLUSTER:
10367 + write_lock(&pfr->ring_rules_lock);
10368 + ret = remove_from_cluster(sock->sk, pfr);
10369 + write_unlock(&pfr->ring_rules_lock);
10372 + case SO_SET_CHANNEL_ID:
10373 + if(optlen != sizeof(channel_id))
10376 + if(copy_from_user(&channel_id, optval, sizeof(channel_id)))
10379 + pfr->channel_id = channel_id;
10380 +#if defined(RING_DEBUG)
10381 + printk("[PF_RING] [pfr->channel_id=%d][channel_id=%d]\n",
10382 + pfr->channel_id, channel_id);
10387 + case SO_SET_APPL_NAME:
10388 + if(optlen > sizeof(applName) /* Names should not be too long */)
10391 + if(copy_from_user(&applName, optval, optlen))
10394 + if(pfr->appl_name != NULL) kfree(pfr->appl_name);
10395 + pfr->appl_name = (char*)kmalloc(optlen+1, GFP_ATOMIC);
10396 + if(pfr->appl_name != NULL) {
10397 + memcpy(pfr->appl_name, applName, optlen);
10398 + pfr->appl_name[optlen] = '\0';
10404 + case SO_PURGE_IDLE_HASH_RULES:
10405 + if(optlen != sizeof(rule_inactivity))
10408 + if(copy_from_user(&rule_inactivity, optval, sizeof(rule_inactivity)))
10411 + if(rule_inactivity > 0) {
10412 + write_lock(&pfr->ring_rules_lock);
10413 + purge_idle_hash_rules(pfr, rule_inactivity);
10414 + write_unlock(&pfr->ring_rules_lock);
10420 + case SO_SET_REFLECTOR:
10421 + if(optlen >= (sizeof(devName)-1))
10426 + if(copy_from_user(devName, optval, optlen))
10430 + devName[optlen] = '\0';
10432 +#if defined(RING_DEBUG)
10433 + printk("[PF_RING] +++ SO_SET_REFLECTOR(%s)\n", devName);
10436 + write_lock(&pfr->ring_rules_lock);
10437 + pfr->reflector_dev = dev_get_by_name(
10438 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24))
10442 + write_unlock(&pfr->ring_rules_lock);
10444 +#if defined(RING_DEBUG)
10445 + if(pfr->reflector_dev != NULL)
10446 + printk("[PF_RING] SO_SET_REFLECTOR(%s): succeded\n", devName);
10448 + printk("[PF_RING] SO_SET_REFLECTOR(%s): device unknown\n", devName);
10452 + case SO_TOGGLE_FILTER_POLICY:
10453 + if(optlen != sizeof(u_int8_t))
10456 + u_int8_t new_policy;
10458 + if(copy_from_user(&new_policy, optval, optlen))
10461 + write_lock(&pfr->ring_rules_lock);
10462 + pfr->rules_default_accept_policy = new_policy;
10463 + write_unlock(&pfr->ring_rules_lock);
10465 + if(debug) printk("[PF_RING] SO_TOGGLE_FILTER_POLICY: default policy is %s\n",
10466 + pfr->rules_default_accept_policy ? "accept" : "drop");
10471 + case SO_ADD_FILTERING_RULE:
10472 + if(debug) printk("[PF_RING] +++ SO_ADD_FILTERING_RULE(len=%d)\n", optlen);
10474 + if(optlen == sizeof(filtering_rule)) {
10475 + struct list_head *ptr, *tmp_ptr;
10477 + if(debug) printk("[PF_RING] Allocating memory\n");
10479 + rule = (filtering_rule_element*)kcalloc(1, sizeof(filtering_rule_element), GFP_KERNEL);
10484 + if(copy_from_user(&rule->rule, optval, optlen))
10487 + INIT_LIST_HEAD(&rule->list);
10489 + if(rule->rule.extended_fields.filter_plugin_id > 0) {
10492 + if(rule->rule.extended_fields.filter_plugin_id >= MAX_PLUGIN_ID)
10494 + else if(plugin_registration[rule->rule.extended_fields.filter_plugin_id] == NULL)
10503 + if(rule->rule.plugin_action.plugin_id > 0) {
10506 + if(rule->rule.plugin_action.plugin_id >= MAX_PLUGIN_ID)
10508 + else if(plugin_registration[rule->rule.plugin_action.plugin_id] == NULL)
10517 + /* Compile pattern if present */
10518 + if(strlen(rule->rule.extended_fields.payload_pattern) > 0)
10522 + rule->pattern = regcomp(rule->rule.extended_fields.payload_pattern,
10525 + if(rule->pattern == NULL) {
10526 + printk("[PF_RING] Unable to compile pattern '%s'\n",
10527 + rule->rule.extended_fields.payload_pattern);
10528 + rule->pattern = NULL;
10530 + printk("[PF_RING] Compiled pattern '%s'\n", rule->rule.extended_fields.payload_pattern);
10532 + rule->pattern = NULL;
10534 + write_lock(&pfr->ring_rules_lock);
10535 + if(debug) printk("[PF_RING] SO_ADD_FILTERING_RULE: About to add rule %d\n", rule->rule.rule_id);
10537 + /* Implement an ordered add */
10538 + list_for_each_safe(ptr, tmp_ptr, &pfr->rules)
10540 + entry = list_entry(ptr, filtering_rule_element, list);
10542 + if(debug) printk("[PF_RING] SO_ADD_FILTERING_RULE: [current rule %d][rule to add %d]\n",
10543 + entry->rule.rule_id, rule->rule.rule_id);
10545 + if(entry->rule.rule_id == rule->rule.rule_id)
10547 + memcpy(&entry->rule, &rule->rule, sizeof(filtering_rule));
10548 + if(entry->pattern != NULL) kfree(entry->pattern);
10549 + entry->pattern = rule->pattern;
10552 + if(debug) printk("[PF_RING] SO_ADD_FILTERING_RULE: overwritten rule_id %d\n", entry->rule.rule_id);
10554 + } else if(entry->rule.rule_id > rule->rule.rule_id) {
10555 + if(prev == NULL) {
10556 + list_add(&rule->list, &pfr->rules); /* Add as first entry */
10557 + pfr->num_filtering_rules++;
10558 + if(debug) printk("[PF_RING] SO_ADD_FILTERING_RULE: added rule %d as head rule\n", rule->rule.rule_id);
10560 + list_add(&rule->list, prev);
10561 + pfr->num_filtering_rules++;
10562 + if(debug) printk("[PF_RING] SO_ADD_FILTERING_RULE: added rule %d\n", rule->rule.rule_id);
10575 + list_add(&rule->list, &pfr->rules); /* Add as first entry */
10576 + pfr->num_filtering_rules++;
10577 + if(debug) printk("[PF_RING] SO_ADD_FILTERING_RULE: added rule %d as first rule\n", rule->rule.rule_id);
10581 + list_add_tail(&rule->list, &pfr->rules); /* Add as first entry */
10582 + pfr->num_filtering_rules++;
10583 + if(debug) printk("[PF_RING] SO_ADD_FILTERING_RULE: added rule %d as last rule\n", rule->rule.rule_id);
10587 + write_unlock(&pfr->ring_rules_lock);
10588 + } else if(optlen == sizeof(hash_filtering_rule)) {
10589 + /* This is a hash rule */
10590 + filtering_hash_bucket *rule = (filtering_hash_bucket*)kcalloc(1, sizeof(filtering_hash_bucket), GFP_KERNEL);
10596 + if(copy_from_user(&rule->rule, optval, optlen))
10599 + write_lock(&pfr->ring_rules_lock);
10600 + rc = handle_filtering_hash_bucket(pfr, rule, 1 /* add */);
10601 + pfr->num_filtering_rules++;
10602 + write_unlock(&pfr->ring_rules_lock);
10609 + printk("[PF_RING] Bad rule length (%d): discarded\n", optlen);
10614 + case SO_REMOVE_FILTERING_RULE:
10615 + if(optlen == sizeof(u_int16_t /* rule _id */))
10617 + /* This is a list rule */
10618 + u_int8_t rule_found = 0;
10619 + struct list_head *ptr, *tmp_ptr;
10621 + if(copy_from_user(&rule_id, optval, optlen))
10624 + write_lock(&pfr->ring_rules_lock);
10626 + list_for_each_safe(ptr, tmp_ptr, &pfr->rules)
10628 + entry = list_entry(ptr, filtering_rule_element, list);
10630 + if(entry->rule.rule_id == rule_id)
10632 + if(entry->pattern) kfree(entry->pattern);
10634 + pfr->num_filtering_rules--;
10635 + if(entry->plugin_data_ptr != NULL) kfree(entry->plugin_data_ptr);
10637 + if(debug) printk("[PF_RING] SO_REMOVE_FILTERING_RULE: rule %d has been removed\n", rule_id);
10643 + write_unlock(&pfr->ring_rules_lock);
10644 + if(!rule_found) {
10645 + if(debug) printk("[PF_RING] SO_REMOVE_FILTERING_RULE: rule %d does not exist\n", rule_id);
10646 + return -EFAULT; /* Rule not found */
10648 + } else if(optlen == sizeof(hash_filtering_rule)) {
10649 + /* This is a hash rule */
10650 + filtering_hash_bucket rule;
10653 + if(copy_from_user(&rule.rule, optval, optlen))
10656 + write_lock(&pfr->ring_rules_lock);
10657 + rc = handle_filtering_hash_bucket(pfr, &rule, 0 /* delete */);
10658 + pfr->num_filtering_rules--;
10659 + write_unlock(&pfr->ring_rules_lock);
10660 + if(rc != 0) return(rc);
10665 + case SO_SET_SAMPLING_RATE:
10666 + if(optlen != sizeof(pfr->sample_rate))
10669 + if(copy_from_user(&pfr->sample_rate, optval, sizeof(pfr->sample_rate)))
10673 + case SO_ACTIVATE_RING:
10674 + if(debug) printk("[PF_RING] * SO_ACTIVATE_RING *\n");
10675 + found = 1, pfr->ring_active = 1;
10678 + case SO_RING_BUCKET_LEN:
10679 + if(optlen != sizeof(u_int32_t))
10682 + if(copy_from_user(&pfr->bucket_len, optval, optlen))
10687 + case SO_MAP_DNA_DEVICE:
10688 + if(optlen != sizeof(dna_device_mapping))
10691 + dna_device_mapping mapping;
10693 + if(copy_from_user(&mapping, optval, optlen))
10696 + ret = ring_map_dna_device(pfr, &mapping), found = 1;
10709 + return(sock_setsockopt(sock, level, optname, optval, optlen));
10712 +/* ************************************* */
10714 +static int ring_getsockopt(struct socket *sock,
10715 + int level, int optname,
10716 + char __user *optval,
10717 + int __user *optlen)
10719 + int len, debug = 0;
10720 + struct ring_opt *pfr = ring_sk(sock->sk);
10725 + if(get_user(len, optlen))
10733 + case SO_GET_RING_VERSION:
10735 + u_int32_t version = RING_VERSION_NUM;
10737 + if(copy_to_user(optval, &version, sizeof(version)))
10742 + case PACKET_STATISTICS:
10744 + struct tpacket_stats st;
10746 + if (len > sizeof(struct tpacket_stats))
10747 + len = sizeof(struct tpacket_stats);
10749 + st.tp_packets = pfr->slots_info->tot_insert;
10750 + st.tp_drops = pfr->slots_info->tot_lost;
10752 + if (copy_to_user(optval, &st, len))
10757 + case SO_GET_HASH_FILTERING_RULE_STATS:
10759 + int rc = -EFAULT;
10761 + if(len >= sizeof(hash_filtering_rule)) {
10762 + hash_filtering_rule rule;
10765 + if(pfr->filtering_hash == NULL) {
10766 + printk("[PF_RING] so_get_hash_filtering_rule_stats(): no hash failure\n");
10770 + if(copy_from_user(&rule, optval, sizeof(rule))) {
10771 + printk("[PF_RING] so_get_hash_filtering_rule_stats: copy_from_user() failure\n");
10776 + printk("[PF_RING] so_get_hash_filtering_rule_stats"
10777 + "(vlan=%u, proto=%u, sip=%u, sport=%u, dip=%u, dport=%u)\n",
10778 + rule.vlan_id, rule.proto,
10779 + rule.host_peer_a, rule.port_peer_a,
10780 + rule.host_peer_b, rule.port_peer_b);
10782 + hash_idx = hash_pkt(rule.vlan_id, rule.proto,
10783 + rule.host_peer_a, rule.host_peer_b,
10784 + rule.port_peer_a, rule.port_peer_b) % DEFAULT_RING_HASH_SIZE;
10786 + if(pfr->filtering_hash[hash_idx] != NULL) {
10787 + filtering_hash_bucket *bucket;
10789 + read_lock(&pfr->ring_rules_lock);
10790 + bucket = pfr->filtering_hash[hash_idx];
10792 + if(debug) printk("[PF_RING] so_get_hash_filtering_rule_stats(): bucket=%p\n", bucket);
10794 + while(bucket != NULL) {
10795 + if(hash_bucket_match_rule(bucket, &rule)) {
10796 + char *buffer = kmalloc(len, GFP_ATOMIC);
10798 + if(buffer == NULL) {
10799 + printk("[PF_RING] so_get_hash_filtering_rule_stats() no memory failure\n");
10802 + if((plugin_registration[rule.plugin_action.plugin_id] == NULL)
10803 + || (plugin_registration[rule.plugin_action.plugin_id]->pfring_plugin_get_stats == NULL)) {
10804 + printk("[PF_RING] Found rule but pluginId %d is not registered\n",
10805 + rule.plugin_action.plugin_id);
10808 + rc = plugin_registration[rule.plugin_action.plugin_id]->
10809 + pfring_plugin_get_stats(pfr, NULL, bucket, buffer, len);
10812 + if(copy_to_user(optval, buffer, rc)) {
10813 + printk("[PF_RING] copy_to_user() failure\n");
10820 + bucket = bucket->next;
10823 + read_unlock(&pfr->ring_rules_lock);
10826 + printk("[PF_RING] so_get_hash_filtering_rule_stats(): entry not found [hash_idx=%d]\n",
10835 + case SO_GET_FILTERING_RULE_STATS:
10837 + char *buffer = NULL;
10838 + int rc = -EFAULT;
10839 + struct list_head *ptr, *tmp_ptr;
10840 + u_int16_t rule_id;
10842 + if(len < sizeof(rule_id))
10845 + if(copy_from_user(&rule_id, optval, sizeof(rule_id)))
10849 + printk("[PF_RING] SO_GET_FILTERING_RULE_STATS: rule_id=%d\n", rule_id);
10851 + read_lock(&pfr->ring_rules_lock);
10852 + list_for_each_safe(ptr, tmp_ptr, &pfr->rules)
10854 + filtering_rule_element *rule;
10856 + rule = list_entry(ptr, filtering_rule_element, list);
10857 + if(rule->rule.rule_id == rule_id)
10859 + buffer = kmalloc(len, GFP_ATOMIC);
10861 + if(buffer == NULL)
10864 + if((plugin_registration[rule->rule.plugin_action.plugin_id] == NULL)
10865 + || (plugin_registration[rule->rule.plugin_action.plugin_id]->pfring_plugin_get_stats == NULL)) {
10866 + printk("[PF_RING] Found rule %d but pluginId %d is not registered\n",
10867 + rule_id, rule->rule.plugin_action.plugin_id);
10870 + rc = plugin_registration[rule->rule.plugin_action.plugin_id]
10871 + ->pfring_plugin_get_stats(pfr, rule, NULL, buffer, len);
10874 + if(copy_to_user(optval, buffer, rc)) {
10883 + read_unlock(&pfr->ring_rules_lock);
10884 + if(buffer != NULL) kfree(buffer);
10886 + /* printk("[PF_RING] SO_GET_FILTERING_RULE_STATS *END*\n"); */
10891 + case SO_GET_MAPPED_DNA_DEVICE:
10893 + if(pfr->dna_device == NULL)
10896 + if (len > sizeof(dna_device))
10897 + len = sizeof(dna_device);
10899 + if (copy_to_user(optval, pfr->dna_device, len))
10906 + return -ENOPROTOOPT;
10909 + if(put_user(len, optlen))
10915 +/* ************************************* */
10917 +u_int get_num_device_free_slots(int ifindex) {
10920 + if((ifindex >= 0) && (ifindex < MAX_NUM_DEVICES)) {
10921 + struct list_head *ptr, *tmp_ptr;
10922 + device_ring_list_element *entry;
10924 + list_for_each_safe(ptr, tmp_ptr, &device_ring_list[ifindex]) {
10925 + int num_free_slots;
10927 + entry = list_entry(ptr, device_ring_list_element, list);
10929 + num_free_slots = get_num_ring_free_slots(entry->the_ring);
10931 + if(num_free_slots == 0)
10935 + num = num_free_slots;
10936 + else if(num > num_free_slots)
10937 + num = num_free_slots;
10945 +/* ************************************* */
10947 +void dna_device_handler(dna_device_operation operation,
10948 + unsigned long packet_memory,
10949 + u_int packet_memory_num_slots,
10950 + u_int packet_memory_slot_len,
10951 + u_int packet_memory_tot_len,
10952 + void *descr_packet_memory,
10953 + u_int descr_packet_memory_num_slots,
10954 + u_int descr_packet_memory_slot_len,
10955 + u_int descr_packet_memory_tot_len,
10956 + u_int channel_id,
10957 + void *phys_card_memory,
10958 + u_int phys_card_memory_len,
10959 + struct net_device *netdev,
10960 + dna_device_model device_model,
10961 + wait_queue_head_t *packet_waitqueue,
10962 + u_int8_t *interrupt_received,
10963 + void *adapter_ptr,
10964 + dna_wait_packet wait_packet_function_ptr) {
10968 + printk("[PF_RING] dna_device_handler(%s)\n", netdev->name);
10970 + if(operation == add_device_mapping) {
10971 + dna_device_list *next;
10973 + next = kmalloc(sizeof(dna_device_list), GFP_ATOMIC);
10974 + if(next != NULL) {
10975 + next->dev.packet_memory = packet_memory;
10976 + next->dev.packet_memory_num_slots = packet_memory_num_slots;
10977 + next->dev.packet_memory_slot_len = packet_memory_slot_len;
10978 + next->dev.packet_memory_tot_len = packet_memory_tot_len;
10979 + next->dev.descr_packet_memory = descr_packet_memory;
10980 + next->dev.descr_packet_memory_num_slots = descr_packet_memory_num_slots;
10981 + next->dev.descr_packet_memory_slot_len = descr_packet_memory_slot_len;
10982 + next->dev.descr_packet_memory_tot_len = descr_packet_memory_tot_len;
10983 + next->dev.phys_card_memory = phys_card_memory;
10984 + next->dev.phys_card_memory_len = phys_card_memory_len;
10985 + next->dev.channel_id = channel_id;
10986 + next->dev.netdev = netdev;
10987 + next->dev.device_model = device_model;
10988 + next->dev.packet_waitqueue = packet_waitqueue;
10989 + next->dev.interrupt_received = interrupt_received;
10990 + next->dev.adapter_ptr = adapter_ptr;
10991 + next->dev.wait_packet_function_ptr = wait_packet_function_ptr;
10992 + list_add(&next->list, &ring_dna_devices_list);
10993 + dna_devices_list_size++;
10995 + printk("[PF_RING] Could not kmalloc slot!!\n");
10998 + struct list_head *ptr, *tmp_ptr;
10999 + dna_device_list *entry;
11001 + list_for_each_safe(ptr, tmp_ptr, &ring_dna_devices_list) {
11002 + entry = list_entry(ptr, dna_device_list, list);
11004 + if((entry->dev.netdev == netdev)
11005 + && (entry->dev.channel_id == channel_id)) {
11008 + dna_devices_list_size--;
11015 + printk("[PF_RING] dna_device_handler(%s): [dna_devices_list_size=%d]\n",
11016 + netdev->name, dna_devices_list_size);
11019 +/* ************************************* */
11021 +static int ring_ioctl(struct socket *sock,
11022 + unsigned int cmd, unsigned long arg)
11025 +#ifdef CONFIG_INET
11026 + case SIOCGIFFLAGS:
11027 + case SIOCSIFFLAGS:
11028 + case SIOCGIFCONF:
11029 + case SIOCGIFMETRIC:
11030 + case SIOCSIFMETRIC:
11035 + case SIOCSIFLINK:
11036 + case SIOCGIFHWADDR:
11037 + case SIOCSIFHWADDR:
11040 + case SIOCSIFSLAVE:
11041 + case SIOCGIFSLAVE:
11042 + case SIOCGIFINDEX:
11043 + case SIOCGIFNAME:
11044 + case SIOCGIFCOUNT:
11045 + case SIOCSIFHWBROADCAST:
11046 + return(inet_dgram_ops.ioctl(sock, cmd, arg));
11050 + return -ENOIOCTLCMD;
11056 +/* ************************************* */
11058 +static struct proto_ops ring_ops = {
11059 + .family = PF_RING,
11060 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
11061 + .owner = THIS_MODULE,
11064 + /* Operations that make no sense on ring sockets. */
11065 + .connect = sock_no_connect,
11066 + .socketpair = sock_no_socketpair,
11067 + .accept = sock_no_accept,
11068 + .getname = sock_no_getname,
11069 + .listen = sock_no_listen,
11070 + .shutdown = sock_no_shutdown,
11071 + .sendpage = sock_no_sendpage,
11072 + .sendmsg = sock_no_sendmsg,
11074 + /* Now the operations that really occur. */
11075 + .release = ring_release,
11076 + .bind = ring_bind,
11077 + .mmap = ring_mmap,
11078 + .poll = ring_poll,
11079 + .setsockopt = ring_setsockopt,
11080 + .getsockopt = ring_getsockopt,
11081 + .ioctl = ring_ioctl,
11082 + .recvmsg = ring_recvmsg,
11085 +/* ************************************ */
11087 +static struct net_proto_family ring_family_ops = {
11088 + .family = PF_RING,
11089 + .create = ring_create,
11090 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
11091 + .owner = THIS_MODULE,
11095 +// BD: API changed in 2.6.12, ref:
11096 +// http://svn.clkao.org/svnweb/linux/revision/?rev=28201
11097 +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,11))
11098 +static struct proto ring_proto = {
11099 + .name = "PF_RING",
11100 + .owner = THIS_MODULE,
11101 + .obj_size = sizeof(struct sock),
11105 +/* ************************************ */
11107 +static void __exit ring_exit(void)
11109 + struct list_head *ptr, *tmp_ptr;
11110 + struct ring_element *entry;
11112 + list_for_each_safe(ptr, tmp_ptr, &ring_table) {
11113 + entry = list_entry(ptr, struct ring_element, list);
11118 + list_for_each_safe(ptr, tmp_ptr, &ring_cluster_list) {
11119 + ring_cluster_element *cluster_ptr;
11121 + cluster_ptr = list_entry(ptr, ring_cluster_element, list);
11124 + kfree(cluster_ptr);
11127 + list_for_each_safe(ptr, tmp_ptr, &ring_dna_devices_list) {
11128 + dna_device_list *elem;
11130 + elem = list_entry(ptr, dna_device_list, list);
11136 + set_register_pfring_plugin(NULL);
11137 + set_unregister_pfring_plugin(NULL);
11138 + set_skb_ring_handler(NULL);
11139 + set_add_hdr_to_ring(NULL);
11140 + set_buffer_ring_handler(NULL);
11141 + set_read_device_pfring_free_slots(NULL);
11142 + set_ring_dna_device_handler(NULL);
11143 + sock_unregister(PF_RING);
11144 + ring_proc_term();
11145 + printk("[PF_RING] unloaded\n");
11148 +/* ************************************ */
11150 +static int __init ring_init(void)
11154 + printk("[PF_RING] Welcome to PF_RING %s\n"
11155 + "(C) 2004-09 L.Deri <deri@ntop.org>\n",
11158 + INIT_LIST_HEAD(&ring_table);
11159 + INIT_LIST_HEAD(&ring_cluster_list);
11160 + INIT_LIST_HEAD(&ring_dna_devices_list);
11162 + for(i=0; i<MAX_NUM_DEVICES; i++)
11163 + INIT_LIST_HEAD(&device_ring_list[i]);
11165 + sock_register(&ring_family_ops);
11167 + set_skb_ring_handler(skb_ring_handler);
11168 + set_add_hdr_to_ring(add_hdr_to_ring);
11169 + set_buffer_ring_handler(buffer_ring_handler);
11170 + set_register_pfring_plugin(register_plugin);
11171 + set_unregister_pfring_plugin(unregister_plugin);
11172 + set_read_device_pfring_free_slots(get_num_device_free_slots);
11173 + set_ring_dna_device_handler(dna_device_handler);
11175 + if(get_buffer_ring_handler() != buffer_ring_handler) {
11176 + printk("[PF_RING] set_buffer_ring_handler FAILED\n");
11178 + set_skb_ring_handler(NULL);
11179 + set_buffer_ring_handler(NULL);
11180 + sock_unregister(PF_RING);
11183 + printk("[PF_RING] Ring slots %d\n", num_slots);
11184 + printk("[PF_RING] Slot version %d\n", RING_FLOWSLOT_VERSION);
11185 + printk("[PF_RING] Capture TX %s\n",
11186 + enable_tx_capture ? "Yes [RX+TX]" : "No [RX only]");
11187 + printk("[PF_RING] IP Defragment %s\n", enable_ip_defrag ? "Yes" : "No");
11188 + printk("[PF_RING] Initialized correctly\n");
11190 + ring_proc_init();
11195 +module_init(ring_init);
11196 +module_exit(ring_exit);
11198 +MODULE_LICENSE("GPL");
11199 +MODULE_AUTHOR("Luca Deri <deri@ntop.org>");
11200 +MODULE_DESCRIPTION("Packet capture acceleration by means of a ring buffer");
11202 +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
11203 +MODULE_ALIAS_NETPROTO(PF_RING);