--- linux-2.6.4/include/linux/pkt_sched.h.orig 2004-04-02 23:46:35.869438752 +0200 +++ linux-2.6.4/include/linux/pkt_sched.h 2004-04-02 23:59:09.936803088 +0200 @@ -442,4 +442,116 @@ #define TCA_ATM_MAX TCA_ATM_STATE +/* WRR section */ + +/* Other includes */ +#include + +// A sub weight and of a class +// All numbers are represented as parts of (2^64-1). +struct tc_wrr_class_weight { + __u64 val; // Current value (0 is not valid) + __u64 decr; // Value pr bytes (2^64-1 is not valid) + __u64 incr; // Value pr seconds (2^64-1 is not valid) + __u64 min; // Minimal value (0 is not valid) + __u64 max; // Minimal value (0 is not valid) + + // The time where the above information was correct: + time_t tim; +}; + +// Pakcet send when modifying a class: +struct tc_wrr_class_modf { + // Not-valid values are ignored. + struct tc_wrr_class_weight weight1; + struct tc_wrr_class_weight weight2; +}; + +// Packet returned when quering a class: +struct tc_wrr_class_stats { + char used; // If this is false the information below is invalid + + struct tc_wrr_class_modf class_modf; + + unsigned char addr[ETH_ALEN]; + char usemac; // True if addr is a MAC address, else it is an IP address + // (this value is only for convience, it is always the same + // value as in the qdisc) + int heappos; // Current heap position or 0 if not in heap + __u64 penal_ls; // Penalty value in heap (ls) + __u64 penal_ms; // Penalty value in heap (ms) +}; + +// Qdisc-wide penalty information (boolean values - 2 not valid) +struct tc_wrr_qdisc_weight { + char weight_mode; // 0=No automatic change to weight + // 1=Decrease normally + // 2=Also multiply with number of machines + // 3=Instead multiply with priority divided + // with priority of the other. + // -1=no change +}; + +// Packet send when modifing a qdisc: +struct tc_wrr_qdisc_modf { + // Not-valid values are ignored: + struct tc_wrr_qdisc_weight weight1; + struct tc_wrr_qdisc_weight weight2; +}; + +// Packet send when creating a qdisc: +struct tc_wrr_qdisc_crt { + struct tc_wrr_qdisc_modf qdisc_modf; + + char srcaddr; // 1=lookup source, 0=lookup destination + char usemac; // 1=Classify on MAC addresses, 0=classify on IP + char usemasq; // 1=Classify based on masqgrading - only valid + // if usemac is zero + int bands_max; // Maximal number of bands (i.e.: classes) + int proxy_maxconn; // If differnt from 0 then we support proxy remapping + // of packets. And this is the number of maximal + // concurrent proxy connections. +}; + +// Packet returned when quering a qdisc: +struct tc_wrr_qdisc_stats { + struct tc_wrr_qdisc_crt qdisc_crt; + int proxy_curconn; + int nodes_in_heap; // Current number of bands wanting to send something + int bands_cur; // Current number of bands used (i.e.: MAC/IP addresses seen) + int bands_reused; // Number of times this band has been reused. + int packets_requed; // Number of times packets have been requeued. + __u64 priosum; // Sum of priorities in heap where 1 is 2^32 +}; + +struct tc_wrr_qdisc_modf_std { + // This indicates which of the tc_wrr_qdisc_modf structers this is: + char proxy; // 0=This struct + + // Should we also change a class? + char change_class; + + // Only valid if change_class is false + struct tc_wrr_qdisc_modf qdisc_modf; + + // Only valid if change_class is true: + unsigned char addr[ETH_ALEN]; // Class to change (non-used bytes should be 0) + struct tc_wrr_class_modf class_modf; // The change +}; + +// Used for proxyrempping: +struct tc_wrr_qdisc_modf_proxy { + // This indicates which of the tc_wrr_qdisc_modf structers this is: + char proxy; // 1=This struct + + // This is 1 if the proxyremap information should be reset + char reset; + + // changec is the number of elements in changes. + int changec; + + // This is an array of type ProxyRemapBlock: + long changes[0]; +}; + /* Delay section */ struct tc_dly_qopt { diff -uNr linux-2.6.4/net/sched.orig/Kconfig linux-2.6.4/net/sched/Kconfig --- linux-2.6.4/net/sched.orig/Kconfig 2004-04-02 23:46:35.000000000 +0200 +++ linux-2.6.4/net/sched/Kconfig 2004-04-03 00:06:53.438340144 +0200 @@ -39,6 +39,10 @@ To compile this code as a module, choose M here: the module will be called sch_htb. +config NET_SCH_WRR + tristate "WRR packet scheduler" + depends on NET_SCHED + config NET_SCH_HFSC tristate "HFSC packet scheduler" depends on NET_SCHED diff -uNr linux-2.6.4/net/sched.orig/Makefile linux-2.6.4/net/sched/Makefile --- linux-2.6.4/net/sched.orig/Makefile 2004-04-02 23:46:35.000000000 +0200 +++ linux-2.6.4/net/sched/Makefile 2004-04-03 00:05:52.359625520 +0200 @@ -10,6 +10,7 @@ obj-$(CONFIG_NET_CLS_POLICE) += police.o obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o +obj-$(CONFIG_NET_SCH_WRR) += sch_wrr.o obj-$(CONFIG_NET_SCH_CSZ) += sch_csz.o obj-$(CONFIG_NET_SCH_HPFQ) += sch_hpfq.o obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o diff -uNr linux-2.6.4/net/sched.orig/proxydict.c linux-2.6.4/net/sched/proxydict.c --- linux-2.6.4/net/sched.orig/proxydict.c 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.6.4/net/sched/proxydict.c 2004-04-03 00:02:21.565671072 +0200 @@ -0,0 +1,153 @@ +#ifndef __KERNEL__ +#include +#include +#endif + +#include "proxyremap.h" +#include "proxydict.h" + + +/*-------------------------------------------------------------------------- +Implementation. +*/ + +// Hash function +#define hash_fnc(m,server,port,proto) \ + (((proto)*7+(server)*13+(port)*5)%m->hash_size) + +// Size of hash table given maximal number of connections: +#define hash_size_max_con(max_con) (2*(max_con)) + +// The memory area we maintain: +typedef struct { + int hash_size; + int max_con; + int cur_con; + + int free_first; + + // Then we have: + // int hash_table[hash_size]; + // int next[max_con]; + // ProxyRemapBlock info[max_con]; + // + // The idea is the following: + // Given a connection we map it by hash_fnc into hash_table. This gives an + // index in next which contains a -1 terminated linked list of connections + // mapping to that hash value. + // + // The entries in next not allocated is also in linked list where + // the first free index is free_first. +} memory; + +#define Memory(m) ((memory*)m) +#define Hash_table(m) ((int*)(((char*)m)+sizeof(memory))) +#define Next(m) ((int*)(((char*)m)+sizeof(memory)+ \ + sizeof(int)*((memory*)m)->hash_size)) +#define Info(m) ((ProxyRemapBlock*)(((char*)m)+ \ + sizeof(memory)+ \ + sizeof(int)*((memory*)m)->hash_size+\ + sizeof(int)*((memory*)m)->max_con \ + )) + +int proxyGetMemSize(int max_con) { + return sizeof(memory)+ + sizeof(int)*hash_size_max_con(max_con)+ + sizeof(int)*max_con+ + sizeof(ProxyRemapBlock)*max_con; +} + +void proxyInitMem(void* data, int max_con) { + // Init m: + memory* m=Memory(data); + m->max_con=max_con; + m->cur_con=0; + m->hash_size=hash_size_max_con(max_con); + + { + // Get pointers: + int* hash_table=Hash_table(data); + int* next=Next(data); + int i; + + // Init the hash table: + for(i=0; ihash_size; i++) hash_table[i]=-1; + + // Init the free-list + for(i=0; imax_con; i++) next[i]=i+1; + m->free_first=0; + } +} + +int proxyGetCurConn(void* data) { + return Memory(data)->cur_con; +} + +int proxyGetMaxConn(void* data) { + return Memory(data)->max_con; +} + +ProxyRemapBlock* proxyLookup(void* data, unsigned ipaddr, unsigned short port, char proto) { + memory* m=Memory(data); + int* hash_table=Hash_table(m); + int* next=Next(m); + ProxyRemapBlock* info=Info(m); + int i; + + for(i=hash_table[hash_fnc(m,ipaddr,port,proto)]; i!=-1; i=next[i]) { + if(info[i].proto==proto && + info[i].sport==port && + info[i].saddr==ipaddr) return &info[i]; + } + + return 0; +} + +int proxyConsumeBlock(void* data, ProxyRemapBlock* blk) { + memory* m=Memory(data); + int* hash_table=Hash_table(m); + int* next=Next(m); + ProxyRemapBlock* info=Info(m); + int hash=hash_fnc(m,blk->saddr,blk->sport,blk->proto); + int foo; + + if(blk->open) { + if(m->cur_con == m->max_con) return -1; + + // Insert the block at a free entry: + info[m->free_first]=*blk; + m->cur_con++; + + foo=next[m->free_first]; + + // And insert it in the hash tabel: + next[m->free_first]=hash_table[hash]; + hash_table[hash]=m->free_first; + m->free_first=foo; + } else { + int* toupdate; + + // Find the block + for(toupdate=&hash_table[hash]; + *toupdate!=-1; + toupdate=&next[*toupdate]) { + if(info[*toupdate].proto==blk->proto && + info[*toupdate].sport==blk->sport && + info[*toupdate].saddr==blk->saddr) break; + } + if(*toupdate==-1) return -1; + + foo=*toupdate; + + // Delete it from the hashing list: + *toupdate=next[*toupdate]; + + // And put it on the free list: + next[foo]=m->free_first; + m->free_first=foo; + + m->cur_con--; + } + + return 0; +} diff -uNr linux-2.6.4/net/sched.orig/proxydict.h linux-2.6.4/net/sched/proxydict.h --- linux-2.6.4/net/sched.orig/proxydict.h 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.6.4/net/sched/proxydict.h 2004-04-03 00:02:21.567670768 +0200 @@ -0,0 +1,32 @@ +#ifdef __cplusplus +extern "C" { +#endif + +/*-------------------------------------------------------------------------- +This is common code for for handling the tabels containing information about +which proxyserver connections are associated with which machines.. +*/ + +// Returns the number of bytes that should be available in the area +// maintained by this module given the maximal number of concurrent +// connections. +int proxyGetMemSize(int max_connections); + +// Initializes a memory area to use. There must be as many bytes +// available as returned by getMemSize. +void proxyInitMem(void* data, int max_connections); + +// Queries: +int proxyGetCurConn(void* data); // Returns current number of connections +int proxyMaxCurConn(void* data); // Returns maximal number of connections + +// This is called to open and close conenctions. Returns -1 if +// a protocol error occores (i.e.: If it is discovered) +int proxyConsumeBlock(void* data, ProxyRemapBlock*); + +// Returns the RemapBlock associated with this connection or 0: +ProxyRemapBlock* proxyLookup(void* data, unsigned ipaddr, unsigned short port, char proto); + +#ifdef __cplusplus +} +#endif diff -uNr linux-2.6.4/net/sched.orig/proxyremap.h linux-2.6.4/net/sched/proxyremap.h --- linux-2.6.4/net/sched.orig/proxyremap.h 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.6.4/net/sched/proxyremap.h 2004-04-03 00:02:21.568670616 +0200 @@ -0,0 +1,33 @@ +#ifndef PROXYREMAP_H +#define PROXYREMAP_H + +// This describes the information that is written in proxyremap.log and which +// are used in the communication between proxyremapserver and proxyremapclient. +// Everything is in network order. + +// First this header is send: +#define PROXY_WELCOME_LINE "ProxyRemap 1.02. This is a binary protocol.\r\n" + +// Then this block is send every time a connection is opened or closed. +// Note how it is alligned to use small space usage - arrays of this +// structure are saved in many places. +typedef struct { + // Server endpoint of connection: + unsigned saddr; + unsigned short sport; + + // IP protocol for this connection (typically udp or tcp): + unsigned char proto; + + // Is the connection opened or closed? + unsigned char open; + + // Client the packets should be accounted to: + unsigned caddr; + unsigned char macaddr[6]; // Might be 0. + + // An informal two-charecter code from the proxyserver. Used for debugging. + char proxyinfo[2]; +} ProxyRemapBlock; + +#endif diff -uNr linux-2.6.4/net/sched.orig/sch_wrr.c linux-2.6.4/net/sched/sch_wrr.c --- linux-2.6.4/net/sched.orig/sch_wrr.c 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.6.4/net/sched/sch_wrr.c 2004-04-03 00:02:21.574669704 +0200 @@ -0,0 +1,1364 @@ +/*----------------------------------------------------------------------------- +Weighted Round Robin scheduler. + +Written by Christian Worm Mortensen, cworm@it-c.dk. + +Introduction +============ +This module implements a weighted round robin queue with build-in classifier. +The classifier currently map each MAC or IP address (configurable either MAC +or IP and either source or destination) to different classes. Each such class +is called a band. Whan using MAC addresses only bridged packets can be +classified other packets go to a default MAC address. + +Each band has a weight value, where 0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +// There seems to be problems when calling functions from userspace when +// using vmalloc and vfree. +//#define my_malloc(size) vmalloc(size) +//#define my_free(ptr) vfree(ptr) +#define my_malloc(size) kmalloc(size,GFP_KERNEL) +#define my_free(ptr) kfree(ptr) + +// Kernel depend stuff: +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) + #define KERNEL22 +#endif + +#ifdef KERNEL22 + #define LOCK_START start_bh_atomic(); + #define LOCK_END end_bh_atomic(); + #define ENQUEUE_SUCCESS 1 + #define ENQUEUE_FAIL 0 + #ifdef CONFIG_IP_MASQUERADE + #include + #define MASQ_SUPPORT + #endif +#else + #define LOCK_START sch_tree_lock(sch); + #define LOCK_END sch_tree_unlock(sch); + #define ENQUEUE_SUCCESS 0 + #define ENQUEUE_FAIL NET_XMIT_DROP + #ifdef CONFIG_NETFILTER + #include + #define MASQ_SUPPORT + #endif +#endif + +#include "proxydict.c" + +// The penalty (priority) type: +typedef u64 penalty_base_t; +#define penalty_base_t_max ((penalty_base_t)-1) +typedef struct penalty_t { + penalty_base_t ms; + penalty_base_t ls; +} penalty_t; +#define penalty_leq(a,b) (a.mselements=0; + h->root_1=poll-1; + + for(i=0; ielements==0; +} + +static char heap_contains(struct heap* h, int id) { + return h->root_1[id+1].id2idx!=0; +} + +static int heap_root(struct heap* h) { + return h->root_1[1].id; +} + +static penalty_t heap_get_penalty(struct heap* h, int id) { + return h->root_1[ h->root_1[id+1].id2idx ].penalty; +} + +static void heap_penalty_changed_internal(struct heap* h,int idx); + +static void heap_set_penalty(struct heap* h, int id, penalty_t p) { + int idx=h->root_1[id+1].id2idx; + h->root_1[idx].penalty=p; + heap_penalty_changed_internal(h,idx); +} + +static void heap_insert(struct heap* h, int id, penalty_t p) { + // Insert at the end of the heap: + h->elements++; + h->root_1[h->elements].id=id; + h->root_1[h->elements].penalty=p; + h->root_1[id+1].id2idx=h->elements; + + // And put it in the right position: + heap_penalty_changed_internal(h,h->elements); +} + +static void heap_remove(struct heap* h, int id) { + int idx=h->root_1[id+1].id2idx; + int mvid; + h->root_1[id+1].id2idx=0; + + if(h->elements==idx) { h->elements--; return; } + + mvid=h->root_1[h->elements].id; + h->root_1[idx].id=mvid; + h->root_1[idx].penalty=h->root_1[h->elements].penalty; + h->root_1[mvid+1].id2idx=idx; + + h->elements--; + heap_penalty_changed_internal(h,idx); +} + +static void heap_swap(struct heap* h, int idx0, int idx1) { + penalty_t tmp_p; + int tmp_id; + int id0,id1; + + // Simple content: + tmp_p=h->root_1[idx0].penalty; + tmp_id=h->root_1[idx0].id; + h->root_1[idx0].penalty=h->root_1[idx1].penalty; + h->root_1[idx0].id=h->root_1[idx1].id; + h->root_1[idx1].penalty=tmp_p; + h->root_1[idx1].id=tmp_id; + + // Update reverse pointers: + id0=h->root_1[idx0].id; + id1=h->root_1[idx1].id; + h->root_1[id0+1].id2idx=idx0; + h->root_1[id1+1].id2idx=idx1; +} + +static void heap_penalty_changed_internal(struct heap* h,int cur) { + if(cur==1 || penalty_leq(h->root_1[cur>>1].penalty,h->root_1[cur].penalty)) { + // We are in heap order upwards - so we should move the element down + for(;;) { + int nxt0=cur<<1; + int nxt1=nxt0+1; + penalty_t pen_c=h->root_1[cur].penalty; + penalty_t pen_0=nxt0<=h->elements ? h->root_1[nxt0].penalty : penalty_max; + penalty_t pen_1=nxt1<=h->elements ? h->root_1[nxt1].penalty : penalty_max; + + if(penalty_le(pen_0,pen_c) && penalty_leq(pen_0,pen_1)) { + // Swap with child 0: + heap_swap(h,cur,nxt0); + cur=nxt0; + } else if(penalty_le(pen_1,pen_c)) { + // Swap with child 1: + heap_swap(h,cur,nxt1); + cur=nxt1; + } else { + // Heap in heap order: + return; + } + } + } else { + // We are not in heap order upwards (and thus we must be it downwards). + // We move up: + while(cur!=1) { // While not root + int nxt=cur>>1; + if(penalty_leq(h->root_1[nxt].penalty,h->root_1[cur].penalty)) return; + heap_swap(h,cur,nxt); + cur=nxt; + } + } +}; + +//----------------------------------------------------------------------------- +// Classification based on MAC or IP adresses. Note that of historical reason +// these are prefixed with mac_ since originally only MAC bases classification +// was supported. +// +// This code should be in a separate filter module - but it isn't. + +// Interface: + +struct mac_head; + +// Initialices/destroys the structure we maintain. +// Returns -1 on error +static int mac_init(struct mac_head*, int max_macs, char srcaddr, + char usemac, char usemasq, void* proxyremap); +static void mac_done(struct mac_head*); +static void mac_reset(struct mac_head*); + +// Classify a packet. Returns a number n where 0<=n>1; + m_ptr=((const char*)base)+m_idx*size; + + i=compare(key,m_ptr); + if(i<0) // key is less + return bsearch(key,base,m_idx,size,compare); + else if(i>0) + return bsearch(key,((const char*)m_ptr)+size,nmemb-m_idx-1,size,compare); + + return m_ptr; +} + +static int mac_init(struct mac_head* h, int max_macs, char srcaddr, + char usemac, char usemasq,void* proxyremap) { + h->mac_cur=0; + h->mac_reused=0; + h->incr_time=0; + h->srcaddr=srcaddr; + h->usemac=usemac; + h->usemasq=usemasq; + h->mac_max=max_macs; + h->proxyremap=proxyremap; + + h->macs=(struct mac_addr*) + my_malloc( sizeof(struct mac_addr)*max_macs); + h->cls2mac=(char*)my_malloc( 6*max_macs); + if(!h->macs || !h->cls2mac) { + if(h->macs) my_free(h->macs); + if(h->cls2mac) my_free(h->cls2mac); + return -1; + } + return 0; +} + +static void mac_done(struct mac_head* h) { + my_free(h->macs); + my_free(h->cls2mac); +} + +static void mac_reset(struct mac_head* h) { + h->mac_cur=0; + h->mac_reused=0; + h->incr_time=0; +} + +static int lookup_mac(struct mac_head* h, unsigned char* addr) { + int i; + int class; + + // First try to find the address in the table: + struct mac_addr* m=(struct mac_addr*) + bsearch(addr,h->macs,h->mac_cur,sizeof(struct mac_addr),mac_compare); + if(m) { + // Found: + m->lastused=h->incr_time++; + return m->class; + } + + // Okay - the MAC adress was not in table + if(h->mac_cur==h->mac_max) { + // And the table is full - delete the oldest entry: + + // Find the oldest entry: + int lowidx=0; + int i; + for(i=1; imac_cur; i++) + if(h->macs[i].lastused < h->macs[lowidx].lastused) lowidx=i; + + class=h->macs[lowidx].class; + + // And delete it: + memmove(&h->macs[lowidx],&h->macs[lowidx+1], + (h->mac_cur-lowidx-1)*sizeof(struct mac_addr)); + h->mac_reused++; + h->mac_cur--; + } else { + class=h->mac_cur; + } + + // The table is now not full - find the position we should put the address in: + for(i=0; imac_cur; i++) if(mac_compare(addr,&h->macs[i])<0) break; + + // We should insert at position i: + memmove(&h->macs[i+1],&h->macs[i],(h->mac_cur-i)*sizeof(struct mac_addr)); + m=&h->macs[i]; + memcpy(m->addr,addr,ETH_ALEN); + m->lastused=h->incr_time++; + m->class=class; + h->mac_cur++; + + // Finally update the cls2mac variabel: + memcpy(h->cls2mac+ETH_ALEN*class,addr,ETH_ALEN); + + return m->class; +} + +int valid_ip_checksum(struct iphdr* ip, int size) { + __u16 header_len=ip->ihl<<2; + __u16 c=0; + __u16* ipu=(u16*)ip; + int a; + + // We require 4 bytes in the packet since we access the port numbers: + if((size>1); a++, ipu++) { + if(a!=5) { // If not the checksum field + __u16 oldc=c; + c+=(*ipu); + if(ccheck==(__u16)~c; +} + +static int mac_classify(struct mac_head* head, struct sk_buff *skb) +{ + // We set this to the address we map to. In case we map to an IP + // address the last two entries are set to 0. + unsigned char addr[ETH_ALEN]; + + + // This is the size of the network part of the packet, I think: + int size=((char*)skb->data+skb->len)-((char*)skb->nh.iph); + + // Set a default value for the address: + memset(addr,0,ETH_ALEN); + + // Accept IP-ARP traffic with big-enough packets: + if(ntohs(skb->protocol)==ETH_P_ARP && + ntohs(skb->nh.arph->ar_pro)==ETH_P_IP) { + // Map all ARP trafic to a default adress to make sure + // it goes through + } else if ((ntohs(skb->protocol)==ETH_P_IP) && + valid_ip_checksum(skb->nh.iph,size)) { + // Accept IP packets which have correct checksum. + + // This is the IP header: + struct iphdr* iph=skb->nh.iph; + + // And this is the port numbers: + const __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); + __u16 sport=portp[0]; + __u16 dport=portp[1]; + + // We will set this to the IP address of the packet that should be + // accounted to: + unsigned ipaddr; + + // Used below: + ProxyRemapBlock* prm; + + // Set ipaddr: + if(head->srcaddr) + ipaddr=iph->saddr; + else + ipaddr=iph->daddr; + +#ifdef MASQ_SUPPORT + // Update ipaddr if packet is masqgraded: + if(head->usemasq) { + #ifdef KERNEL22 + struct ip_masq* src; + + // HACK!: + // ip_masq_in_get must be called for packets comming from the outside + // to the firewall. We have a a packet which is comming from the + // firewall to the outside - so we switch the parameters: + if((src=ip_masq_in_get( + iph->protocol, + iph->daddr,dport, + iph->saddr,sport))) { + // Use masqgraded address: + ipaddr=src->saddr; + + // It seems like we must put it back: + ip_masq_put(src); + } + #else + // Thanks to Rusty Russell for help with the following code: + enum ip_conntrack_info ctinfo; + struct ip_conntrack *ct; + ct = ip_conntrack_get(skb, &ctinfo); + if (ct) { + if(head->srcaddr) + ipaddr=ct->tuplehash[CTINFO2DIR(ctinfo)].tuple.src.ip; + else + ipaddr=ct->tuplehash[CTINFO2DIR(ctinfo)].tuple.dst.ip; + } + #endif + } +#endif + + // Set prm based on ipaddr: + prm=0; + if(head->proxyremap) { + if(head->srcaddr) { + prm=proxyLookup(head->proxyremap,ipaddr,sport,skb->nh.iph->protocol); + } else { + prm=proxyLookup(head->proxyremap,ipaddr,dport,skb->nh.iph->protocol); + } + } + + // And finally set addr to the address: + memset(addr,0,ETH_ALEN); + if(prm) { + // This package should be remapped: + if(head->usemac) + memcpy(addr,prm->macaddr,ETH_ALEN); + else { + memcpy(addr,&prm->caddr,sizeof(unsigned)); + } + } else { + // This packet should not be remapped: + if(head->usemac) { + // We should find MAC address of packet. + // Unfortunatly, this is not always available. + // On bridged packets it always is, however.. + #ifdef KERNEL22 + if(skb->pkt_bridged) { + if(head->srcaddr) { + memcpy(addr,skb->mac.ethernet->h_source,ETH_ALEN); + } else { + memcpy(addr,skb->mac.ethernet->h_dest,ETH_ALEN); + } + } + #endif + } else { + memcpy(addr,&ipaddr,4); + } + } + } else { + // All other traffic is dropped - this ensures that packets + // we consider probably have valid addresses so we don't + // get to many strange addresses into our table. And that we + // don't use bandwidth on strange packets.. + return -1; + } + + return lookup_mac(head,addr); +} + +//----------------------------------------------------------------------------- +// The qdisc itself + +// Pr-class information. +struct wrrc_sched_data { + struct Qdisc* que; // The queue for this class + struct tc_wrr_class_modf class_modf; // Information about the class. + + // For classes in the heap this is the priority value priosum + // was updated with for this class: + u64 priosum_val; +}; + +// Pr-qdisc information: +struct wrr_sched_data +{ + // A heap containing all the bands that will send something + struct heap h; + struct heap_element* poll; // bandc elements + + // The sum of the prioities of the elements in the heap where + // a priority of 1 is saved as 2^32 + u64 priosum; + + // A class for each band + struct wrrc_sched_data* bands; // bandc elements + + // Information maintained by the proxydict module of 0 if we + // have no proxy remapping + void* proxydict; + + // Always incrementning counters, we always have that any value of + // counter_low_penal < any value of counter_high_penal. + penalty_base_t counter_low_penal; + penalty_base_t counter_high_penal; + + // Penalty updating: + struct tc_wrr_qdisc_modf qdisc_modf; + + // Statistics: + int packets_requed; + + // The filter: + struct mac_head filter; + int bandc; // Number of bands +}; + +// Priority handling. +// weight is in interval [0..2^32] +// priosum has whole numbers in the upper and fragments in the lower 32 bits. +static void weight_transmit(struct tc_wrr_class_weight* p, + struct tc_wrr_qdisc_weight q, + unsigned heapsize, + u64 priosum, u64 weight, + unsigned size) { + + unsigned long now=jiffies/HZ; + + // Penalty for transmitting: + u64 change,old; + u32 divisor; + + change=0; + switch(q.weight_mode) { + case 1: change=p->decr*size; break; + case 2: change=p->decr*size*heapsize; break; + case 3: // Note: 64 bit division is not always available.. + divisor=(u32)(weight>>16); + if(divisor<=0) divisor=1; + change=p->decr*size*(((u32)(priosum>>16))/divisor); break; + } + old=p->val; + p->val-=change; + if(p->val>old || p->valmin) p->val=p->min; + + // Credit for time went: + change=(now-p->tim)*p->incr; + p->tim=now; + old=p->val; + p->val+=change; + if(p->valval>p->max) p->val=p->max; +} + +static void weight_setdefault(struct tc_wrr_class_weight* p) { + p->val=(u64)-1; + p->decr=0; + p->incr=0; + p->min=(u64)-1; + p->max=(u64)-1; + p->tim=jiffies/HZ; +} + +static void weight_setvalue(struct tc_wrr_class_weight* dst, + struct tc_wrr_class_weight* src) { + if(src->val!=0) { + dst->val=src->val; + dst->tim=jiffies/HZ; + } + if(src->min!=0) dst->min=src->min; + if(src->max!=0) dst->max=src->max; + if(src->decr!=((u64)-1)) dst->decr=src->decr; + if(src->incr!=((u64)-1)) dst->incr=src->incr; + if(dst->valmin) dst->val=dst->min; + if(dst->val>dst->max) dst->val=dst->max; +} + +static void wrr_destroy(struct Qdisc *sch) +{ + struct wrr_sched_data *q=(struct wrr_sched_data *)sch->data; + int i; + + // Destroy our filter: + mac_done(&q->filter); + + // Destroy all our childre ques: + for(i=0; ibandc; i++) + qdisc_destroy(q->bands[i].que); + + // And free memory: + my_free(q->bands); + my_free(q->poll); + if(q->proxydict) my_free(q->proxydict); + + MOD_DEC_USE_COUNT; +} + +static int wrr_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct wrr_sched_data *q = (struct wrr_sched_data *)sch->data; + int i,maciniterr; + char crterr; + struct tc_wrr_qdisc_crt *qopt; + + // Parse options: + if (!opt) return -EINVAL; // Options must be specified + if (opt->rta_len < RTA_LENGTH(sizeof(*qopt))) return -EINVAL; + qopt = RTA_DATA(opt); + + if(qopt->bands_max>8192 || qopt->bands_max<2) { + // More than 8192 queues or less than 2? That cannot be true - it must be + // an error... + return -EINVAL; + } + + if(qopt->proxy_maxconn<0 || qopt->proxy_maxconn>20000) { + // More than this number of maximal concurrent connections is unrealistic + return -EINVAL; + } + +#ifndef MASQ_SUPPORT + if(qopt->usemasq) { + return -ENOSYS; + } +#endif + +#ifndef KERNEL22 + if(qopt->usemac) { // Not supported - please fix this! + return -ENOSYS; + } +#endif + + q->bandc=qopt->bands_max; + q->qdisc_modf=qopt->qdisc_modf; + + // Create structures: + q->poll=(struct heap_element*) + my_malloc( sizeof(struct heap_element)*q->bandc); + q->bands=(struct wrrc_sched_data*) + my_malloc( sizeof(struct wrrc_sched_data)*q->bandc); + + if(qopt->proxy_maxconn>0) { + q->proxydict=my_malloc(proxyGetMemSize(qopt->proxy_maxconn)); + } else { + q->proxydict=0; + } + + // Init mac module: + maciniterr=mac_init(&q->filter,qopt->bands_max,qopt->srcaddr, + qopt->usemac,qopt->usemasq,q->proxydict); + + // See if we got the memory we wanted: + if(!q->poll || !q->bands || + (qopt->proxy_maxconn>0 && !q->proxydict) || maciniterr<0) { + if(q->poll) my_free(q->poll); + if(q->bands) my_free(q->bands); + if(q->proxydict) my_free(q->proxydict); + if(maciniterr>=0) mac_done(&q->filter); + return -ENOMEM; + } + + // Initialize proxy: + if(q->proxydict) { + proxyInitMem(q->proxydict,qopt->proxy_maxconn); + } + + // Initialize values: + q->counter_low_penal=0; + q->counter_high_penal=penalty_base_t_max>>1; + q->packets_requed=0; + + // Initialize empty heap: + heap_init(&q->h,q->bandc,q->poll); + q->priosum=0; + + // Initialize each band: + crterr=0; + for (i=0; ibandc; i++) { + weight_setdefault(&q->bands[i].class_modf.weight1); + weight_setdefault(&q->bands[i].class_modf.weight2); + if(!crterr) { + struct Qdisc *child=qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + if(child) + q->bands[i].que = child; + else { + // Queue couldn't be created :-( + crterr=1; + } + } + if(crterr) q->bands[i].que = &noop_qdisc; + } + + MOD_INC_USE_COUNT; + + if(crterr) { + // Destroy again: + wrr_destroy(sch); + return -ENOMEM; + } + + return 0; +} + +static void wrr_reset(struct Qdisc* sch) +{ + struct wrr_sched_data *q = (struct wrr_sched_data *)sch->data; + int i; + + // Reset own values: + q->counter_low_penal=0; + q->counter_high_penal=penalty_base_t_max>>1; + q->packets_requed=0; + + // Reset filter: + mac_reset(&q->filter); + + // Reinitialize heap: + heap_init(&q->h,q->bandc,q->poll); + q->priosum=0; + + // Reset all bands: + for (i=0; ibandc; i++) { + weight_setdefault(&q->bands[i].class_modf.weight1); + weight_setdefault(&q->bands[i].class_modf.weight2); + qdisc_reset(q->bands[i].que); + } + + // Reset proxy remapping information: + if(q->proxydict) + proxyInitMem(q->proxydict,proxyGetMaxConn(q->proxydict)); +} + +static int wrr_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct wrr_sched_data *q = (struct wrr_sched_data *)sch->data; + int retvalue=ENQUEUE_FAIL; + + // The packet is in skb. + int band=mac_classify(&q->filter,skb); + + if(band>=0) { + // Enque packet for this band: + struct Qdisc* qdisc = q->bands[band].que; + + if ((retvalue=qdisc->enqueue(skb, qdisc)) == ENQUEUE_SUCCESS) { + // Successfull + sch->stats.bytes += skb->len; + sch->stats.packets++; + sch->q.qlen++; + + // Insert band into heap if not already there: + if(!heap_contains(&q->h,band)) { + penalty_t p; + if(!heap_empty(&q->h)) + p.ms=heap_get_penalty(&q->h,heap_root(&q->h)).ms; + else + p.ms=0; + p.ls=q->counter_low_penal++; + heap_insert(&q->h,band,p); + q->bands[band].priosum_val= + ((q->bands[band].class_modf.weight1.val>>48)+1)* + ((q->bands[band].class_modf.weight2.val>>48)+1); + q->priosum+=q->bands[band].priosum_val; + } + } + } else { + // If we decide not to enque it seems like we also need to free the packet: + kfree_skb(skb); + } + + if(retvalue!=ENQUEUE_SUCCESS) { + // Packet not enqued: + sch->stats.drops++; + } + + return retvalue; +} + +static struct sk_buff *wrr_dequeue(struct Qdisc* sch) +{ + struct wrr_sched_data *q = (struct wrr_sched_data *)sch->data; + struct sk_buff* skb; + int band; + u64 weight,priosum; + struct wrrc_sched_data* b; + + // Return if heap is empty: + if(heap_empty(&q->h)) return 0; + + // Find root element: + band=heap_root(&q->h); + + // Find priority of this element in interval [1;2^32] + b=&q->bands[band]; + weight=((b->class_modf.weight1.val>>48)+1)* + ((b->class_modf.weight2.val>>48)+1); //weight is in interval [1;2^32] + priosum=q->priosum; + q->priosum-=q->bands[band].priosum_val; + + // Deque the packet from the root: + skb=q->bands[band].que->dequeue(q->bands[band].que); + + if(skb) { + // There was a packet in this que. + unsigned adjlen; + penalty_t p; + + // Find length of packet adjusted with priority: + adjlen=(u32)(weight>>(32-16)); + if(adjlen==0) adjlen=1; + adjlen=(skb->len<<16)/adjlen; + + // Update penalty information for this class: + weight_transmit(&b->class_modf.weight1,q->qdisc_modf.weight1,q->h.elements,priosum,weight,skb->len); + weight_transmit(&b->class_modf.weight2,q->qdisc_modf.weight2,q->h.elements,priosum,weight,skb->len); + q->bands[band].priosum_val=((b->class_modf.weight1.val>>48)+1)* + ((b->class_modf.weight2.val>>48)+1); + q->priosum+=q->bands[band].priosum_val; + + // And update the class in the heap + p=heap_get_penalty(&q->h,band); + p.ms+=adjlen; + p.ls=q->counter_high_penal++; + heap_set_penalty(&q->h,band,p); + + // Return packet: + sch->q.qlen--; + return skb; + } + + // No packet - so machine should be removed from heap: + heap_remove(&q->h,band); + + // And try again: + return wrr_dequeue(sch); +} + +static int wrr_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct wrr_sched_data *q = (struct wrr_sched_data *)sch->data; + struct Qdisc* qdisc; + int ret; + + // Find band we took it from: + int band=mac_classify(&q->filter,skb); + if(band<0) { + // Who should now free the pakcet? + printk(KERN_DEBUG "sch_wrr: Oops - packet requed could never have been queued.\n"); + sch->stats.drops++; + return ENQUEUE_FAIL; + } + + q->packets_requed++; + + // Try to requeue it on that machine: + qdisc=q->bands[band].que; + + if((ret=qdisc->ops->requeue(skb,qdisc))==ENQUEUE_SUCCESS) { + // On success: + sch->q.qlen++; + + // We should restore priority information - but we don't + // + // p=heap_get_penalty(&q->h,band); + // ... + // heap_set_penalty(&q->h,band,p); + + return ENQUEUE_SUCCESS; + } else { + sch->stats.drops++; + return ret; + } +} + +static unsigned wrr_drop(struct Qdisc* sch) +{ + struct wrr_sched_data *q = (struct wrr_sched_data *)sch->data; + + // Ugly... Drop button up in heap: + int i; + + for(i=q->h.elements; i>=1; i--) { + int band=q->h.root_1[i].id; + if(q->bands[band].que->ops->drop(q->bands[band].que)) { + // On success + sch->q.qlen--; + return 1; + } + } + + return 0; +} + +static int wrr_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct wrr_sched_data *q = (struct wrr_sched_data *)sch->data; + unsigned char *b = skb->tail; + struct tc_wrr_qdisc_stats opt; + + opt.qdisc_crt.qdisc_modf=q->qdisc_modf; + opt.qdisc_crt.srcaddr=q->filter.srcaddr; + opt.qdisc_crt.usemac=q->filter.usemac; + opt.qdisc_crt.usemasq=q->filter.usemasq; + opt.qdisc_crt.bands_max=q->filter.mac_max; + opt.nodes_in_heap=q->h.elements; + opt.bands_cur=q->filter.mac_cur; + opt.bands_reused=q->filter.mac_reused; + opt.packets_requed=q->packets_requed; + opt.priosum=q->priosum; + + if(q->proxydict) { + opt.qdisc_crt.proxy_maxconn=proxyGetMaxConn(q->proxydict); + opt.proxy_curconn=proxyGetCurConn(q->proxydict); + } else { + opt.qdisc_crt.proxy_maxconn=0; + opt.proxy_curconn=0; + } + + RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: // seems like RTA_PUT jump to this label.. + skb_trim(skb, b - skb->data); + return -1; +} + +static int wrr_tune_std(struct Qdisc *sch, struct rtattr *opt) +{ + struct wrr_sched_data *q = (struct wrr_sched_data *)sch->data; + struct tc_wrr_qdisc_modf_std *qopt = RTA_DATA(opt); + + if(opt->rta_len < RTA_LENGTH(sizeof(*qopt))) return -EINVAL; + + LOCK_START + + if(qopt->change_class) { + int idx=lookup_mac(&q->filter,qopt->addr); + weight_setvalue + (&q->bands[idx].class_modf.weight1,&qopt->class_modf.weight1); + weight_setvalue + (&q->bands[idx].class_modf.weight2,&qopt->class_modf.weight2); + } else { + if(qopt->qdisc_modf.weight1.weight_mode!=-1) + q->qdisc_modf.weight1.weight_mode=qopt->qdisc_modf.weight1.weight_mode; + if(qopt->qdisc_modf.weight2.weight_mode!=-1) + q->qdisc_modf.weight2.weight_mode=qopt->qdisc_modf.weight2.weight_mode; + } + + LOCK_END + + return 0; +} + +static int wrr_tune_proxy(struct Qdisc *sch, struct rtattr *opt) +{ + struct wrr_sched_data *q = (struct wrr_sched_data *)sch->data; + struct tc_wrr_qdisc_modf_proxy *qopt = RTA_DATA(opt); + int i; + + // Return if we are not configured with proxy support: + if(!q->proxydict) return -ENOSYS; + + // Return if not enough data given: + if(opt->rta_lenrta_len< + RTA_LENGTH(sizeof(*qopt)+sizeof(ProxyRemapBlock)*qopt->changec)) + return -EINVAL; + + LOCK_START; + + if(qopt->reset) { + proxyInitMem(q->proxydict,proxyGetMaxConn(q->proxydict)); + } + + // Do all the changes: + for(i=0; ichangec; i++) { + proxyConsumeBlock(q->proxydict,&((ProxyRemapBlock*)&qopt->changes)[i]); + } + + LOCK_END; + + return 0; +} + +static int wrr_tune(struct Qdisc *sch, struct rtattr *opt) { + if(((struct tc_wrr_qdisc_modf_std*)RTA_DATA(opt))->proxy) { + return wrr_tune_proxy(sch,opt); + } else { + return wrr_tune_std(sch,opt); + } +} + +//----------------------------------------------------------------------------- +// Classes. +// External and internal IDs are equal. They are the band number plus 1. + +// Replace a class with another: +static int wrr_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct wrr_sched_data *q = (struct wrr_sched_data *)sch->data; + if(arg>q->bandc || arg==0) return -EINVAL; + arg--; + + if (new == NULL) + new = &noop_qdisc; + +#ifdef KERNEL22 + *old = xchg(&q->bands[arg].que, new); +#else + LOCK_START + *old = q->bands[arg].que; + q->bands[arg].que = new; + qdisc_reset(*old); + LOCK_END +#endif + + return 0; +} + +// Returns the qdisc for a class: +static struct Qdisc * wrr_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct wrr_sched_data *q = (struct wrr_sched_data *)sch->data; + if(arg>q->bandc || arg==0) return NULL; + arg--; + return q->bands[arg].que; +} + +static unsigned long wrr_get(struct Qdisc *sch, u32 classid) +{ + struct wrr_sched_data *q = (struct wrr_sched_data *)sch->data; + unsigned long band = TC_H_MIN(classid); + if(band>q->bandc || band==0) return 0; + return band; +} + +static void wrr_put(struct Qdisc *q, unsigned long cl) +{ + return; +} + +static int wrr_delete(struct Qdisc *sch, unsigned long cl) +{ + struct wrr_sched_data *q = (struct wrr_sched_data *)sch->data; + if(cl==0 || cl>q->bandc) return -ENOENT; + cl--; + return 0; +} + +static int wrr_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct wrr_sched_data *q = (struct wrr_sched_data *)sch->data; + unsigned char *b = skb->tail; + struct tc_wrr_class_stats opt; + + // Handle of this class: + tcm->tcm_handle = sch->handle|cl; + + if(cl==0 || cl>q->bandc) + goto rtattr_failure; + cl--; + + if(cl>=q->filter.mac_cur) { + // Band is unused: + memset(&opt,0,sizeof(opt)); + opt.used=0; + } else { + opt.used=1; + opt.class_modf.weight1=q->bands[cl].class_modf.weight1; + opt.class_modf.weight2=q->bands[cl].class_modf.weight2; + weight_transmit(&opt.class_modf.weight1,q->qdisc_modf.weight1,0,0,0,0); + weight_transmit(&opt.class_modf.weight2,q->qdisc_modf.weight2,0,0,0,0); + memcpy(opt.addr,q->filter.cls2mac+cl*ETH_ALEN,ETH_ALEN); + opt.usemac=q->filter.usemac; + opt.heappos=q->h.root_1[cl+1].id2idx; + if(opt.heappos!=0) { // Is in heap + opt.penal_ls=heap_get_penalty(&q->h,cl).ls; + opt.penal_ms=heap_get_penalty(&q->h,cl).ms; + } else { + opt.penal_ls=0; + opt.penal_ms=0; + } + } + + // Put quing information: + RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int wrr_change(struct Qdisc *sch, u32 handle, u32 parent, + struct rtattr **tca, unsigned long *arg) +{ + unsigned long cl = *arg; + struct wrr_sched_data *q = (struct wrr_sched_data *)sch->data; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct tc_wrr_class_modf *copt = RTA_DATA(opt); + + if(cl==0 || cl>q->bandc) return -EINVAL; + cl--; + + if (opt->rta_len < RTA_LENGTH(sizeof(*copt))) return -EINVAL; + + LOCK_START; + + weight_setvalue(&q->bands[cl].class_modf.weight1,&copt->weight1); + weight_setvalue(&q->bands[cl].class_modf.weight2,&copt->weight2); + + LOCK_END; + + return 0; +} + +static void wrr_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct wrr_sched_data *q = (struct wrr_sched_data *)sch->data; + int prio; + + if (arg->stop) return; + + for (prio = 1; prio <= q->bandc; prio++) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, prio, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static struct tcf_proto ** wrr_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + return NULL; +} + +static unsigned long wrr_bind(struct Qdisc *sch, + unsigned long parent, u32 classid) +{ + return wrr_get(sch, classid); +} + +//----------------------------------------------------------------------------- +// Generel + +static struct Qdisc_class_ops wrr_class_ops = +{ + wrr_graft, + wrr_leaf, + + wrr_get, + wrr_put, + wrr_change, + wrr_delete, + wrr_walk, + + wrr_find_tcf, + wrr_bind, + wrr_put, + +#if !defined(KERNEL22) || defined(CONFIG_RTNETLINK) + wrr_dump_class, +#endif +}; + +struct Qdisc_ops wrr_qdisc_ops = +{ + NULL, + &wrr_class_ops, + "wrr", + sizeof(struct wrr_sched_data), + + wrr_enqueue, + wrr_dequeue, + wrr_requeue, + wrr_drop, + + wrr_init, + wrr_reset, + wrr_destroy, + wrr_tune, + +#if !defined(KERNEL22) || defined(CONFIG_RTNETLINK) + wrr_dump, +#endif +}; + +#ifdef MODULE + +int init_module(void) +{ + return register_qdisc(&wrr_qdisc_ops); +} + +void cleanup_module(void) +{ + unregister_qdisc(&wrr_qdisc_ops); +} + +#ifndef KERNEL22 + MODULE_LICENSE("GPL"); +#endif + +#endif +