diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/Documentation/Configure.help linux-2.2.19-vs-1.0.8/Documentation/Configure.help --- linux-2.2.19/Documentation/Configure.help Tue Mar 27 09:33:35 2001 +++ linux-2.2.19-vs-1.0.8/Documentation/Configure.help Tue Mar 27 09:32:02 2001 @@ -2807,6 +2807,118 @@ The module will be called ip_masq_markfw.o. If you want to compile it as a module, say M here and read Documentation/modules.txt. +IP: masquerading virtual server support +CONFIG_IP_MASQUERADE_VS + IP Virtual Server support will let you build a virtual server + based on cluster of two or more real servers. This option must + be enabled for at least one of the clustered computers that will + take care of intercepting incomming connections to a single IP + address and scheduling them to real servers. + + Three request dispatching techniques are implemented, they are + virtual server via NAT, virtual server via tunneling and virtual + server via direct routing. The round-robin scheduling, the weighted + round-robin secheduling, the weighted least-connection scheduling, + the locality-based least-connection scheduling, or the + locality-based least-connection with replication scheduling + algorithm can be used to choose which server the connection is + directed to, thus load balancing can be achieved among the servers. + For more information and its administration program, please visit + the following URL: + + http://www.linuxvirtualserver.org/ + If you want this, say Y. + +IP virtual server debugging +CONFIG_IP_VS_DEBUG + Say Y here if you want to get additional messages useful in + debugging the IP virtual server code. You can change the debug + level in /proc/sys/net/ipv4/vs/debug_level + +IP masquerading VS table size (the Nth power of 2) +CONFIG_IP_MASQUERADE_VS_TAB_BITS + Using a big ipvs hash table for virtual server will greatly reduce + conflicts in the ipvs hash table when there are hundreds of thousands + of active connections. + + Note the table size must be power of 2. The table size will be the + value of 2 to the your input number power. For example, the default + number is 12, so the table size is 4096. Don't input the number too + small, otherwise you will lose performance on it. You can adapt the + table size yourself, according to your virtual server application. It + is good to set the table size not far less than the number of + connections per second multiplying average lasting time of connection + in the table. For example, your virtual server gets 200 connections + per second, the connection lasts for 200 seconds in average in the + masquerading table, the table size should be not far less than + 200x200, it is good to set the table size 32768 (2**15). + + Another note that each connection occupies 128 bytes effectively and + each hash entry uses 8 bytes, so you can estimate how much memory is + needed for your box. + +IPVS: round-robin scheduling +CONFIG_IP_MASQUERADE_VS_RR + The robin-robin scheduling algorithm simply directs network + connections to different real servers in a round-robin manner. + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. + +IPVS: weighted round-robin scheduling +CONFIG_IP_MASQUERADE_VS_WRR + The weighted robin-robin scheduling algorithm directs network + connections to different real servers based on server weights + in a round-robin manner. Servers with higher weights receive + new connections first than those with less weights, and servers + with higher weights get more connections than those with less + weights and servers with equal weights get equal connections. + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. + +IPVS: least-connection scheduling +CONFIG_IP_MASQUERADE_VS_LC + The least-connection scheduling algorithm directs network + connections to the server with the least number of active + connections. + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. + +IPVS: weighted least-connection scheduling +CONFIG_IP_MASQUERADE_VS_WLC + The weighted least-connection scheduling algorithm directs network + connections to the server with the least active connections + normalized by the server weight. + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. + +IPVS: locality-based least-connection scheduling +CONFIG_IP_MASQUERADE_VS_LBLC + The locality-based least-connection scheduling algorithm is for + destination IP load balancing. It is usually used in cache cluster. + This algorithm usually directs packet destined for an IP address to + its server if the server is alive and under load. If the server is + overloaded (its active connection numbers is larger than its weight) + and there is a server in its half load, then allocate the weighted + least-connection server to this IP address. + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. + +IPVS: locality-based least-connection with replication scheduling +CONFIG_IP_MASQUERADE_VS_LBLCR + The locality-based least-connection with replication scheduling + algorithm is also for destination IP load balancing. It is + usually used in cache cluster. It differs from the LBLC scheduling + as follows: the load balancer maintains mappings from a target + to a set of server nodes that can serve the target. Requests for + a target are assigned to the least-connection node in the target's + server set. If all the node in the server set are over loaded, + it picks up a least-connection node in the cluster and adds it + in the sever set for the target. If the server set has not been + modified for the specified time, the most loaded node is removed + from the server set, in order to avoid high degree of replication. + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. + IP: aliasing support CONFIG_IP_ALIAS Sometimes it is useful to give several IP addresses to a single diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/include/linux/ip_masq.h linux-2.2.19-vs-1.0.8/include/linux/ip_masq.h --- linux-2.2.19/include/linux/ip_masq.h Sat Oct 23 17:02:32 1999 +++ linux-2.2.19-vs-1.0.8/include/linux/ip_masq.h Tue Dec 12 19:17:27 2000 @@ -103,6 +103,27 @@ #define IP_MASQ_MFW_SCHED 0x01 +/* + * Virtual server stuff + */ +struct ip_vs_user { + /* virtual service options */ + u_int16_t protocol; + u_int32_t vaddr; /* virtual address */ + u_int16_t vport; + u_int32_t vfwmark; /* firwall mark of virtual */ + unsigned vs_flags; /* virtual service flags */ + unsigned timeout; /* persistent timeout in ticks */ + u_int32_t netmask; /* persistent netmask */ + + /* destination specific options */ + u_int32_t daddr; /* real destination address */ + u_int16_t dport; + unsigned masq_flags; /* destination flags */ + int weight; /* destination weight */ +}; + + #define IP_FW_MASQCTL_MAX 256 #define IP_MASQ_TNAME_MAX 32 @@ -115,6 +136,7 @@ struct ip_autofw_user autofw_user; struct ip_mfw_user mfw_user; struct ip_masq_user user; + struct ip_vs_user vs_user; unsigned char m_raw[IP_FW_MASQCTL_MAX]; } u; }; @@ -124,7 +146,9 @@ #define IP_MASQ_TARGET_CORE 1 #define IP_MASQ_TARGET_MOD 2 /* masq_mod is selected by "name" */ #define IP_MASQ_TARGET_USER 3 -#define IP_MASQ_TARGET_LAST 4 +#define IP_MASQ_TARGET_VS 4 +#define IP_MASQ_TARGET_LAST 5 + #define IP_MASQ_CMD_NONE 0 /* just peek */ #define IP_MASQ_CMD_INSERT 1 @@ -136,5 +160,9 @@ #define IP_MASQ_CMD_LIST 7 /* actually fake: done via /proc */ #define IP_MASQ_CMD_ENABLE 8 #define IP_MASQ_CMD_DISABLE 9 +#define IP_MASQ_CMD_ADD_DEST 10 /* for adding dest in IPVS */ +#define IP_MASQ_CMD_DEL_DEST 11 /* for deleting dest in IPVS */ +#define IP_MASQ_CMD_SET_DEST 12 /* for setting dest in IPVS */ #endif /* _LINUX_IP_MASQ_H */ + diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/include/linux/sysctl.h linux-2.2.19-vs-1.0.8/include/linux/sysctl.h --- linux-2.2.19/include/linux/sysctl.h Tue Mar 27 09:33:48 2001 +++ linux-2.2.19-vs-1.0.8/include/linux/sysctl.h Tue Mar 27 09:32:20 2001 @@ -196,6 +196,7 @@ NET_IPV4_NEIGH=17, NET_IPV4_ROUTE=18, NET_IPV4_FIB_HASH=19, + NET_IPV4_VS=20, NET_IPV4_TCP_TIMESTAMPS=33, NET_IPV4_TCP_WINDOW_SCALING=34, @@ -275,6 +276,32 @@ NET_IPV4_CONF_LOG_MARTIANS=11, NET_IPV4_CONF_HIDDEN=12, NET_IPV4_CONF_ARPFILTER=13 +}; + +/* /proc/sys/net/ipv4/vs */ + +enum +{ + NET_IPV4_VS_AMEMTHRESH=1, + NET_IPV4_VS_AMDROPRATE=2, + NET_IPV4_VS_DROP_ENTRY=3, + NET_IPV4_VS_DROP_PACKET=4, + NET_IPV4_VS_SECURE_TCP=5, + NET_IPV4_VS_TO_ES=6, + NET_IPV4_VS_TO_SS=7, + NET_IPV4_VS_TO_SR=8, + NET_IPV4_VS_TO_FW=9, + NET_IPV4_VS_TO_TW=10, + NET_IPV4_VS_TO_CL=11, + NET_IPV4_VS_TO_CW=12, + NET_IPV4_VS_TO_LA=13, + NET_IPV4_VS_TO_LI=14, + NET_IPV4_VS_TO_SA=15, + NET_IPV4_VS_TO_UDP=16, + NET_IPV4_VS_TO_ICMP=17, + NET_IPV4_VS_DEBUG_LEVEL=18, + NET_IPV4_VS_LBLC_EXPIRE=19, + NET_IPV4_VS_LBLCR_EXPIRE=20, }; /* /proc/sys/net/ipv6 */ diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/include/net/ip.h linux-2.2.19-vs-1.0.8/include/net/ip.h --- linux-2.2.19/include/net/ip.h Tue Mar 27 09:33:48 2001 +++ linux-2.2.19-vs-1.0.8/include/net/ip.h Tue Mar 27 17:48:23 2001 @@ -47,6 +47,9 @@ #define IPSKB_MASQUERADED 1 #define IPSKB_TRANSLATED 2 #define IPSKB_FORWARDED 4 +#ifdef CONFIG_IP_MASQUERADE_VS +#define IPSKB_REDIRECTED 8 +#endif }; struct ipcm_cookie diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/include/net/ip_masq.h linux-2.2.19-vs-1.0.8/include/net/ip_masq.h --- linux-2.2.19/include/net/ip_masq.h Tue Mar 27 09:33:48 2001 +++ linux-2.2.19-vs-1.0.8/include/net/ip_masq.h Wed Apr 18 16:17:59 2001 @@ -12,8 +12,15 @@ #include #include #include +#ifdef CONFIG_SYSCTL +#include +#endif #endif /* __KERNEL__ */ +#ifdef CONFIG_IP_MASQUERADE_VS +struct ip_vs_dest; +#endif + /* * This define affects the number of ports that can be handled * by each of the protocol helper modules. @@ -66,10 +73,6 @@ #define IP_MASQ_MOD_CTL 0x00 #define IP_MASQ_USER_CTL 0x01 -#ifdef __KERNEL__ - -#define IP_MASQ_TAB_SIZE 256 - #define IP_MASQ_F_NO_DADDR 0x0001 /* no daddr yet */ #define IP_MASQ_F_NO_DPORT 0x0002 /* no dport set yet */ #define IP_MASQ_F_NO_SADDR 0x0004 /* no sport set yet */ @@ -86,6 +89,22 @@ #define IP_MASQ_F_USER 0x2000 /* from uspace */ #define IP_MASQ_F_SIMPLE_HASH 0x8000 /* prevent s+d and m+d hashing */ +#ifdef CONFIG_IP_MASQUERADE_VS +#define IP_MASQ_F_VS 0x00010000 /* virtual server related */ +#define IP_MASQ_F_VS_NO_OUTPUT 0x00020000 /* output packets avoid masq */ +#define IP_MASQ_F_VS_INACTIVE 0x00040000 /* not established */ +#define IP_MASQ_F_VS_FWD_MASK 0x00700000 /* mask for the fdw method */ +#define IP_MASQ_F_VS_LOCALNODE 0x00100000 /* local node destination */ +#define IP_MASQ_F_VS_TUNNEL 0x00200000 /* packets will be tunneled */ +#define IP_MASQ_F_VS_DROUTE 0x00400000 /* direct routing */ + /* masquerading otherwise */ +#define IP_MASQ_VS_FWD(ms) (ms->flags & IP_MASQ_F_VS_FWD_MASK) +#endif /* CONFIG_IP_MASQUERADE_VS */ + +#ifdef __KERNEL__ + +#define IP_MASQ_TAB_SIZE 256 + /* * Delta seq. info structure * Each MASQ struct has 2 (output AND input seq. changes). @@ -114,9 +133,13 @@ struct ip_masq *control; /* Master control connection */ atomic_t n_control; /* Number of "controlled" masqs */ unsigned flags; /* status flags */ - unsigned timeout; /* timeout */ + unsigned long timeout; /* timeout */ unsigned state; /* state info */ struct ip_masq_timeout_table *timeout_table; +#ifdef CONFIG_IP_MASQUERADE_VS + struct ip_vs_dest *dest; /* real server */ + atomic_t in_pkts; /* incoming packet counter */ +#endif /* CONFIG_IP_MASQUERADE_VS */ }; /* @@ -179,7 +202,7 @@ extern struct list_head ip_masq_d_table[IP_MASQ_TAB_SIZE]; extern const char * ip_masq_state_name(int state); extern struct ip_masq_hook *ip_masq_user_hook; -extern u32 ip_masq_select_addr(struct device *dev, u32 dst, int scope); +extern int ip_masq_select_addr(struct sk_buff *skb,__u32 *maddr); /* * * IP_MASQ_APP: IP application masquerading definitions @@ -354,6 +377,10 @@ static const char *strProt[] = {"UDP","TCP","ICMP"}; int msproto = masq_proto_num(proto); +#ifdef CONFIG_IP_MASQUERADE_VS + if (proto == IPPROTO_IP) + return "IP "; +#endif /* CONFIG_IP_MASQUERADE_VS */ if (msproto<0||msproto>2) { sprintf(buf, "IP_%d", proto); return buf; @@ -372,6 +399,9 @@ IP_MASQ_S_CLOSE_WAIT, IP_MASQ_S_LAST_ACK, IP_MASQ_S_LISTEN, +#ifdef CONFIG_IP_MASQUERADE_VS + IP_MASQ_S_SYNACK, +#endif IP_MASQ_S_UDP, IP_MASQ_S_ICMP, IP_MASQ_S_LAST @@ -395,8 +425,33 @@ if (!mstim) return; + ms->timeout_table = NULL; atomic_dec(&mstim->refcnt); } + +#ifdef CONFIG_IP_MASQUERADE_VS + +extern struct ip_masq_timeout_table masq_timeout_table_dos; +extern void ip_masq_secure_tcp_set(int on); + +/* + * This is a simple mechanism to ignore packets when + * we are loaded. Just set ip_masq_drop_rate to 'n' and + * we start to drop 1/n of the packets + */ + +extern int ip_masq_drop_rate; +extern int ip_masq_drop_counter; + +static __inline__ int ip_masq_todrop(void) +{ + if (!ip_masq_drop_rate) return 0; + if (--ip_masq_drop_counter > 0) return 0; + ip_masq_drop_counter = ip_masq_drop_rate; + return 1; +} + +#endif /* CONFIG_IP_MASQUERADE_VS */ #endif /* __KERNEL__ */ diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/include/net/ip_vs.h linux-2.2.19-vs-1.0.8/include/net/ip_vs.h --- linux-2.2.19/include/net/ip_vs.h Thu Jan 1 08:00:00 1970 +++ linux-2.2.19-vs-1.0.8/include/net/ip_vs.h Tue Apr 24 18:07:00 2001 @@ -0,0 +1,392 @@ +/* + * IP virtual server + * data structure and functionality definitions + */ + +#include + +#ifndef _IP_VS_H +#define _IP_VS_H + +#define IP_VS_VERSION_CODE 0x010008 +#define NVERSION(version) \ + (version >> 16) & 0xFF, \ + (version >> 8) & 0xFF, \ + version & 0xFF + +/* + * Virtual Service Flags + */ +#define IP_VS_SVC_F_PERSISTENT 0x0001 /* persistent port */ +#define IP_VS_SVC_F_HASHED 0x0002 /* hashed entry */ + +/* + * Destination Server Flags + */ +#define IP_VS_DEST_F_AVAILABLE 0x0001 /* Available tag */ + +/* + * The default IP_VS_TEMPLATE_TIMEOUT is a little larger than average + * connection time plus MASQUERADE_EXPIRE_TCP_FIN(2*60*HZ). Because the + * template won't be released until its controlled masq entries are + * expired. + * If IP_VS_TEMPLATE_TIMEOUT is too less, the template will soon expire + * and will be put in expire again and again, which requires additional + * overhead. If it is too large, the same will always visit the same + * server, which will make dynamic load imbalance worse. + */ +#define IP_VS_TEMPLATE_TIMEOUT 6*60*HZ + +#ifdef __KERNEL__ + +extern int ip_vs_forwarding_related_icmp(struct sk_buff *skb); + +#ifdef CONFIG_IP_VS_DEBUG +extern int ip_vs_get_debug_level(void); +#define IP_VS_DBG(level, msg...) \ + do { \ + if (level <= ip_vs_get_debug_level()) \ + printk(KERN_DEBUG "IPVS: " ## msg); \ + } while (0) +#else /* NO DEBUGGING at ALL */ +#define IP_VS_DBG(level, msg...) do {} while (0) +#endif + +#define IP_VS_ERR(msg...) printk(KERN_ERR "IPVS: " ## msg ) +#define IP_VS_INFO(msg...) printk(KERN_INFO "IPVS: " ## msg ) +#define IP_VS_WARNING(msg...) \ + printk(KERN_WARNING "IPVS: " ## msg) + +#ifdef CONFIG_IP_VS_DEBUG +#define EnterFunction(level) \ + do { \ + if (level <= ip_vs_get_debug_level()) \ + printk(KERN_DEBUG "Enter: %s, %s line %i\n", \ + __FUNCTION__, __FILE__, __LINE__); \ + } while (0) +#define LeaveFunction(level) \ + do { \ + if (level <= ip_vs_get_debug_level()) \ + printk(KERN_DEBUG "Leave: %s, %s line %i\n", \ + __FUNCTION__, __FILE__, __LINE__); \ + } while (0) +#else +#define EnterFunction(level) do {} while (0) +#define LeaveFunction(level) do {} while (0) +#endif + + +/* + * IPVS statistics object + */ +struct ip_vs_stats +{ + spinlock_t lock; /* spin lock */ + __u32 conns; /* connections scheduled */ + __u32 inpkts; /* incoming packets */ + __u32 outpkts; /* outgoing packets */ + __u64 inbytes; /* incoming bytes */ + __u64 outbytes; /* outgoing bytes */ +}; + + +/* + * The real server destination forwarding entry + * with ip address, port + */ +struct ip_vs_dest { + struct list_head n_list; /* for the dests in the service */ + struct list_head d_list; /* for table with all the dests */ + + __u32 addr; /* IP address of real server */ + __u16 port; /* port number of the service */ + unsigned flags; /* dest status flags */ + unsigned masq_flags; /* flags to copy to masq */ + atomic_t activeconns; /* active connections */ + atomic_t inactconns; /* inactive connections */ + atomic_t refcnt; /* reference counter */ + int weight; /* server weight */ + struct ip_vs_stats stats; /* statistics */ + + /* for virtual service */ + struct ip_vs_service *svc; /* service that it belongs to */ + __u16 protocol; /* which protocol (TCP/UDP) */ + __u32 vaddr; /* IP address for virtual service */ + __u16 vport; /* port number for the service */ + __u32 vfwmark; /* firewall mark of the service */ +}; + + +/* + * The scheduler object + */ +struct ip_vs_scheduler { + struct list_head n_list; /* d-linked list head */ + char *name; /* scheduler name */ + atomic_t refcnt; /* reference counter */ + + /* scheduler initializing service */ + int (*init_service)(struct ip_vs_service *svc); + /* scheduling service finish */ + int (*done_service)(struct ip_vs_service *svc); + /* scheduler updating service */ + int (*update_service)(struct ip_vs_service *svc); + + /* selecting a server from the given service */ + struct ip_vs_dest* (*schedule)(struct ip_vs_service *svc, + struct iphdr *iph); +}; + + +/* + * The information about the virtual service offered to the net + * and the forwarding entries + */ +struct ip_vs_service { + struct list_head s_list; /* for normal service table */ + struct list_head f_list; /* for fwmark-based service table */ + atomic_t refcnt; /* reference counter */ + + __u16 protocol; /* which protocol (TCP/UDP) */ + __u32 addr; /* IP address for virtual service */ + __u16 port; /* port number for the service */ + __u32 fwmark; /* firewall mark of the service */ + unsigned flags; /* service status flags */ + unsigned timeout; /* persistent timeout in ticks */ + __u32 netmask; /* grouping granularity */ + struct list_head destinations; /* real server d-linked list */ + struct ip_vs_stats stats; /* statistics for the service */ + + /* for scheduling */ + struct ip_vs_scheduler *scheduler; /* bound scheduler object */ + void *sched_data; /* scheduler application data */ +}; + + +/* + * IP Virtual Server masq entry hash table + */ +#define IP_VS_TAB_BITS CONFIG_IP_MASQUERADE_VS_TAB_BITS +#define IP_VS_TAB_SIZE (1 << IP_VS_TAB_BITS) +#define IP_VS_TAB_MASK (IP_VS_TAB_SIZE - 1) +extern struct list_head *ip_vs_table; + +/* + * Hash and unhash functions + */ +extern int ip_vs_hash(struct ip_masq *ms); +extern int ip_vs_unhash(struct ip_masq *ms); + +/* + * Registering/unregistering scheduler functions + */ +extern int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler); +extern int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler); + +/* + * Lookup functions for the hash table (caller must lock table) + */ +extern struct ip_masq * __ip_vs_in_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port); +extern struct ip_masq * __ip_vs_out_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port); + +/* + * Creating a masquerading entry for IPVS + */ +extern struct ip_masq * ip_masq_new_vs(int proto, __u32 maddr, __u16 mport, __u32 saddr, __u16 sport, __u32 daddr, __u16 dport, unsigned flags); + +/* + * IPVS data and functions + */ +extern rwlock_t __ip_vs_lock; + +extern void ip_vs_set_state(struct ip_masq *ms, int new_state); +extern void ip_vs_bind_masq(struct ip_masq *ms, struct ip_vs_dest *dest); +extern void ip_vs_unbind_masq(struct ip_masq *ms); + +extern int ip_vs_ctl(int optname, struct ip_masq_ctl *mctl, int optlen); +extern struct ip_vs_service * +ip_vs_lookup_service(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport); +extern struct ip_vs_service * ip_vs_lookup_svc_fwm(__u32 fwmark); +extern struct ip_vs_dest * +__ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport); +extern struct ip_vs_dest *ip_vs_lookup_dest(struct ip_vs_service *svc, + __u32 daddr, __u16 dport); +extern struct ip_masq * ip_vs_schedule(struct ip_vs_service *svc, + struct iphdr *iph); +extern int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb); +extern int ip_vs_tunnel_xmit(struct sk_buff *skb, __u32 daddr); +extern int ip_vs_dr_xmit(struct sk_buff *skb, __u32 daddr); + +/* + * init function + */ +extern int ip_vs_init(void); + +/* + * init function prototypes for scheduling modules + * these function will be called when they are built in kernel + */ +extern int ip_vs_rr_init(void); +extern int ip_vs_wrr_init(void); +extern int ip_vs_lc_init(void); +extern int ip_vs_wlc_init(void); +extern int ip_vs_lblc_init(void); +extern int ip_vs_lblcr_init(void); + + +/* + * Slow timer functions for IPVS + */ +extern void add_sltimer(struct timer_list * timer); +extern int del_sltimer(struct timer_list * timer); +extern void mod_sltimer(struct timer_list *timer, unsigned long expires); + + +/* + * IP Virtual Server statistics + */ +extern struct ip_vs_stats ip_vs_stats; + +extern __inline__ void +ip_vs_in_stats(struct ip_masq *ms, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = ms->dest; + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + spin_lock(&dest->stats.lock); + dest->stats.inpkts++; + dest->stats.inbytes += skb->len; + spin_unlock(&dest->stats.lock); + + spin_lock(&dest->svc->stats.lock); + dest->svc->stats.inpkts++; + dest->svc->stats.inbytes += skb->len; + spin_unlock(&dest->svc->stats.lock); + + spin_lock(&ip_vs_stats.lock); + ip_vs_stats.inpkts++; + ip_vs_stats.inbytes += skb->len; + spin_unlock(&ip_vs_stats.lock); + } +} + + +extern __inline__ void +ip_vs_out_stats(struct ip_masq *ms, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = ms->dest; + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + spin_lock(&dest->stats.lock); + dest->stats.outpkts++; + dest->stats.outbytes += skb->len; + spin_unlock(&dest->stats.lock); + + spin_lock(&dest->svc->stats.lock); + dest->svc->stats.outpkts++; + dest->svc->stats.outbytes += skb->len; + spin_unlock(&dest->svc->stats.lock); + + spin_lock(&ip_vs_stats.lock); + ip_vs_stats.outpkts++; + ip_vs_stats.outbytes += skb->len; + spin_unlock(&ip_vs_stats.lock); + } +} + + +extern __inline__ void +ip_vs_conn_stats(struct ip_masq *ms, struct ip_vs_service *svc) +{ + spin_lock(&ms->dest->stats.lock); + ms->dest->stats.conns++; + spin_unlock(&ms->dest->stats.lock); + + spin_lock(&svc->stats.lock); + svc->stats.conns++; + spin_unlock(&svc->stats.lock); + + spin_lock(&ip_vs_stats.lock); + ip_vs_stats.conns++; + spin_unlock(&ip_vs_stats.lock); +} + + +/* + * ip_vs_fwd_tag returns the forwarding tag of the masq + */ +extern __inline__ char ip_vs_fwd_tag(struct ip_masq *ms) +{ + char fwd = 'M'; + + switch (IP_MASQ_VS_FWD(ms)) { + case IP_MASQ_F_VS_LOCALNODE: fwd = 'L'; break; + case IP_MASQ_F_VS_TUNNEL: fwd = 'T'; break; + case IP_MASQ_F_VS_DROUTE: fwd = 'R'; break; + } + return fwd; +} + + +extern __inline__ char * ip_vs_fwd_name(unsigned masq_flags) +{ + char *fwd; + + switch (masq_flags & IP_MASQ_F_VS_FWD_MASK) { + case IP_MASQ_F_VS_LOCALNODE: + fwd = "Local"; + break; + case IP_MASQ_F_VS_TUNNEL: + fwd = "Tunnel"; + break; + case IP_MASQ_F_VS_DROUTE: + fwd = "Route"; + break; + default: + fwd = "Masq"; + } + return fwd; +} + + +/* + * ip_vs_forward forwards the packet through tunneling, direct + * routing or local node (passing to the upper layer). + * Return values mean: + * 0 skb must be passed to the upper layer + * -1 skb must be released + * -2 skb has been released + */ +extern __inline__ int ip_vs_forward(struct sk_buff *skb, struct ip_masq *ms) +{ + int ret = -1; + + atomic_inc(&ms->in_pkts); + + switch (IP_MASQ_VS_FWD(ms)) { + case IP_MASQ_F_VS_TUNNEL: + if (ip_vs_tunnel_xmit(skb, ms->saddr) == 0) { + IP_VS_DBG(10, "tunneling failed.\n"); + } else { + IP_VS_DBG(10, "tunneling succeeded.\n"); + } + ret = -2; + break; + + case IP_MASQ_F_VS_DROUTE: + if (ip_vs_dr_xmit(skb, ms->saddr) == 0) { + IP_VS_DBG(10, "direct routing failed.\n"); + } else { + IP_VS_DBG(10, "direct routing succeeded.\n"); + } + ret = -2; + break; + + case IP_MASQ_F_VS_LOCALNODE: + ret = 0; + } + + return ret; +} + +#endif /* __KERNEL__ */ + +#endif /* _IP_VS_H */ diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/Config.in linux-2.2.19-vs-1.0.8/net/ipv4/Config.in --- linux-2.2.19/net/ipv4/Config.in Sat Dec 16 23:10:12 2000 +++ linux-2.2.19-vs-1.0.8/net/ipv4/Config.in Tue Dec 12 18:35:06 2000 @@ -51,6 +51,17 @@ tristate 'IP: ipportfw masq support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_IPPORTFW tristate 'IP: ip fwmark masq-forwarding support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_MFW fi + bool 'IP: masquerading virtual server support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_VS + if [ "$CONFIG_IP_MASQUERADE_VS" = "y" ]; then + bool ' IP virtual server debugging' CONFIG_IP_VS_DEBUG + int ' IP masquerading VS table size (the Nth power of 2)' CONFIG_IP_MASQUERADE_VS_TAB_BITS 12 + tristate ' IPVS: round-robin scheduling' CONFIG_IP_MASQUERADE_VS_RR + tristate ' IPVS: weighted round-robin scheduling' CONFIG_IP_MASQUERADE_VS_WRR + tristate ' IPVS: least-connection scheduling' CONFIG_IP_MASQUERADE_VS_LC + tristate ' IPVS: weighted least-connection scheduling' CONFIG_IP_MASQUERADE_VS_WLC + tristate ' IPVS: locality-based least-connection scheduling' CONFIG_IP_MASQUERADE_VS_LBLC + tristate ' IPVS: locality-based least-connection with replication scheduling' CONFIG_IP_MASQUERADE_VS_LBLCR + fi fi fi fi diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/Makefile linux-2.2.19-vs-1.0.8/net/ipv4/Makefile --- linux-2.2.19/net/ipv4/Makefile Tue Jan 5 07:31:34 1999 +++ linux-2.2.19-vs-1.0.8/net/ipv4/Makefile Sat Dec 2 22:32:10 2000 @@ -91,6 +91,58 @@ endif +ifeq ($(CONFIG_IP_MASQUERADE_VS),y) + IPV4X_OBJS += ip_vs.o + + ifeq ($(CONFIG_IP_MASQUERADE_VS_RR),y) + IPV4_OBJS += ip_vs_rr.o + else + ifeq ($(CONFIG_IP_MASQUERADE_VS_RR),m) + M_OBJS += ip_vs_rr.o + endif + endif + + ifeq ($(CONFIG_IP_MASQUERADE_VS_WRR),y) + IPV4_OBJS += ip_vs_wrr.o + else + ifeq ($(CONFIG_IP_MASQUERADE_VS_WRR),m) + M_OBJS += ip_vs_wrr.o + endif + endif + + ifeq ($(CONFIG_IP_MASQUERADE_VS_LC),y) + IPV4_OBJS += ip_vs_lc.o + else + ifeq ($(CONFIG_IP_MASQUERADE_VS_LC),m) + M_OBJS += ip_vs_lc.o + endif + endif + + ifeq ($(CONFIG_IP_MASQUERADE_VS_WLC),y) + IPV4_OBJS += ip_vs_wlc.o + else + ifeq ($(CONFIG_IP_MASQUERADE_VS_WLC),m) + M_OBJS += ip_vs_wlc.o + endif + endif + + ifeq ($(CONFIG_IP_MASQUERADE_VS_LBLC),y) + IPV4_OBJS += ip_vs_lblc.o + else + ifeq ($(CONFIG_IP_MASQUERADE_VS_LBLC),m) + M_OBJS += ip_vs_lblc.o + endif + endif + + ifeq ($(CONFIG_IP_MASQUERADE_VS_LBLCR),y) + IPV4_OBJS += ip_vs_lblcr.o + else + ifeq ($(CONFIG_IP_MASQUERADE_VS_LBLCR),m) + M_OBJS += ip_vs_lblcr.o + endif + endif +endif + M_OBJS += ip_masq_user.o M_OBJS += ip_masq_ftp.o ip_masq_irc.o ip_masq_raudio.o ip_masq_quake.o M_OBJS += ip_masq_vdolive.o ip_masq_cuseeme.o diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_forward.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_forward.c --- linux-2.2.19/net/ipv4/ip_forward.c Fri Jan 7 09:45:02 2000 +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_forward.c Fri Feb 2 15:38:28 2001 @@ -41,6 +41,9 @@ #include #ifdef CONFIG_IP_MASQUERADE #include +#ifdef CONFIG_IP_MASQUERADE_VS +#include +#endif #endif #include #include @@ -103,6 +106,14 @@ } #endif +#ifdef CONFIG_IP_MASQUERADE_VS + if (iph->protocol == IPPROTO_ICMP && + !(IPCB(skb)->flags&IPSKB_MASQUERADED)) { + /* Related ICMP packet for IPVS ? */ + fw_res = ip_vs_forwarding_related_icmp(skb); + if (fw_res > 0) return ip_local_deliver(skb); + } +#endif #ifdef CONFIG_IP_TRANSPARENT_PROXY if (ip_chksock(skb)) diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_input.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_input.c --- linux-2.2.19/net/ipv4/ip_input.c Tue Mar 27 09:33:49 2001 +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_input.c Tue Mar 27 09:32:21 2001 @@ -250,6 +250,15 @@ */ { int ret; + +#ifdef CONFIG_IP_MASQUERADE_VS + if((IPCB(skb)->flags&IPSKB_REDIRECTED)) { + printk(KERN_DEBUG "ip_input(): ipvs recursion detected. Check ipvs configuration\n"); + kfree_skb(skb); + return 0; + } +#endif + /* * Some masq modules can re-inject packets if * bad configured. @@ -262,6 +271,12 @@ } ret = ip_fw_demasquerade(&skb); +#ifdef CONFIG_IP_MASQUERADE_VS + if (ret == -2) { + /* skb has already been released */ + return 0; + } +#endif if (ret < 0) { kfree_skb(skb); return 0; diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_masq.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_masq.c --- linux-2.2.19/net/ipv4/ip_masq.c Tue Mar 27 09:33:49 2001 +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_masq.c Wed Apr 18 19:58:48 2001 @@ -50,7 +50,12 @@ * Kai Bankett : do not toss other IP protos in proto_doff() * Dan Kegel : pointed correct NAT behavior for UDP streams * Julian Anastasov : use daddr and dport as hash keys - * + * Wensong Zhang : Added virtual server support + * Peter Kese : added masq TCP state handling for input-only + * Julian Anastasov : step to mSR after SYN in INPUT_ONLY table + * Julian Anastasov : fixed huge expire bug for IPVS after bad checksum + * Wensong Zhang : added server status checking for IPVS + * */ #include @@ -85,6 +90,10 @@ #include #include +#ifdef CONFIG_IP_MASQUERADE_VS +#include +#endif /* CONFIG_IP_MASQUERADE_VS */ + int sysctl_ip_masq_debug = 0; int sysctl_ip_masq_udp_dloose = 0; @@ -98,6 +107,21 @@ struct ip_masq_hook *ip_masq_user_hook = NULL; +#ifdef CONFIG_IP_MASQUERADE_VS +/* + * Use different state/timeout tables + */ +#ifndef IP_MASQ_MANY_STATE_TABLES +#define IP_MASQ_MANY_STATE_TABLES +#endif + +int ip_masq_drop_rate = 0; +int ip_masq_drop_counter = 0; + +#endif + +#ifndef CONFIG_IP_MASQUERADE_VS + /* * Timeout table[state] */ @@ -106,38 +130,104 @@ ATOMIC_INIT(0), /* refcnt */ 0, /* scale */ { - 30*60*HZ, /* IP_MASQ_S_NONE, */ - 15*60*HZ, /* IP_MASQ_S_ESTABLISHED, */ - 2*60*HZ, /* IP_MASQ_S_SYN_SENT, */ - 1*60*HZ, /* IP_MASQ_S_SYN_RECV, */ - 2*60*HZ, /* IP_MASQ_S_FIN_WAIT, */ - 2*60*HZ, /* IP_MASQ_S_TIME_WAIT, */ - 10*HZ, /* IP_MASQ_S_CLOSE, */ - 60*HZ, /* IP_MASQ_S_CLOSE_WAIT, */ - 30*HZ, /* IP_MASQ_S_LAST_ACK, */ - 2*60*HZ, /* IP_MASQ_S_LISTEN, */ - 5*60*HZ, /* IP_MASQ_S_UDP, */ - 1*60*HZ, /* IP_MASQ_S_ICMP, */ - 2*HZ,/* IP_MASQ_S_LAST */ + [IP_MASQ_S_NONE] = 30*60*HZ, + [IP_MASQ_S_ESTABLISHED] = 15*60*HZ, + [IP_MASQ_S_SYN_SENT] = 2*60*HZ, + [IP_MASQ_S_SYN_RECV] = 1*60*HZ, + [IP_MASQ_S_FIN_WAIT] = 2*60*HZ, + [IP_MASQ_S_TIME_WAIT] = 2*60*HZ, + [IP_MASQ_S_CLOSE] = 10*HZ, + [IP_MASQ_S_CLOSE_WAIT] = 60*HZ, + [IP_MASQ_S_LAST_ACK] = 30*HZ, + [IP_MASQ_S_LISTEN] = 2*60*HZ, + [IP_MASQ_S_UDP] = 5*60*HZ, + [IP_MASQ_S_ICMP] = 1*60*HZ, + [IP_MASQ_S_LAST] = 2*HZ, }, /* timeout */ }; +#else /* CONFIG_IP_MASQUERADE_VS */ + +/* + * Timeout table[state] + */ +/* static int masq_timeout_table[IP_MASQ_S_LAST+1] = { */ +static struct ip_masq_timeout_table masq_timeout_table = { + ATOMIC_INIT(0), /* refcnt */ + 0, /* scale */ + { + [IP_MASQ_S_NONE] = 30*60*HZ, + [IP_MASQ_S_ESTABLISHED] = 15*60*HZ, + [IP_MASQ_S_SYN_SENT] = 2*60*HZ, + [IP_MASQ_S_SYN_RECV] = 1*60*HZ, + [IP_MASQ_S_FIN_WAIT] = 2*60*HZ, + [IP_MASQ_S_TIME_WAIT] = 2*60*HZ, + [IP_MASQ_S_CLOSE] = 10*HZ, + [IP_MASQ_S_CLOSE_WAIT] = 60*HZ, + [IP_MASQ_S_LAST_ACK] = 30*HZ, + [IP_MASQ_S_LISTEN] = 2*60*HZ, + [IP_MASQ_S_SYNACK] = 120*HZ, + [IP_MASQ_S_UDP] = 5*60*HZ, + [IP_MASQ_S_ICMP] = 1*60*HZ, + [IP_MASQ_S_LAST] = 2*HZ, + }, /* timeout */ +}; + + +struct ip_masq_timeout_table masq_timeout_table_dos = { + ATOMIC_INIT(0), /* refcnt */ + 0, /* scale */ + { + [IP_MASQ_S_NONE] = 15*60*HZ, + [IP_MASQ_S_ESTABLISHED] = 8*60*HZ, + [IP_MASQ_S_SYN_SENT] = 60*HZ, + [IP_MASQ_S_SYN_RECV] = 10*HZ, + [IP_MASQ_S_FIN_WAIT] = 60*HZ, + [IP_MASQ_S_TIME_WAIT] = 60*HZ, + [IP_MASQ_S_CLOSE] = 10*HZ, + [IP_MASQ_S_CLOSE_WAIT] = 60*HZ, + [IP_MASQ_S_LAST_ACK] = 30*HZ, + [IP_MASQ_S_LISTEN] = 2*60*HZ, + [IP_MASQ_S_SYNACK] = 100*HZ, + [IP_MASQ_S_UDP] = 3*60*HZ, + [IP_MASQ_S_ICMP] = 1*60*HZ, + [IP_MASQ_S_LAST] = 2*HZ, + }, /* timeout */ +}; + +/* + * Timeout table to use for the VS entries + * If NULL we use the default table (masq_timeout_table). + * Under flood attack we switch to masq_timeout_table_dos + */ + +struct ip_masq_timeout_table *ip_vs_timeout_table = &masq_timeout_table; + +#endif /* CONFIG_IP_MASQUERADE_VS */ + +#ifdef CONFIG_IP_MASQUERADE_VS +#define MASQUERADE_EXPIRE_RETRY(ms) (ms->timeout_table? ms->timeout_table->timeout[IP_MASQ_S_TIME_WAIT] : masq_timeout_table.timeout[IP_MASQ_S_TIME_WAIT]) +#else #define MASQUERADE_EXPIRE_RETRY masq_timeout_table.timeout[IP_MASQ_S_TIME_WAIT] +#endif static const char * state_name_table[IP_MASQ_S_LAST+1] = { - "NONE", /* IP_MASQ_S_NONE, */ - "ESTABLISHED", /* IP_MASQ_S_ESTABLISHED, */ - "SYN_SENT", /* IP_MASQ_S_SYN_SENT, */ - "SYN_RECV", /* IP_MASQ_S_SYN_RECV, */ - "FIN_WAIT", /* IP_MASQ_S_FIN_WAIT, */ - "TIME_WAIT", /* IP_MASQ_S_TIME_WAIT, */ - "CLOSE", /* IP_MASQ_S_CLOSE, */ - "CLOSE_WAIT", /* IP_MASQ_S_CLOSE_WAIT, */ - "LAST_ACK", /* IP_MASQ_S_LAST_ACK, */ - "LISTEN", /* IP_MASQ_S_LISTEN, */ - "UDP", /* IP_MASQ_S_UDP, */ - "ICMP", /* IP_MASQ_S_ICMP, */ - "BUG!", /* IP_MASQ_S_LAST */ + [IP_MASQ_S_NONE] = "NONE", + [IP_MASQ_S_ESTABLISHED] = "ESTABLISHED", + [IP_MASQ_S_SYN_SENT] = "SYN_SENT", + [IP_MASQ_S_SYN_RECV] = "SYN_RECV", + [IP_MASQ_S_FIN_WAIT] = "FIN_WAIT", + [IP_MASQ_S_TIME_WAIT] = "TIME_WAIT", + [IP_MASQ_S_CLOSE] = "CLOSE", + [IP_MASQ_S_CLOSE_WAIT] = "CLOSE_WAIT", + [IP_MASQ_S_LAST_ACK] = "LAST_ACK", + [IP_MASQ_S_LISTEN] = "LISTEN", +#ifdef CONFIG_IP_MASQUERADE_VS + [IP_MASQ_S_SYNACK] = "SYNACK", +#endif + [IP_MASQ_S_UDP] = "UDP", + [IP_MASQ_S_ICMP] = "ICMP", + [IP_MASQ_S_LAST] = "BUG!", }; #define mNO IP_MASQ_S_NONE @@ -150,6 +240,9 @@ #define mCW IP_MASQ_S_CLOSE_WAIT #define mLA IP_MASQ_S_LAST_ACK #define mLI IP_MASQ_S_LISTEN +#ifdef CONFIG_IP_MASQUERADE_VS +#define mSA IP_MASQ_S_SYNACK +#endif struct masq_tcp_states_t { int next_state[IP_MASQ_S_LAST]; /* should be _LAST_TCP */ @@ -159,46 +252,111 @@ { if (state >= IP_MASQ_S_LAST) return "ERR!"; - return state_name_table[state]; + return state_name_table[state] ? state_name_table[state] : "?"; } +#ifndef CONFIG_IP_MASQUERADE_VS + struct masq_tcp_states_t masq_tcp_states [] = { /* INPUT */ /* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI */ /*syn*/ {{mSR, mES, mES, mSR, mSR, mSR, mSR, mSR, mSR, mSR }}, /*fin*/ {{mCL, mCW, mSS, mTW, mTW, mTW, mCL, mCW, mLA, mLI }}, -/*ack*/ {{mCL, mES, mSS, mSR, mFW, mTW, mCL, mCW, mCL, mLI }}, +/*ack*/ {{mCL, mES, mSS, mES, mFW, mTW, mCL, mCW, mCL, mLI }}, /*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI }}, /* OUTPUT */ /* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI */ -/*syn*/ {{mSS, mES, mSS, mES, mSS, mSS, mSS, mSS, mSS, mLI }}, +/*syn*/ {{mSS, mES, mSS, mSR, mSS, mSS, mSS, mSS, mSS, mLI }}, /*fin*/ {{mTW, mFW, mSS, mTW, mFW, mTW, mCL, mTW, mLA, mLI }}, -/*ack*/ {{mES, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mES }}, +/*ack*/ {{mES, mES, mSS, mES, mFW, mTW, mCL, mCW, mLA, mES }}, /*rst*/ {{mCL, mCL, mSS, mCL, mCL, mTW, mCL, mCL, mCL, mCL }}, }; -static __inline__ int masq_tcp_state_idx(struct tcphdr *th, int output) +#else /* CONFIG_IP_MASQUERADE_VS */ + +struct masq_tcp_states_t masq_tcp_states [] = { +/* INPUT */ +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */ +/*syn*/ {{mSR, mES, mES, mSR, mSR, mSR, mSR, mSR, mSR, mSR, mSR }}, +/*fin*/ {{mCL, mCW, mSS, mTW, mTW, mTW, mCL, mCW, mLA, mLI, mTW }}, +/*ack*/ {{mCL, mES, mSS, mES, mFW, mTW, mCL, mCW, mCL, mLI, mES }}, +/*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI, mSR }}, + +/* OUTPUT */ +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */ +/*syn*/ {{mSS, mES, mSS, mSR, mSS, mSS, mSS, mSS, mSS, mLI, mSR }}, +/*fin*/ {{mTW, mFW, mSS, mTW, mFW, mTW, mCL, mTW, mLA, mLI, mTW }}, +/*ack*/ {{mES, mES, mSS, mES, mFW, mTW, mCL, mCW, mLA, mES, mES }}, +/*rst*/ {{mCL, mCL, mSS, mCL, mCL, mTW, mCL, mCL, mCL, mCL, mCL }}, + +/* INPUT-ONLY */ +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */ +/*syn*/ {{mSR, mES, mES, mSR, mSR, mSR, mSR, mSR, mSR, mSR, mSR }}, +/*fin*/ {{mCL, mFW, mSS, mTW, mFW, mTW, mCL, mCW, mLA, mLI, mTW }}, +/*ack*/ {{mCL, mES, mSS, mES, mFW, mTW, mCL, mCW, mCL, mLI, mES }}, +/*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI, mCL }}, +}; + +struct masq_tcp_states_t masq_tcp_states_dos [] = { +/* INPUT */ +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */ +/*syn*/ {{mSR, mES, mES, mSR, mSR, mSR, mSR, mSR, mSR, mSR, mSA }}, +/*fin*/ {{mCL, mCW, mSS, mTW, mTW, mTW, mCL, mCW, mLA, mLI, mSA }}, +/*ack*/ {{mCL, mES, mSS, mSR, mFW, mTW, mCL, mCW, mCL, mLI, mSA }}, +/*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI, mCL }}, + +/* OUTPUT */ +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */ +/*syn*/ {{mSS, mES, mSS, mSA, mSS, mSS, mSS, mSS, mSS, mLI, mSA }}, +/*fin*/ {{mTW, mFW, mSS, mTW, mFW, mTW, mCL, mTW, mLA, mLI, mTW }}, +/*ack*/ {{mES, mES, mSS, mES, mFW, mTW, mCL, mCW, mLA, mES, mES }}, +/*rst*/ {{mCL, mCL, mSS, mCL, mCL, mTW, mCL, mCL, mCL, mCL, mCL }}, + +/* INPUT-ONLY */ +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */ +/*syn*/ {{mSA, mES, mES, mSR, mSA, mSA, mSA, mSA, mSA, mSA, mSA }}, +/*fin*/ {{mCL, mFW, mSS, mTW, mFW, mTW, mCL, mCW, mLA, mLI, mTW }}, +/*ack*/ {{mCL, mES, mSS, mES, mFW, mTW, mCL, mCW, mCL, mLI, mES }}, +/*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI, mCL }}, +}; + +struct masq_tcp_states_t *ip_vs_state_table = masq_tcp_states; + +void ip_masq_secure_tcp_set(int on) +{ + if (on) { + ip_vs_state_table = masq_tcp_states_dos; + ip_vs_timeout_table = &masq_timeout_table_dos; + } else { + ip_vs_state_table = masq_tcp_states; + ip_vs_timeout_table = &masq_timeout_table; + } +} + +#endif /* CONFIG_IP_MASQUERADE_VS */ + +#define MASQ_STATE_INPUT 0 +#define MASQ_STATE_OUTPUT 4 +#define MASQ_STATE_INPUT_ONLY 8 + +static __inline__ int masq_tcp_state_idx(struct tcphdr *th, int state_off) { /* - * [0-3]: input states, [4-7]: output. + * [0-3]: input states, [4-7]: output, [8-11] input only states. */ - if (output) - output=4; - if (th->rst) - return output+3; + return state_off+3; if (th->syn) - return output+0; + return state_off+0; if (th->fin) - return output+1; + return state_off+1; if (th->ack) - return output+2; + return state_off+2; return -1; } - static int masq_set_state_timeout(struct ip_masq *ms, int state) { struct ip_masq_timeout_table *mstim = ms->timeout_table; @@ -221,18 +379,34 @@ return state; } -static int masq_tcp_state(struct ip_masq *ms, int output, struct tcphdr *th) +static int masq_tcp_state(struct ip_masq *ms, int state_off, struct tcphdr *th) { int state_idx; int new_state = IP_MASQ_S_CLOSE; - if ((state_idx = masq_tcp_state_idx(th, output)) < 0) { +#ifdef CONFIG_IP_MASQUERADE_VS + /* + * Update state offset to INPUT_ONLY if necessary + * or delete NO_OUTPUT flag if output packet detected + */ + if (ms->flags & IP_MASQ_F_VS_NO_OUTPUT) { + if (state_off == MASQ_STATE_OUTPUT) + ms->flags &= ~IP_MASQ_F_VS_NO_OUTPUT; + else state_off = MASQ_STATE_INPUT_ONLY; + } +#endif + + if ((state_idx = masq_tcp_state_idx(th, state_off)) < 0) { IP_MASQ_DEBUG(1, "masq_state_idx(%d)=%d!!!\n", - output, state_idx); + state_off, state_idx); goto tcp_state_out; } +#ifdef CONFIG_IP_MASQUERADE_VS + new_state = ip_vs_state_table[state_idx].next_state[ms->state]; +#else new_state = masq_tcp_states[state_idx].next_state[ms->state]; +#endif tcp_state_out: if (new_state!=ms->state) @@ -247,6 +421,15 @@ ntohl(ms->daddr), ntohs(ms->dport), ip_masq_state_name(ms->state), ip_masq_state_name(new_state)); + +#ifdef CONFIG_IP_MASQUERADE_VS + /* + * Increase/Decrease the active connection counter and + * set ms->flags according to ms->state and new_state. + */ + ip_vs_set_state(ms, new_state); +#endif /* CONFIG_IP_MASQUERADE_VS */ + return masq_set_state_timeout(ms, new_state); } @@ -254,7 +437,7 @@ /* * Handle state transitions */ -static int masq_set_state(struct ip_masq *ms, int output, struct iphdr *iph, void *tp) +static int masq_set_state(struct ip_masq *ms, int state_off, struct iphdr *iph, void *tp) { switch (iph->protocol) { case IPPROTO_ICMP: @@ -262,7 +445,7 @@ case IPPROTO_UDP: return masq_set_state_timeout(ms, IP_MASQ_S_UDP); case IPPROTO_TCP: - return masq_tcp_state(ms, output, tp); + return masq_tcp_state(ms, state_off, tp); } return -1; } @@ -361,6 +544,9 @@ EXPORT_SYMBOL(ip_masq_get_debug_level); EXPORT_SYMBOL(ip_masq_new); +#ifdef CONFIG_IP_MASQUERADE_VS +EXPORT_SYMBOL(ip_masq_new_vs); +#endif /* CONFIG_IP_MASQUERADE_VS */ EXPORT_SYMBOL(ip_masq_listen); EXPORT_SYMBOL(ip_masq_free_ports); EXPORT_SYMBOL(ip_masq_out_get); @@ -423,9 +609,17 @@ { if (tout) { ms->timer.expires = jiffies+tout; +#ifdef CONFIG_IP_MASQUERADE_VS + add_sltimer(&ms->timer); +#else add_timer(&ms->timer); +#endif } else { +#ifdef CONFIG_IP_MASQUERADE_VS + del_sltimer(&ms->timer); +#else del_timer(&ms->timer); +#endif } } @@ -741,6 +935,10 @@ struct ip_masq *ms; read_lock(&__ip_masq_lock); +#ifdef CONFIG_IP_MASQUERADE_VS + ms = __ip_vs_out_get(protocol, s_addr, s_port, d_addr, d_port); + if (ms == NULL) +#endif /* CONFIG_IP_MASQUERADE_VS */ ms = __ip_masq_out_get(protocol, s_addr, s_port, d_addr, d_port); read_unlock(&__ip_masq_lock); @@ -754,7 +952,11 @@ struct ip_masq *ms; read_lock(&__ip_masq_lock); - ms = __ip_masq_in_get(protocol, s_addr, s_port, d_addr, d_port); +#ifdef CONFIG_IP_MASQUERADE_VS + ms = __ip_vs_in_get(protocol, s_addr, s_port, d_addr, d_port); + if (ms == NULL) +#endif /* CONFIG_IP_MASQUERADE_VS */ + ms = __ip_masq_in_get(protocol, s_addr, s_port, d_addr, d_port); read_unlock(&__ip_masq_lock); if (ms) @@ -791,7 +993,11 @@ static void masq_expire(unsigned long data) { struct ip_masq *ms = (struct ip_masq *)data; +#ifdef CONFIG_IP_MASQUERADE_VS + ms->timeout = MASQUERADE_EXPIRE_RETRY(ms); +#else ms->timeout = MASQUERADE_EXPIRE_RETRY; +#endif /* * hey, I'm using it @@ -826,6 +1032,15 @@ if (ms->control) ip_masq_control_del(ms); +#ifdef CONFIG_IP_MASQUERADE_VS + if (ms->flags & IP_MASQ_F_VS) { + if (ip_vs_unhash(ms)) { + ip_vs_unbind_masq(ms); + ip_masq_unbind_app(ms); + } + } + else +#endif /* CONFIG_IP_MASQUERADE_VS */ if (ip_masq_unhash(ms)) { if (ms->flags&IP_MASQ_F_MPORT) { atomic_dec(&mport_count); @@ -839,6 +1054,9 @@ * refcnt==1 implies I'm the only one referrer */ if (atomic_read(&ms->refcnt) == 1) { +#ifdef IP_MASQ_MANY_STATE_TABLES + ip_masq_timeout_detach(ms); +#endif kfree_s(ms,sizeof(*ms)); sysctl_ip_always_defrag--; MOD_DEC_USE_COUNT; @@ -1077,6 +1295,83 @@ return NULL; } + +#ifdef CONFIG_IP_MASQUERADE_VS +/* + * Create a new masquerade entry for IPVS, all parameters {maddr, + * mport, saddr, sport, daddr, dport, mflags} are known. No need + * to allocate a free mport. And, hash it into the ip_vs_table. + * + * Be careful, it can be called from u-space + */ + +struct ip_masq * ip_masq_new_vs(int proto, __u32 maddr, __u16 mport, __u32 saddr, __u16 sport, __u32 daddr, __u16 dport, unsigned mflags) +{ + struct ip_masq *ms; + static int n_fails = 0; + int prio; + + prio = (mflags&IP_MASQ_F_USER) ? GFP_KERNEL : GFP_ATOMIC; + + ms = (struct ip_masq *) kmalloc(sizeof(struct ip_masq), prio); + if (ms == NULL) { + if (++n_fails < 5) + IP_VS_ERR("ip_masq_new_vs(proto=%s): no memory available.\n", + masq_proto_name(proto)); + return NULL; + } + MOD_INC_USE_COUNT; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,2,14) + sysctl_ip_always_defrag++; +#endif + memset(ms, 0, sizeof(*ms)); + INIT_LIST_HEAD(&ms->s_list); + INIT_LIST_HEAD(&ms->m_list); + INIT_LIST_HEAD(&ms->d_list); + init_timer(&ms->timer); + ms->timer.data = (unsigned long)ms; + ms->timer.function = masq_expire; + ip_masq_timeout_attach(ms,ip_vs_timeout_table); + ms->protocol = proto; + ms->saddr = saddr; + ms->sport = sport; + ms->daddr = daddr; + ms->dport = dport; + ms->maddr = maddr; + ms->mport = mport; + ms->flags = mflags; + ms->app_data = NULL; + ms->control = NULL; + + atomic_set(&ms->n_control,0); + atomic_set(&ms->refcnt,0); + atomic_set(&ms->in_pkts,0); + + if (mflags & IP_MASQ_F_USER) + write_lock_bh(&__ip_masq_lock); + else + write_lock(&__ip_masq_lock); + + /* + * Hash it in the ip_vs_table + */ + ip_vs_hash(ms); + + if (mflags & IP_MASQ_F_USER) + write_unlock_bh(&__ip_masq_lock); + else + write_unlock(&__ip_masq_lock); + + ip_masq_bind_app(ms); + n_fails = 0; + atomic_inc(&ms->refcnt); + masq_set_state_timeout(ms, IP_MASQ_S_NONE); + return ms; +} +#endif /* CONFIG_IP_MASQUERADE_VS */ + + /* * Get transport protocol data offset, check against size * return: @@ -1153,25 +1448,20 @@ return -1; } +#ifndef CONFIG_IP_MASQUERADE_VS /* Lets determine our maddr now, shall we? */ - if (maddr == 0) { - struct rtable *rt; - struct rtable *skb_rt = (struct rtable*)skb->dst; - struct device *skb_dev = skb_rt->u.dst.dev; - - if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(iph->tos)|RTO_CONN, skb_dev?skb_dev->ifindex:0)) { - /* Fallback on old method */ - /* This really shouldn't happen... */ - maddr = inet_select_addr(skb_dev, skb_rt->rt_gateway, RT_SCOPE_UNIVERSE); - } else { - /* Route lookup succeeded */ - maddr = rt->rt_src; - ip_rt_put(rt); - } + if (!maddr && (ip_masq_select_addr(skb,&maddr) < 0)) { + return -1; } +#endif switch (iph->protocol) { case IPPROTO_ICMP: +#ifdef CONFIG_IP_MASQUERADE_VS + if (!maddr && (ip_masq_select_addr(skb,&maddr) < 0)) { + return -1; + } +#endif return(ip_fw_masq_icmp(skb_p, maddr)); case IPPROTO_UDP: if (h.uh->check == 0) @@ -1230,6 +1520,17 @@ ms = ip_masq_out_get_iph(iph); if (ms!=NULL) { +#ifdef CONFIG_IP_MASQUERADE_VS + if (!maddr && (ip_masq_select_addr(skb,&maddr) < 0)) { + /* + * Drop this packet but don't + * start the timer from the beginning + */ + __ip_masq_put(ms); + add_sltimer(&ms->timer); + return -1; + } +#endif /* * If sysctl !=0 and no pkt has been received yet @@ -1280,6 +1581,33 @@ ms->daddr = iph->daddr; } } else { +#ifdef CONFIG_IP_MASQUERADE_VS + struct ip_vs_dest *dest; + + /* + * Check if the packet is from our real service + */ + read_lock(&__ip_vs_lock); + dest = __ip_vs_lookup_real_service(iph->protocol, + iph->saddr, h.portp[0]); + read_unlock(&__ip_vs_lock); + if (dest) { + /* + * Notify the real server: there is + * no existing entry if it is not RST packet + * or not TCP packet. + */ + if (!h.th->rst || iph->protocol != IPPROTO_TCP) + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_PORT_UNREACH, 0); + return -1; + } + + if (!maddr && (ip_masq_select_addr(skb,&maddr) < 0)) { + return -1; + } +#endif + /* * Nope, not found, create a new entry for it */ @@ -1392,11 +1720,17 @@ IP_MASQ_DEBUG(2, "O-routed from %08X:%04X with masq.addr %08X\n", ntohl(ms->maddr),ntohs(ms->mport),ntohl(maddr)); - masq_set_state(ms, 1, iph, h.portp); +#ifdef CONFIG_IP_MASQUERADE_VS + /* do the IPVS statistics */ + if (ms->flags & IP_MASQ_F_VS) + ip_vs_out_stats(ms, skb); +#endif + + masq_set_state(ms, MASQ_STATE_OUTPUT, iph, h.portp); ip_masq_put(ms); return 0; - } +} /* * Restore original addresses and ports in the original IP @@ -1438,6 +1772,12 @@ ms = __ip_masq_out_get(iph->protocol, iph->daddr, portp[1], iph->saddr, portp[0]); +#ifdef CONFIG_IP_MASQUERADE_VS + if (ms == NULL) + ms = __ip_vs_out_get(iph->protocol, + iph->daddr, portp[1], + iph->saddr, portp[0]); +#endif /* CONFIG_IP_MASQUERADE_VS */ read_unlock(&__ip_masq_lock); if (ms) { IP_MASQ_DEBUG(1, "Incoming frag_need rewrited from %d.%d.%d.%d to %d.%d.%d.%d\n", @@ -1459,6 +1799,12 @@ ms = __ip_masq_in_get(iph->protocol, iph->daddr, portp[1], iph->saddr, portp[0]); +#ifdef CONFIG_IP_MASQUERADE_VS + if (ms == NULL) + ms = __ip_vs_in_get(iph->protocol, + iph->daddr, portp[1], + iph->saddr, portp[0]); +#endif /* CONFIG_IP_MASQUERADE_VS */ read_unlock(&__ip_masq_lock); if (ms) { IP_MASQ_DEBUG(1, "Outgoing frag_need rewrited from %d.%d.%d.%d to %d.%d.%d.%d\n", @@ -1469,8 +1815,8 @@ return 1; } return 0; - } + /* * Handle ICMP messages in forward direction. * Find any that might be relevant, check against existing connections, @@ -1556,7 +1902,7 @@ ntohs(icmp_id(icmph)), icmph->type); - masq_set_state(ms, 1, iph, icmph); + masq_set_state(ms, MASQ_STATE_OUTPUT, iph, icmph); ip_masq_put(ms); return 1; @@ -1684,11 +2030,28 @@ pptr[1], ciph->saddr, pptr[0]); +#ifdef CONFIG_IP_MASQUERADE_VS + if (ms == NULL) { + ms = __ip_vs_out_get(ciph->protocol, + ciph->daddr, pptr[1], + ciph->saddr, pptr[0]); + } +#endif /* CONFIG_IP_MASQUERADE_VS */ read_unlock(&__ip_masq_lock); if (ms == NULL) return 0; +#ifdef CONFIG_IP_MASQUERADE_VS + if (IP_MASQ_VS_FWD(ms) != 0) { + IP_VS_INFO("shouldn't get here, because tun/dr is on the half connection\n"); + } + + /* do the IPVS statistics */ + if (ms->flags & IP_MASQ_F_VS) + ip_vs_out_stats(ms, skb); +#endif /* CONFIG_IP_MASQUERADE_VS */ + /* Now we do real damage to this packet...! */ /* First change the source IP address, and recalc checksum */ iph->saddr = ms->maddr; @@ -1739,6 +2102,87 @@ return skb; } +#ifdef CONFIG_IP_MASQUERADE_VS + +/* + * Check whether this ICMP packet in the FORWARD path is for + * related IPVS connection and needs to be delivered locally + */ + +int ip_vs_forwarding_related_icmp(struct sk_buff *skb) +{ + struct iphdr *iph = skb->nh.iph; + struct icmphdr *icmph = (struct icmphdr *)((char *)iph + (iph->ihl<<2)); + unsigned short size = ntohs(iph->tot_len) - (iph->ihl * 4); + struct iphdr *ciph; /* The ip header contained within the ICMP */ + __u16 *pptr; /* port numbers from TCP/UDP contained header */ + struct ip_masq *ms; + union ip_masq_tphdr h; + int doff; + + /* + * PACKET_HOST only, see ip_forward + */ + + h.raw = (char*) iph + iph->ihl * 4; + + doff = proto_doff(iph->protocol, h.raw, size); + + if (doff <= 0) return 0; + + IP_VS_DBG(10, "icmp fwd/rev (%d,%d) %u.%u.%u.%u -> %u.%u.%u.%u\n", + icmph->type, ntohs(icmp_id(icmph)), + NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); + + if ((icmph->type != ICMP_DEST_UNREACH) && + (icmph->type != ICMP_SOURCE_QUENCH) && + (icmph->type != ICMP_TIME_EXCEEDED)) + return 0; + + /* + * If we get here we have an ICMP error of one of the above 3 types + * Now find the contained IP header + */ + + ciph = (struct iphdr *) (icmph + 1); + size -= sizeof(struct icmphdr); + if (size < sizeof(struct iphdr)) return 0; + + /* We are only interested ICMPs generated from TCP or UDP packets */ + if (ciph->protocol == IPPROTO_TCP) { + if (size < sizeof(struct tcphdr)) return 0; + } + else + if (ciph->protocol == IPPROTO_UDP) { + if (size < sizeof(struct udphdr)) return 0; + } + else return 0; + + /* We don't ensure for now the checksum is correct */ + + /* This is pretty much what __ip_masq_in_get_iph() does, + except params are wrong way round */ + pptr = (__u16 *)&(((char *)ciph)[ciph->ihl*4]); + + read_lock(&__ip_masq_lock); + ms = __ip_vs_in_get(ciph->protocol, + ciph->daddr, + pptr[1], + ciph->saddr, + pptr[0]); + read_unlock(&__ip_masq_lock); + + if (!ms) return 0; + IP_VS_DBG(10, "Delivering locally ICMP for %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u to %u.%u.%u.%u\n", + NIPQUAD(ciph->daddr), ntohs(pptr[1]), + NIPQUAD(ciph->saddr), ntohs(pptr[0]), + NIPQUAD(ms->saddr)); + __ip_masq_put(ms); + + return 1; +} +#endif /* CONFIG_IP_MASQUERADE_VS */ + /* * Handle ICMP messages in reverse (demasquerade) direction. * Find any that might be relevant, check against existing connections, @@ -1812,7 +2256,7 @@ ntohs(icmp_id(icmph)), icmph->type); - masq_set_state(ms, 0, iph, icmph); + masq_set_state(ms, MASQ_STATE_INPUT, iph, icmph); ip_masq_put(ms); return 1; @@ -1914,9 +2358,11 @@ * *outgoing* so the ports are reversed (and addresses) */ pptr = (__u16 *)&(((char *)ciph)[csize]); +#ifndef CONFIG_IP_MASQUERADE_VS if (ntohs(pptr[0]) < PORT_MASQ_BEGIN || ntohs(pptr[0]) > PORT_MASQ_END) return 0; +#endif /* Ensure the checksum is correct */ if (ip_compute_csum((unsigned char *) icmph, len)) @@ -1927,7 +2373,6 @@ return(-1); } - IP_MASQ_DEBUG(2, "Handling reverse ICMP for %08X:%04X -> %08X:%04X\n", ntohl(ciph->saddr), ntohs(pptr[0]), ntohl(ciph->daddr), ntohs(pptr[1])); @@ -1935,6 +2380,14 @@ /* This is pretty much what __ip_masq_in_get_iph() does, except params are wrong way round */ read_lock(&__ip_masq_lock); +#ifdef CONFIG_IP_MASQUERADE_VS + ms = __ip_vs_in_get(ciph->protocol, + ciph->daddr, + pptr[1], + ciph->saddr, + pptr[0]); + if (ms == NULL) +#endif /* CONFIG_IP_MASQUERADE_VS */ ms = __ip_masq_in_get(ciph->protocol, ciph->daddr, pptr[1], @@ -1945,10 +2398,23 @@ if (ms == NULL) return 0; +#ifdef CONFIG_IP_MASQUERADE_VS + /* do the IPVS statistics */ + if (ms->flags & IP_MASQ_F_VS) + ip_vs_in_stats(ms, skb); + + if (IP_MASQ_VS_FWD(ms) != 0) { + int ret = ip_vs_forward(skb, ms); + __ip_masq_put(ms); + return ret; + } +#endif /* CONFIG_IP_MASQUERADE_VS */ + if ((skb=masq_skb_cow(skb_p, &iph, (unsigned char**)&icmph)) == NULL) { __ip_masq_put(ms); return -1; } + ciph = (struct iphdr *) (icmph + 1); pptr = (__u16 *)&(((char *)ciph)[ciph->ihl*4]); @@ -1998,7 +2464,10 @@ int csum = 0; int csum_ok = 0; __u32 maddr; - +#ifdef CONFIG_IP_MASQUERADE_VS + struct ip_vs_service *svc = NULL; +#endif + /* * Big tappo: only PACKET_HOST (nor loopback neither mcasts) * ... don't know why 1st test DOES NOT include 2nd (?) @@ -2039,13 +2508,21 @@ return(ip_fw_demasq_icmp(skb_p)); case IPPROTO_TCP: case IPPROTO_UDP: - /* + /* * Make sure packet is in the masq range * ... or some mod-ule relaxes input range * ... or there is still some `special' mport opened */ +#ifdef CONFIG_IP_MASQUERADE_VS + svc = ip_vs_lookup_service(skb->fwmark, + iph->protocol, maddr, h.portp[1]); + if (!svc && + (ntohs(h.portp[1]) < PORT_MASQ_BEGIN + || ntohs(h.portp[1]) > PORT_MASQ_END) +#else if ((ntohs(h.portp[1]) < PORT_MASQ_BEGIN || ntohs(h.portp[1]) > PORT_MASQ_END) +#endif /* CONFIG_IP_MASQUERADE_VS */ #ifdef CONFIG_IP_MASQUERADE_MOD && (ip_masq_mod_in_rule(skb, iph) != 1) #endif @@ -2100,6 +2577,21 @@ ms = ip_masq_in_get_iph(iph); +#ifdef CONFIG_IP_MASQUERADE_VS + /* + * Checking the server status + */ + if (ms && ms->dest && !(ms->dest->flags & IP_VS_DEST_F_AVAILABLE)) { + /* + * If the dest is not avaiable, don't restart the timer + * of the packet, but silently drop it. + */ + add_sltimer(&ms->timer); + __ip_masq_put(ms); + return -1; + } +#endif + /* * Give additional modules a chance to create an entry */ @@ -2116,6 +2608,27 @@ ip_masq_mod_in_update(skb, iph, ms); #endif +#ifdef CONFIG_IP_MASQUERADE_VS + if (!ms && + (h.th->syn || (iph->protocol!=IPPROTO_TCP)) && svc) { + if (ip_masq_todrop()) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + return -1; + } + /* + * Let the virtual server select a real server + * for the incomming connection, and create a + * masquerading entry. + */ + ms = ip_vs_schedule(svc, iph); + if (!ms) + return ip_vs_leave(svc, skb); + ip_vs_conn_stats(ms, svc); + } +#endif /* CONFIG_IP_MASQUERADE_VS */ if (ms != NULL) { @@ -2168,13 +2681,43 @@ } } + +#ifdef CONFIG_IP_MASQUERADE_VS + /* do the IPVS statistics */ + if (ms->flags & IP_MASQ_F_VS) + ip_vs_in_stats(ms, skb); + + if (IP_MASQ_VS_FWD(ms) != 0) { + int ret; + + /* + * Sorry for setting state of masq entry so early + * no matter whether the packet is forwarded + * successfully or not, because ip_vs_forward may + * have already released the skb. Although it + * brokes the original sematics, it won't lead to + * serious errors. We look forward to fixing it + * under the Rusty's netfilter framework both for + * correctness and modularization. + */ + masq_set_state(ms, MASQ_STATE_INPUT, iph, h.portp); + + ret = ip_vs_forward(skb, ms); + ip_masq_put(ms); + return ret; + } + + IP_VS_DBG(10, "masquerading packet...\n"); +#endif /* CONFIG_IP_MASQUERADE_VS */ + if ((skb=masq_skb_cow(skb_p, &iph, &h.raw)) == NULL) { ip_masq_put(ms); return -1; } + iph->daddr = ms->saddr; h.portp[1] = ms->sport; - + /* * Invalidate csum saving if tunnel has masq helper */ @@ -2231,15 +2774,28 @@ h.uh->check = 0xFFFF; break; } - ip_send_check(iph); + ip_send_check(iph); IP_MASQ_DEBUG(2, "I-routed to %08X:%04X\n",ntohl(iph->daddr),ntohs(h.portp[1])); - masq_set_state (ms, 0, iph, h.portp); + masq_set_state(ms, MASQ_STATE_INPUT, iph, h.portp); ip_masq_put(ms); return 1; } +#ifdef CONFIG_IP_MASQUERADE_VS + if (svc) { + /* + * Drop packet if it belongs to virtual service but no entry + * is found or created. Furthermore, send DEST_UNREACH icmp + * packet to clients if it is not RST or it is not TCP. + */ + if (!h.th->rst || iph->protocol != IPPROTO_TCP) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + } + return -1; + } +#endif /* sorry, all this trouble for a no-hit :) */ return 0; @@ -2350,7 +2906,6 @@ len += sprintf(buffer+len, "%-127s\n", temp); if(len >= length) { - read_unlock_bh(&__ip_masq_lock); goto done; } @@ -2358,9 +2913,52 @@ read_unlock_bh(&__ip_masq_lock); } -done: +#ifdef CONFIG_IP_MASQUERADE_VS + for(idx = 0; idx < IP_VS_TAB_SIZE; idx++) + { + /* + * Lock is actually only need in next loop + * we are called from uspace: must stop bh. + */ + read_lock_bh(&__ip_masq_lock); + l = &ip_vs_table[idx]; + for (e=l->next; e!=l; e=e->next) { + ms = list_entry(e, struct ip_masq, m_list); + pos += 128; + if (pos <= offset) { + len = 0; + continue; + } + + /* + * We have locked the tables, no need to del/add timers + * nor cli() 8) + */ + + sprintf(temp,"%s %08X:%04X %08X:%04X %04X %08X %6d %6d %7lu", + masq_proto_name(ms->protocol), + ntohl(ms->saddr), ntohs(ms->sport), + ntohl(ms->daddr), ntohs(ms->dport), + ntohs(ms->mport), + ms->out_seq.init_seq, + ms->out_seq.delta, + ms->out_seq.previous_delta, + ms->timer.expires-jiffies); + len += sprintf(buffer+len, "%-127s\n", temp); + + if(len >= length) { + read_unlock_bh(&__ip_masq_lock); + goto done; + } + } + read_unlock_bh(&__ip_masq_lock); + + } +#endif /* CONFIG_IP_MASQUERADE_VS */ + +done: begin = len - (pos - offset); *start = buffer + begin; len -= begin; @@ -2386,17 +2984,29 @@ len, sizeof(struct ip_fw_masq)); } else { masq = (struct ip_fw_masq *)m; - if (masq->tcp_timeout) + if (masq->tcp_timeout) { masq_timeout_table.timeout[IP_MASQ_S_ESTABLISHED] +#ifdef CONFIG_IP_MASQUERADE_VS + = masq_timeout_table_dos.timeout[IP_MASQ_S_ESTABLISHED] +#endif = masq->tcp_timeout; + } - if (masq->tcp_fin_timeout) + if (masq->tcp_fin_timeout) { masq_timeout_table.timeout[IP_MASQ_S_FIN_WAIT] +#ifdef CONFIG_IP_MASQUERADE_VS + = masq_timeout_table_dos.timeout[IP_MASQ_S_FIN_WAIT] +#endif = masq->tcp_fin_timeout; + } - if (masq->udp_timeout) + if (masq->udp_timeout) { masq_timeout_table.timeout[IP_MASQ_S_UDP] +#ifdef CONFIG_IP_MASQUERADE_VS + = masq_timeout_table_dos.timeout[IP_MASQ_S_UDP] +#endif = masq->udp_timeout; + } ret = 0; } return ret; @@ -2468,6 +3078,11 @@ ret = ip_masq_mod_ctl(optname, &masq_ctl, optlen); break; #endif +#ifdef CONFIG_IP_MASQUERADE_VS + case IP_MASQ_TARGET_VS: + ret = ip_vs_ctl(optname, &masq_ctl, optlen); + break; +#endif } /* @@ -2529,12 +3144,25 @@ } } #endif /* CONFIG_PROC_FS */ + /* - * Wrapper over inet_select_addr() + * Determine maddr from skb */ -u32 ip_masq_select_addr(struct device *dev, u32 dst, int scope) +int ip_masq_select_addr(struct sk_buff *skb, __u32 *maddr) { - return inet_select_addr(dev, dst, scope); + struct rtable *rt; + struct rtable *skb_rt = (struct rtable*)skb->dst; + struct device *skb_dev = skb_rt->u.dst.dev; + struct iphdr *iph = skb->nh.iph; + + if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(iph->tos)|RTO_CONN, skb_dev?skb_dev->ifindex:0)) { + return -1; + } else { + /* Route lookup succeeded */ + *maddr = rt->rt_src; + ip_rt_put(rt); + return 0; + } } /* @@ -2587,7 +3215,7 @@ (char *) IPPROTO_ICMP, ip_masq_user_info }); -#endif +#endif /* CONFIG_PROC_FS */ #ifdef CONFIG_IP_MASQUERADE_IPAUTOFW ip_autofw_init(); #endif @@ -2596,6 +3224,9 @@ #endif #ifdef CONFIG_IP_MASQUERADE_MFW ip_mfw_init(); +#endif +#ifdef CONFIG_IP_MASQUERADE_VS + ip_vs_init(); #endif ip_masq_app_init(); diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs.c --- linux-2.2.19/net/ipv4/ip_vs.c Thu Jan 1 08:00:00 1970 +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs.c Mon May 14 22:04:50 2001 @@ -0,0 +1,3015 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a part + * of IP masquerading code. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Version: $Id$ + * + * Authors: Wensong Zhang + * Peter Kese + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : fixed the overflow bug in ip_vs_procinfo + * Wensong Zhang : added editing dest and service functions + * Wensong Zhang : changed the names of some functions + * Wensong Zhang : fixed the unlocking bug in ip_vs_del_dest + * Wensong Zhang : added a separate hash table for IPVS + * Wensong Zhang : added slow timer for IPVS masq entries + * Julian Anastasov : fixed the number of active connections + * Wensong Zhang : added persistent port + * Wensong Zhang : fixed the incorrect lookup in hash table + * Wensong Zhang : added server status checking + * Wensong Zhang : fixed the incorrect slow timer vector layout + * Wensong Zhang : fixed the sltimer added twice bug of mst + * Julian Anastasov : fixed the IP_MASQ_F_VS_INACTIVE cleared bug after editing dest + * Wensong Zhang : added the inactive connection counter + * Wensong Zhang : changed the body of ip_vs_schedule + * Julian Anastasov : fixed the unlocking bug in ip_vs_schedule + * Julian Anastasov : fixed the uncounting bug in creating masqs by template + * Wensong Zhang : changed some condition orders for a bit performance + * Julian Anastasov : don't touch counters in ip_vs_unbind_masq for templates + * Wensong Zhang : added the hash table for virtual services + * Wensong Zhang : changed destination lists to d-linked lists + * Wensong Zhang : changed the scheduler list to the d-linked list + * Wensong Zhang : added new persistent service handling + * Julian Anastasov : fixed the counting bug in ip_vs_unbind_masq again + * (don't touch counters for templates) + * Wensong Zhang : changed some IP_VS_ERR to IP_VS_DBG in the ip_vs_tunnel_xmit + * Wensong Zhang : added different timeout support for persistent svc + * Wensong Zhang : fixed the bug that persistent svc cannot be edited + * Julian Anastasov : removed extra read_unlock in __ip_vs_lookup_service + * Julian Anastasov : changed not to restart template timers if dest is unavailable + * Julian Anastasov : added the destination trash + * Wensong Zhang : added the update_service call in ip_vs_del_dest + * Wensong Zhang : added the ip_vs_leave function + * Lars Marowsky-Bree : added persistence granularity support + * Julian Anastasov : changed some comestics things for debugging + * Wensong Zhang : use vmalloc to allocate big ipvs hash table + * Wensong Zhang : changed the tunneling/direct routing methods a little + * Julian Anastasov : fixed the return bug of ip_vs_leave(-2 instead of -3) + * Roberto Nibali : fixed the undefined variable bug in the IP_VS_DBG of ip_vs_dr_xmit + * Julian Anastasov : changed ICMP_PROT_UNREACH to ICMP_PORT_UNREACH in ip_vs_leave + * Wensong Zhang : added port zero support for persistent services + * Wensong Zhang : fixed the bug that virtual ftp service blocks other services not listed in ipvs table + * Wensong Zhang : invalidate a persistent template when its dest is unavailable + * Julian Anastasov : changed two IP_VS_ERR calls to IP_VS_DBG + * Wensong Zhang : added random drop of syn entries + * Wensong Zhang : added random drop of UDP entris + * Julian Anastasov : added droprate defense against DoS attack + * Julian Anastasov : added secure_tcp defense against DoS attack + * Wensong Zhang : revisited dropentry defense against DoS attach + * Horms : added the fwmark service feature + * Wensong Zhang : changed to two service hash tables + * Julian Anastasov : corrected trash_dest lookup for both + * normal service and fwmark service + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_KMOD +#include +#endif + +EXPORT_SYMBOL(register_ip_vs_scheduler); +EXPORT_SYMBOL(unregister_ip_vs_scheduler); +EXPORT_SYMBOL(ip_vs_bind_masq); +EXPORT_SYMBOL(ip_vs_unbind_masq); +EXPORT_SYMBOL(ip_vs_lookup_dest); +#ifdef CONFIG_IP_VS_DEBUG +EXPORT_SYMBOL(ip_vs_get_debug_level); +#endif + +int sysctl_ip_vs_drop_entry = 0; +int sysctl_ip_vs_drop_packet = 0; +int sysctl_ip_vs_secure_tcp = 0; +int sysctl_ip_vs_amemthresh = 1024; +int sysctl_ip_vs_am_droprate = 10; + +#ifdef CONFIG_IP_VS_DEBUG +static int sysctl_ip_vs_debug_level = 0; + +int ip_vs_get_debug_level(void) +{ + return sysctl_ip_vs_debug_level; +} +#endif + + +int ip_vs_dropentry = 0; + +static inline void update_defense_level(void) +{ + int ip_vs_amem = nr_free_pages+page_cache_size+(buffermem>>PAGE_SHIFT); + int nomem = (ip_vs_amem < sysctl_ip_vs_amemthresh); + + /* drop_entry */ + switch (sysctl_ip_vs_drop_entry) { + case 0: + ip_vs_dropentry = 0; + break; + case 1: + if (nomem) { + ip_vs_dropentry = 1; + sysctl_ip_vs_drop_entry = 2; + } else { + ip_vs_dropentry = 0; + } + break; + case 2: + if (nomem) { + ip_vs_dropentry = 1; + } else { + ip_vs_dropentry = 0; + sysctl_ip_vs_drop_entry = 1; + }; + break; + case 3: + ip_vs_dropentry = 1; + break; + } + + /* drop_packet */ + switch (sysctl_ip_vs_drop_packet) { + case 0: + ip_masq_drop_rate = 0; + break; + case 1: + if (nomem) { + ip_masq_drop_rate = ip_masq_drop_counter + = sysctl_ip_vs_amemthresh / + (sysctl_ip_vs_amemthresh-ip_vs_amem); + sysctl_ip_vs_drop_packet = 2; + } else { + ip_masq_drop_rate = 0; + } + break; + case 2: + if (nomem) { + ip_masq_drop_rate = ip_masq_drop_counter + = sysctl_ip_vs_amemthresh / + (sysctl_ip_vs_amemthresh-ip_vs_amem); + } else { + ip_masq_drop_rate = 0; + sysctl_ip_vs_drop_packet = 1; + } + break; + case 3: + ip_masq_drop_rate = sysctl_ip_vs_am_droprate; + break; + } + + /* secure_tcp */ + switch (sysctl_ip_vs_secure_tcp) { + case 0: + ip_masq_secure_tcp_set(0); + break; + case 1: + if (nomem) { + ip_masq_secure_tcp_set(1); + sysctl_ip_vs_secure_tcp = 2; + } else { + ip_masq_secure_tcp_set(0); + } + break; + case 2: + if (nomem) { + ip_masq_secure_tcp_set(1); + } else { + ip_masq_secure_tcp_set(0); + sysctl_ip_vs_secure_tcp = 1; + } + break; + case 3: + ip_masq_secure_tcp_set(1); + break; + } +} + + +static inline int todrop_entry(struct ip_masq *ms) +{ + /* + * The drop rate array needs tuning for real environments. + */ + static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + static char todrop_counter[9] = {0}; + int i; + + if (ms->timeout+jiffies-ms->timer.expires < 60*HZ) + return 0; + + i = atomic_read(&ms->in_pkts); + if (i > 8) return 0; + + if (!todrop_rate[i]) return 0; + if (--todrop_counter[i] > 0) return 0; + + todrop_counter[i] = todrop_rate[i]; + return 1; +} + +static inline void ip_vs_random_dropentry(void) +{ + int i; + struct ip_masq *ms; + struct list_head *l,*e; + struct ip_masq *mst; + void (*fn)(unsigned long); + + /* + * Randomly scan 1/32 of the whole table every second + */ + for (i=0; i < (IP_VS_TAB_SIZE>>5); i++) { + /* + * Lock is actually needed in this loop. + */ + write_lock(&__ip_masq_lock); + + l = &ip_vs_table[net_random()&IP_VS_TAB_MASK]; + for (e=l->next; e!=l; e=e->next) { + ms = list_entry(e, struct ip_masq, m_list); + if (ms->dport == 0) + /* masq template */ + continue; + switch(ms->state) { + case IP_MASQ_S_SYN_RECV: + case IP_MASQ_S_SYNACK: + break; + + case IP_MASQ_S_ESTABLISHED: + case IP_MASQ_S_UDP: + if (todrop_entry(ms)) + break; + continue; + + default: + continue; + } + + /* + * Drop the entry, and drop its mst if not referenced + */ + write_unlock(&__ip_masq_lock); + IP_VS_DBG(4, "Drop masq\n"); + mst = ms->control; + fn = (ms->timer).function; + del_sltimer(&ms->timer); + fn((unsigned long)ms); + if (mst && !atomic_read(&mst->n_control)) { + IP_VS_DBG(4, "Drop masq template\n"); + del_sltimer(&mst->timer); + fn((unsigned long)mst); + } + write_lock(&__ip_masq_lock); + } + write_unlock(&__ip_masq_lock); + } +} + + +/* + * The following block implements slow timers for IPVS, most code is stolen + * from linux/kernel/sched.c + * Slow timer is used to avoid the overhead of cascading timers, when lots + * of masq entries (>50,000) are cluttered in the system. + */ +#define SHIFT_BITS 6 +#define TVN_BITS 8 +#define TVR_BITS 10 +#define TVN_SIZE (1 << TVN_BITS) +#define TVR_SIZE (1 << TVR_BITS) +#define TVN_MASK (TVN_SIZE - 1) +#define TVR_MASK (TVR_SIZE - 1) + +struct sltimer_vec { + int index; + struct timer_list *vec[TVN_SIZE]; +}; + +struct sltimer_vec_root { + int index; + struct timer_list *vec[TVR_SIZE]; +}; + +static struct sltimer_vec sltv3 = { 0 }; +static struct sltimer_vec sltv2 = { 0 }; +static struct sltimer_vec_root sltv1 = { 0 }; + +static struct sltimer_vec * const sltvecs[] = { + (struct sltimer_vec *)&sltv1, &sltv2, &sltv3 +}; + +#define NOOF_SLTVECS (sizeof(sltvecs) / sizeof(sltvecs[0])) + +static unsigned long sltimer_jiffies = 0; + +static inline void insert_sltimer(struct timer_list *timer, + struct timer_list **vec, int idx) +{ + if ((timer->next = vec[idx])) + vec[idx]->prev = timer; + vec[idx] = timer; + timer->prev = (struct timer_list *)&vec[idx]; +} + +static inline void internal_add_sltimer(struct timer_list *timer) +{ + /* + * must be cli-ed when calling this + */ + unsigned long expires = timer->expires; + unsigned long idx = (expires - sltimer_jiffies) >> SHIFT_BITS; + + if (idx < TVR_SIZE) { + int i = (expires >> SHIFT_BITS) & TVR_MASK; + insert_sltimer(timer, sltv1.vec, i); + } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { + int i = (expires >> (SHIFT_BITS+TVR_BITS)) & TVN_MASK; + insert_sltimer(timer, sltv2.vec, i); + } else if ((signed long) idx < 0) { + /* + * can happen if you add a timer with expires == jiffies, + * or you set a timer to go off in the past + */ + insert_sltimer(timer, sltv1.vec, sltv1.index); + } else if (idx <= 0xffffffffUL) { + int i = (expires >> (SHIFT_BITS+TVR_BITS+TVN_BITS)) & TVN_MASK; + insert_sltimer(timer, sltv3.vec, i); + } else { + /* Can only get here on architectures with 64-bit jiffies */ + timer->next = timer->prev = timer; + } +} + +rwlock_t sltimerlist_lock = RW_LOCK_UNLOCKED; + +void add_sltimer(struct timer_list *timer) +{ + write_lock(&sltimerlist_lock); + if (timer->prev) + goto bug; + internal_add_sltimer(timer); +out: + write_unlock(&sltimerlist_lock); + return; + +bug: + printk("bug: kernel sltimer added twice at %p.\n", + __builtin_return_address(0)); + goto out; +} + +static inline int detach_sltimer(struct timer_list *timer) +{ + struct timer_list *prev = timer->prev; + if (prev) { + struct timer_list *next = timer->next; + prev->next = next; + if (next) + next->prev = prev; + return 1; + } + return 0; +} + +void mod_sltimer(struct timer_list *timer, unsigned long expires) +{ + write_lock(&sltimerlist_lock); + timer->expires = expires; + detach_sltimer(timer); + internal_add_sltimer(timer); + write_unlock(&sltimerlist_lock); +} + +int del_sltimer(struct timer_list * timer) +{ + int ret; + + write_lock(&sltimerlist_lock); + ret = detach_sltimer(timer); + timer->next = timer->prev = 0; + write_unlock(&sltimerlist_lock); + return ret; +} + + +static inline void cascade_sltimers(struct sltimer_vec *tv) +{ + /* + * cascade all the timers from tv up one level + */ + struct timer_list *timer; + timer = tv->vec[tv->index]; + /* + * We are removing _all_ timers from the list, so we don't have to + * detach them individually, just clear the list afterwards. + */ + while (timer) { + struct timer_list *tmp = timer; + timer = timer->next; + internal_add_sltimer(tmp); + } + tv->vec[tv->index] = NULL; + tv->index = (tv->index + 1) & TVN_MASK; +} + +static inline void run_sltimer_list(void) +{ + write_lock(&sltimerlist_lock); + while ((long)(jiffies - sltimer_jiffies) >= 0) { + struct timer_list *timer; + if (!sltv1.index) { + int n = 1; + do { + cascade_sltimers(sltvecs[n]); + } while (sltvecs[n]->index == 1 && ++n < NOOF_SLTVECS); + } + while ((timer = sltv1.vec[sltv1.index])) { + void (*fn)(unsigned long) = timer->function; + unsigned long data = timer->data; + detach_sltimer(timer); + timer->next = timer->prev = NULL; + write_unlock(&sltimerlist_lock); + fn(data); + write_lock(&sltimerlist_lock); + } + sltimer_jiffies += 1< */ +struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; +/* the service table hashed by fwmark */ +struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; + +/* + * Hash table: for real service lookups + */ +#define IP_VS_RTAB_BITS 4 +#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) +#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) + +struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE]; + +/* + * IPVS scheduler list + */ +struct list_head ip_vs_schedulers; + +/* + * Trash for destinations + */ +struct list_head ip_vs_dest_trash; + +/* + * FTP & NULL virtual service counters + */ +atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0); +atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0); + +/* + * Register a scheduler in the scheduler list + */ +int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) +{ + if (!scheduler) { + IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n"); + return -EINVAL; + } + + if (!scheduler->name) { + IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n"); + return -EINVAL; + } + + if (scheduler->n_list.next != &scheduler->n_list) { + IP_VS_ERR("register_ip_vs_scheduler(): scheduler already linked\n"); + return -EINVAL; + } + + /* + * Add it into the d-linked scheduler list + */ + list_add(&scheduler->n_list, &ip_vs_schedulers); + + return 0; +} + + +/* + * Unregister a scheduler in the scheduler list + */ +int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) +{ + if (!scheduler) { + IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n"); + return -EINVAL; + } + + /* + * Only allow unregistration if it is not referenced + */ + if (atomic_read(&scheduler->refcnt)) { + IP_VS_ERR("unregister_ip_vs_scheduler(): is in use by %d guys. failed\n", + atomic_read(&scheduler->refcnt)); + return -EINVAL; + } + + if (scheduler->n_list.next == &scheduler->n_list) { + IP_VS_ERR("unregister_ip_vs_scheduler(): scheduler is not in the list. failed\n"); + return -EINVAL; + } + + /* + * Removed it from the d-linked scheduler list + */ + list_del(&scheduler->n_list); + + return 0; +} + + +/* + * Bind a service with a scheduler + * Must called with the __ip_vs_lock lock, and return bool. + */ +int ip_vs_bind_scheduler(struct ip_vs_service *svc, + struct ip_vs_scheduler *scheduler) +{ + if (svc == NULL) { + IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n"); + return -EINVAL; + } + if (scheduler == NULL) { + IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n"); + return -EINVAL; + } + + svc->scheduler = scheduler; + atomic_inc(&scheduler->refcnt); + + if(scheduler->init_service) + if(scheduler->init_service(svc) != 0) { + IP_VS_ERR("ip_vs_bind_scheduler(): init error\n"); + return -EINVAL; + } + + return 0; +} + + +/* + * Unbind a service with its scheduler + * Must called with the __ip_vs_lock lock, and return bool. + */ +int ip_vs_unbind_scheduler(struct ip_vs_service *svc) +{ + struct ip_vs_scheduler *sched; + + if (svc == NULL) { + IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n"); + return -EINVAL; + } + + sched = svc->scheduler; + if (sched == NULL) { + IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n"); + return -EINVAL; + } + + if(sched->done_service) + if(sched->done_service(svc) != 0) { + IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n"); + return -EINVAL; + } + + atomic_dec(&sched->refcnt); + svc->scheduler = NULL; + + return 0; +} + + +/* + * Get scheduler in the scheduler list by name + */ +struct ip_vs_scheduler * ip_vs_sched_getbyname(const char *sched_name) +{ + struct ip_vs_scheduler *sched; + struct list_head *l, *e; + + IP_VS_DBG(6, "ip_vs_sched_getbyname(): sched_name \"%s\"\n", + sched_name); + + read_lock_bh(&__ip_vs_lock); + + l = &ip_vs_schedulers; + for (e=l->next; e!=l; e=e->next) { + sched = list_entry(e, struct ip_vs_scheduler, n_list); + if (strcmp(sched_name, sched->name)==0) { + /* HIT */ + read_unlock_bh(&__ip_vs_lock); + return sched; + } + } + + read_unlock_bh(&__ip_vs_lock); + return NULL; +} + + +/* + * Lookup scheduler and try to load it if it doesn't exist + */ +struct ip_vs_scheduler * ip_vs_lookup_scheduler(const char *sched_name) +{ + struct ip_vs_scheduler *sched; + + /* + * Search for the scheduler by sched_name + */ + sched = ip_vs_sched_getbyname(sched_name); + + /* + * If scheduler not found, load the module and search again + */ + if (sched == NULL) { + char module_name[IP_MASQ_TNAME_MAX+8]; + sprintf(module_name,"ip_vs_%s",sched_name); +#ifdef CONFIG_KMOD + request_module(module_name); +#endif /* CONFIG_KMOD */ + sched = ip_vs_sched_getbyname(sched_name); + } + + return sched; +} + + +/* + * Returns hash value for IPVS masq entry + */ + +static __inline__ unsigned +ip_vs_hash_key(unsigned proto, __u32 addr, __u16 port) +{ + unsigned addrh = ntohl(addr); + + return (proto^addrh^(addrh>>IP_VS_TAB_BITS)^ntohs(port)) + & IP_VS_TAB_MASK; +} + + +/* + * Hashes ip_masq in ip_vs_table by proto,addr,port. + * should be called with locked tables. + * returns bool success. + */ +int ip_vs_hash(struct ip_masq *ms) +{ + unsigned hash; + + if (ms->flags & IP_MASQ_F_HASHED) { + IP_VS_ERR("ip_vs_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* + * Note: because ip_masq_put sets masq expire only if its + * refcnt==IP_MASQ_NTABLES, otherwise the masq entry + * will never expire. + */ + atomic_add(IP_MASQ_NTABLES, &ms->refcnt); + + /* + * Hash by proto,d{addr,port}, + * which are client address and port in IPVS. + */ + hash = ip_vs_hash_key(ms->protocol, ms->daddr, ms->dport); + list_add(&ms->m_list, &ip_vs_table[hash]); + + ms->flags |= IP_MASQ_F_HASHED; + return 1; +} + + +/* + * Unhashes ip_masq from ip_vs_table. + * should be called with locked tables. + * returns bool success. + */ +int ip_vs_unhash(struct ip_masq *ms) +{ + if (!(ms->flags & IP_MASQ_F_HASHED)) { + IP_VS_ERR("ip_vs_unhash(): request for unhash flagged, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* + * Remove it from the list and decrease its reference counter. + */ + list_del(&ms->m_list); + atomic_sub(IP_MASQ_NTABLES, &ms->refcnt); + + ms->flags &= ~IP_MASQ_F_HASHED; + return 1; +} + + +/* + * Gets ip_masq associated with supplied parameters in the ip_vs_table. + * Called for pkts coming from OUTside-to-INside. + * s_addr, s_port: pkt source address (foreign host) + * d_addr, d_port: pkt dest address (load balancer) + * Caller must lock tables + */ +struct ip_masq * __ip_vs_in_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) +{ + unsigned hash; + struct ip_masq *ms; + struct list_head *l,*e; + + hash = ip_vs_hash_key(protocol, s_addr, s_port); + + l = &ip_vs_table[hash]; + for (e=l->next; e!=l; e=e->next) { + ms = list_entry(e, struct ip_masq, m_list); + if (s_addr==ms->daddr && s_port==ms->dport && + d_port==ms->mport && d_addr==ms->maddr && + protocol==ms->protocol) { + /* HIT */ + atomic_inc(&ms->refcnt); + goto out; + } + } + ms = NULL; + + out: + IP_VS_DBG(7, "look/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", + masq_proto_name(protocol), + NIPQUAD(s_addr), ntohs(s_port), + NIPQUAD(d_addr), ntohs(d_port), + ms?"hit":"not hit"); + + return ms; +} + + +/* + * Gets ip_masq associated with supplied parameters in the ip_vs_table. + * Called for pkts coming from inside-to-OUTside. + * s_addr, s_port: pkt source address (inside host) + * d_addr, d_port: pkt dest address (foreign host) + * Caller must lock tables + */ +struct ip_masq * __ip_vs_out_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) +{ + unsigned hash; + struct ip_masq *ms; + struct list_head *l,*e; + + /* + * Check for "full" addressed entries + */ + hash = ip_vs_hash_key(protocol, d_addr, d_port); + + l = &ip_vs_table[hash]; + for (e=l->next; e!=l; e=e->next) { + ms = list_entry(e, struct ip_masq, m_list); + if (d_addr == ms->daddr && d_port == ms->dport && + s_port == ms->sport && s_addr == ms->saddr && + protocol == ms->protocol) { + /* HIT */ + atomic_inc(&ms->refcnt); + goto out; + } + } + ms = NULL; + + out: + IP_VS_DBG(7, "look/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", + masq_proto_name(protocol), + NIPQUAD(s_addr), ntohs(s_port), + NIPQUAD(d_addr), ntohs(d_port), + ms?"hit":"not hit"); + + return ms; +} + + +/* + * Called by ip_vs_sched_persist to look for masq template. + */ +static __inline__ struct ip_masq *ip_vs_in_get +(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) +{ + struct ip_masq *ms; + + read_lock(&__ip_masq_lock); + ms = __ip_vs_in_get(protocol, s_addr, s_port, d_addr, d_port); + read_unlock(&__ip_masq_lock); + + return ms; +} + + +/* + * Returns hash value for virtual service + */ +static __inline__ unsigned +ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port) +{ + register unsigned porth = ntohs(port); + + return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth) + & IP_VS_SVC_TAB_MASK; +} + +/* + * Returns hash value of fwmark for virtual service lookup + */ +static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark) +{ + return fwmark & IP_VS_SVC_TAB_MASK; +} + +/* + * Hashes ip_vs_service in the ip_vs_svc_table by + * or in the ip_vs_svc_fwm_table by fwmark. + * Should be called with locked tables. + * Returns bool success. + */ +int ip_vs_svc_hash(struct ip_vs_service *svc) +{ + unsigned hash; + + if (svc->flags & IP_VS_SVC_F_HASHED) { + IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + if (svc->fwmark == 0) { + /* + * Hash by in ip_vs_svc_table + */ + hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port); + list_add(&svc->s_list, &ip_vs_svc_table[hash]); + } else { + /* + * Hash by fwmark in ip_vs_svc_fwm_table + */ + hash = ip_vs_svc_fwm_hashkey(svc->fwmark); + list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); + } + + svc->flags |= IP_VS_SVC_F_HASHED; + atomic_inc(&svc->refcnt); + return 1; +} + + +/* + * Unhashes ip_vs_service from ip_vs_svc_table/ip_vs_svc_fwm_table. + * Should be called with locked tables. + * Returns bool success. + */ +int ip_vs_svc_unhash(struct ip_vs_service *svc) +{ + if (!(svc->flags & IP_VS_SVC_F_HASHED)) { + IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + if (svc->fwmark == 0) { + /* + * Remove it from the ip_vs_svc_table table. + */ + list_del(&svc->s_list); + } else { + /* + * Remove it from the ip_vs_svc_fwm_table table. + */ + list_del(&svc->f_list); + } + + svc->flags &= ~IP_VS_SVC_F_HASHED; + atomic_dec(&svc->refcnt); + return 1; +} + + +/* + * Lookup service by {proto,addr,port} in the service table. + */ +static __inline__ struct ip_vs_service * +__ip_vs_lookup_service(__u16 protocol, __u32 vaddr, __u16 vport) +{ + unsigned hash; + struct ip_vs_service *svc; + struct list_head *l,*e; + + /* + * Check for "full" addressed entries + * Note: as long as IP_VS_SVC_TAB_BITS is larger than zero, + * and have different hash + * keys, there is no need to do protcol checking. + */ + hash = ip_vs_svc_hashkey(protocol, vaddr, vport); + + l = &ip_vs_svc_table[hash]; + for (e=l->next; e!=l; e=e->next) { + svc = list_entry(e, struct ip_vs_service, s_list); + if ((svc->addr == vaddr) + && (svc->port == vport)) { + /* HIT */ + return svc; + } + } + + return NULL; +} + + +/* + * Lookup service by fwmark in the service table. + */ +static __inline__ struct ip_vs_service * __ip_vs_lookup_svc_fwm(__u32 fwmark) +{ + unsigned hash; + struct ip_vs_service *svc; + struct list_head *l,*e; + + /* + * Check for fwmark-indexed entries + */ + hash = ip_vs_svc_fwm_hashkey(fwmark); + + l = &ip_vs_svc_fwm_table[hash]; + for (e=l->next; e!=l; e=e->next) { + svc = list_entry(e, struct ip_vs_service, f_list); + if (svc->fwmark == fwmark) { + /* HIT */ + return svc; + } + } + + return NULL; +} + +struct ip_vs_service * +ip_vs_lookup_service(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport) +{ + struct ip_vs_service *svc; + + read_lock(&__ip_vs_lock); + + if (fwmark) { + /* + * Check the table hashed by fwmark first + */ + svc = __ip_vs_lookup_svc_fwm(fwmark); + if (svc) + goto out; + } + + /* + * Check the table hashed by + * first for "full" addressed entries + */ + svc = __ip_vs_lookup_service(protocol, vaddr, vport); + + if (svc == NULL + && protocol == IPPROTO_TCP + && atomic_read(&ip_vs_ftpsvc_counter) + && (vport==FTPDATA || ntohs(vport)>=PROT_SOCK)){ + /* + * Check if ftp service entry exists, the packet + * might belong to FTP data connections. + */ + svc = __ip_vs_lookup_service(protocol, vaddr, FTPPORT); + } + + if (svc == NULL + && atomic_read(&ip_vs_nullsvc_counter)) { + /* + * Check if the catch-all port (port zero) exists + */ + svc = __ip_vs_lookup_service(protocol, vaddr, 0); + } + + out: + read_unlock(&__ip_vs_lock); + + IP_VS_DBG(5, "lookup_service fwm %d %s %u.%u.%u.%u:%d %s\n", + fwmark, + masq_proto_name(protocol), + NIPQUAD(vaddr), ntohs(vport), + svc?"hit":"not hit"); + + return svc; +} + + +/* + * Bind a destination with a service + */ +static inline void +__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + atomic_inc(&svc->refcnt); + dest->svc = svc; +} + +/* + * Unbind a destination with its service + */ +static inline void +__ip_vs_unbind_svc(struct ip_vs_dest *dest) +{ + struct ip_vs_service *svc = dest->svc; + + dest->svc = NULL; + if (atomic_dec_and_test(&svc->refcnt)) { + IP_VS_DBG(2, "release svc %s %u.%u.%u.%u:%d\n", + masq_proto_name(svc->protocol), + NIPQUAD(svc->addr), ntohs(svc->port)); + kfree_s(svc, sizeof(struct ip_vs_service)); + } +} + + +/* + * Returns hash value for real service + */ +static __inline__ unsigned +ip_vs_rs_hashkey(__u32 addr, __u16 port) +{ + register unsigned porth = ntohs(port); + + return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth) & IP_VS_RTAB_MASK; +} + +/* + * Hashes ip_vs_dest in ip_vs_rtable by proto,addr,port. + * should be called with locked tables. + * returns bool success. + */ +int ip_vs_rs_hash(struct ip_vs_dest *dest) +{ + unsigned hash; + + if (!list_empty(&dest->d_list)) { + return 0; + } + + /* + * Hash by proto,addr,port, + * which are the parameters of the real service. + */ + hash = ip_vs_rs_hashkey(dest->addr, dest->port); + list_add(&dest->d_list, &ip_vs_rtable[hash]); + + return 1; +} + +/* + * UNhashes ip_vs_dest from ip_vs_rtable. + * should be called with locked tables. + * returns bool success. + */ +int ip_vs_rs_unhash(struct ip_vs_dest *dest) +{ + /* + * Remove it from the ip_vs_rtable table. + */ + if (!list_empty(&dest->d_list)) { + list_del(&dest->d_list); + INIT_LIST_HEAD(&dest->d_list); + } + + return 1; +} + +/* + * Lookup real service by {proto,addr,port} in the real service table. + */ +struct ip_vs_dest * __ip_vs_lookup_real_service(__u16 protocol, + __u32 daddr, __u16 dport) +{ + unsigned hash; + struct ip_vs_dest *dest; + struct list_head *l,*e; + + /* + * Check for "full" addressed entries + * Return the first found entry + */ + hash = ip_vs_rs_hashkey(daddr, dport); + + l = &ip_vs_rtable[hash]; + for (e=l->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, d_list); + if ((dest->addr == daddr) + && (dest->port == dport) + && ((dest->protocol == protocol) || dest->vfwmark)) { + /* HIT */ + return dest; + } + } + + return NULL; +} + +/* + * Lookup destination by {addr,port} in the given service + */ +struct ip_vs_dest * ip_vs_lookup_dest(struct ip_vs_service *svc, + __u32 daddr, __u16 dport) +{ + struct ip_vs_dest *dest; + struct list_head *l, *e; + + read_lock_bh(&__ip_vs_lock); + + /* + * Find the destination for the given service + */ + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + if ((dest->addr == daddr) && (dest->port == dport)) { + /* HIT */ + read_unlock_bh(&__ip_vs_lock); + return dest; + } + } + + read_unlock_bh(&__ip_vs_lock); + return NULL; +} + + +/* + * Lookup dest by {svc,addr,port} in the destination trash. + * Called by ip_vs_add_dest with the __ip_vs_lock. + * The destination trash is used to hold the destinations that are removed + * from the service table but are still referenced by some masq entries. + * The reason to add the destination trash is when the dest is temporary + * down (either by administrator or by monitor program), the dest can be + * picked back from the trash, the remaining connections to the dest can + * continue, and the counting information of the dest is also useful for + * scheduling. + */ +struct ip_vs_dest * __ip_vs_get_trash_dest(struct ip_vs_service *svc, + __u32 daddr, __u16 dport) +{ + struct ip_vs_dest *dest; + struct list_head *l, *e; + + /* + * Find the destination in trash + */ + l = &ip_vs_dest_trash; + for (e=l->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%d still in trash, " + "refcnt=%d\n", + dest->vfwmark, + NIPQUAD(dest->addr), ntohs(dest->port), + atomic_read(&dest->refcnt)); + if (dest->addr == daddr && + dest->port == dport && + dest->vfwmark == svc->fwmark && + (svc->fwmark || + (dest->protocol == svc->protocol && + dest->vaddr == svc->addr && + dest->vport == svc->port))) { + /* HIT */ + return dest; + } + + /* + * Try to purge the destination from trash if not referenced + */ + if (atomic_read(&dest->refcnt) == 1) { + IP_VS_DBG(3, "Remove destination %u/%u.%u.%u.%u:%d " + "from trash\n", + dest->vfwmark, + NIPQUAD(dest->addr), ntohs(dest->port)); + e = e->prev; + list_del(&dest->n_list); + __ip_vs_unbind_svc(dest); + kfree_s(dest, sizeof(*dest)); + } + } + return NULL; +} + + +/* + * Update a destination in the given service + */ +void __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, + struct ip_masq_ctl *mctl) +{ + struct ip_vs_user *mm = &mctl->u.vs_user; + + /* + * Set the weight and the flags + */ + dest->weight = mm->weight; + dest->masq_flags = mm->masq_flags; + + dest->masq_flags |= IP_MASQ_F_VS; + dest->masq_flags |= IP_MASQ_F_VS_INACTIVE; + + /* + * Check if local node and update the flags + */ + if (inet_addr_type(mm->daddr) == RTN_LOCAL) { + dest->masq_flags = (dest->masq_flags & ~IP_MASQ_F_VS_FWD_MASK) + | IP_MASQ_F_VS_LOCALNODE; + } + + /* + * Set the IP_MASQ_F_VS_NO_OUTPUT flag if not masquerading + */ + if ((dest->masq_flags & IP_MASQ_F_VS_FWD_MASK) != 0) { + dest->masq_flags |= IP_MASQ_F_VS_NO_OUTPUT; + } else { + /* + * Put the real service in ip_vs_rtable if not present. + * For now only for NAT! + */ + ip_vs_rs_hash(dest); + } + + + /* bind the service */ + if (!dest->svc) { + __ip_vs_bind_svc(dest, svc); + } else { + if (dest->svc != svc) { + __ip_vs_unbind_svc(dest); + __ip_vs_bind_svc(dest, svc); + } + } + + /* + * Set the dest status flags + */ + dest->flags |= IP_VS_DEST_F_AVAILABLE; +} + + +/* + * Create a destination for the given service + */ +struct ip_vs_dest *ip_vs_new_dest(struct ip_vs_service *svc, + struct ip_masq_ctl *mctl) +{ + struct ip_vs_dest *dest; + struct ip_vs_user *mm = &mctl->u.vs_user; + + EnterFunction(2); + + dest = (struct ip_vs_dest*) kmalloc(sizeof(struct ip_vs_dest), + GFP_ATOMIC); + if (dest == NULL) { + IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n"); + return NULL; + } + memset(dest, 0, sizeof(struct ip_vs_dest)); + + dest->protocol = svc->protocol; + dest->vaddr = svc->addr; + dest->vport = svc->port; + dest->vfwmark = svc->fwmark; + dest->addr = mm->daddr; + dest->port = mm->dport; + + atomic_set(&dest->activeconns, 0); + atomic_set(&dest->inactconns, 0); + atomic_set(&dest->refcnt, 0); + + INIT_LIST_HEAD(&dest->d_list); + dest->stats.lock = SPIN_LOCK_UNLOCKED; + __ip_vs_update_dest(svc, dest, mctl); + + LeaveFunction(2); + + return dest; +} + + +/* + * Add a destination into an existing service + */ +int ip_vs_add_dest(struct ip_vs_service *svc, struct ip_masq_ctl *mctl) +{ + struct ip_vs_dest *dest; + struct ip_vs_user *mm = &mctl->u.vs_user; + __u32 daddr = mm->daddr; + __u16 dport = mm->dport; + + EnterFunction(2); + + if (mm->weight < 0) { + IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n"); + return -ERANGE; + } + + /* + * Check if the dest already exists in the list + */ + dest = ip_vs_lookup_dest(svc, daddr, dport); + if (dest != NULL) { + IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n"); + return -EEXIST; + } + + write_lock_bh(&__ip_vs_lock); + + /* + * Check if the dest already exists in the trash and + * is from the same service + */ + dest = __ip_vs_get_trash_dest(svc, daddr, dport); + if (dest != NULL) { + IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%d from trash, " + "refcnt=%d, service %u.%u.%u.%u:%d\n", + NIPQUAD(daddr), ntohs(dport), + atomic_read(&dest->refcnt), + NIPQUAD(dest->vaddr), + ntohs(dest->vport)); + + /* + * Get the destination from the trash + */ + list_del(&dest->n_list); + list_add(&dest->n_list, &svc->destinations); + + __ip_vs_update_dest(svc, dest, mctl); + + write_unlock_bh(&__ip_vs_lock); + return 0; + } + + /* + * Allocate and initialize the dest structure + */ + dest = ip_vs_new_dest(svc, mctl); + if (dest == NULL) { + write_unlock_bh(&__ip_vs_lock); + IP_VS_ERR("ip_vs_add_dest(): out of memory\n"); + return -ENOMEM; + } + + /* + * Add the dest entry into the list + */ + list_add(&dest->n_list, &svc->destinations); + atomic_inc(&dest->refcnt); + + write_unlock_bh(&__ip_vs_lock); + + LeaveFunction(2); + return 0; +} + + +/* + * Edit a destination in the given service + */ +int ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_masq_ctl *mctl) +{ + struct ip_vs_dest *dest; + struct ip_vs_user *mm = &mctl->u.vs_user; + __u32 daddr = mm->daddr; + __u16 dport = mm->dport; + + EnterFunction(2); + + if (mm->weight < 0) { + IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n"); + return -ERANGE; + } + + /* + * Lookup the destination list + */ + dest = ip_vs_lookup_dest(svc, daddr, dport); + if (dest == NULL) { + IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n"); + return -ENOENT; + } + + write_lock_bh(&__ip_vs_lock); + + __ip_vs_update_dest(svc, dest, mctl); + + write_unlock_bh(&__ip_vs_lock); + + LeaveFunction(2); + return 0; +} + + +/* + * Delete a destination from the given service + */ +void __ip_vs_del_dest(struct ip_vs_dest *dest) +{ + dest->flags &= ~IP_VS_DEST_F_AVAILABLE; + + /* + * Remove it from the d-linked destination list. + */ + list_del(&dest->n_list); + + /* + * Remove it from the d-linked list with the real services. + */ + ip_vs_rs_unhash(dest); + + /* + * Decrease the refcnt of the dest, and free the dest + * if nobody refers to it (refcnt=0). Otherwise, throw + * the destination into the trash. + */ + if (atomic_dec_and_test(&dest->refcnt)) { + /* simply decrease svc->refcnt here, let the caller check + and release the service if nobody refers to it. + Only user context can release destination and service, + and only user context can update virtual service at a + time, so the operation here is OK */ + atomic_dec(&dest->svc->refcnt); + kfree_s(dest, sizeof(*dest)); + } else { + IP_VS_DBG(3, "Move dest %u.%u.%u.%u:%d into trash, " + "refcnt=%d\n", + NIPQUAD(dest->addr), ntohs(dest->port), + atomic_read(&dest->refcnt)); + list_add(&dest->n_list, &ip_vs_dest_trash); + atomic_inc(&dest->refcnt); + } +} + +int ip_vs_del_dest(struct ip_vs_service *svc, struct ip_masq_ctl *mctl) +{ + struct ip_vs_dest *dest; + struct ip_vs_user *mm = &mctl->u.vs_user; + __u32 daddr = mm->daddr; + __u16 dport = mm->dport; + + EnterFunction(2); + + /* + * Lookup the destination list + */ + dest = ip_vs_lookup_dest(svc, daddr, dport); + if (dest == NULL) { + IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n"); + return -ENOENT; + } + + write_lock_bh(&__ip_vs_lock); + + /* + * Remove dest from the destination list + */ + __ip_vs_del_dest(dest); + + /* + * Called the update_service function of its scheduler + */ + svc->scheduler->update_service(svc); + + write_unlock_bh(&__ip_vs_lock); + + LeaveFunction(2); + + return 0; +} + + +/* + * Add a service into the service hash table + */ +int ip_vs_add_service(struct ip_masq_ctl *mctl) +{ + struct ip_vs_user *mm = &mctl->u.vs_user; + __u16 protocol = mm->protocol; + __u32 vaddr = mm->vaddr; + __u16 vport = mm->vport; + __u32 vfwmark = mm->vfwmark; + + int ret = 0; + struct ip_vs_scheduler *sched; + struct ip_vs_service *svc; + + EnterFunction(2); + + /* + * Lookup the scheduler, by 'mctl->m_tname' + */ + sched = ip_vs_lookup_scheduler(mctl->m_tname); + if (sched == NULL) { + IP_VS_INFO("Scheduler module ip_vs_%s.o not found\n", + mctl->m_tname); + return -ENOENT; + } + + write_lock_bh(&__ip_vs_lock); + + /* + * Check if the service already exists + */ + if (vfwmark == 0) + svc = __ip_vs_lookup_service(protocol, vaddr, vport); + else + svc = __ip_vs_lookup_svc_fwm(vfwmark); + + if (svc != NULL) { + IP_VS_DBG(1, "ip_vs_add_service: service already exists.\n"); + ret = -EEXIST; + goto out; + } + + svc = (struct ip_vs_service*) + kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); + if (svc == NULL) { + IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n"); + ret = -ENOMEM; + goto out; + } + memset(svc, 0, sizeof(struct ip_vs_service)); + + svc->protocol = protocol; + svc->addr = vaddr; + svc->port = vport; + svc->fwmark = vfwmark; + svc->flags = mm->vs_flags; + svc->timeout = mm->timeout; + svc->netmask = mm->netmask; + + INIT_LIST_HEAD(&svc->destinations); + atomic_set(&svc->refcnt, 0); + svc->stats.lock = SPIN_LOCK_UNLOCKED; + + /* + * Bind the scheduler + */ + ip_vs_bind_scheduler(svc, sched); + + /* + * Hash the service into the service table + */ + ip_vs_svc_hash(svc); + + /* + * Update the virtual service counters + */ + if (vport == FTPPORT) + atomic_inc(&ip_vs_ftpsvc_counter); + else if (vport == 0) + atomic_inc(&ip_vs_nullsvc_counter); + + out: + write_unlock_bh(&__ip_vs_lock); + LeaveFunction(2); + return ret; +} + + +/* + * Edit a service and bind it with a new scheduler + */ +int ip_vs_edit_service(struct ip_vs_service *svc, struct ip_masq_ctl *mctl) +{ + struct ip_vs_user *mm = &mctl->u.vs_user; + struct ip_vs_scheduler *sched; + + EnterFunction(2); + + /* + * Lookup the scheduler, by 'mctl->m_tname' + */ + sched = ip_vs_lookup_scheduler(mctl->m_tname); + if (sched == NULL) { + IP_VS_INFO("Scheduler module ip_vs_%s.o not found\n", + mctl->m_tname); + return -ENOENT; + } + + write_lock_bh(&__ip_vs_lock); + + /* + * Set the flags and timeout value + */ + svc->flags = mm->vs_flags | IP_VS_SVC_F_HASHED; + svc->timeout = mm->timeout; + svc->netmask = mm->netmask; + + /* + * Unbind the old scheduler + */ + ip_vs_unbind_scheduler(svc); + + /* + * Bind the new scheduler + */ + ip_vs_bind_scheduler(svc, sched); + + write_unlock_bh(&__ip_vs_lock); + + LeaveFunction(2); + return 0; +} + + +/* + * Delete a service from the service list + */ +int __ip_vs_del_service(struct ip_vs_service *svc) +{ + struct list_head *l; + struct ip_vs_dest *dest; + + /* + * Unbind scheduler + */ + ip_vs_unbind_scheduler(svc); + + /* + * Unlink the whole destination list + */ + l = &svc->destinations; + while (l->next != l) { + dest = list_entry(l->next, struct ip_vs_dest, n_list); + __ip_vs_del_dest(dest); + } + + /* + * Unhash it from the service table + */ + if (ip_vs_svc_unhash(svc)) { + /* + * Update the virtual service counters + */ + if (svc->port == FTPPORT) + atomic_dec(&ip_vs_ftpsvc_counter); + else if (svc->port == 0) + atomic_dec(&ip_vs_nullsvc_counter); + + /* + * Free the service if nobody refers to it + */ + if (atomic_read(&svc->refcnt) == 0) { + IP_VS_DBG(2, "release svc %s %u.%u.%u.%u:%d\n", + masq_proto_name(svc->protocol), + NIPQUAD(svc->addr), ntohs(svc->port)); + kfree_s(svc, sizeof(struct ip_vs_service)); + } + } else { + /* + * Called the update_service function of its scheduler + */ + svc->scheduler->update_service(svc); + return -EPERM; + } + + return 0; +} + +int ip_vs_del_service(struct ip_vs_service *svc) +{ + EnterFunction(2); + + if (svc == NULL) + return -EEXIST; + + write_lock_bh(&__ip_vs_lock); + + __ip_vs_del_service(svc); + + write_unlock_bh(&__ip_vs_lock); + LeaveFunction(2); + return 0; +} + + +/* + * Flush all the virtual services + */ +int ip_vs_flush(void) +{ + int idx; + struct ip_vs_service *svc; + struct list_head *l; + + write_lock_bh(&__ip_vs_lock); + + /* + * Flush the service table hashed by + */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + l = &ip_vs_svc_table[idx]; + while (l->next != l) { + svc = list_entry(l->next,struct ip_vs_service,s_list); + + if (__ip_vs_del_service(svc)) + goto out; + } + } + + /* + * Flush the service table hashed by fwmark + */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + l = &ip_vs_svc_fwm_table[idx]; + while (l->next != l) { + svc = list_entry(l->next,struct ip_vs_service,f_list); + + if (__ip_vs_del_service(svc)) + goto out; + } + } + + out: + write_unlock_bh(&__ip_vs_lock); + return 0; +} + + +/* + * Change the connection counter and the flags if the masq state changes + * Called by the masq_tcp_state function. + */ +void ip_vs_set_state(struct ip_masq *ms, int new_state) +{ + struct ip_vs_dest *dest = ms->dest; + + if (dest && + (ms->flags & IP_MASQ_F_VS) && (new_state != ms->state)) { + if (!(ms->flags & IP_MASQ_F_VS_INACTIVE) && + (new_state != IP_MASQ_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + ms->flags |= IP_MASQ_F_VS_INACTIVE; + } else if ((ms->flags & IP_MASQ_F_VS_INACTIVE) && + (new_state == IP_MASQ_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + ms->flags &= ~IP_MASQ_F_VS_INACTIVE; + } + + IP_VS_DBG(8, "Set-state masq fwd:%c s:%s c:%u.%u.%u.%u:%d " + "v:%u.%u.%u.%u:%d d:%u.%u.%u.%u:%d flg:%X cnt:%d\n", + ip_vs_fwd_tag(ms), ip_masq_state_name(ms->state), + NIPQUAD(ms->daddr), ntohs(ms->dport), + NIPQUAD(ms->maddr), ntohs(ms->mport), + NIPQUAD(ms->saddr), ntohs(ms->sport), + ms->flags, atomic_read(&ms->refcnt)); + } +} + + +/* + * Bind a masq entry with a virtual service destination + * Called when a new masq entry is created for VS. + */ +void ip_vs_bind_masq(struct ip_masq *ms, struct ip_vs_dest *dest) +{ + ms->flags |= dest->masq_flags; + ms->dest = dest; + + /* + * Increase the refcnt counter of the dest. + */ + atomic_inc(&dest->refcnt); + + IP_VS_DBG(9, "Bind-masq fwd:%c s:%s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " + "d:%u.%u.%u.%u:%d flg:%X cnt:%d destcnt:%d\n", + ip_vs_fwd_tag(ms), ip_masq_state_name(ms->state), + NIPQUAD(ms->daddr), ntohs(ms->dport), + NIPQUAD(ms->maddr), ntohs(ms->mport), + NIPQUAD(ms->saddr), ntohs(ms->sport), + ms->flags, atomic_read(&ms->refcnt), + atomic_read(&dest->refcnt)); +} + + +/* + * Unbind a masq entry with its VS destination + * Called by the masq_expire function. + */ +void ip_vs_unbind_masq(struct ip_masq *ms) +{ + struct ip_vs_dest *dest = ms->dest; + + IP_VS_DBG(9, "Unbind-masq fwd:%c s:%s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " + "d:%u.%u.%u.%u:%d flg:%X cnt:%d destcnt:%d\n", + ip_vs_fwd_tag(ms), ip_masq_state_name(ms->state), + NIPQUAD(ms->daddr),ntohs(ms->dport), + NIPQUAD(ms->maddr),ntohs(ms->mport), + NIPQUAD(ms->saddr),ntohs(ms->sport), + ms->flags, atomic_read(&ms->refcnt), + atomic_read(&dest->refcnt)); + + if (dest) { + /* + * Decrease the inactconns or activeconns counter + * if it is not a masq template (ms->dport!=0). + */ + if (ms->dport) { + if (ms->flags & IP_MASQ_F_VS_INACTIVE) { + atomic_dec(&dest->inactconns); + } else { + atomic_dec(&dest->activeconns); + } + } + + /* + * Simply decrease the refcnt of the dest, because the + * dest will be either in service's destination list + * or in the trash. + */ + atomic_dec(&dest->refcnt); + } +} + + +/* + * Checking if the destination of a masq template is available. + * If available, return 1, otherwise return 0 and invalidate this + * masq template. + */ +int ip_vs_check_template(struct ip_masq *mst) +{ + struct ip_vs_dest *dest = mst->dest; + + /* + * Checking the dest server status. + */ + if ((dest == NULL) || + !(dest->flags & IP_VS_DEST_F_AVAILABLE)) { + IP_VS_DBG(9, "check_template: dest not available for prot %s " + "src %u.%u.%u.%u:%d dest %u.%u.%u.%u:%d -> %X:%X\n", + masq_proto_name(mst->protocol), + NIPQUAD(mst->daddr), ntohs(mst->dport), + NIPQUAD(mst->maddr), ntohs(mst->mport), + (dest!=NULL)? ntohl(dest->addr):0, + (dest!=NULL)? ntohs(dest->port):0); + + /* + * Invalidate the masq template + */ + ip_vs_unhash(mst); + mst->sport = 65535; + mst->mport = 65535; + mst->dport = 0; + ip_vs_hash(mst); + + /* + * Simply decrease the refcnt of the template, + * don't restart its timer. + */ + atomic_dec(&mst->refcnt); + return 0; + } + return 1; +} + + +/* + * IPVS persistent scheduling function + * It creates a masq entry according to its template if exists, or selects + * a server and creates a masq entry plus a template. + */ +struct ip_masq * +ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_masq *ms = NULL; + struct ip_vs_dest *dest; + const __u16 *portp; + struct ip_masq *mst; + __u16 dport; /* destination port to forward */ + __u32 snet; /* source network of the client, after masking */ + + portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); + + /* Mask saddr with the netmask to adjust template granularity */ + snet = iph->saddr & svc->netmask; + + IP_VS_DBG(6, "P-schedule: src %u.%u.%u.%u:%d dest %u.%u.%u.%u:%d " + "snet %u.%u.%u.%u/%u.%u.%u.%u\n", + NIPQUAD(iph->saddr), ntohs(portp[0]), + NIPQUAD(iph->daddr), ntohs(portp[1]), + NIPQUAD(snet), NIPQUAD(svc->netmask)); + + /* + * As far as we know, FTP is a very complicated network protocol, and + * it uses control connection and data connections. For active FTP, + * FTP server initilize data connection to the client, its source port + * is often 20. For passive FTP, FTP server tells the clients the port + * that it passively listens to, and the client issues the data + * connection. In the tunneling or direct routing mode, the load + * balancer is on the client-to-server half of connection, the port + * number is unknown to the load balancer. So, a template masq like + * is created for persistent FTP + * service, and a template like + * is created for other persistent services. + */ + if (portp[1] == svc->port) { + /* Check if a template already exists */ + if (svc->port != FTPPORT) + mst = ip_vs_in_get(iph->protocol, snet, 0, + iph->daddr, portp[1]); + else + mst = ip_vs_in_get(iph->protocol, snet, 0, + iph->daddr, 0); + + if (!mst || !ip_vs_check_template(mst)) { + /* + * No template found or the dest of the masq + * template is not available. + */ + read_lock(&__ip_vs_lock); + + dest = svc->scheduler->schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "P-schedule: no dest found.\n"); + read_unlock(&__ip_vs_lock); + return NULL; + } + + /* + * Create a template like for non-ftp service, + * and + * for ftp service. + */ + if (svc->port != FTPPORT) + mst = ip_masq_new_vs(iph->protocol, + iph->daddr, portp[1], + dest->addr, dest->port, + snet, 0, + 0); + else + mst = ip_masq_new_vs(iph->protocol, + iph->daddr, 0, + dest->addr, 0, + snet, 0, + 0); + if (mst == NULL) { + IP_VS_ERR("ip_masq_new_vs template failed\n"); + read_unlock(&__ip_vs_lock); + return NULL; + } + + /* + * Bind the template with dest and set timeout. + */ + ip_vs_bind_masq(mst, dest); + mst->timeout = svc->timeout; + + read_unlock(&__ip_vs_lock); + } else { + /* + * Template found and its destination is available. + */ + dest = mst->dest; + + /* + * Delete its timer so that it can be put back. + */ + del_sltimer(&mst->timer); + } + dport = dest->port; + } else { + /* + * Note: persistent fwmark-based services and persistent + * port zero service are handled here. + * fwmark template: + * port zero template: + */ + if (svc->fwmark) + mst = ip_vs_in_get(IPPROTO_IP, snet, 0, + htonl(svc->fwmark), 0); + else + mst = ip_vs_in_get(iph->protocol, + snet, 0, iph->daddr, 0); + + if (!mst || !ip_vs_check_template(mst)) { + /* + * If it is not persistent port zero, return NULL. + */ + if (svc->port) + return NULL; + + read_lock(&__ip_vs_lock); + + dest = svc->scheduler->schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "P-schedule: no dest found.\n"); + read_unlock(&__ip_vs_lock); + return NULL; + } + + /* + * Create a template according to the service + */ + if (svc->fwmark) + mst = ip_masq_new_vs(IPPROTO_IP, + htonl(svc->fwmark), 0, + dest->addr, 0, + snet, 0, + 0); + else + mst = ip_masq_new_vs(iph->protocol, + iph->daddr, 0, + dest->addr, 0, + snet, 0, + 0); + if (mst == NULL) { + IP_VS_ERR("ip_masq_new_vs template failed\n"); + read_unlock(&__ip_vs_lock); + return NULL; + } + + /* + * Bind the template with dest and set timeout. + */ + ip_vs_bind_masq(mst, dest); + mst->timeout = svc->timeout; + read_unlock(&__ip_vs_lock); + } else { + dest = mst->dest; + + /* + * Delete its timer so that it can be put back. + */ + del_sltimer(&mst->timer); + } + dport = portp[1]; + } + + /* + * Create a new masq according to the template + */ + ms = ip_masq_new_vs(iph->protocol, + iph->daddr, portp[1], + dest->addr, dport, + iph->saddr, portp[0], + 0); + if (ms == NULL) { + IP_VS_ERR("ip_masq_new_vs failed\n"); + ip_masq_put(mst); + return NULL; + } + + /* + * Bind the masq entry with the vs dest. + */ + ip_vs_bind_masq(ms, dest); + + /* + * Increase the inactive connection counter + * because it is in Syn-Received + * state (inactive) when the masq is created. + */ + atomic_inc(&dest->inactconns); + + /* + * Add its control + */ + ip_masq_control_add(ms, mst); + + ip_masq_put(mst); + return ms; +} + + +/* + * IPVS main scheduling function + * It selects a server according to the virtual service, and + * creates a masq entry. + */ +struct ip_masq *ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_masq *ms = NULL; + struct ip_vs_dest *dest; + const __u16 *portp; + + /* + * Persistent service + */ + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + return ip_vs_sched_persist(svc, iph); + + /* + * Non-persistent service + */ + portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); + if (!svc->fwmark && portp[1] != svc->port) { + if (!svc->port) + IP_VS_ERR("Schedule: port zero only supported in persistent services, check your ipvs configuration\n"); + return NULL; + } + + read_lock(&__ip_vs_lock); + + dest = svc->scheduler->schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "Schedule: no dest found.\n"); + read_unlock(&__ip_vs_lock); + return NULL; + } + + /* + * Create a masquerading entry. + */ + ms = ip_masq_new_vs(iph->protocol, + iph->daddr, portp[1], + dest->addr, dest->port?dest->port:portp[1], + iph->saddr, portp[0], + 0); + if (ms == NULL) { + IP_VS_ERR("Schedule: ip_masq_new_vs failed\n"); + read_unlock(&__ip_vs_lock); + return NULL; + } + + /* + * Bind the masq entry with the vs dest. + */ + ip_vs_bind_masq(ms, dest); + + /* + * Increase the inactive connection counter because it is in + * Syn-Received state (inactive) when the masq is created. + */ + atomic_inc(&dest->inactconns); + + IP_VS_DBG(9, "Schedule masq fwd:%c s:%s c:%u.%u.%u.%u:%d " + "v:%u.%u.%u.%u:%d d:%u.%u.%u.%u:%d flg:%X cnt:%d\n", + ip_vs_fwd_tag(ms), ip_masq_state_name(ms->state), + NIPQUAD(ms->daddr),ntohs(ms->dport), + NIPQUAD(ms->maddr),ntohs(ms->mport), + NIPQUAD(ms->saddr),ntohs(ms->sport), + ms->flags, atomic_read(&ms->refcnt)); + + read_unlock(&__ip_vs_lock); + + return ms; +} + + +/* + * Pass or drop the packet. + * Called by ip_fw_demasquerade, when the virtual service is available but + * no destination is available for a new connection. + */ +int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb) +{ + struct iphdr *iph = skb->nh.iph; + __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); + + /* + * When the virtual ftp service is presented, packets destined + * for other services on the VIP may get here (except services + * listed in the ipvs table), pass the packets, because it is + * not ipvs job to decide to drop the packets. + */ + if ((svc->port == FTPPORT) && (portp[1] != FTPPORT)) + return 0; + + /* + * Notify the client that the destination is unreachable, and + * release the socket buffer. + * Since it is in IP layer, the TCP socket is not actually + * created, the TCP RST packet cannot be sent, instead that + * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ + */ + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + kfree_skb(skb); + return -2; +} + + +/* + * IPVS user control entry + */ +int ip_vs_ctl(int optname, struct ip_masq_ctl *mctl, int optlen) +{ + struct ip_vs_service *svc = NULL; + struct ip_vs_user *mm = &mctl->u.vs_user; + __u32 vaddr = mm->vaddr; + __u16 vport = mm->vport; + int proto_num = masq_proto_num(mm->protocol); + + /* + * Check the size of mctl, no overflow... + */ + if (optlen != sizeof(*mctl)) + return -EINVAL; + + /* + * Flush all the virtual service... + */ + if (mctl->m_cmd == IP_MASQ_CMD_FLUSH) + return ip_vs_flush(); + + /* + * Check for valid protocol: TCP or UDP + */ + if (mm->vfwmark == 0 && (proto_num < 0 || proto_num > 1)) { + IP_VS_INFO("vs_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s", + ntohs(mm->protocol), + NIPQUAD(vaddr), ntohs(vport), mctl->m_tname); + return -EFAULT; + } + + /* + * Lookup the exact service by (protocol, vaddr, vport) + */ + read_lock(&__ip_vs_lock); + + if (mm->vfwmark == 0) + svc = __ip_vs_lookup_service(mm->protocol, vaddr, vport); + else + svc = __ip_vs_lookup_svc_fwm(mm->vfwmark); + + read_unlock(&__ip_vs_lock); + + switch (mctl->m_cmd) { + case IP_MASQ_CMD_ADD: + if (svc != NULL) + return -EEXIST; + + return ip_vs_add_service(mctl); + + case IP_MASQ_CMD_SET: + if (svc == NULL) + return -ESRCH; + + return ip_vs_edit_service(svc, mctl); + + case IP_MASQ_CMD_DEL: + if (svc == NULL) + return -ESRCH; + else + return ip_vs_del_service(svc); + + case IP_MASQ_CMD_ADD_DEST: + if (svc == NULL) + return -ESRCH; + else + return ip_vs_add_dest(svc, mctl); + + case IP_MASQ_CMD_SET_DEST: + if (svc == NULL) + return -ESRCH; + else + return ip_vs_edit_dest(svc, mctl); + + case IP_MASQ_CMD_DEL_DEST: + if (svc == NULL) + return -ESRCH; + else + return ip_vs_del_dest(svc, mctl); + } + return -EINVAL; +} + + +#ifdef CONFIG_SYSCTL + +static int ip_vs_sysctl_defense_mode(ctl_table *ctl, int write, + struct file * filp,void *buffer, size_t *lenp) +{ + int *valp = ctl->data; + int val = *valp; + int ret; + + ret = proc_dointvec(ctl, write, filp, buffer, lenp); + if (write && (*valp != val)) { + if ((*valp < 0) || (*valp > 3)) { + /* Restore the correct value */ + *valp = val; + } else { + update_defense_level(); + } + } + return ret; +} + +ctl_table ipv4_vs_table[] = { +#ifdef CONFIG_IP_VS_DEBUG + {NET_IPV4_VS_DEBUG_LEVEL, "debug_level", + &sysctl_ip_vs_debug_level, sizeof(int), 0644, NULL, + &proc_dointvec}, +#endif + {NET_IPV4_VS_AMEMTHRESH, "amemthresh", + &sysctl_ip_vs_amemthresh, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_VS_AMDROPRATE, "am_droprate", + &sysctl_ip_vs_am_droprate, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_VS_DROP_ENTRY, "drop_entry", + &sysctl_ip_vs_drop_entry, sizeof(int), 0644, NULL, + &ip_vs_sysctl_defense_mode}, + {NET_IPV4_VS_DROP_PACKET, "drop_packet", + &sysctl_ip_vs_drop_packet, sizeof(int), 0644, NULL, + &ip_vs_sysctl_defense_mode}, + {NET_IPV4_VS_SECURE_TCP, "secure_tcp", + &sysctl_ip_vs_secure_tcp, sizeof(int), 0644, NULL, + &ip_vs_sysctl_defense_mode}, + {NET_IPV4_VS_TO_ES, "timeout_established", + &masq_timeout_table_dos.timeout[IP_MASQ_S_ESTABLISHED], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_SS, "timeout_synsent", + &masq_timeout_table_dos.timeout[IP_MASQ_S_SYN_SENT], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_SR, "timeout_synrecv", + &masq_timeout_table_dos.timeout[IP_MASQ_S_SYN_RECV], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_FW, "timeout_finwait", + &masq_timeout_table_dos.timeout[IP_MASQ_S_FIN_WAIT], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_TW, "timeout_timewait", + &masq_timeout_table_dos.timeout[IP_MASQ_S_TIME_WAIT], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_CL, "timeout_close", + &masq_timeout_table_dos.timeout[IP_MASQ_S_CLOSE], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_CW, "timeout_closewait", + &masq_timeout_table_dos.timeout[IP_MASQ_S_CLOSE_WAIT], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_LA, "timeout_lastack", + &masq_timeout_table_dos.timeout[IP_MASQ_S_LAST_ACK], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_LI, "timeout_listen", + &masq_timeout_table_dos.timeout[IP_MASQ_S_LISTEN], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_SA, "timeout_synack", + &masq_timeout_table_dos.timeout[IP_MASQ_S_SYNACK], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_UDP, "timeout_udp", + &masq_timeout_table_dos.timeout[IP_MASQ_S_UDP], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_ICMP, "timeout_icmp", + &masq_timeout_table_dos.timeout[IP_MASQ_S_ICMP], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {0} +}; +#endif + +#ifdef CONFIG_PROC_FS +/* + * Write the contents of the VS rule table to a PROCfs file. + */ +static int ip_vs_procinfo(char *buf, char **start, off_t offset, + int length, int *eof, void *data) +{ + int len=0; + off_t pos=0; + char temp[64], temp2[32]; + int idx; + struct ip_vs_service *svc; + struct ip_vs_dest *dest; + struct list_head *l, *e, *p, *q; + + /* + * Note: since the length of the buffer is usually the multiple + * of 512, it is good to use fixed record of the divisor of 512, + * so that records won't be truncated at buffer boundary. + */ + pos = 192; + if (pos > offset) { + sprintf(temp, + "IP Virtual Server version %d.%d.%d (size=%d)", + NVERSION(IP_VS_VERSION_CODE), IP_VS_TAB_SIZE); + len += sprintf(buf+len, "%-63s\n", temp); + len += sprintf(buf+len, "%-63s\n", + "Prot LocalAddress:Port Scheduler Flags"); + len += sprintf(buf+len, "%-63s\n", + " -> RemoteAddress:Port Forward Weight ActiveConn InActConn"); + } + + read_lock_bh(&__ip_vs_lock); + + /* print the service table hashed by */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + l = &ip_vs_svc_table[idx]; + for (e=l->next; e!=l; e=e->next) { + svc = list_entry(e, struct ip_vs_service, s_list); + pos += 64; + if (pos > offset) { + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + sprintf(temp2, "persistent %d %08X", + svc->timeout, + ntohl(svc->netmask)); + else + temp2[0] = '\0'; + + sprintf(temp, "%s %08X:%04X %s %s", + masq_proto_name(svc->protocol), + ntohl(svc->addr), + ntohs(svc->port), + svc->scheduler->name, temp2); + len += sprintf(buf+len, "%-63s\n", temp); + if (len >= length) + goto done; + } + + p = &svc->destinations; + for (q=p->next; q!=p; q=q->next) { + dest = list_entry(q, struct ip_vs_dest, n_list); + pos += 64; + if (pos <= offset) + continue; + sprintf(temp, + " -> %08X:%04X %-7s %-6d %-10d %-10d", + ntohl(dest->addr), + ntohs(dest->port), + ip_vs_fwd_name(dest->masq_flags), + dest->weight, + atomic_read(&dest->activeconns), + atomic_read(&dest->inactconns)); + len += sprintf(buf+len, "%-63s\n", temp); + if (len >= length) + goto done; + } + } + } + + /* print the service table hashed by fwmark */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + l = &ip_vs_svc_fwm_table[idx]; + for (e=l->next; e!=l; e=e->next) { + svc = list_entry(e, struct ip_vs_service, f_list); + pos += 64; + if (pos > offset) { + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + sprintf(temp2, "persistent %d %08X", + svc->timeout, + ntohl(svc->netmask)); + else + temp2[0] = '\0'; + + sprintf(temp, "FWM %08X %s %s", + svc->fwmark, + svc->scheduler->name, temp2); + len += sprintf(buf+len, "%-63s\n", temp); + if (len >= length) + goto done; + } + + p = &svc->destinations; + for (q=p->next; q!=p; q=q->next) { + dest = list_entry(q, struct ip_vs_dest, n_list); + pos += 64; + if (pos <= offset) + continue; + sprintf(temp, + " -> %08X:%04X %-7s %-6d %-10d %-10d", + ntohl(dest->addr), + ntohs(dest->port), + ip_vs_fwd_name(dest->masq_flags), + dest->weight, + atomic_read(&dest->activeconns), + atomic_read(&dest->inactconns)); + len += sprintf(buf+len, "%-63s\n", temp); + if (len >= length) + goto done; + } + } + } + + done: + read_unlock_bh(&__ip_vs_lock); + + *start = buf+len-(pos-offset); /* Start of wanted data */ + len = pos-offset; + if (len > length) + len = length; + if (len < 0) + len = 0; + return len; +} + +struct proc_dir_entry ip_vs_proc_entry = { + 0, /* dynamic inode */ + 2, "vs", /* namelen and name */ + S_IFREG | S_IRUGO, /* mode */ + 1, 0, 0, 0, /* nlinks, owner, group, size */ + &proc_net_inode_operations, /* operations */ + NULL, /* get_info */ + NULL, /* fill_inode */ + NULL, NULL, NULL, /* next, parent, subdir */ + NULL, /* data */ + &ip_vs_procinfo, /* function to generate proc data */ +}; + + +/* + * Write the IPVS statistic information to a PROCfs file. + */ +struct ip_vs_stats ip_vs_stats = {SPIN_LOCK_UNLOCKED, 0, 0}; + +static int +ip_vs_stats_get_info(char *buf, char **start, off_t offset, + int length, int *eof, void *data) +{ + int idx; + int len=0; + off_t pos=0; + char temp[128]; + struct ip_vs_service *svc; + struct ip_vs_dest *dest; + struct list_head *l, *e, *p, *q; + + pos += 128; + if (pos > offset) { + len += sprintf(buf+len, "%-63s\n", +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + "TotalConns InPkts OutPkts InBytes OutBytes"); + spin_lock(&ip_vs_stats.lock); + sprintf(temp, " %8X %8X %8X %8X%08X %8X%08X", + ip_vs_stats.conns, + ip_vs_stats.inpkts, + ip_vs_stats.outpkts, + (__u32)(ip_vs_stats.inbytes >> 32), + (__u32)ip_vs_stats.inbytes, + (__u32)(ip_vs_stats.outbytes >> 32), + (__u32)ip_vs_stats.outbytes); + spin_unlock(&ip_vs_stats.lock); + len += sprintf(buf+len, "%-63s\n", temp); + } + + read_lock_bh(&__ip_vs_lock); + + /* print the service statistics */ + pos += 128; + if (pos > offset) { + len += sprintf(buf+len, "%-127s\n", + "\nVirtual Service\n" + "Pro VirtService Conns InPkts OutPkts InBytes OutBytes"); + } + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + l = &ip_vs_svc_table[idx]; + for (e=l->next; e!=l; e=e->next) { + svc = list_entry(e, struct ip_vs_service, s_list); + pos += 128; + if (pos <= offset) + continue; + spin_lock(&svc->stats.lock); + sprintf(temp, "%3s %08X:%04X %8X %8X %8X %8X%08X %8X%08X", + masq_proto_name(svc->protocol), + ntohl(svc->addr), + ntohs(svc->port), + svc->stats.conns, + svc->stats.inpkts, + svc->stats.outpkts, + (__u32)(svc->stats.inbytes >> 32), + (__u32)svc->stats.inbytes, + (__u32)(svc->stats.outbytes >> 32), + (__u32)svc->stats.outbytes); + spin_unlock(&svc->stats.lock); + len += sprintf(buf+len, "%-127s\n", temp); + if (pos >= offset+length) + goto done; + } + } + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + l = &ip_vs_svc_fwm_table[idx]; + for (e=l->next; e!=l; e=e->next) { + svc = list_entry(e, struct ip_vs_service, f_list); + pos += 128; + if (pos <= offset) + continue; + spin_lock(&svc->stats.lock); + sprintf(temp, "FWM %08X %8X %8X %8X %8X%08X %8X%08X", + svc->fwmark, + svc->stats.conns, + svc->stats.inpkts, + svc->stats.outpkts, + (__u32)(svc->stats.inbytes >> 32), + (__u32)svc->stats.inbytes, + (__u32)(svc->stats.outbytes >> 32), + (__u32)svc->stats.outbytes); + spin_unlock(&svc->stats.lock); + len += sprintf(buf+len, "%-127s\n", temp); + if (pos >= offset+length) + goto done; + } + } + + /* print the real server statistics */ + pos += 128; + if (pos > offset) { + len += sprintf(buf+len, "%-127s\n", + "\nReal Service\n" + "Pro VirtService RealService Conns InPkts OutPkts InBytes OutBytes"); + } + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + l = &ip_vs_svc_table[idx]; + for (e=l->next; e!=l; e=e->next) { + svc = list_entry(e, struct ip_vs_service, s_list); + p = &svc->destinations; + for (q=p->next; q!=p; q=q->next) { + dest = list_entry(q, struct ip_vs_dest, n_list); + pos += 128; + if (pos <= offset) + continue; + spin_lock(&dest->stats.lock); + sprintf(temp, + "%3s %08X:%04X %08X:%04X %8X %8X %8X %8X%08X %8X%08X", + masq_proto_name(svc->protocol), + ntohl(svc->addr), + ntohs(svc->port), + ntohl(dest->addr), + ntohs(dest->port), + dest->stats.conns, + dest->stats.inpkts, + dest->stats.outpkts, + (__u32)(dest->stats.inbytes >> 32), + (__u32)dest->stats.inbytes, + (__u32)(dest->stats.outbytes >> 32), + (__u32)dest->stats.outbytes); + spin_unlock(&dest->stats.lock); + len += sprintf(buf+len, "%-127s\n", temp); + if (pos >= offset+length) + goto done; + } + } + } + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + l = &ip_vs_svc_fwm_table[idx]; + for (e=l->next; e!=l; e=e->next) { + svc = list_entry(e, struct ip_vs_service, f_list); + p = &svc->destinations; + for (q=p->next; q!=p; q=q->next) { + dest = list_entry(q,struct ip_vs_dest,n_list); + pos += 128; + if (pos <= offset) + continue; + spin_lock(&dest->stats.lock); + sprintf(temp, + "FWM %08X %08X:%04X %8X %8X %8X %8X%08X %8X%08X", + svc->fwmark, + ntohl(dest->addr), + ntohs(dest->port), + dest->stats.conns, + dest->stats.inpkts, + dest->stats.outpkts, + (__u32)(dest->stats.inbytes >> 32), + (__u32)dest->stats.inbytes, + (__u32)(dest->stats.outbytes >> 32), + (__u32)dest->stats.outbytes); + spin_unlock(&dest->stats.lock); + len += sprintf(buf+len, "%-127s\n", temp); + if (pos >= offset+length) + goto done; + } + } + } + done: + read_unlock_bh(&__ip_vs_lock); + + *start = buf+len-(pos-offset); /* Start of wanted data */ + len = pos-offset; + if (len > length) + len = length; + if (len < 0) + len = 0; + return len; +} + +struct proc_dir_entry ip_vs_stat_proc_entry = { + 0, /* dynamic inode */ + 8, "vs_stats", /* namelen and name */ + S_IFREG | S_IRUGO, /* mode */ + 1, 0, 0, 0, /* nlinks, owner, group, size */ + &proc_net_inode_operations, /* operations */ + NULL, /* get_info */ + NULL, /* fill_inode */ + NULL, NULL, NULL, /* next, parent, subdir */ + NULL, /* data */ + &ip_vs_stats_get_info, /* function to generate proc data */ +}; + +#endif + + +/* + * This function encapsulates the packet in a new IP header, its destination + * will be set to the daddr. Most code of this function is from ipip.c. + * Usage: + * It is called in the ip_vs_forward() function. The load balancer + * selects a real server from a cluster based on a scheduling algorithm, + * encapsulates the packet and forwards it to the selected server. All real + * servers are configured with "ifconfig tunl0 up". + * When the server receives the encapsulated packet, it decapsulates the + * packet, processes the request and return the reply packets directly to + * the client without passing the load balancer. This can greatly + * increase the scalability of virtual server. + * Returns: + * if succeeded, return 1; otherwise, return 0. + */ + +int ip_vs_tunnel_xmit(struct sk_buff *skb, __u32 daddr) +{ + struct rtable *rt; /* Route to the other host */ + struct device *tdev; /* Device to other host */ + struct iphdr *old_iph = skb->nh.iph; + u8 tos = old_iph->tos; + u16 df = old_iph->frag_off; + struct iphdr *iph; /* Our new IP header */ + int max_headroom; /* The extra header space needed */ + u32 dst = daddr; + u32 src = 0; + int mtu; + + if (skb->protocol != __constant_htons(ETH_P_IP)) { + IP_VS_DBG(0, "ip_vs_tunnel_xmit(): protocol error, ETH_P_IP: %d, skb protocol: %d\n", + __constant_htons(ETH_P_IP),skb->protocol); + goto tx_error; + } + + if (ip_route_output(&rt, dst, src, RT_TOS(tos), 0)) { + IP_VS_DBG(0, "ip_vs_tunnel_xmit(): route error, dest: " + "%u.%u.%u.%u\n", NIPQUAD(dst)); + goto tx_error_icmp; + } + tdev = rt->u.dst.dev; + + mtu = rt->u.dst.pmtu - sizeof(struct iphdr); + if (mtu < 68) { + ip_rt_put(rt); + IP_VS_DBG(0, "ip_vs_tunnel_xmit(): mtu less than 68\n"); + goto tx_error; + } + if (skb->dst && mtu < skb->dst->pmtu) + skb->dst->pmtu = mtu; + + df |= (old_iph->frag_off&__constant_htons(IP_DF)); + + if ((old_iph->frag_off&__constant_htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + IP_VS_DBG(0, "ip_vs_tunnel_xmit(): frag needed\n"); + goto tx_error; + } + + skb->h.raw = skb->nh.raw; + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr)); + + if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + ip_rt_put(rt); + kfree_skb(skb); + IP_VS_ERR("ip_vs_tunnel_xmit(): no memory for new_skb\n"); + return 0; + } + kfree_skb(skb); + skb = new_skb; + } + + skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* + * Push down and install the IPIP header. + */ + + iph = skb->nh.iph; + iph->version = 4; + iph->ihl = sizeof(struct iphdr)>>2; + iph->frag_off = df; + iph->protocol = IPPROTO_IPIP; + iph->tos = tos; + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + iph->ttl = old_iph->ttl; + iph->tot_len = htons(skb->len); + iph->id = htons(ip_id_count++); + ip_send_check(iph); + + IPCB(skb)->flags |= IPSKB_REDIRECTED; + IPCB(skb)->flags |= IPSKB_MASQUERADED; + + ip_send(skb); + return 1; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + return 0; +} + + +/* + * Direct Routing + */ +int ip_vs_dr_xmit(struct sk_buff *skb, __u32 daddr) +{ + struct rtable *rt; /* Route to the other host */ + struct iphdr *iph = skb->nh.iph; + u8 tos = iph->tos; + int mtu; + + if (ip_route_output(&rt, daddr, 0, RT_TOS(tos), 0)) { + IP_VS_DBG(0, "ip_vs_dr_xmit(): route error, dest: %u.%u.%u.%u\n", + NIPQUAD(daddr)); + goto tx_error_icmp; + } + + /* MTU checking */ + mtu = rt->u.dst.pmtu; + if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + IP_VS_DBG(0, "ip_vs_dr_xmit(): frag needed\n"); + goto tx_error; + } + + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + IPCB(skb)->flags |= IPSKB_REDIRECTED; + IPCB(skb)->flags |= IPSKB_MASQUERADED; + + ip_send(skb); + return 1; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + return 0; +} + + +/* + * Initialize IP virtual server + */ +__initfunc(int ip_vs_init(void)) +{ + int idx; + + /* + * Allocate the ip_vs_table and initialize its list head. + * Initilize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable, + * ip_vs_schedulers and ip_vs_dest_trash. + */ + if (!(ip_vs_table = + vmalloc(IP_VS_TAB_SIZE*sizeof(struct list_head)))) { + return -ENOMEM; + } + for(idx = 0; idx < IP_VS_TAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_table[idx]); + } + IP_VS_INFO("Connection hash table configured " + "(size=%d, memory=%ldKbytes)\n", + IP_VS_TAB_SIZE, + (long) (IP_VS_TAB_SIZE*sizeof(struct list_head))/1024); + + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_svc_table[idx]); + INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); + } + for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_rtable[idx]); + } + INIT_LIST_HEAD(&ip_vs_schedulers); + INIT_LIST_HEAD(&ip_vs_dest_trash); + + /* + * Hook the slow_timer handler in the system timer. + */ + slow_timer.expires = jiffies+SLTIMER_PERIOD; + add_timer(&slow_timer); + +#ifdef CONFIG_PROC_FS + ip_masq_proc_register(&ip_vs_proc_entry); + ip_masq_proc_register(&ip_vs_stat_proc_entry); +#endif + +#ifdef CONFIG_IP_MASQUERADE_VS_RR + ip_vs_rr_init(); +#endif +#ifdef CONFIG_IP_MASQUERADE_VS_WRR + ip_vs_wrr_init(); +#endif +#ifdef CONFIG_IP_MASQUERADE_VS_LC + ip_vs_lc_init(); +#endif +#ifdef CONFIG_IP_MASQUERADE_VS_WLC + ip_vs_wlc_init(); +#endif +#ifdef CONFIG_IP_MASQUERADE_VS_LBLC + ip_vs_lblc_init(); +#endif +#ifdef CONFIG_IP_MASQUERADE_VS_LBLCR + ip_vs_lblcr_init(); +#endif + return 0; +} diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_lblc.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_lblc.c --- linux-2.2.19/net/ipv4/ip_vs_lblc.c Thu Jan 1 08:00:00 1970 +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_lblc.c Fri Feb 2 18:49:08 2001 @@ -0,0 +1,645 @@ +/* + * IPVS: Locality-Based Least-Connection scheduling module + * + * Version: $Id$ + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Martin Hamilton : fixed the terrible locking bugs + * *lock(tbl->lock) ==> *lock(&tbl->lock) + * Wensong Zhang : fixed the uninitilized tbl->lock bug + * Wensong Zhang : added doing full expiration check to + * collect stale entries of 24+ hours when + * no partial expire check in a half hour + * + */ + +/* + * The lblc algorithm is as follows (pseudo code): + * + * if cachenode[dest_ip] is null then + * n, cachenode[dest_ip] <- {weighted least-conn node}; + * else + * n <- cachenode[dest_ip]; + * if (n is dead) OR + * (n.conns>n.weight AND + * there is a node m with m.conns +#include +#ifdef CONFIG_KMOD +#include +#endif +#include +#include +#include +#include +#include +#ifdef CONFIG_IP_MASQUERADE_MOD +#include +#endif +#include +#include +#include +#include + + +/* + * It is for garbage collection of stale IPVS lblc entries, + * when the table is full. + */ +#define CHECK_EXPIRE_INTERVAL (60*HZ) +#define ENTRY_TIMEOUT (5*60*HZ) + +/* + * It is for full expiration check. + * When there is no partial expiration check (garbage collection) + * in a half hour, do a full expiration check to collect stale + * entries that haven't been touched for a day (by default). + */ +#define COUNT_FOR_FULL_EXPIRATION 30 +int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ; + + +/* + * for IPVS lblc entry hash table + */ +#ifndef CONFIG_IP_VS_LBLC_TAB_BITS +#define CONFIG_IP_VS_LBLC_TAB_BITS 10 +#endif +#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS +#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) +#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) + + +/* + * IPVS lblc entry represents an association between destination + * IP address and its destination server + */ +struct ip_vs_lblc_entry { + struct list_head list; + __u32 addr; /* destination IP address */ + struct ip_vs_dest *dest; /* real server (cache) */ + unsigned long lastuse; /* last used time */ +}; + + +/* + * IPVS lblc hash table + */ +struct ip_vs_lblc_table { + rwlock_t lock; /* lock for this table */ + struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ + atomic_t entries; /* number of entries */ + int max_size; /* maximum size of entries */ + struct timer_list periodic_timer; /* collect stale entries */ + int rover; /* rover for expire check */ + int counter; /* counter for no expire */ +}; + + + +/* + * IPVS LBLC sysctl table + */ +struct ip_vs_lblc_sysctl_table { + struct ctl_table_header *sysctl_header; + ctl_table vs_vars[2]; + ctl_table vs_dir[2]; + ctl_table ipv4_dir[2]; + ctl_table root_dir[2]; +}; + + +static struct ip_vs_lblc_sysctl_table lblc_sysctl_table = { + NULL, + {{NET_IPV4_VS_LBLC_EXPIRE, "lblc_expiration", + &sysctl_ip_vs_lblc_expiration, + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {0}}, + {{NET_IPV4_VS, "vs", NULL, 0, 0555, lblc_sysctl_table.vs_vars}, + {0}}, + {{NET_IPV4, "ipv4", NULL, 0, 0555, lblc_sysctl_table.vs_dir}, + {0}}, + {{CTL_NET, "net", NULL, 0, 0555, lblc_sysctl_table.ipv4_dir}, + {0}} +}; + + +/* + * new/free a ip_vs_lblc_entry, which is a mapping of a destination + * IP address to a server. + */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_new(__u32 daddr, struct ip_vs_dest *dest) +{ + struct ip_vs_lblc_entry *en; + + en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC); + if (en == NULL) { + IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); + return NULL; + } + + INIT_LIST_HEAD(&en->list); + en->addr = daddr; + + atomic_inc(&dest->refcnt); + en->dest = dest; + + return en; +} + + +static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) +{ + list_del(&en->list); + atomic_dec(&en->dest->refcnt); + kfree(en); +} + + +/* + * Returns hash value for IPVS LBLC entry + */ +static inline unsigned ip_vs_lblc_hashkey(__u32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK; +} + + +/* + * Hash an entry in the ip_vs_lblc_table. + * returns bool success. + */ +static int +ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) +{ + unsigned hash; + + if (!list_empty(&en->list)) { + IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* + * Hash by destination IP address + */ + hash = ip_vs_lblc_hashkey(en->addr); + + write_lock(&tbl->lock); + list_add(&en->list, &tbl->bucket[hash]); + atomic_inc(&tbl->entries); + write_unlock(&tbl->lock); + + return 1; +} + + +#if 0000 +/* + * Unhash ip_vs_lblc_entry from ip_vs_lblc_table. + * returns bool success. + */ +static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl, + struct ip_vs_lblc_entry *en) +{ + if (list_empty(&en->list)) { + IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* + * Remove it from the table + */ + write_lock(&tbl->lock); + list_del(&en->list); + INIT_LIST_HEAD(&en->list); + write_unlock(&tbl->lock); + + return 1; +} +#endif + + +/* + * Get ip_vs_lblc_entry associated with supplied parameters. + */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __u32 addr) +{ + unsigned hash; + struct ip_vs_lblc_entry *en; + struct list_head *l,*e; + + hash = ip_vs_lblc_hashkey(addr); + + read_lock(&tbl->lock); + + l = &tbl->bucket[hash]; + for (e=l->next; e!=l; e=e->next) { + en = list_entry(e, struct ip_vs_lblc_entry, list); + if (en->addr == addr) { + /* HIT */ + read_unlock(&tbl->lock); + return en; + } + } + + read_unlock(&tbl->lock); + + return NULL; +} + + +/* + * Flush all the entries of the specified table. + */ +static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) +{ + int i; + struct list_head *l; + struct ip_vs_lblc_entry *en; + + for (i=0; ilock); + for (l=&tbl->bucket[i]; l->next!=l; ) { + en = list_entry(l->next, + struct ip_vs_lblc_entry, list); + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&tbl->lock); + } +} + + +static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) +{ + unsigned long now = jiffies; + int i, j; + struct list_head *l, *e; + struct ip_vs_lblc_entry *en; + + for (i=0, j=tbl->rover; ibucket[j]; + write_lock(&tbl->lock); + while (e->next != l) { + en = list_entry(e->next, + struct ip_vs_lblc_entry, list); + if ((now - en->lastuse) < + sysctl_ip_vs_lblc_expiration) { + e = e->next; + continue; + } + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&tbl->lock); + } + tbl->rover = j; +} + + +/* + * Periodical timer handler for IPVS lblc table + * It is used to collect stale entries when the number of entries + * exceeds the maximum size of the table. + * + * Fixme: we probably need more complicated algorithm to collect + * entries that have not been used for a long time even + * if the number of entries doesn't exceed the maximum size + * of the table. + * The full expiration check is for this purpose now. + */ +static void ip_vs_lblc_check_expire(unsigned long data) +{ + struct ip_vs_lblc_table *tbl; + unsigned long now = jiffies; + int goal; + int i, j; + struct list_head *l, *e; + struct ip_vs_lblc_entry *en; + + tbl = (struct ip_vs_lblc_table *)data; + + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { + /* do full expiration check */ + ip_vs_lblc_full_check(tbl); + tbl->counter = 1; + goto out; + } + + if (atomic_read(&tbl->entries) < tbl->max_size) { + tbl->counter++; + goto out; + } + + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; + if (goal > tbl->max_size/2) + goal = tbl->max_size/2; + + for (i=0, j=tbl->rover; ibucket[j]; + write_lock(&tbl->lock); + while (e->next != l) { + en = list_entry(e->next, + struct ip_vs_lblc_entry, list); + if ((now - en->lastuse) < ENTRY_TIMEOUT) { + e = e->next; + continue; + } + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + goal--; + } + write_unlock(&tbl->lock); + if (goal <= 0) + break; + } + tbl->rover = j; + + out: + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); +} + + +static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) +{ + int i; + struct ip_vs_lblc_table *tbl; + + /* + * Allocate the ip_vs_lblc_table for this service + */ + tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(0, "LBLC hash table (memory=%dbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_lblc_table)); + + /* + * Initialize the hash buckets + */ + for (i=0; ibucket[i]); + } + tbl->lock = RW_LOCK_UNLOCKED; + tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; + tbl->rover = 0; + tbl->counter = 1; + + /* + * Hook periodic timer for garbage collection + */ + init_timer(&tbl->periodic_timer); + tbl->periodic_timer.data = (unsigned long)tbl; + tbl->periodic_timer.function = ip_vs_lblc_check_expire; + tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; + add_timer(&tbl->periodic_timer); + + MOD_INC_USE_COUNT; + return 0; +} + + +static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + + /* remove periodic timer */ + del_timer(&tbl->periodic_timer); + + /* got to clean up table entries here */ + ip_vs_lblc_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(0, "LBLC hash table (memory=%dbytes) released\n", + sizeof(struct ip_vs_lblc_table)); + + MOD_DEC_USE_COUNT; + return 0; +} + + +static int ip_vs_lblc_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline struct ip_vs_dest * +__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + register struct list_head *l, *e; + struct ip_vs_dest *dest, *least; + int loh, doh; + + l = &svc->destinations; + if (l == l->next) + return NULL; + + /* + * We think the overhead of processing active connections is fifty + * times than that of inactive conncetions in average. (This fifty + * times might be not accurate, we will change it later.) We use + * the following formula to estimate the overhead: + * dest->activeconns*50 + dest->inactconns + * and the load: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connection. + */ + + for (e=l->next; e!=l; e=e->next) { + least = list_entry(e, struct ip_vs_dest, n_list); + if (least->weight > 0) { + loh = atomic_read(&least->activeconns) * 50 + + atomic_read(&least->inactconns); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + for (e=e->next; e!=l; e=e->next) + { + dest = list_entry(e, struct ip_vs_dest, n_list); + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + if (loh * dest->weight > doh * least->weight) + { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), least->weight, loh); + + return least; +} + + +/* + * If this destination server is overloaded and there is a less loaded + * server, then return true. + */ +static inline int +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + if (atomic_read(&dest->activeconns) > dest->weight) { + register struct list_head *l, *e; + struct ip_vs_dest *d; + + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + d = list_entry(e, struct ip_vs_dest, n_list); + if (atomic_read(&d->activeconns)*2 < d->weight) { + return 1; + } + } + } + return 0; +} + + +/* + * Locality-Based (weighted) Least-Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_lblc_table *tbl; + struct ip_vs_lblc_entry *en; + + IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_lblc_table *)svc->sched_data; + en = ip_vs_lblc_get(tbl, iph->daddr); + if (en == NULL) { + dest = __ip_vs_wlc_schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + en = ip_vs_lblc_new(iph->daddr, dest); + if (en == NULL) { + return NULL; + } + ip_vs_lblc_hash(tbl, en); + } else { + dest = en->dest; + if (!(dest->flags & IP_VS_DEST_F_AVAILABLE) + || dest->weight <= 0 + || is_overloaded(dest, svc)) { + dest = __ip_vs_wlc_schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + atomic_dec(&en->dest->refcnt); + atomic_inc(&dest->refcnt); + en->dest = dest; + } + } + en->lastuse = jiffies; + + IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(en->addr), + NIPQUAD(dest->addr), + ntohs(dest->port)); + + return dest; +} + + +static struct ip_vs_scheduler ip_vs_lblc_scheduler = +{ + {0}, /* n_list */ + "lblc", /* name */ + ATOMIC_INIT(0), /* refcnt */ + ip_vs_lblc_init_svc, /* service initializer */ + ip_vs_lblc_done_svc, /* service done */ + ip_vs_lblc_update_svc, /* service updater */ + ip_vs_lblc_schedule, /* select a server from the destination list */ +}; + + +__initfunc(int ip_vs_lblc_init(void)) +{ + IP_VS_INFO("Initializing LBLC scheduling\n"); + INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list); + lblc_sysctl_table.sysctl_header = + register_sysctl_table(lblc_sysctl_table.root_dir, 0); + return register_ip_vs_scheduler(&ip_vs_lblc_scheduler); +} + + +#ifdef MODULE +EXPORT_NO_SYMBOLS; + +int init_module(void) +{ + INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list); + + /* module initialization by 'request_module' */ + if (register_ip_vs_scheduler(&ip_vs_lblc_scheduler) != 0) + return -EIO; + + lblc_sysctl_table.sysctl_header = + register_sysctl_table(lblc_sysctl_table.root_dir, 0); + + IP_VS_INFO("LBLC scheduling module loaded.\n"); + + return 0; +} + +void cleanup_module(void) +{ + /* module cleanup by 'release_module' */ + if (unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler) != 0) { + IP_VS_INFO("cannot remove LBLC scheduling module\n"); + } else { + IP_VS_INFO("LBLC scheduling module unloaded.\n"); + } + unregister_sysctl_table(lblc_sysctl_table.sysctl_header); +} + +#endif /* MODULE */ diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_lblcr.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_lblcr.c --- linux-2.2.19/net/ipv4/ip_vs_lblcr.c Thu Jan 1 08:00:00 1970 +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_lblcr.c Tue Mar 27 17:37:00 2001 @@ -0,0 +1,834 @@ +/* + * IPVS: Locality-Based Least-Connection with Replication scheduler + * + * Version: $Id$ + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Julian Anastasov : Added the missing (dest->weight>0) + * condition in the ip_vs_dest_set_max. + * + */ + +/* + * The lblc/r algorithm is as follows (pseudo code): + * + * if serverSet[dest_ip] is null then + * n, serverSet[dest_ip] <- {weighted least-conn node}; + * else + * n <- {least-conn (alive) node in serverSet[dest_ip]}; + * if (n is null) OR + * (n.conns>n.weight AND + * there is a node m with m.conns 1 AND + * now - serverSet[dest_ip].lastMod > T then + * m <- {most conn node in serverSet[dest_ip]}; + * remove m from serverSet[dest_ip]; + * if serverSet[dest_ip] changed then + * serverSet[dest_ip].lastMod <- now; + * + * return n; + * + */ + +#include +#include +#ifdef CONFIG_KMOD +#include +#endif +#include +#include +#include +#include +#include +#ifdef CONFIG_IP_MASQUERADE_MOD +#include +#endif +#include +#include +#include +#include + + +/* + * It is for garbage collection of stale IPVS lblcr entries, + * when the table is full. + */ +#define CHECK_EXPIRE_INTERVAL (60*HZ) +#define ENTRY_TIMEOUT (6*60*HZ) + +/* + * It is for full expiration check. + * When there is no partial expiration check (garbage collection) + * in a half hour, do a full expiration check to collect stale + * entries that haven't been touched for a day. + */ +#define COUNT_FOR_FULL_EXPIRATION 30 +int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ; + + +/* + * for IPVS lblcr entry hash table + */ +#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS +#define CONFIG_IP_VS_LBLCR_TAB_BITS 10 +#endif +#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS +#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS) +#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1) + + +/* + * IPVS destination set structure and operations + */ +struct ip_vs_dest_list { + struct ip_vs_dest_list *next; /* list link */ + struct ip_vs_dest *dest; /* destination server */ +}; + +struct ip_vs_dest_set { + atomic_t size; /* set size */ + unsigned long lastmod; /* last modified time */ + struct ip_vs_dest_list *list; /* destination list */ + rwlock_t lock; /* lock for this list */ +}; + + +static struct ip_vs_dest_list * +ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +{ + struct ip_vs_dest_list *e; + + for (e=set->list; e!=NULL; e=e->next) { + if (e->dest == dest) + /* already existed */ + return NULL; + } + + e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC); + if (e == NULL) { + IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n"); + return NULL; + } + + atomic_inc(&dest->refcnt); + e->dest = dest; + + /* link it to the list */ + write_lock(&set->lock); + if (set->list != NULL) { + e->next = set->list->next; + set->list = e; + } else { + e->next = NULL; + set->list = e; + } + write_unlock(&set->lock); + + atomic_inc(&set->size); + set->lastmod = jiffies; + return e; +} + +static void +ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +{ + struct ip_vs_dest_list *e, **ep; + + write_lock(&set->lock); + for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { + if (e->dest == dest) { + /* HIT */ + *ep = e->next; + atomic_dec(&set->size); + set->lastmod = jiffies; + atomic_dec(&e->dest->refcnt); + kfree(e); + break; + } + ep = &e->next; + } + write_unlock(&set->lock); +} + +static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) +{ + struct ip_vs_dest_list *e, **ep; + + write_lock(&set->lock); + for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { + *ep = e->next; + /* + * We don't kfree dest because it is refered either + * by its service or by the trash dest list. + */ + atomic_dec(&e->dest->refcnt); + kfree(e); + } + write_unlock(&set->lock); +} + +/* get weighted least-connection node in the destination set */ +static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) +{ + register struct ip_vs_dest_list *e; + struct ip_vs_dest *dest, *least; + int loh, doh; + + if (set == NULL) + return NULL; + + read_lock(&set->lock); + /* select the first destination server, whose weight > 0 */ + for (e=set->list; e!=NULL; e=e->next) { + least = e->dest; + if ((least->weight > 0) + && (least->flags & IP_VS_DEST_F_AVAILABLE)) { + loh = atomic_read(&least->activeconns) * 50 + + atomic_read(&least->inactconns); + goto nextstage; + } + } + read_unlock(&set->lock); + return NULL; + + /* find the destination with the weighted least load */ + nextstage: + for (e=e->next; e!=NULL; e=e->next) { + dest = e->dest; + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + if ((loh*dest->weight > doh*least->weight) + && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + least = dest; + loh = doh; + } + } + read_unlock(&set->lock); + + IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), least->weight, loh); + return least; +} + + +/* get weighted most-connection node in the destination set */ +static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) +{ + register struct ip_vs_dest_list *e; + struct ip_vs_dest *dest, *most; + int moh, doh; + + if (set == NULL) + return NULL; + + read_lock(&set->lock); + /* select the first destination server, whose weight > 0 */ + for (e=set->list; e!=NULL; e=e->next) { + most = e->dest; + if (most->weight > 0) { + moh = atomic_read(&most->activeconns) * 50 + + atomic_read(&most->inactconns); + goto nextstage; + } + } + read_unlock(&set->lock); + return NULL; + + /* find the destination with the weighted most load */ + nextstage: + for (e=e->next; e!=NULL; e=e->next) { + dest = e->dest; + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ + if (moh*dest->weight < doh*most->weight + && dest->weight > 0) { + most = dest; + moh = doh; + } + } + read_unlock(&set->lock); + + IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(most->addr), ntohs(most->port), + atomic_read(&most->activeconns), + atomic_read(&most->refcnt), most->weight, moh); + return most; +} + + +/* + * IPVS lblcr entry represents an association between destination + * IP address and its destination server set + */ +struct ip_vs_lblcr_entry { + struct list_head list; + __u32 addr; /* destination IP address */ + struct ip_vs_dest_set set; /* destination server set */ + unsigned long lastuse; /* last used time */ +}; + + +/* + * IPVS lblcr hash table + */ +struct ip_vs_lblcr_table { + rwlock_t lock; /* lock for this table */ + struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ + atomic_t entries; /* number of entries */ + int max_size; /* maximum size of entries */ + struct timer_list periodic_timer; /* collect stale entries */ + int rover; /* rover for expire check */ + int counter; /* counter for no expire */ +}; + + +/* + * IPVS LBLCR sysctl table + */ +struct ip_vs_lblcr_sysctl_table { + struct ctl_table_header *sysctl_header; + ctl_table vs_vars[2]; + ctl_table vs_dir[2]; + ctl_table ipv4_dir[2]; + ctl_table root_dir[2]; +}; + + +static struct ip_vs_lblcr_sysctl_table lblcr_sysctl_table = { + NULL, + {{NET_IPV4_VS_LBLCR_EXPIRE, "lblcr_expiration", + &sysctl_ip_vs_lblcr_expiration, + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {0}}, + {{NET_IPV4_VS, "vs", NULL, 0, 0555, lblcr_sysctl_table.vs_vars}, + {0}}, + {{NET_IPV4, "ipv4", NULL, 0, 0555, lblcr_sysctl_table.vs_dir}, + {0}}, + {{CTL_NET, "net", NULL, 0, 0555, lblcr_sysctl_table.ipv4_dir}, + {0}} +}; + + +/* + * new/free a ip_vs_lblcr_entry, which is a mapping of a destination + * IP address to a server. + */ +static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__u32 daddr) +{ + struct ip_vs_lblcr_entry *en; + + en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC); + if (en == NULL) { + IP_VS_ERR("ip_vs_lblcr_new(): no memory\n"); + return NULL; + } + + INIT_LIST_HEAD(&en->list); + en->addr = daddr; + + /* initilize its dest set */ + atomic_set(&(en->set.size), 0); + en->set.list = NULL; + en->set.lock = RW_LOCK_UNLOCKED; + + return en; +} + + +static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) +{ + list_del(&en->list); + ip_vs_dest_set_eraseall(&en->set); + kfree(en); +} + + +/* + * Returns hash value for IPVS LBLCR entry + */ +static inline unsigned ip_vs_lblcr_hashkey(__u32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; +} + + +/* + * Hash an entry in the ip_vs_lblcr_table. + * returns bool success. + */ +static int +ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) +{ + unsigned hash; + + if (!list_empty(&en->list)) { + IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* + * Hash by destination IP address + */ + hash = ip_vs_lblcr_hashkey(en->addr); + + write_lock(&tbl->lock); + list_add(&en->list, &tbl->bucket[hash]); + atomic_inc(&tbl->entries); + write_unlock(&tbl->lock); + + return 1; +} + + +#if 0000 +/* + * Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table. + * returns bool success. + */ +static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl, + struct ip_vs_lblcr_entry *en) +{ + if (list_empty(&en->list)) { + IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* + * Remove it from the table + */ + write_lock(&tbl->lock); + list_del(&en->list); + INIT_LIST_HEAD(&en->list); + write_unlock(&tbl->lock); + + return 1; +} +#endif + + +/* + * Get ip_vs_lblcr_entry associated with supplied parameters. + */ +static inline struct ip_vs_lblcr_entry * +ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __u32 addr) +{ + unsigned hash; + struct ip_vs_lblcr_entry *en; + struct list_head *l,*e; + + hash = ip_vs_lblcr_hashkey(addr); + l = &tbl->bucket[hash]; + + read_lock(&tbl->lock); + + for (e=l->next; e!=l; e=e->next) { + en = list_entry(e, struct ip_vs_lblcr_entry, list); + if (en->addr == addr) { + /* HIT */ + read_unlock(&tbl->lock); + return en; + } + } + + read_unlock(&tbl->lock); + + return NULL; +} + + +/* + * Flush all the entries of the specified table. + */ +static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) +{ + int i; + struct list_head *l; + struct ip_vs_lblcr_entry *en; + + for (i=0; ilock); + for (l=&tbl->bucket[i]; l->next!=l; ) { + en = list_entry(l->next, + struct ip_vs_lblcr_entry, list); + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&tbl->lock); + } +} + + +static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl) +{ + unsigned long now = jiffies; + int i, j; + struct list_head *l, *e; + struct ip_vs_lblcr_entry *en; + + for (i=0, j=tbl->rover; ibucket[j]; + write_lock(&tbl->lock); + while (e->next != l) { + en = list_entry(e->next, + struct ip_vs_lblcr_entry, list); + if ((now - en->lastuse) < + sysctl_ip_vs_lblcr_expiration) { + e = e->next; + continue; + } + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&tbl->lock); + } + tbl->rover = j; +} + + +/* + * Periodical timer handler for IPVS lblcr table + * It is used to collect stale entries when the number of entries + * exceeds the maximum size of the table. + * + * Fixme: we probably need more complicated algorithm to collect + * entries that have not been used for a long time even + * if the number of entries doesn't exceed the maximum size + * of the table. + * The full expiration check is for this purpose now. + */ +static void ip_vs_lblcr_check_expire(unsigned long data) +{ + struct ip_vs_lblcr_table *tbl; + unsigned long now = jiffies; + int goal; + int i, j; + struct list_head *l, *e; + struct ip_vs_lblcr_entry *en; + + tbl = (struct ip_vs_lblcr_table *)data; + + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { + /* do full expiration check */ + ip_vs_lblcr_full_check(tbl); + tbl->counter = 1; + goto out; + } + + if (atomic_read(&tbl->entries) < tbl->max_size) { + tbl->counter++; + goto out; + } + + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; + if (goal > tbl->max_size/2) + goal = tbl->max_size/2; + + for (i=0, j=tbl->rover; ibucket[j]; + write_lock(&tbl->lock); + while (e->next != l) { + en = list_entry(e->next, + struct ip_vs_lblcr_entry, list); + if ((now - en->lastuse) < ENTRY_TIMEOUT) { + e = e->next; + continue; + } + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + goal--; + } + write_unlock(&tbl->lock); + if (goal <= 0) + break; + } + tbl->rover = j; + + out: + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); +} + + +static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) +{ + int i; + struct ip_vs_lblcr_table *tbl; + + /* + * Allocate the ip_vs_lblcr_table for this service + */ + tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(0, "LBLCR hash table (memory=%dbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_lblcr_table)); + + /* + * Initialize the hash buckets + */ + for (i=0; ibucket[i]); + } + tbl->lock = RW_LOCK_UNLOCKED; + tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; + tbl->rover = 0; + tbl->counter = 1; + + /* + * Hook periodic timer for garbage collection + */ + init_timer(&tbl->periodic_timer); + tbl->periodic_timer.data = (unsigned long)tbl; + tbl->periodic_timer.function = ip_vs_lblcr_check_expire; + tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; + add_timer(&tbl->periodic_timer); + + MOD_INC_USE_COUNT; + return 0; +} + + +static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + + /* remove periodic timer */ + del_timer(&tbl->periodic_timer); + + /* got to clean up table entries here */ + ip_vs_lblcr_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(0, "LBLCR hash table (memory=%dbytes) released\n", + sizeof(struct ip_vs_lblcr_table)); + + MOD_DEC_USE_COUNT; + return 0; +} + + +static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline struct ip_vs_dest * +__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + register struct list_head *l, *e; + struct ip_vs_dest *dest, *least; + int loh, doh; + + l = &svc->destinations; + if (l == l->next) + return NULL; + + /* + * We think the overhead of processing active connections is fifty + * times than that of inactive conncetions in average. (This fifty + * times might be not accurate, we will change it later.) We use + * the following formula to estimate the overhead: + * dest->activeconns*50 + dest->inactconns + * and the load: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connection. + */ + + for (e=l->next; e!=l; e=e->next) { + least = list_entry(e, struct ip_vs_dest, n_list); + if (least->weight > 0) { + loh = atomic_read(&least->activeconns) * 50 + + atomic_read(&least->inactconns); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + for (e=e->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + if (loh*dest->weight > doh*least->weight) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), least->weight, loh); + + return least; +} + + +/* + * If this destination server is overloaded and there is a less loaded + * server, then return true. + */ +static inline int +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + if (atomic_read(&dest->activeconns) > dest->weight) { + register struct list_head *l, *e; + struct ip_vs_dest *d; + + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + d = list_entry(e, struct ip_vs_dest, n_list); + if (atomic_read(&d->activeconns)*2 < d->weight) { + return 1; + } + } + } + return 0; +} + + +/* + * Locality-Based (weighted) Least-Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_lblcr_table *tbl; + struct ip_vs_lblcr_entry *en; + + IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_lblcr_table *)svc->sched_data; + en = ip_vs_lblcr_get(tbl, iph->daddr); + if (en == NULL) { + dest = __ip_vs_wlc_schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + en = ip_vs_lblcr_new(iph->daddr); + if (en == NULL) { + return NULL; + } + ip_vs_dest_set_insert(&en->set, dest); + ip_vs_lblcr_hash(tbl, en); + } else { + dest = ip_vs_dest_set_min(&en->set); + if (!dest || is_overloaded(dest, svc)) { + dest = __ip_vs_wlc_schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + ip_vs_dest_set_insert(&en->set, dest); + } + if (atomic_read(&en->set.size) > 1 && + jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) { + struct ip_vs_dest *m; + m = ip_vs_dest_set_max(&en->set); + if (m) ip_vs_dest_set_erase(&en->set, m); + } + } + en->lastuse = jiffies; + + IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(en->addr), + NIPQUAD(dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS LBLCR Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_lblcr_scheduler = +{ + {0}, /* n_list */ + "lblcr", /* name */ + ATOMIC_INIT(0), /* refcnt */ + ip_vs_lblcr_init_svc, /* service initializer */ + ip_vs_lblcr_done_svc, /* service done */ + ip_vs_lblcr_update_svc, /* service updater */ + ip_vs_lblcr_schedule, /* select a server from the destination list */ +}; + + +__initfunc(int ip_vs_lblcr_init(void)) +{ + IP_VS_INFO("Initializing LBLCR scheduling\n"); + INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list); + lblcr_sysctl_table.sysctl_header = + register_sysctl_table(lblcr_sysctl_table.root_dir, 0); + return register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); +} + + +#ifdef MODULE +EXPORT_NO_SYMBOLS; + +int init_module(void) +{ + INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list); + + /* module initialization by 'request_module' */ + if (register_ip_vs_scheduler(&ip_vs_lblcr_scheduler) != 0) + return -EIO; + + lblcr_sysctl_table.sysctl_header = + register_sysctl_table(lblcr_sysctl_table.root_dir, 0); + + IP_VS_INFO("LBLCR scheduling module loaded.\n"); + + return 0; +} + +void cleanup_module(void) +{ + /* module cleanup by 'release_module' */ + if (unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler) != 0) { + IP_VS_INFO("cannot remove LBLCR scheduling module\n"); + } else { + IP_VS_INFO("LBLCR scheduling module unloaded.\n"); + } + unregister_sysctl_table(lblcr_sysctl_table.sysctl_header); +} + +#endif /* MODULE */ diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_lc.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_lc.c --- linux-2.2.19/net/ipv4/ip_vs_lc.c Thu Jan 1 08:00:00 1970 +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_lc.c Fri Nov 24 10:02:53 2000 @@ -0,0 +1,159 @@ +/* + * IPVS: Least-Connection Scheduling module + * + * Version: $Id$ + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : added the ip_vs_lc_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#include +#include +#ifdef CONFIG_KMOD +#include +#endif +#include +#include +#include +#include +#ifdef CONFIG_IP_MASQUERADE_MOD +#include +#endif +#include +#include + + +static int ip_vs_lc_init_svc(struct ip_vs_service *svc) +{ + MOD_INC_USE_COUNT; + return 0; +} + + +static int ip_vs_lc_done_svc(struct ip_vs_service *svc) +{ + MOD_DEC_USE_COUNT; + return 0; +} + + +static int ip_vs_lc_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +/* + * Least Connection scheduling + */ +static struct ip_vs_dest* ip_vs_lc_schedule(struct ip_vs_service *svc, + struct iphdr *iph) +{ + struct list_head *l, *e; + struct ip_vs_dest *dest, *least; + int lac, dac; + + IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n"); + + l = &svc->destinations; + if (l == l->next) + return NULL; + + /* + * Simply select the server with the least number of + * (activeconns<<5) + inactconns + * Except whose weight is equal to zero. + * If the weight is equal to zero, it means that the server is + * quiesced, the existing connections to the server still get + * served, but no new connection is assigned to the server. + */ + + for (e=l->next; e!=l; e=e->next) { + least = list_entry (e, struct ip_vs_dest, n_list); + if (least->weight > 0) { + lac = (atomic_read(&least->activeconns) << 5) + + atomic_read(&least->inactconns); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + for (e=e->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + if (dest->weight == 0) + continue; + dac = (atomic_read(&dest->activeconns) << 5) + + atomic_read(&dest->inactconns); + if (dac < lac) { + least = dest; + lac = dac; + } + } + + IP_VS_DBG(6, "LC: server %d.%d.%d.%d:%d activeconns %d inactconns %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->inactconns)); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_lc_scheduler = { + {0}, /* n_list */ + "lc", /* name */ + ATOMIC_INIT(0), /* refcnt */ + ip_vs_lc_init_svc, /* service initializer */ + ip_vs_lc_done_svc, /* service done */ + ip_vs_lc_update_svc, /* service updater */ + ip_vs_lc_schedule, /* select a server from the destination list */ +}; + + +__initfunc(int ip_vs_lc_init(void)) +{ + IP_VS_INFO("Initializing LC scheduling\n"); + INIT_LIST_HEAD(&ip_vs_lc_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ; +} + + +#ifdef MODULE +EXPORT_NO_SYMBOLS; + +int init_module(void) +{ + INIT_LIST_HEAD(&ip_vs_lc_scheduler.n_list); + + /* module initialization by 'request_module' */ + if(register_ip_vs_scheduler(&ip_vs_lc_scheduler) != 0) + return -EIO; + + IP_VS_INFO("LC scheduling module loaded.\n"); + + return 0; +} + +void cleanup_module(void) +{ + /* module cleanup by 'release_module' */ + if(unregister_ip_vs_scheduler(&ip_vs_lc_scheduler) != 0) + IP_VS_INFO("cannot remove LC scheduling module\n"); + else + IP_VS_INFO("LC scheduling module unloaded.\n"); +} + +#endif /* MODULE */ diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_rr.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_rr.c --- linux-2.2.19/net/ipv4/ip_vs_rr.c Thu Jan 1 08:00:00 1970 +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_rr.c Fri Nov 24 10:04:12 2000 @@ -0,0 +1,145 @@ +/* + * IPVS: Round-Robin Scheduling module + * + * Version: $Id$ + * + * Authors: Wensong Zhang + * Peter Kese + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes/Changes: + * Wensong Zhang : changed the ip_vs_rr_schedule to return dest + * Julian Anastasov : fixed the NULL pointer access bug in debugging + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_rr_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#include +#include +#ifdef CONFIG_KMOD +#include +#endif +#include +#include +#include +#include +#ifdef CONFIG_IP_MASQUERADE_MOD +#include +#endif +#include +#include + + +static int ip_vs_rr_init_svc(struct ip_vs_service *svc) +{ + svc->sched_data = &svc->destinations; + MOD_INC_USE_COUNT; + return 0; +} + + +static int ip_vs_rr_done_svc(struct ip_vs_service *svc) +{ + MOD_DEC_USE_COUNT; + return 0; +} + + +static int ip_vs_rr_update_svc(struct ip_vs_service *svc) +{ + svc->sched_data = &svc->destinations; + return 0; +} + + +/* + * Round-Robin Scheduling + */ +static struct ip_vs_dest* ip_vs_rr_schedule(struct ip_vs_service *svc, + struct iphdr *iph) +{ + register struct list_head *p, *q; + struct ip_vs_dest *dest; + + IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n"); + + p = (struct list_head *)svc->sched_data; + p = p->next; + q = p; + do { + if (q == &svc->destinations) { + q = q->next; + continue; + } + dest = list_entry(q, struct ip_vs_dest, n_list); + if (dest->weight > 0) + /* HIT */ + goto out; + q = q->next; + } while (q != p); + return NULL; + + out: + svc->sched_data = q; + IP_VS_DBG(6, "RR: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d\n", + NIPQUAD(dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), dest->weight); + + return dest; +} + + +static struct ip_vs_scheduler ip_vs_rr_scheduler = { + {0}, /* n_list */ + "rr", /* name */ + ATOMIC_INIT(0), /* refcnt */ + ip_vs_rr_init_svc, /* service initializer */ + ip_vs_rr_done_svc, /* service done */ + ip_vs_rr_update_svc, /* service updater */ + ip_vs_rr_schedule, /* select a server from the destination list */ +}; + + +__initfunc(int ip_vs_rr_init(void)) +{ + IP_VS_INFO("Initializing RR scheduling\n"); + INIT_LIST_HEAD(&ip_vs_rr_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_rr_scheduler) ; +} + + +#ifdef MODULE +EXPORT_NO_SYMBOLS; + +int init_module(void) +{ + INIT_LIST_HEAD(&ip_vs_rr_scheduler.n_list); + + /* module initialization by 'request_module' */ + if(register_ip_vs_scheduler(&ip_vs_rr_scheduler) != 0) + return -EIO; + + IP_VS_INFO("RR scheduling module loaded.\n"); + + return 0; +} + +void cleanup_module(void) +{ + /* module cleanup by 'release_module' */ + if(unregister_ip_vs_scheduler(&ip_vs_rr_scheduler) != 0) + IP_VS_INFO("cannot remove RR scheduling module\n"); + else + IP_VS_INFO("RR scheduling module unloaded.\n"); +} + +#endif /* MODULE */ diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_wlc.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_wlc.c --- linux-2.2.19/net/ipv4/ip_vs_wlc.c Thu Jan 1 08:00:00 1970 +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_wlc.c Fri Nov 24 09:59:32 2000 @@ -0,0 +1,176 @@ +/* + * IPVS: Weighted Least-Connection Scheduling module + * + * Version: $Id$ + * + * Authors: Wensong Zhang + * Peter Kese + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest + * Wensong Zhang : changed to use the inactconns in scheduling + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_wlc_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#include +#include +#ifdef CONFIG_KMOD +#include +#endif +#include +#include +#include +#include +#ifdef CONFIG_IP_MASQUERADE_MOD +#include +#endif +#include +#include + + +static int +ip_vs_wlc_init_svc(struct ip_vs_service *svc) +{ + MOD_INC_USE_COUNT; + return 0; +} + + +static int +ip_vs_wlc_done_svc(struct ip_vs_service *svc) +{ + MOD_DEC_USE_COUNT; + return 0; +} + + +static int +ip_vs_wlc_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + register struct list_head *l, *e; + struct ip_vs_dest *dest, *least; + int loh, doh; + + IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); + + l = &svc->destinations; + if (l == l->next) + return NULL; + + /* + * We think the overhead of processing active connections is fifty + * times than that of inactive conncetions in average. (This fifty + * times might be not accurate, we will change it later.) We use + * the following formula to estimate the overhead: + * dest->activeconns*50 + dest->inactconns + * and the load: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connection. + */ + + for (e=l->next; e!=l; e=e->next) { + least = list_entry(e, struct ip_vs_dest, n_list); + if (least->weight > 0) { + loh = atomic_read(&least->activeconns) * 50 + + atomic_read(&least->inactconns); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + for (e=e->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + if (loh * dest->weight > doh * least->weight) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "WLC: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), least->weight, loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_wlc_scheduler = +{ + {0}, /* n_list */ + "wlc", /* name */ + ATOMIC_INIT (0), /* refcnt */ + ip_vs_wlc_init_svc, /* service initializer */ + ip_vs_wlc_done_svc, /* service done */ + ip_vs_wlc_update_svc, /* service updater */ + ip_vs_wlc_schedule, /* select a server from the destination list */ +}; + + +__initfunc(int ip_vs_wlc_init (void)) +{ + IP_VS_INFO("Initializing WLC scheduling\n"); + INIT_LIST_HEAD(&ip_vs_wlc_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_wlc_scheduler); +} + + +#ifdef MODULE +EXPORT_NO_SYMBOLS; + +int init_module(void) +{ + INIT_LIST_HEAD(&ip_vs_wlc_scheduler.n_list); + + /* module initialization by 'request_module' */ + if (register_ip_vs_scheduler(&ip_vs_wlc_scheduler) != 0) + return -EIO; + + IP_VS_INFO("WLC scheduling module loaded.\n"); + + return 0; +} + +void cleanup_module(void) +{ + /* module cleanup by 'release_module' */ + if (unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler) != 0) + IP_VS_INFO("cannot remove WLC scheduling module\n"); + else + IP_VS_INFO("WLC scheduling module unloaded.\n"); +} + +#endif /* MODULE */ diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_wrr.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_wrr.c --- linux-2.2.19/net/ipv4/ip_vs_wrr.c Thu Jan 1 08:00:00 1970 +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_wrr.c Fri Nov 24 09:57:23 2000 @@ -0,0 +1,209 @@ +/* + * IPVS: Weighted Round-Robin Scheduling module + * + * Version: $Id$ + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_wrr_update_svc + * Julian Anastasov : return -ENOMEM instead of ENOMEM in the + * ip_vs_wrr_init_svc + * Julian Anastasov : fixed the bug of returning destination + * with weight 0 when all weights are zero + * + */ + +#include +#include +#ifdef CONFIG_KMOD +#include +#endif +#include +#include +#include +#include +#ifdef CONFIG_IP_MASQUERADE_MOD +#include +#endif +#include +#include + +/* + * current destination pointer for weighted round-robin scheduling + */ +struct ip_vs_wrr_mark { + struct list_head *cl; /* current list head */ + int cw; /* current weight */ +}; + + +static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) +{ + /* + * Allocate the mark variable for WRR scheduling + */ + svc->sched_data = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); + + if (svc->sched_data == NULL) { + IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n"); + return -ENOMEM; + } + memset(svc->sched_data, 0, sizeof(struct ip_vs_wrr_mark)); + + ((struct ip_vs_wrr_mark*)svc->sched_data)->cl = &svc->destinations; + + MOD_INC_USE_COUNT; + return 0; +} + + +static int ip_vs_wrr_done_svc(struct ip_vs_service *svc) +{ + /* + * Release the mark variable + */ + kfree_s(svc->sched_data, sizeof(struct ip_vs_wrr_mark)); + + MOD_DEC_USE_COUNT; + return 0; +} + + +static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) +{ + ((struct ip_vs_wrr_mark*)svc->sched_data)->cl = &svc->destinations; + return 0; +} + + +/* + * Get the maximum weight of the service destinations. + */ +int ip_vs_wrr_max_weight(struct ip_vs_service *svc) +{ + register struct list_head *l, *e; + struct ip_vs_dest *dest; + int weight = 0; + + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + if (dest->weight > weight) + weight = dest->weight; + } + + return weight; +} + + +/* + * Weighted Round-Robin Scheduling + */ +static struct ip_vs_dest* ip_vs_wrr_schedule(struct ip_vs_service *svc, + struct iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_wrr_mark *mark = svc->sched_data; + + IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n"); + + /* + * This loop will always terminate, because 0cwcl == &svc->destinations) { + /* it is at the head of the destination list */ + + if (mark->cl == mark->cl->next) + /* no dest entry */ + return NULL; + + mark->cl = svc->destinations.next; + mark->cw--; + if (mark->cw <= 0) { + mark->cw = ip_vs_wrr_max_weight(svc); + /* + * Still zero, which means no availabe servers. + */ + if (mark->cw == 0) { + mark->cl = &svc->destinations; + IP_VS_INFO("ip_vs_wrr_schedule(): " + "no available servers\n"); + return NULL; + } + } + } + else mark->cl = mark->cl->next; + + if (mark->cl != &svc->destinations) { + /* not at the head of the list */ + dest = list_entry(mark->cl, struct ip_vs_dest, n_list); + if (dest->weight >= mark->cw) + break; + } + } + + IP_VS_DBG(6, "WRR: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d\n", + NIPQUAD(dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), dest->weight); + + return dest; +} + + +static struct ip_vs_scheduler ip_vs_wrr_scheduler = { + {0}, /* n_list */ + "wrr", /* name */ + ATOMIC_INIT(0), /* refcnt */ + ip_vs_wrr_init_svc, /* service initializer */ + ip_vs_wrr_done_svc, /* service done */ + ip_vs_wrr_update_svc, /* service updater */ + ip_vs_wrr_schedule, /* select a server from the destination list */ +}; + + +__initfunc(int ip_vs_wrr_init(void)) +{ + IP_VS_INFO("Initializing WRR scheduling\n"); + INIT_LIST_HEAD(&ip_vs_wrr_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ; +} + +#ifdef MODULE +EXPORT_NO_SYMBOLS; + +int init_module(void) +{ + INIT_LIST_HEAD(&ip_vs_wrr_scheduler.n_list); + + /* module initialization by 'request_module' */ + if(register_ip_vs_scheduler(&ip_vs_wrr_scheduler) != 0) + return -EIO; + + IP_VS_INFO("WRR scheduling module loaded.\n"); + + return 0; +} + +void cleanup_module(void) +{ + /* module cleanup by 'release_module' */ + if(unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler) != 0) + IP_VS_INFO("cannot remove WRR scheduling module\n"); + else + IP_VS_INFO("WRR scheduling module unloaded.\n"); +} + +#endif /* MODULE */ diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/sysctl_net_ipv4.c linux-2.2.19-vs-1.0.8/net/ipv4/sysctl_net_ipv4.c --- linux-2.2.19/net/ipv4/sysctl_net_ipv4.c Tue Mar 27 09:33:49 2001 +++ linux-2.2.19-vs-1.0.8/net/ipv4/sysctl_net_ipv4.c Tue Mar 27 09:32:21 2001 @@ -69,6 +69,9 @@ struct ipv4_config ipv4_config; extern ctl_table ipv4_route_table[]; +#ifdef CONFIG_IP_MASQUERADE_VS +extern ctl_table ipv4_vs_table[]; +#endif #ifdef CONFIG_SYSCTL @@ -198,7 +201,10 @@ {NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships", &sysctl_igmp_max_memberships, sizeof(int), 0644, NULL, &proc_dointvec}, #endif +#ifdef CONFIG_IP_MASQUERADE_VS + {NET_IPV4_VS, "vs", NULL, 0, 0555, ipv4_vs_table}, +#endif {0} }; - + #endif /* CONFIG_SYSCTL */