Merge branch 'lvs-next-2.6' of git://git.kernel.org/pub/scm/linux/kernel/git/horms/lvs-2.6

Conflicts:

	net/netfilter/Kconfig
This commit is contained in:
David S. Miller
2008-10-08 14:26:36 -07:00
29 changed files with 272 additions and 5 deletions

View File

@@ -858,3 +858,5 @@ config NETFILTER_XT_MATCH_U32
endif # NETFILTER_XTABLES
endmenu
source "net/netfilter/ipvs/Kconfig"

View File

@@ -89,3 +89,6 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o
obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o
obj-$(CONFIG_NETFILTER_XT_MATCH_TIME) += xt_time.o
obj-$(CONFIG_NETFILTER_XT_MATCH_U32) += xt_u32.o
# IPVS
obj-$(CONFIG_IP_VS) += ipvs/

239
net/netfilter/ipvs/Kconfig Normal file
View File

@@ -0,0 +1,239 @@
#
# IP Virtual Server configuration
#
menuconfig IP_VS
tristate "IP virtual server support (EXPERIMENTAL)"
depends on NETFILTER
---help---
IP Virtual Server support will let you build a high-performance
virtual server based on cluster of two or more real servers. This
option must be enabled for at least one of the clustered computers
that will take care of intercepting incoming connections to a
single IP address and scheduling them to real servers.
Three request dispatching techniques are implemented, they are
virtual server via NAT, virtual server via tunneling and virtual
server via direct routing. The several scheduling algorithms can
be used to choose which server the connection is directed to,
thus load balancing can be achieved among the servers. For more
information and its administration program, please visit the
following URL: <http://www.linuxvirtualserver.org/>.
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.
if IP_VS
config IP_VS_IPV6
bool "IPv6 support for IPVS (DANGEROUS)"
depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6)
---help---
Add IPv6 support to IPVS. This is incomplete and might be dangerous.
Say N if unsure.
config IP_VS_DEBUG
bool "IP virtual server debugging"
---help---
Say Y here if you want to get additional messages useful in
debugging the IP virtual server code. You can change the debug
level in /proc/sys/net/ipv4/vs/debug_level
config IP_VS_TAB_BITS
int "IPVS connection table size (the Nth power of 2)"
range 8 20
default 12
---help---
The IPVS connection hash table uses the chaining scheme to handle
hash collisions. Using a big IPVS connection hash table will greatly
reduce conflicts when there are hundreds of thousands of connections
in the hash table.
Note the table size must be power of 2. The table size will be the
value of 2 to the your input number power. The number to choose is
from 8 to 20, the default number is 12, which means the table size
is 4096. Don't input the number too small, otherwise you will lose
performance on it. You can adapt the table size yourself, according
to your virtual server application. It is good to set the table size
not far less than the number of connections per second multiplying
average lasting time of connection in the table. For example, your
virtual server gets 200 connections per second, the connection lasts
for 200 seconds in average in the connection table, the table size
should be not far less than 200x200, it is good to set the table
size 32768 (2**15).
Another note that each connection occupies 128 bytes effectively and
each hash entry uses 8 bytes, so you can estimate how much memory is
needed for your box.
comment "IPVS transport protocol load balancing support"
config IP_VS_PROTO_TCP
bool "TCP load balancing support"
---help---
This option enables support for load balancing TCP transport
protocol. Say Y if unsure.
config IP_VS_PROTO_UDP
bool "UDP load balancing support"
---help---
This option enables support for load balancing UDP transport
protocol. Say Y if unsure.
config IP_VS_PROTO_AH_ESP
bool
depends on UNDEFINED
config IP_VS_PROTO_ESP
bool "ESP load balancing support"
select IP_VS_PROTO_AH_ESP
---help---
This option enables support for load balancing ESP (Encapsulation
Security Payload) transport protocol. Say Y if unsure.
config IP_VS_PROTO_AH
bool "AH load balancing support"
select IP_VS_PROTO_AH_ESP
---help---
This option enables support for load balancing AH (Authentication
Header) transport protocol. Say Y if unsure.
comment "IPVS scheduler"
config IP_VS_RR
tristate "round-robin scheduling"
---help---
The robin-robin scheduling algorithm simply directs network
connections to different real servers in a round-robin manner.
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.
config IP_VS_WRR
tristate "weighted round-robin scheduling"
---help---
The weighted robin-robin scheduling algorithm directs network
connections to different real servers based on server weights
in a round-robin manner. Servers with higher weights receive
new connections first than those with less weights, and servers
with higher weights get more connections than those with less
weights and servers with equal weights get equal connections.
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.
config IP_VS_LC
tristate "least-connection scheduling"
---help---
The least-connection scheduling algorithm directs network
connections to the server with the least number of active
connections.
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.
config IP_VS_WLC
tristate "weighted least-connection scheduling"
---help---
The weighted least-connection scheduling algorithm directs network
connections to the server with the least active connections
normalized by the server weight.
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.
config IP_VS_LBLC
tristate "locality-based least-connection scheduling"
---help---
The locality-based least-connection scheduling algorithm is for
destination IP load balancing. It is usually used in cache cluster.
This algorithm usually directs packet destined for an IP address to
its server if the server is alive and under load. If the server is
overloaded (its active connection numbers is larger than its weight)
and there is a server in its half load, then allocate the weighted
least-connection server to this IP address.
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.
config IP_VS_LBLCR
tristate "locality-based least-connection with replication scheduling"
---help---
The locality-based least-connection with replication scheduling
algorithm is also for destination IP load balancing. It is
usually used in cache cluster. It differs from the LBLC scheduling
as follows: the load balancer maintains mappings from a target
to a set of server nodes that can serve the target. Requests for
a target are assigned to the least-connection node in the target's
server set. If all the node in the server set are over loaded,
it picks up a least-connection node in the cluster and adds it
in the sever set for the target. If the server set has not been
modified for the specified time, the most loaded node is removed
from the server set, in order to avoid high degree of replication.
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.
config IP_VS_DH
tristate "destination hashing scheduling"
---help---
The destination hashing scheduling algorithm assigns network
connections to the servers through looking up a statically assigned
hash table by their destination IP addresses.
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.
config IP_VS_SH
tristate "source hashing scheduling"
---help---
The source hashing scheduling algorithm assigns network
connections to the servers through looking up a statically assigned
hash table by their source IP addresses.
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.
config IP_VS_SED
tristate "shortest expected delay scheduling"
---help---
The shortest expected delay scheduling algorithm assigns network
connections to the server with the shortest expected delay. The
expected delay that the job will experience is (Ci + 1) / Ui if
sent to the ith server, in which Ci is the number of connections
on the ith server and Ui is the fixed service rate (weight)
of the ith server.
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.
config IP_VS_NQ
tristate "never queue scheduling"
---help---
The never queue scheduling algorithm adopts a two-speed model.
When there is an idle server available, the job will be sent to
the idle server, instead of waiting for a fast one. When there
is no idle server available, the job will be sent to the server
that minimize its expected delay (The Shortest Expected Delay
scheduling algorithm).
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.
comment 'IPVS application helper'
config IP_VS_FTP
tristate "FTP protocol helper"
depends on IP_VS_PROTO_TCP
---help---
FTP is a protocol that transfers IP address and/or port number in
the payload. In the virtual server via Network Address Translation,
the IP address and port number of real servers cannot be sent to
clients in ftp connections directly, so FTP protocol helper is
required for tracking the connection and mangling it back to that of
virtual service.
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.
endif # IP_VS

View File

@@ -0,0 +1,33 @@
#
# Makefile for the IPVS modules on top of IPv4.
#
# IPVS transport protocol load balancing support
ip_vs_proto-objs-y :=
ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o
ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o
ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \
ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \
ip_vs_est.o ip_vs_proto.o \
$(ip_vs_proto-objs-y)
# IPVS core
obj-$(CONFIG_IP_VS) += ip_vs.o
# IPVS schedulers
obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o
obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o
obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o
obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o
obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
# IPVS application helpers
obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o

View File

@@ -0,0 +1,622 @@
/*
* ip_vs_app.c: Application module support for IPVS
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Most code here is taken from ip_masq_app.c in kernel 2.2. The difference
* is that ip_vs_app module handles the reverse direction (incoming requests
* and outgoing responses).
*
* IP_MASQ_APP application masquerading module
*
* Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
*
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/netfilter.h>
#include <net/net_namespace.h>
#include <net/protocol.h>
#include <net/tcp.h>
#include <asm/system.h>
#include <linux/stat.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/mutex.h>
#include <net/ip_vs.h>
EXPORT_SYMBOL(register_ip_vs_app);
EXPORT_SYMBOL(unregister_ip_vs_app);
EXPORT_SYMBOL(register_ip_vs_app_inc);
/* ipvs application list head */
static LIST_HEAD(ip_vs_app_list);
static DEFINE_MUTEX(__ip_vs_app_mutex);
/*
* Get an ip_vs_app object
*/
static inline int ip_vs_app_get(struct ip_vs_app *app)
{
return try_module_get(app->module);
}
static inline void ip_vs_app_put(struct ip_vs_app *app)
{
module_put(app->module);
}
/*
* Allocate/initialize app incarnation and register it in proto apps.
*/
static int
ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
{
struct ip_vs_protocol *pp;
struct ip_vs_app *inc;
int ret;
if (!(pp = ip_vs_proto_get(proto)))
return -EPROTONOSUPPORT;
if (!pp->unregister_app)
return -EOPNOTSUPP;
inc = kmemdup(app, sizeof(*inc), GFP_KERNEL);
if (!inc)
return -ENOMEM;
INIT_LIST_HEAD(&inc->p_list);
INIT_LIST_HEAD(&inc->incs_list);
inc->app = app;
inc->port = htons(port);
atomic_set(&inc->usecnt, 0);
if (app->timeouts) {
inc->timeout_table =
ip_vs_create_timeout_table(app->timeouts,
app->timeouts_size);
if (!inc->timeout_table) {
ret = -ENOMEM;
goto out;
}
}
ret = pp->register_app(inc);
if (ret)
goto out;
list_add(&inc->a_list, &app->incs_list);
IP_VS_DBG(9, "%s application %s:%u registered\n",
pp->name, inc->name, inc->port);
return 0;
out:
kfree(inc->timeout_table);
kfree(inc);
return ret;
}
/*
* Release app incarnation
*/
static void
ip_vs_app_inc_release(struct ip_vs_app *inc)
{
struct ip_vs_protocol *pp;
if (!(pp = ip_vs_proto_get(inc->protocol)))
return;
if (pp->unregister_app)
pp->unregister_app(inc);
IP_VS_DBG(9, "%s App %s:%u unregistered\n",
pp->name, inc->name, inc->port);
list_del(&inc->a_list);
kfree(inc->timeout_table);
kfree(inc);
}
/*
* Get reference to app inc (only called from softirq)
*
*/
int ip_vs_app_inc_get(struct ip_vs_app *inc)
{
int result;
atomic_inc(&inc->usecnt);
if (unlikely((result = ip_vs_app_get(inc->app)) != 1))
atomic_dec(&inc->usecnt);
return result;
}
/*
* Put the app inc (only called from timer or net softirq)
*/
void ip_vs_app_inc_put(struct ip_vs_app *inc)
{
ip_vs_app_put(inc->app);
atomic_dec(&inc->usecnt);
}
/*
* Register an application incarnation in protocol applications
*/
int
register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
{
int result;
mutex_lock(&__ip_vs_app_mutex);
result = ip_vs_app_inc_new(app, proto, port);
mutex_unlock(&__ip_vs_app_mutex);
return result;
}
/*
* ip_vs_app registration routine
*/
int register_ip_vs_app(struct ip_vs_app *app)
{
/* increase the module use count */
ip_vs_use_count_inc();
mutex_lock(&__ip_vs_app_mutex);
list_add(&app->a_list, &ip_vs_app_list);
mutex_unlock(&__ip_vs_app_mutex);
return 0;
}
/*
* ip_vs_app unregistration routine
* We are sure there are no app incarnations attached to services
*/
void unregister_ip_vs_app(struct ip_vs_app *app)
{
struct ip_vs_app *inc, *nxt;
mutex_lock(&__ip_vs_app_mutex);
list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) {
ip_vs_app_inc_release(inc);
}
list_del(&app->a_list);
mutex_unlock(&__ip_vs_app_mutex);
/* decrease the module use count */
ip_vs_use_count_dec();
}
/*
* Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
*/
int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp)
{
return pp->app_conn_bind(cp);
}
/*
* Unbind cp from application incarnation (called by cp destructor)
*/
void ip_vs_unbind_app(struct ip_vs_conn *cp)
{
struct ip_vs_app *inc = cp->app;
if (!inc)
return;
if (inc->unbind_conn)
inc->unbind_conn(inc, cp);
if (inc->done_conn)
inc->done_conn(inc, cp);
ip_vs_app_inc_put(inc);
cp->app = NULL;
}
/*
* Fixes th->seq based on ip_vs_seq info.
*/
static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
{
__u32 seq = ntohl(th->seq);
/*
* Adjust seq with delta-offset for all packets after
* the most recent resized pkt seq and with previous_delta offset
* for all packets before most recent resized pkt seq.
*/
if (vseq->delta || vseq->previous_delta) {
if(after(seq, vseq->init_seq)) {
th->seq = htonl(seq + vseq->delta);
IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n",
vseq->delta);
} else {
th->seq = htonl(seq + vseq->previous_delta);
IP_VS_DBG(9, "vs_fix_seq(): added previous_delta "
"(%d) to seq\n", vseq->previous_delta);
}
}
}
/*
* Fixes th->ack_seq based on ip_vs_seq info.
*/
static inline void
vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
{
__u32 ack_seq = ntohl(th->ack_seq);
/*
* Adjust ack_seq with delta-offset for
* the packets AFTER most recent resized pkt has caused a shift
* for packets before most recent resized pkt, use previous_delta
*/
if (vseq->delta || vseq->previous_delta) {
/* since ack_seq is the number of octet that is expected
to receive next, so compare it with init_seq+delta */
if(after(ack_seq, vseq->init_seq+vseq->delta)) {
th->ack_seq = htonl(ack_seq - vseq->delta);
IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta "
"(%d) from ack_seq\n", vseq->delta);
} else {
th->ack_seq = htonl(ack_seq - vseq->previous_delta);
IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted "
"previous_delta (%d) from ack_seq\n",
vseq->previous_delta);
}
}
}
/*
* Updates ip_vs_seq if pkt has been resized
* Assumes already checked proto==IPPROTO_TCP and diff!=0.
*/
static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
unsigned flag, __u32 seq, int diff)
{
/* spinlock is to keep updating cp->flags atomic */
spin_lock(&cp->lock);
if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
vseq->previous_delta = vseq->delta;
vseq->delta += diff;
vseq->init_seq = seq;
cp->flags |= flag;
}
spin_unlock(&cp->lock);
}
static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
struct ip_vs_app *app)
{
int diff;
const unsigned int tcp_offset = ip_hdrlen(skb);
struct tcphdr *th;
__u32 seq;
if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
return 0;
th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
/*
* Remember seq number in case this pkt gets resized
*/
seq = ntohl(th->seq);
/*
* Fix seq stuff if flagged as so.
*/
if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
vs_fix_seq(&cp->out_seq, th);
if (cp->flags & IP_VS_CONN_F_IN_SEQ)
vs_fix_ack_seq(&cp->in_seq, th);
/*
* Call private output hook function
*/
if (app->pkt_out == NULL)
return 1;
if (!app->pkt_out(app, cp, skb, &diff))
return 0;
/*
* Update ip_vs seq stuff if len has changed.
*/
if (diff != 0)
vs_seq_update(cp, &cp->out_seq,
IP_VS_CONN_F_OUT_SEQ, seq, diff);
return 1;
}
/*
* Output pkt hook. Will call bound ip_vs_app specific function
* called by ipvs packet handler, assumes previously checked cp!=NULL
* returns false if it can't handle packet (oom)
*/
int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
{
struct ip_vs_app *app;
/*
* check if application module is bound to
* this ip_vs_conn.
*/
if ((app = cp->app) == NULL)
return 1;
/* TCP is complicated */
if (cp->protocol == IPPROTO_TCP)
return app_tcp_pkt_out(cp, skb, app);
/*
* Call private output hook function
*/
if (app->pkt_out == NULL)
return 1;
return app->pkt_out(app, cp, skb, NULL);
}
static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
struct ip_vs_app *app)
{
int diff;
const unsigned int tcp_offset = ip_hdrlen(skb);
struct tcphdr *th;
__u32 seq;
if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
return 0;
th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
/*
* Remember seq number in case this pkt gets resized
*/
seq = ntohl(th->seq);
/*
* Fix seq stuff if flagged as so.
*/
if (cp->flags & IP_VS_CONN_F_IN_SEQ)
vs_fix_seq(&cp->in_seq, th);
if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
vs_fix_ack_seq(&cp->out_seq, th);
/*
* Call private input hook function
*/
if (app->pkt_in == NULL)
return 1;
if (!app->pkt_in(app, cp, skb, &diff))
return 0;
/*
* Update ip_vs seq stuff if len has changed.
*/
if (diff != 0)
vs_seq_update(cp, &cp->in_seq,
IP_VS_CONN_F_IN_SEQ, seq, diff);
return 1;
}
/*
* Input pkt hook. Will call bound ip_vs_app specific function
* called by ipvs packet handler, assumes previously checked cp!=NULL.
* returns false if can't handle packet (oom).
*/
int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
{
struct ip_vs_app *app;
/*
* check if application module is bound to
* this ip_vs_conn.
*/
if ((app = cp->app) == NULL)
return 1;
/* TCP is complicated */
if (cp->protocol == IPPROTO_TCP)
return app_tcp_pkt_in(cp, skb, app);
/*
* Call private input hook function
*/
if (app->pkt_in == NULL)
return 1;
return app->pkt_in(app, cp, skb, NULL);
}
#ifdef CONFIG_PROC_FS
/*
* /proc/net/ip_vs_app entry function
*/
static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
{
struct ip_vs_app *app, *inc;
list_for_each_entry(app, &ip_vs_app_list, a_list) {
list_for_each_entry(inc, &app->incs_list, a_list) {
if (pos-- == 0)
return inc;
}
}
return NULL;
}
static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
{
mutex_lock(&__ip_vs_app_mutex);
return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN;
}
static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct ip_vs_app *inc, *app;
struct list_head *e;
++*pos;
if (v == SEQ_START_TOKEN)
return ip_vs_app_idx(0);
inc = v;
app = inc->app;
if ((e = inc->a_list.next) != &app->incs_list)
return list_entry(e, struct ip_vs_app, a_list);
/* go on to next application */
for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) {
app = list_entry(e, struct ip_vs_app, a_list);
list_for_each_entry(inc, &app->incs_list, a_list) {
return inc;
}
}
return NULL;
}
static void ip_vs_app_seq_stop(struct seq_file *seq, void *v)
{
mutex_unlock(&__ip_vs_app_mutex);
}
static int ip_vs_app_seq_show(struct seq_file *seq, void *v)
{
if (v == SEQ_START_TOKEN)
seq_puts(seq, "prot port usecnt name\n");
else {
const struct ip_vs_app *inc = v;
seq_printf(seq, "%-3s %-7u %-6d %-17s\n",
ip_vs_proto_name(inc->protocol),
ntohs(inc->port),
atomic_read(&inc->usecnt),
inc->name);
}
return 0;
}
static const struct seq_operations ip_vs_app_seq_ops = {
.start = ip_vs_app_seq_start,
.next = ip_vs_app_seq_next,
.stop = ip_vs_app_seq_stop,
.show = ip_vs_app_seq_show,
};
static int ip_vs_app_open(struct inode *inode, struct file *file)
{
return seq_open(file, &ip_vs_app_seq_ops);
}
static const struct file_operations ip_vs_app_fops = {
.owner = THIS_MODULE,
.open = ip_vs_app_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
#endif
/*
* Replace a segment of data with a new segment
*/
int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri,
char *o_buf, int o_len, char *n_buf, int n_len)
{
int diff;
int o_offset;
int o_left;
EnterFunction(9);
diff = n_len - o_len;
o_offset = o_buf - (char *)skb->data;
/* The length of left data after o_buf+o_len in the skb data */
o_left = skb->len - (o_offset + o_len);
if (diff <= 0) {
memmove(o_buf + n_len, o_buf + o_len, o_left);
memcpy(o_buf, n_buf, n_len);
skb_trim(skb, skb->len + diff);
} else if (diff <= skb_tailroom(skb)) {
skb_put(skb, diff);
memmove(o_buf + n_len, o_buf + o_len, o_left);
memcpy(o_buf, n_buf, n_len);
} else {
if (pskb_expand_head(skb, skb_headroom(skb), diff, pri))
return -ENOMEM;
skb_put(skb, diff);
memmove(skb->data + o_offset + n_len,
skb->data + o_offset + o_len, o_left);
skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len);
}
/* must update the iph total length here */
ip_hdr(skb)->tot_len = htons(skb->len);
LeaveFunction(9);
return 0;
}
int __init ip_vs_app_init(void)
{
/* we will replace it with proc_net_ipvs_create() soon */
proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops);
return 0;
}
void ip_vs_app_cleanup(void)
{
proc_net_remove(&init_net, "ip_vs_app");
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,261 @@
/*
* IPVS: Destination Hashing scheduling module
*
* Authors: Wensong Zhang <wensong@gnuchina.org>
*
* Inspired by the consistent hashing scheduler patch from
* Thomas Proell <proellt@gmx.de>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
*
*/
/*
* The dh algorithm is to select server by the hash key of destination IP
* address. The pseudo code is as follows:
*
* n <- servernode[dest_ip];
* if (n is dead) OR
* (n is overloaded) OR (n.weight <= 0) then
* return NULL;
*
* return n;
*
* Notes that servernode is a 256-bucket hash table that maps the hash
* index derived from packet destination IP address to the current server
* array. If the dh scheduler is used in cache cluster, it is good to
* combine it with cache_bypass feature. When the statically assigned
* server is dead or overloaded, the load balancer can bypass the cache
* server and send requests to the original server directly.
*
*/
#include <linux/ip.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <net/ip_vs.h>
/*
* IPVS DH bucket
*/
struct ip_vs_dh_bucket {
struct ip_vs_dest *dest; /* real server (cache) */
};
/*
* for IPVS DH entry hash table
*/
#ifndef CONFIG_IP_VS_DH_TAB_BITS
#define CONFIG_IP_VS_DH_TAB_BITS 8
#endif
#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS
#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS)
#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1)
/*
* Returns hash value for IPVS DH entry
*/
static inline unsigned ip_vs_dh_hashkey(__be32 addr)
{
return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK;
}
/*
* Get ip_vs_dest associated with supplied parameters.
*/
static inline struct ip_vs_dest *
ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __be32 addr)
{
return (tbl[ip_vs_dh_hashkey(addr)]).dest;
}
/*
* Assign all the hash buckets of the specified table with the service.
*/
static int
ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)
{
int i;
struct ip_vs_dh_bucket *b;
struct list_head *p;
struct ip_vs_dest *dest;
b = tbl;
p = &svc->destinations;
for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
if (list_empty(p)) {
b->dest = NULL;
} else {
if (p == &svc->destinations)
p = p->next;
dest = list_entry(p, struct ip_vs_dest, n_list);
atomic_inc(&dest->refcnt);
b->dest = dest;
p = p->next;
}
b++;
}
return 0;
}
/*
* Flush all the hash buckets of the specified table.
*/
static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
{
int i;
struct ip_vs_dh_bucket *b;
b = tbl;
for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
if (b->dest) {
atomic_dec(&b->dest->refcnt);
b->dest = NULL;
}
b++;
}
}
static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
{
struct ip_vs_dh_bucket *tbl;
/* allocate the DH table for this service */
tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE,
GFP_ATOMIC);
if (tbl == NULL) {
IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n");
return -ENOMEM;
}
svc->sched_data = tbl;
IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
"current service\n",
sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
/* assign the hash buckets with the updated service */
ip_vs_dh_assign(tbl, svc);
return 0;
}
static int ip_vs_dh_done_svc(struct ip_vs_service *svc)
{
struct ip_vs_dh_bucket *tbl = svc->sched_data;
/* got to clean up hash buckets here */
ip_vs_dh_flush(tbl);
/* release the table itself */
kfree(svc->sched_data);
IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
return 0;
}
static int ip_vs_dh_update_svc(struct ip_vs_service *svc)
{
struct ip_vs_dh_bucket *tbl = svc->sched_data;
/* got to clean up hash buckets here */
ip_vs_dh_flush(tbl);
/* assign the hash buckets with the updated service */
ip_vs_dh_assign(tbl, svc);
return 0;
}
/*
* If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
* consider that the server is overloaded here.
*/
static inline int is_overloaded(struct ip_vs_dest *dest)
{
return dest->flags & IP_VS_DEST_F_OVERLOAD;
}
/*
* Destination hashing scheduling
*/
static struct ip_vs_dest *
ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_dest *dest;
struct ip_vs_dh_bucket *tbl;
struct iphdr *iph = ip_hdr(skb);
IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n");
tbl = (struct ip_vs_dh_bucket *)svc->sched_data;
dest = ip_vs_dh_get(tbl, iph->daddr);
if (!dest
|| !(dest->flags & IP_VS_DEST_F_AVAILABLE)
|| atomic_read(&dest->weight) <= 0
|| is_overloaded(dest)) {
return NULL;
}
IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u "
"--> server %u.%u.%u.%u:%d\n",
NIPQUAD(iph->daddr),
NIPQUAD(dest->addr.ip),
ntohs(dest->port));
return dest;
}
/*
* IPVS DH Scheduler structure
*/
static struct ip_vs_scheduler ip_vs_dh_scheduler =
{
.name = "dh",
.refcnt = ATOMIC_INIT(0),
.module = THIS_MODULE,
.n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list),
#ifdef CONFIG_IP_VS_IPV6
.supports_ipv6 = 0,
#endif
.init_service = ip_vs_dh_init_svc,
.done_service = ip_vs_dh_done_svc,
.update_service = ip_vs_dh_update_svc,
.schedule = ip_vs_dh_schedule,
};
static int __init ip_vs_dh_init(void)
{
return register_ip_vs_scheduler(&ip_vs_dh_scheduler);
}
static void __exit ip_vs_dh_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_dh_scheduler);
}
module_init(ip_vs_dh_init);
module_exit(ip_vs_dh_cleanup);
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,166 @@
/*
* ip_vs_est.c: simple rate estimator for IPVS
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
*
*/
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/interrupt.h>
#include <linux/sysctl.h>
#include <linux/list.h>
#include <net/ip_vs.h>
/*
This code is to estimate rate in a shorter interval (such as 8
seconds) for virtual services and real servers. For measure rate in a
long interval, it is easy to implement a user level daemon which
periodically reads those statistical counters and measure rate.
Currently, the measurement is activated by slow timer handler. Hope
this measurement will not introduce too much load.
We measure rate during the last 8 seconds every 2 seconds:
avgrate = avgrate*(1-W) + rate*W
where W = 2^(-2)
NOTES.
* The stored value for average bps is scaled by 2^5, so that maximal
rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10.
* A lot code is taken from net/sched/estimator.c
*/
static void estimation_timer(unsigned long arg);
static LIST_HEAD(est_list);
static DEFINE_SPINLOCK(est_lock);
static DEFINE_TIMER(est_timer, estimation_timer, 0, 0);
static void estimation_timer(unsigned long arg)
{
struct ip_vs_estimator *e;
struct ip_vs_stats *s;
u32 n_conns;
u32 n_inpkts, n_outpkts;
u64 n_inbytes, n_outbytes;
u32 rate;
spin_lock(&est_lock);
list_for_each_entry(e, &est_list, list) {
s = container_of(e, struct ip_vs_stats, est);
spin_lock(&s->lock);
n_conns = s->ustats.conns;
n_inpkts = s->ustats.inpkts;
n_outpkts = s->ustats.outpkts;
n_inbytes = s->ustats.inbytes;
n_outbytes = s->ustats.outbytes;
/* scaled by 2^10, but divided 2 seconds */
rate = (n_conns - e->last_conns)<<9;
e->last_conns = n_conns;
e->cps += ((long)rate - (long)e->cps)>>2;
s->ustats.cps = (e->cps+0x1FF)>>10;
rate = (n_inpkts - e->last_inpkts)<<9;
e->last_inpkts = n_inpkts;
e->inpps += ((long)rate - (long)e->inpps)>>2;
s->ustats.inpps = (e->inpps+0x1FF)>>10;
rate = (n_outpkts - e->last_outpkts)<<9;
e->last_outpkts = n_outpkts;
e->outpps += ((long)rate - (long)e->outpps)>>2;
s->ustats.outpps = (e->outpps+0x1FF)>>10;
rate = (n_inbytes - e->last_inbytes)<<4;
e->last_inbytes = n_inbytes;
e->inbps += ((long)rate - (long)e->inbps)>>2;
s->ustats.inbps = (e->inbps+0xF)>>5;
rate = (n_outbytes - e->last_outbytes)<<4;
e->last_outbytes = n_outbytes;
e->outbps += ((long)rate - (long)e->outbps)>>2;
s->ustats.outbps = (e->outbps+0xF)>>5;
spin_unlock(&s->lock);
}
spin_unlock(&est_lock);
mod_timer(&est_timer, jiffies + 2*HZ);
}
void ip_vs_new_estimator(struct ip_vs_stats *stats)
{
struct ip_vs_estimator *est = &stats->est;
INIT_LIST_HEAD(&est->list);
est->last_conns = stats->ustats.conns;
est->cps = stats->ustats.cps<<10;
est->last_inpkts = stats->ustats.inpkts;
est->inpps = stats->ustats.inpps<<10;
est->last_outpkts = stats->ustats.outpkts;
est->outpps = stats->ustats.outpps<<10;
est->last_inbytes = stats->ustats.inbytes;
est->inbps = stats->ustats.inbps<<5;
est->last_outbytes = stats->ustats.outbytes;
est->outbps = stats->ustats.outbps<<5;
spin_lock_bh(&est_lock);
list_add(&est->list, &est_list);
spin_unlock_bh(&est_lock);
}
void ip_vs_kill_estimator(struct ip_vs_stats *stats)
{
struct ip_vs_estimator *est = &stats->est;
spin_lock_bh(&est_lock);
list_del(&est->list);
spin_unlock_bh(&est_lock);
}
void ip_vs_zero_estimator(struct ip_vs_stats *stats)
{
struct ip_vs_estimator *est = &stats->est;
/* set counters zero, caller must hold the stats->lock lock */
est->last_inbytes = 0;
est->last_outbytes = 0;
est->last_conns = 0;
est->last_inpkts = 0;
est->last_outpkts = 0;
est->cps = 0;
est->inpps = 0;
est->outpps = 0;
est->inbps = 0;
est->outbps = 0;
}
int __init ip_vs_estimator_init(void)
{
mod_timer(&est_timer, jiffies + 2 * HZ);
return 0;
}
void ip_vs_estimator_cleanup(void)
{
del_timer_sync(&est_timer);
}

View File

@@ -0,0 +1,410 @@
/*
* ip_vs_ftp.c: IPVS ftp application module
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
* Changes:
*
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference
* is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp.
*
* IP_MASQ_FTP ftp masquerading module
*
* Version: @(#)ip_masq_ftp.c 0.04 02/05/96
*
* Author: Wouter Gadeyne
*
*/
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/netfilter.h>
#include <net/protocol.h>
#include <net/tcp.h>
#include <asm/unaligned.h>
#include <net/ip_vs.h>
#define SERVER_STRING "227 Entering Passive Mode ("
#define CLIENT_STRING "PORT "
/*
* List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
* First port is set to the default port.
*/
static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0};
module_param_array(ports, ushort, NULL, 0);
MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands");
/* Dummy variable */
static int ip_vs_ftp_pasv;
static int
ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
{
return 0;
}
static int
ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
{
return 0;
}
/*
* Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
* with the "pattern" and terminated with the "term" character.
* <addr,port> is in network order.
*/
static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
const char *pattern, size_t plen, char term,
__be32 *addr, __be16 *port,
char **start, char **end)
{
unsigned char p[6];
int i = 0;
if (data_limit - data < plen) {
/* check if there is partial match */
if (strnicmp(data, pattern, data_limit - data) == 0)
return -1;
else
return 0;
}
if (strnicmp(data, pattern, plen) != 0) {
return 0;
}
*start = data + plen;
for (data = *start; *data != term; data++) {
if (data == data_limit)
return -1;
}
*end = data;
memset(p, 0, sizeof(p));
for (data = *start; data != *end; data++) {
if (*data >= '0' && *data <= '9') {
p[i] = p[i]*10 + *data - '0';
} else if (*data == ',' && i < 5) {
i++;
} else {
/* unexpected character */
return -1;
}
}
if (i != 5)
return -1;
*addr = get_unaligned((__be32 *)p);
*port = get_unaligned((__be16 *)(p + 4));
return 1;
}
/*
* Look at outgoing ftp packets to catch the response to a PASV command
* from the server (inside-to-outside).
* When we see one, we build a connection entry with the client address,
* client port 0 (unknown at the moment), the server address and the
* server port. Mark the current connection entry as a control channel
* of the new entry. All this work is just to make the data connection
* can be scheduled to the right server later.
*
* The outgoing packet should be something like
* "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
* xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
*/
static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
struct sk_buff *skb, int *diff)
{
struct iphdr *iph;
struct tcphdr *th;
char *data, *data_limit;
char *start, *end;
union nf_inet_addr from;
__be16 port;
struct ip_vs_conn *n_cp;
char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
unsigned buf_len;
int ret;
#ifdef CONFIG_IP_VS_IPV6
/* This application helper doesn't work with IPv6 yet,
* so turn this into a no-op for IPv6 packets
*/
if (cp->af == AF_INET6)
return 1;
#endif
*diff = 0;
/* Only useful for established sessions */
if (cp->state != IP_VS_TCP_S_ESTABLISHED)
return 1;
/* Linear packets are much easier to deal with. */
if (!skb_make_writable(skb, skb->len))
return 0;
if (cp->app_data == &ip_vs_ftp_pasv) {
iph = ip_hdr(skb);
th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
data = (char *)th + (th->doff << 2);
data_limit = skb_tail_pointer(skb);
if (ip_vs_ftp_get_addrport(data, data_limit,
SERVER_STRING,
sizeof(SERVER_STRING)-1, ')',
&from.ip, &port,
&start, &end) != 1)
return 1;
IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> "
"%u.%u.%u.%u:%d detected\n",
NIPQUAD(from.ip), ntohs(port),
NIPQUAD(cp->caddr.ip), 0);
/*
* Now update or create an connection entry for it
*/
n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port,
&cp->caddr, 0);
if (!n_cp) {
n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
&cp->caddr, 0,
&cp->vaddr, port,
&from, port,
IP_VS_CONN_F_NO_CPORT,
cp->dest);
if (!n_cp)
return 0;
/* add its controller */
ip_vs_control_add(n_cp, cp);
}
/*
* Replace the old passive address with the new one
*/
from.ip = n_cp->vaddr.ip;
port = n_cp->vport;
sprintf(buf, "%d,%d,%d,%d,%d,%d", NIPQUAD(from.ip),
(ntohs(port)>>8)&255, ntohs(port)&255);
buf_len = strlen(buf);
/*
* Calculate required delta-offset to keep TCP happy
*/
*diff = buf_len - (end-start);
if (*diff == 0) {
/* simply replace it with new passive address */
memcpy(start, buf, buf_len);
ret = 1;
} else {
ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start,
end-start, buf, buf_len);
}
cp->app_data = NULL;
ip_vs_tcp_conn_listen(n_cp);
ip_vs_conn_put(n_cp);
return ret;
}
return 1;
}
/*
* Look at incoming ftp packets to catch the PASV/PORT command
* (outside-to-inside).
*
* The incoming packet having the PORT command should be something like
* "PORT xxx,xxx,xxx,xxx,ppp,ppp\n".
* xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number.
* In this case, we create a connection entry using the client address and
* port, so that the active ftp data connection from the server can reach
* the client.
*/
static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
struct sk_buff *skb, int *diff)
{
struct iphdr *iph;
struct tcphdr *th;
char *data, *data_start, *data_limit;
char *start, *end;
union nf_inet_addr to;
__be16 port;
struct ip_vs_conn *n_cp;
#ifdef CONFIG_IP_VS_IPV6
/* This application helper doesn't work with IPv6 yet,
* so turn this into a no-op for IPv6 packets
*/
if (cp->af == AF_INET6)
return 1;
#endif
/* no diff required for incoming packets */
*diff = 0;
/* Only useful for established sessions */
if (cp->state != IP_VS_TCP_S_ESTABLISHED)
return 1;
/* Linear packets are much easier to deal with. */
if (!skb_make_writable(skb, skb->len))
return 0;
/*
* Detecting whether it is passive
*/
iph = ip_hdr(skb);
th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
/* Since there may be OPTIONS in the TCP packet and the HLEN is
the length of the header in 32-bit multiples, it is accurate
to calculate data address by th+HLEN*4 */
data = data_start = (char *)th + (th->doff << 2);
data_limit = skb_tail_pointer(skb);
while (data <= data_limit - 6) {
if (strnicmp(data, "PASV\r\n", 6) == 0) {
/* Passive mode on */
IP_VS_DBG(7, "got PASV at %td of %td\n",
data - data_start,
data_limit - data_start);
cp->app_data = &ip_vs_ftp_pasv;
return 1;
}
data++;
}
/*
* To support virtual FTP server, the scenerio is as follows:
* FTP client ----> Load Balancer ----> FTP server
* First detect the port number in the application data,
* then create a new connection entry for the coming data
* connection.
*/
if (ip_vs_ftp_get_addrport(data_start, data_limit,
CLIENT_STRING, sizeof(CLIENT_STRING)-1,
'\r', &to.ip, &port,
&start, &end) != 1)
return 1;
IP_VS_DBG(7, "PORT %u.%u.%u.%u:%d detected\n",
NIPQUAD(to.ip), ntohs(port));
/* Passive mode off */
cp->app_data = NULL;
/*
* Now update or create a connection entry for it
*/
IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n",
ip_vs_proto_name(iph->protocol),
NIPQUAD(to.ip), ntohs(port), NIPQUAD(cp->vaddr.ip), 0);
n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol,
&to, port,
&cp->vaddr, htons(ntohs(cp->vport)-1));
if (!n_cp) {
n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
&to, port,
&cp->vaddr, htons(ntohs(cp->vport)-1),
&cp->daddr, htons(ntohs(cp->dport)-1),
0,
cp->dest);
if (!n_cp)
return 0;
/* add its controller */
ip_vs_control_add(n_cp, cp);
}
/*
* Move tunnel to listen state
*/
ip_vs_tcp_conn_listen(n_cp);
ip_vs_conn_put(n_cp);
return 1;
}
static struct ip_vs_app ip_vs_ftp = {
.name = "ftp",
.type = IP_VS_APP_TYPE_FTP,
.protocol = IPPROTO_TCP,
.module = THIS_MODULE,
.incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list),
.init_conn = ip_vs_ftp_init_conn,
.done_conn = ip_vs_ftp_done_conn,
.bind_conn = NULL,
.unbind_conn = NULL,
.pkt_out = ip_vs_ftp_out,
.pkt_in = ip_vs_ftp_in,
};
/*
* ip_vs_ftp initialization
*/
static int __init ip_vs_ftp_init(void)
{
int i, ret;
struct ip_vs_app *app = &ip_vs_ftp;
ret = register_ip_vs_app(app);
if (ret)
return ret;
for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
if (!ports[i])
continue;
ret = register_ip_vs_app_inc(app, app->protocol, ports[i]);
if (ret)
break;
IP_VS_INFO("%s: loaded support on port[%d] = %d\n",
app->name, i, ports[i]);
}
if (ret)
unregister_ip_vs_app(app);
return ret;
}
/*
* ip_vs_ftp finish.
*/
static void __exit ip_vs_ftp_exit(void)
{
unregister_ip_vs_app(&ip_vs_ftp);
}
module_init(ip_vs_ftp_init);
module_exit(ip_vs_ftp_exit);
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,555 @@
/*
* IPVS: Locality-Based Least-Connection scheduling module
*
* Authors: Wensong Zhang <wensong@gnuchina.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
* Martin Hamilton : fixed the terrible locking bugs
* *lock(tbl->lock) ==> *lock(&tbl->lock)
* Wensong Zhang : fixed the uninitilized tbl->lock bug
* Wensong Zhang : added doing full expiration check to
* collect stale entries of 24+ hours when
* no partial expire check in a half hour
* Julian Anastasov : replaced del_timer call with del_timer_sync
* to avoid the possible race between timer
* handler and del_timer thread in SMP
*
*/
/*
* The lblc algorithm is as follows (pseudo code):
*
* if cachenode[dest_ip] is null then
* n, cachenode[dest_ip] <- {weighted least-conn node};
* else
* n <- cachenode[dest_ip];
* if (n is dead) OR
* (n.conns>n.weight AND
* there is a node m with m.conns<m.weight/2) then
* n, cachenode[dest_ip] <- {weighted least-conn node};
*
* return n;
*
* Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
* me to write this module.
*/
#include <linux/ip.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/jiffies.h>
/* for sysctl */
#include <linux/fs.h>
#include <linux/sysctl.h>
#include <net/ip_vs.h>
/*
* It is for garbage collection of stale IPVS lblc entries,
* when the table is full.
*/
#define CHECK_EXPIRE_INTERVAL (60*HZ)
#define ENTRY_TIMEOUT (6*60*HZ)
/*
* It is for full expiration check.
* When there is no partial expiration check (garbage collection)
* in a half hour, do a full expiration check to collect stale
* entries that haven't been touched for a day.
*/
#define COUNT_FOR_FULL_EXPIRATION 30
static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
/*
* for IPVS lblc entry hash table
*/
#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
#define CONFIG_IP_VS_LBLC_TAB_BITS 10
#endif
#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS
#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS)
#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1)
/*
* IPVS lblc entry represents an association between destination
* IP address and its destination server
*/
struct ip_vs_lblc_entry {
struct list_head list;
__be32 addr; /* destination IP address */
struct ip_vs_dest *dest; /* real server (cache) */
unsigned long lastuse; /* last used time */
};
/*
* IPVS lblc hash table
*/
struct ip_vs_lblc_table {
struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
atomic_t entries; /* number of entries */
int max_size; /* maximum size of entries */
struct timer_list periodic_timer; /* collect stale entries */
int rover; /* rover for expire check */
int counter; /* counter for no expire */
};
/*
* IPVS LBLC sysctl table
*/
static ctl_table vs_vars_table[] = {
{
.procname = "lblc_expiration",
.data = &sysctl_ip_vs_lblc_expiration,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec_jiffies,
},
{ .ctl_name = 0 }
};
static struct ctl_table_header * sysctl_header;
static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
{
list_del(&en->list);
/*
* We don't kfree dest because it is refered either by its service
* or the trash dest list.
*/
atomic_dec(&en->dest->refcnt);
kfree(en);
}
/*
* Returns hash value for IPVS LBLC entry
*/
static inline unsigned ip_vs_lblc_hashkey(__be32 addr)
{
return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
}
/*
* Hash an entry in the ip_vs_lblc_table.
* returns bool success.
*/
static void
ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
{
unsigned hash = ip_vs_lblc_hashkey(en->addr);
list_add(&en->list, &tbl->bucket[hash]);
atomic_inc(&tbl->entries);
}
/*
* Get ip_vs_lblc_entry associated with supplied parameters. Called under read
* lock
*/
static inline struct ip_vs_lblc_entry *
ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr)
{
unsigned hash = ip_vs_lblc_hashkey(addr);
struct ip_vs_lblc_entry *en;
list_for_each_entry(en, &tbl->bucket[hash], list)
if (en->addr == addr)
return en;
return NULL;
}
/*
* Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
* address to a server. Called under write lock.
*/
static inline struct ip_vs_lblc_entry *
ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, __be32 daddr,
struct ip_vs_dest *dest)
{
struct ip_vs_lblc_entry *en;
en = ip_vs_lblc_get(tbl, daddr);
if (!en) {
en = kmalloc(sizeof(*en), GFP_ATOMIC);
if (!en) {
IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
return NULL;
}
en->addr = daddr;
en->lastuse = jiffies;
atomic_inc(&dest->refcnt);
en->dest = dest;
ip_vs_lblc_hash(tbl, en);
} else if (en->dest != dest) {
atomic_dec(&en->dest->refcnt);
atomic_inc(&dest->refcnt);
en->dest = dest;
}
return en;
}
/*
* Flush all the entries of the specified table.
*/
static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
{
struct ip_vs_lblc_entry *en, *nxt;
int i;
for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
ip_vs_lblc_free(en);
atomic_dec(&tbl->entries);
}
}
}
static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
{
struct ip_vs_lblc_table *tbl = svc->sched_data;
struct ip_vs_lblc_entry *en, *nxt;
unsigned long now = jiffies;
int i, j;
for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLC_TAB_MASK;
write_lock(&svc->sched_lock);
list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
if (time_before(now,
en->lastuse + sysctl_ip_vs_lblc_expiration))
continue;
ip_vs_lblc_free(en);
atomic_dec(&tbl->entries);
}
write_unlock(&svc->sched_lock);
}
tbl->rover = j;
}
/*
* Periodical timer handler for IPVS lblc table
* It is used to collect stale entries when the number of entries
* exceeds the maximum size of the table.
*
* Fixme: we probably need more complicated algorithm to collect
* entries that have not been used for a long time even
* if the number of entries doesn't exceed the maximum size
* of the table.
* The full expiration check is for this purpose now.
*/
static void ip_vs_lblc_check_expire(unsigned long data)
{
struct ip_vs_service *svc = (struct ip_vs_service *) data;
struct ip_vs_lblc_table *tbl = svc->sched_data;
unsigned long now = jiffies;
int goal;
int i, j;
struct ip_vs_lblc_entry *en, *nxt;
if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
/* do full expiration check */
ip_vs_lblc_full_check(svc);
tbl->counter = 1;
goto out;
}
if (atomic_read(&tbl->entries) <= tbl->max_size) {
tbl->counter++;
goto out;
}
goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
if (goal > tbl->max_size/2)
goal = tbl->max_size/2;
for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLC_TAB_MASK;
write_lock(&svc->sched_lock);
list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
continue;
ip_vs_lblc_free(en);
atomic_dec(&tbl->entries);
goal--;
}
write_unlock(&svc->sched_lock);
if (goal <= 0)
break;
}
tbl->rover = j;
out:
mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
}
static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
{
int i;
struct ip_vs_lblc_table *tbl;
/*
* Allocate the ip_vs_lblc_table for this service
*/
tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
if (tbl == NULL) {
IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
return -ENOMEM;
}
svc->sched_data = tbl;
IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
"current service\n", sizeof(*tbl));
/*
* Initialize the hash buckets
*/
for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
INIT_LIST_HEAD(&tbl->bucket[i]);
}
tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
tbl->rover = 0;
tbl->counter = 1;
/*
* Hook periodic timer for garbage collection
*/
setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire,
(unsigned long)svc);
mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
return 0;
}
static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
{
struct ip_vs_lblc_table *tbl = svc->sched_data;
/* remove periodic timer */
del_timer_sync(&tbl->periodic_timer);
/* got to clean up table entries here */
ip_vs_lblc_flush(tbl);
/* release the table itself */
kfree(tbl);
IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
sizeof(*tbl));
return 0;
}
static inline struct ip_vs_dest *
__ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
{
struct ip_vs_dest *dest, *least;
int loh, doh;
/*
* We think the overhead of processing active connections is fifty
* times higher than that of inactive connections in average. (This
* fifty times might not be accurate, we will change it later.) We
* use the following formula to estimate the overhead:
* dest->activeconns*50 + dest->inactconns
* and the load:
* (dest overhead) / dest->weight
*
* Remember -- no floats in kernel mode!!!
* The comparison of h1*w2 > h2*w1 is equivalent to that of
* h1/w1 > h2/w2
* if every weight is larger than zero.
*
* The server with weight=0 is quiesced and will not receive any
* new connection.
*/
list_for_each_entry(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
if (atomic_read(&dest->weight) > 0) {
least = dest;
loh = atomic_read(&least->activeconns) * 50
+ atomic_read(&least->inactconns);
goto nextstage;
}
}
return NULL;
/*
* Find the destination with the least load.
*/
nextstage:
list_for_each_entry_continue(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
doh = atomic_read(&dest->activeconns) * 50
+ atomic_read(&dest->inactconns);
if (loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight)) {
least = dest;
loh = doh;
}
}
IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
"activeconns %d refcnt %d weight %d overhead %d\n",
NIPQUAD(least->addr.ip), ntohs(least->port),
atomic_read(&least->activeconns),
atomic_read(&least->refcnt),
atomic_read(&least->weight), loh);
return least;
}
/*
* If this destination server is overloaded and there is a less loaded
* server, then return true.
*/
static inline int
is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
{
if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
struct ip_vs_dest *d;
list_for_each_entry(d, &svc->destinations, n_list) {
if (atomic_read(&d->activeconns)*2
< atomic_read(&d->weight)) {
return 1;
}
}
}
return 0;
}
/*
* Locality-Based (weighted) Least-Connection scheduling
*/
static struct ip_vs_dest *
ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_lblc_table *tbl = svc->sched_data;
struct iphdr *iph = ip_hdr(skb);
struct ip_vs_dest *dest = NULL;
struct ip_vs_lblc_entry *en;
IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
/* First look in our cache */
read_lock(&svc->sched_lock);
en = ip_vs_lblc_get(tbl, iph->daddr);
if (en) {
/* We only hold a read lock, but this is atomic */
en->lastuse = jiffies;
/*
* If the destination is not available, i.e. it's in the trash,
* we must ignore it, as it may be removed from under our feet,
* if someone drops our reference count. Our caller only makes
* sure that destinations, that are not in the trash, are not
* moved to the trash, while we are scheduling. But anyone can
* free up entries from the trash at any time.
*/
if (en->dest->flags & IP_VS_DEST_F_AVAILABLE)
dest = en->dest;
}
read_unlock(&svc->sched_lock);
/* If the destination has a weight and is not overloaded, use it */
if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
goto out;
/* No cache entry or it is invalid, time to schedule */
dest = __ip_vs_lblc_schedule(svc, iph);
if (!dest) {
IP_VS_DBG(1, "no destination available\n");
return NULL;
}
/* If we fail to create a cache entry, we'll just use the valid dest */
write_lock(&svc->sched_lock);
ip_vs_lblc_new(tbl, iph->daddr, dest);
write_unlock(&svc->sched_lock);
out:
IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
"--> server %u.%u.%u.%u:%d\n",
NIPQUAD(iph->daddr),
NIPQUAD(dest->addr.ip),
ntohs(dest->port));
return dest;
}
/*
* IPVS LBLC Scheduler structure
*/
static struct ip_vs_scheduler ip_vs_lblc_scheduler =
{
.name = "lblc",
.refcnt = ATOMIC_INIT(0),
.module = THIS_MODULE,
.n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list),
#ifdef CONFIG_IP_VS_IPV6
.supports_ipv6 = 0,
#endif
.init_service = ip_vs_lblc_init_svc,
.done_service = ip_vs_lblc_done_svc,
.schedule = ip_vs_lblc_schedule,
};
static int __init ip_vs_lblc_init(void)
{
int ret;
sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
if (ret)
unregister_sysctl_table(sysctl_header);
return ret;
}
static void __exit ip_vs_lblc_cleanup(void)
{
unregister_sysctl_table(sysctl_header);
unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
}
module_init(ip_vs_lblc_init);
module_exit(ip_vs_lblc_cleanup);
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,755 @@
/*
* IPVS: Locality-Based Least-Connection with Replication scheduler
*
* Authors: Wensong Zhang <wensong@gnuchina.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
* Julian Anastasov : Added the missing (dest->weight>0)
* condition in the ip_vs_dest_set_max.
*
*/
/*
* The lblc/r algorithm is as follows (pseudo code):
*
* if serverSet[dest_ip] is null then
* n, serverSet[dest_ip] <- {weighted least-conn node};
* else
* n <- {least-conn (alive) node in serverSet[dest_ip]};
* if (n is null) OR
* (n.conns>n.weight AND
* there is a node m with m.conns<m.weight/2) then
* n <- {weighted least-conn node};
* add n to serverSet[dest_ip];
* if |serverSet[dest_ip]| > 1 AND
* now - serverSet[dest_ip].lastMod > T then
* m <- {most conn node in serverSet[dest_ip]};
* remove m from serverSet[dest_ip];
* if serverSet[dest_ip] changed then
* serverSet[dest_ip].lastMod <- now;
*
* return n;
*
*/
#include <linux/ip.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/jiffies.h>
/* for sysctl */
#include <linux/fs.h>
#include <linux/sysctl.h>
#include <net/net_namespace.h>
#include <net/ip_vs.h>
/*
* It is for garbage collection of stale IPVS lblcr entries,
* when the table is full.
*/
#define CHECK_EXPIRE_INTERVAL (60*HZ)
#define ENTRY_TIMEOUT (6*60*HZ)
/*
* It is for full expiration check.
* When there is no partial expiration check (garbage collection)
* in a half hour, do a full expiration check to collect stale
* entries that haven't been touched for a day.
*/
#define COUNT_FOR_FULL_EXPIRATION 30
static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
/*
* for IPVS lblcr entry hash table
*/
#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
#define CONFIG_IP_VS_LBLCR_TAB_BITS 10
#endif
#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS
#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS)
#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1)
/*
* IPVS destination set structure and operations
*/
struct ip_vs_dest_list {
struct ip_vs_dest_list *next; /* list link */
struct ip_vs_dest *dest; /* destination server */
};
struct ip_vs_dest_set {
atomic_t size; /* set size */
unsigned long lastmod; /* last modified time */
struct ip_vs_dest_list *list; /* destination list */
rwlock_t lock; /* lock for this list */
};
static struct ip_vs_dest_list *
ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
{
struct ip_vs_dest_list *e;
for (e=set->list; e!=NULL; e=e->next) {
if (e->dest == dest)
/* already existed */
return NULL;
}
e = kmalloc(sizeof(*e), GFP_ATOMIC);
if (e == NULL) {
IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
return NULL;
}
atomic_inc(&dest->refcnt);
e->dest = dest;
/* link it to the list */
e->next = set->list;
set->list = e;
atomic_inc(&set->size);
set->lastmod = jiffies;
return e;
}
static void
ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
{
struct ip_vs_dest_list *e, **ep;
for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
if (e->dest == dest) {
/* HIT */
*ep = e->next;
atomic_dec(&set->size);
set->lastmod = jiffies;
atomic_dec(&e->dest->refcnt);
kfree(e);
break;
}
ep = &e->next;
}
}
static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
{
struct ip_vs_dest_list *e, **ep;
write_lock(&set->lock);
for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
*ep = e->next;
/*
* We don't kfree dest because it is refered either
* by its service or by the trash dest list.
*/
atomic_dec(&e->dest->refcnt);
kfree(e);
}
write_unlock(&set->lock);
}
/* get weighted least-connection node in the destination set */
static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
{
register struct ip_vs_dest_list *e;
struct ip_vs_dest *dest, *least;
int loh, doh;
if (set == NULL)
return NULL;
/* select the first destination server, whose weight > 0 */
for (e=set->list; e!=NULL; e=e->next) {
least = e->dest;
if (least->flags & IP_VS_DEST_F_OVERLOAD)
continue;
if ((atomic_read(&least->weight) > 0)
&& (least->flags & IP_VS_DEST_F_AVAILABLE)) {
loh = atomic_read(&least->activeconns) * 50
+ atomic_read(&least->inactconns);
goto nextstage;
}
}
return NULL;
/* find the destination with the weighted least load */
nextstage:
for (e=e->next; e!=NULL; e=e->next) {
dest = e->dest;
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
doh = atomic_read(&dest->activeconns) * 50
+ atomic_read(&dest->inactconns);
if ((loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight))
&& (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
least = dest;
loh = doh;
}
}
IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
"activeconns %d refcnt %d weight %d overhead %d\n",
NIPQUAD(least->addr.ip), ntohs(least->port),
atomic_read(&least->activeconns),
atomic_read(&least->refcnt),
atomic_read(&least->weight), loh);
return least;
}
/* get weighted most-connection node in the destination set */
static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
{
register struct ip_vs_dest_list *e;
struct ip_vs_dest *dest, *most;
int moh, doh;
if (set == NULL)
return NULL;
/* select the first destination server, whose weight > 0 */
for (e=set->list; e!=NULL; e=e->next) {
most = e->dest;
if (atomic_read(&most->weight) > 0) {
moh = atomic_read(&most->activeconns) * 50
+ atomic_read(&most->inactconns);
goto nextstage;
}
}
return NULL;
/* find the destination with the weighted most load */
nextstage:
for (e=e->next; e!=NULL; e=e->next) {
dest = e->dest;
doh = atomic_read(&dest->activeconns) * 50
+ atomic_read(&dest->inactconns);
/* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
if ((moh * atomic_read(&dest->weight) <
doh * atomic_read(&most->weight))
&& (atomic_read(&dest->weight) > 0)) {
most = dest;
moh = doh;
}
}
IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
"activeconns %d refcnt %d weight %d overhead %d\n",
NIPQUAD(most->addr.ip), ntohs(most->port),
atomic_read(&most->activeconns),
atomic_read(&most->refcnt),
atomic_read(&most->weight), moh);
return most;
}
/*
* IPVS lblcr entry represents an association between destination
* IP address and its destination server set
*/
struct ip_vs_lblcr_entry {
struct list_head list;
__be32 addr; /* destination IP address */
struct ip_vs_dest_set set; /* destination server set */
unsigned long lastuse; /* last used time */
};
/*
* IPVS lblcr hash table
*/
struct ip_vs_lblcr_table {
struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
atomic_t entries; /* number of entries */
int max_size; /* maximum size of entries */
struct timer_list periodic_timer; /* collect stale entries */
int rover; /* rover for expire check */
int counter; /* counter for no expire */
};
/*
* IPVS LBLCR sysctl table
*/
static ctl_table vs_vars_table[] = {
{
.procname = "lblcr_expiration",
.data = &sysctl_ip_vs_lblcr_expiration,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec_jiffies,
},
{ .ctl_name = 0 }
};
static struct ctl_table_header * sysctl_header;
static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
{
list_del(&en->list);
ip_vs_dest_set_eraseall(&en->set);
kfree(en);
}
/*
* Returns hash value for IPVS LBLCR entry
*/
static inline unsigned ip_vs_lblcr_hashkey(__be32 addr)
{
return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
}
/*
* Hash an entry in the ip_vs_lblcr_table.
* returns bool success.
*/
static void
ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
{
unsigned hash = ip_vs_lblcr_hashkey(en->addr);
list_add(&en->list, &tbl->bucket[hash]);
atomic_inc(&tbl->entries);
}
/*
* Get ip_vs_lblcr_entry associated with supplied parameters. Called under
* read lock.
*/
static inline struct ip_vs_lblcr_entry *
ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr)
{
unsigned hash = ip_vs_lblcr_hashkey(addr);
struct ip_vs_lblcr_entry *en;
list_for_each_entry(en, &tbl->bucket[hash], list)
if (en->addr == addr)
return en;
return NULL;
}
/*
* Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
* IP address to a server. Called under write lock.
*/
static inline struct ip_vs_lblcr_entry *
ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, __be32 daddr,
struct ip_vs_dest *dest)
{
struct ip_vs_lblcr_entry *en;
en = ip_vs_lblcr_get(tbl, daddr);
if (!en) {
en = kmalloc(sizeof(*en), GFP_ATOMIC);
if (!en) {
IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
return NULL;
}
en->addr = daddr;
en->lastuse = jiffies;
/* initilize its dest set */
atomic_set(&(en->set.size), 0);
en->set.list = NULL;
rwlock_init(&en->set.lock);
ip_vs_lblcr_hash(tbl, en);
}
write_lock(&en->set.lock);
ip_vs_dest_set_insert(&en->set, dest);
write_unlock(&en->set.lock);
return en;
}
/*
* Flush all the entries of the specified table.
*/
static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
{
int i;
struct ip_vs_lblcr_entry *en, *nxt;
/* No locking required, only called during cleanup. */
for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
ip_vs_lblcr_free(en);
}
}
}
static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
{
struct ip_vs_lblcr_table *tbl = svc->sched_data;
unsigned long now = jiffies;
int i, j;
struct ip_vs_lblcr_entry *en, *nxt;
for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
write_lock(&svc->sched_lock);
list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
now))
continue;
ip_vs_lblcr_free(en);
atomic_dec(&tbl->entries);
}
write_unlock(&svc->sched_lock);
}
tbl->rover = j;
}
/*
* Periodical timer handler for IPVS lblcr table
* It is used to collect stale entries when the number of entries
* exceeds the maximum size of the table.
*
* Fixme: we probably need more complicated algorithm to collect
* entries that have not been used for a long time even
* if the number of entries doesn't exceed the maximum size
* of the table.
* The full expiration check is for this purpose now.
*/
static void ip_vs_lblcr_check_expire(unsigned long data)
{
struct ip_vs_service *svc = (struct ip_vs_service *) data;
struct ip_vs_lblcr_table *tbl = svc->sched_data;
unsigned long now = jiffies;
int goal;
int i, j;
struct ip_vs_lblcr_entry *en, *nxt;
if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
/* do full expiration check */
ip_vs_lblcr_full_check(svc);
tbl->counter = 1;
goto out;
}
if (atomic_read(&tbl->entries) <= tbl->max_size) {
tbl->counter++;
goto out;
}
goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
if (goal > tbl->max_size/2)
goal = tbl->max_size/2;
for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
write_lock(&svc->sched_lock);
list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
continue;
ip_vs_lblcr_free(en);
atomic_dec(&tbl->entries);
goal--;
}
write_unlock(&svc->sched_lock);
if (goal <= 0)
break;
}
tbl->rover = j;
out:
mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
}
static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
{
int i;
struct ip_vs_lblcr_table *tbl;
/*
* Allocate the ip_vs_lblcr_table for this service
*/
tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
if (tbl == NULL) {
IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
return -ENOMEM;
}
svc->sched_data = tbl;
IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
"current service\n", sizeof(*tbl));
/*
* Initialize the hash buckets
*/
for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
INIT_LIST_HEAD(&tbl->bucket[i]);
}
tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
tbl->rover = 0;
tbl->counter = 1;
/*
* Hook periodic timer for garbage collection
*/
setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
(unsigned long)svc);
mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
return 0;
}
static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
{
struct ip_vs_lblcr_table *tbl = svc->sched_data;
/* remove periodic timer */
del_timer_sync(&tbl->periodic_timer);
/* got to clean up table entries here */
ip_vs_lblcr_flush(tbl);
/* release the table itself */
kfree(tbl);
IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
sizeof(*tbl));
return 0;
}
static inline struct ip_vs_dest *
__ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph)
{
struct ip_vs_dest *dest, *least;
int loh, doh;
/*
* We think the overhead of processing active connections is fifty
* times higher than that of inactive connections in average. (This
* fifty times might not be accurate, we will change it later.) We
* use the following formula to estimate the overhead:
* dest->activeconns*50 + dest->inactconns
* and the load:
* (dest overhead) / dest->weight
*
* Remember -- no floats in kernel mode!!!
* The comparison of h1*w2 > h2*w1 is equivalent to that of
* h1/w1 > h2/w2
* if every weight is larger than zero.
*
* The server with weight=0 is quiesced and will not receive any
* new connection.
*/
list_for_each_entry(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
if (atomic_read(&dest->weight) > 0) {
least = dest;
loh = atomic_read(&least->activeconns) * 50
+ atomic_read(&least->inactconns);
goto nextstage;
}
}
return NULL;
/*
* Find the destination with the least load.
*/
nextstage:
list_for_each_entry_continue(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
doh = atomic_read(&dest->activeconns) * 50
+ atomic_read(&dest->inactconns);
if (loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight)) {
least = dest;
loh = doh;
}
}
IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d "
"activeconns %d refcnt %d weight %d overhead %d\n",
NIPQUAD(least->addr.ip), ntohs(least->port),
atomic_read(&least->activeconns),
atomic_read(&least->refcnt),
atomic_read(&least->weight), loh);
return least;
}
/*
* If this destination server is overloaded and there is a less loaded
* server, then return true.
*/
static inline int
is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
{
if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
struct ip_vs_dest *d;
list_for_each_entry(d, &svc->destinations, n_list) {
if (atomic_read(&d->activeconns)*2
< atomic_read(&d->weight)) {
return 1;
}
}
}
return 0;
}
/*
* Locality-Based (weighted) Least-Connection scheduling
*/
static struct ip_vs_dest *
ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_lblcr_table *tbl = svc->sched_data;
struct iphdr *iph = ip_hdr(skb);
struct ip_vs_dest *dest = NULL;
struct ip_vs_lblcr_entry *en;
IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
/* First look in our cache */
read_lock(&svc->sched_lock);
en = ip_vs_lblcr_get(tbl, iph->daddr);
if (en) {
/* We only hold a read lock, but this is atomic */
en->lastuse = jiffies;
/* Get the least loaded destination */
read_lock(&en->set.lock);
dest = ip_vs_dest_set_min(&en->set);
read_unlock(&en->set.lock);
/* More than one destination + enough time passed by, cleanup */
if (atomic_read(&en->set.size) > 1 &&
time_after(jiffies, en->set.lastmod +
sysctl_ip_vs_lblcr_expiration)) {
struct ip_vs_dest *m;
write_lock(&en->set.lock);
m = ip_vs_dest_set_max(&en->set);
if (m)
ip_vs_dest_set_erase(&en->set, m);
write_unlock(&en->set.lock);
}
/* If the destination is not overloaded, use it */
if (dest && !is_overloaded(dest, svc)) {
read_unlock(&svc->sched_lock);
goto out;
}
/* The cache entry is invalid, time to schedule */
dest = __ip_vs_lblcr_schedule(svc, iph);
if (!dest) {
IP_VS_DBG(1, "no destination available\n");
read_unlock(&svc->sched_lock);
return NULL;
}
/* Update our cache entry */
write_lock(&en->set.lock);
ip_vs_dest_set_insert(&en->set, dest);
write_unlock(&en->set.lock);
}
read_unlock(&svc->sched_lock);
if (dest)
goto out;
/* No cache entry, time to schedule */
dest = __ip_vs_lblcr_schedule(svc, iph);
if (!dest) {
IP_VS_DBG(1, "no destination available\n");
return NULL;
}
/* If we fail to create a cache entry, we'll just use the valid dest */
write_lock(&svc->sched_lock);
ip_vs_lblcr_new(tbl, iph->daddr, dest);
write_unlock(&svc->sched_lock);
out:
IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
"--> server %u.%u.%u.%u:%d\n",
NIPQUAD(iph->daddr),
NIPQUAD(dest->addr.ip),
ntohs(dest->port));
return dest;
}
/*
* IPVS LBLCR Scheduler structure
*/
static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
{
.name = "lblcr",
.refcnt = ATOMIC_INIT(0),
.module = THIS_MODULE,
.n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list),
#ifdef CONFIG_IP_VS_IPV6
.supports_ipv6 = 0,
#endif
.init_service = ip_vs_lblcr_init_svc,
.done_service = ip_vs_lblcr_done_svc,
.schedule = ip_vs_lblcr_schedule,
};
static int __init ip_vs_lblcr_init(void)
{
int ret;
sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
if (ret)
unregister_sysctl_table(sysctl_header);
return ret;
}
static void __exit ip_vs_lblcr_cleanup(void)
{
unregister_sysctl_table(sysctl_header);
unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
}
module_init(ip_vs_lblcr_init);
module_exit(ip_vs_lblcr_cleanup);
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,103 @@
/*
* IPVS: Least-Connection Scheduling module
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
* Wensong Zhang : added the ip_vs_lc_update_svc
* Wensong Zhang : added any dest with weight=0 is quiesced
*
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <net/ip_vs.h>
static inline unsigned int
ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
{
/*
* We think the overhead of processing active connections is 256
* times higher than that of inactive connections in average. (This
* 256 times might not be accurate, we will change it later) We
* use the following formula to estimate the overhead now:
* dest->activeconns*256 + dest->inactconns
*/
return (atomic_read(&dest->activeconns) << 8) +
atomic_read(&dest->inactconns);
}
/*
* Least Connection scheduling
*/
static struct ip_vs_dest *
ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_dest *dest, *least = NULL;
unsigned int loh = 0, doh;
IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n");
/*
* Simply select the server with the least number of
* (activeconns<<5) + inactconns
* Except whose weight is equal to zero.
* If the weight is equal to zero, it means that the server is
* quiesced, the existing connections to the server still get
* served, but no new connection is assigned to the server.
*/
list_for_each_entry(dest, &svc->destinations, n_list) {
if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
atomic_read(&dest->weight) == 0)
continue;
doh = ip_vs_lc_dest_overhead(dest);
if (!least || doh < loh) {
least = dest;
loh = doh;
}
}
if (least)
IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d inactconns %d\n",
IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
atomic_read(&least->activeconns),
atomic_read(&least->inactconns));
return least;
}
static struct ip_vs_scheduler ip_vs_lc_scheduler = {
.name = "lc",
.refcnt = ATOMIC_INIT(0),
.module = THIS_MODULE,
.n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list),
#ifdef CONFIG_IP_VS_IPV6
.supports_ipv6 = 1,
#endif
.schedule = ip_vs_lc_schedule,
};
static int __init ip_vs_lc_init(void)
{
return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ;
}
static void __exit ip_vs_lc_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
}
module_init(ip_vs_lc_init);
module_exit(ip_vs_lc_cleanup);
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,138 @@
/*
* IPVS: Never Queue scheduling module
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
*
*/
/*
* The NQ algorithm adopts a two-speed model. When there is an idle server
* available, the job will be sent to the idle server, instead of waiting
* for a fast one. When there is no idle server available, the job will be
* sent to the server that minimize its expected delay (The Shortest
* Expected Delay scheduling algorithm).
*
* See the following paper for more information:
* A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
* in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
* pages 986-994, 1988.
*
* Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me.
*
* The difference between NQ and SED is that NQ can improve overall
* system utilization.
*
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <net/ip_vs.h>
static inline unsigned int
ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
{
/*
* We only use the active connection number in the cost
* calculation here.
*/
return atomic_read(&dest->activeconns) + 1;
}
/*
* Weighted Least Connection scheduling
*/
static struct ip_vs_dest *
ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_dest *dest, *least = NULL;
unsigned int loh = 0, doh;
IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n");
/*
* We calculate the load of each dest server as follows:
* (server expected overhead) / dest->weight
*
* Remember -- no floats in kernel mode!!!
* The comparison of h1*w2 > h2*w1 is equivalent to that of
* h1/w1 > h2/w2
* if every weight is larger than zero.
*
* The server with weight=0 is quiesced and will not receive any
* new connections.
*/
list_for_each_entry(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD ||
!atomic_read(&dest->weight))
continue;
doh = ip_vs_nq_dest_overhead(dest);
/* return the server directly if it is idle */
if (atomic_read(&dest->activeconns) == 0) {
least = dest;
loh = doh;
goto out;
}
if (!least ||
(loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight))) {
least = dest;
loh = doh;
}
}
if (!least)
return NULL;
out:
IP_VS_DBG_BUF(6, "NQ: server %s:%u "
"activeconns %d refcnt %d weight %d overhead %d\n",
IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
atomic_read(&least->activeconns),
atomic_read(&least->refcnt),
atomic_read(&least->weight), loh);
return least;
}
static struct ip_vs_scheduler ip_vs_nq_scheduler =
{
.name = "nq",
.refcnt = ATOMIC_INIT(0),
.module = THIS_MODULE,
.n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list),
#ifdef CONFIG_IP_VS_IPV6
.supports_ipv6 = 1,
#endif
.schedule = ip_vs_nq_schedule,
};
static int __init ip_vs_nq_init(void)
{
return register_ip_vs_scheduler(&ip_vs_nq_scheduler);
}
static void __exit ip_vs_nq_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
}
module_init(ip_vs_nq_init);
module_exit(ip_vs_nq_cleanup);
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,288 @@
/*
* ip_vs_proto.c: transport protocol load balancing support for IPVS
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Julian Anastasov <ja@ssi.bg>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
*
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <net/protocol.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <asm/system.h>
#include <linux/stat.h>
#include <linux/proc_fs.h>
#include <net/ip_vs.h>
/*
* IPVS protocols can only be registered/unregistered when the ipvs
* module is loaded/unloaded, so no lock is needed in accessing the
* ipvs protocol table.
*/
#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */
#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1))
static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
/*
* register an ipvs protocol
*/
static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
{
unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
pp->next = ip_vs_proto_table[hash];
ip_vs_proto_table[hash] = pp;
if (pp->init != NULL)
pp->init(pp);
return 0;
}
/*
* unregister an ipvs protocol
*/
static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
{
struct ip_vs_protocol **pp_p;
unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
pp_p = &ip_vs_proto_table[hash];
for (; *pp_p; pp_p = &(*pp_p)->next) {
if (*pp_p == pp) {
*pp_p = pp->next;
if (pp->exit != NULL)
pp->exit(pp);
return 0;
}
}
return -ESRCH;
}
/*
* get ip_vs_protocol object by its proto.
*/
struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
{
struct ip_vs_protocol *pp;
unsigned hash = IP_VS_PROTO_HASH(proto);
for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) {
if (pp->protocol == proto)
return pp;
}
return NULL;
}
/*
* Propagate event for state change to all protocols
*/
void ip_vs_protocol_timeout_change(int flags)
{
struct ip_vs_protocol *pp;
int i;
for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) {
if (pp->timeout_change)
pp->timeout_change(pp, flags);
}
}
}
int *
ip_vs_create_timeout_table(int *table, int size)
{
return kmemdup(table, size, GFP_ATOMIC);
}
/*
* Set timeout value for state specified by name
*/
int
ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to)
{
int i;
if (!table || !name || !to)
return -EINVAL;
for (i = 0; i < num; i++) {
if (strcmp(names[i], name))
continue;
table[i] = to * HZ;
return 0;
}
return -ENOENT;
}
const char * ip_vs_state_name(__u16 proto, int state)
{
struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
if (pp == NULL || pp->state_name == NULL)
return (IPPROTO_IP == proto) ? "NONE" : "ERR!";
return pp->state_name(state);
}
static void
ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
const struct sk_buff *skb,
int offset,
const char *msg)
{
char buf[128];
struct iphdr _iph, *ih;
ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
if (ih == NULL)
sprintf(buf, "%s TRUNCATED", pp->name);
else if (ih->frag_off & htons(IP_OFFSET))
sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
pp->name, NIPQUAD(ih->saddr),
NIPQUAD(ih->daddr));
else {
__be16 _ports[2], *pptr
;
pptr = skb_header_pointer(skb, offset + ih->ihl*4,
sizeof(_ports), _ports);
if (pptr == NULL)
sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u",
pp->name,
NIPQUAD(ih->saddr),
NIPQUAD(ih->daddr));
else
sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u",
pp->name,
NIPQUAD(ih->saddr),
ntohs(pptr[0]),
NIPQUAD(ih->daddr),
ntohs(pptr[1]));
}
printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
}
#ifdef CONFIG_IP_VS_IPV6
static void
ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
const struct sk_buff *skb,
int offset,
const char *msg)
{
char buf[192];
struct ipv6hdr _iph, *ih;
ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
if (ih == NULL)
sprintf(buf, "%s TRUNCATED", pp->name);
else if (ih->nexthdr == IPPROTO_FRAGMENT)
sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT " frag",
pp->name, NIP6(ih->saddr),
NIP6(ih->daddr));
else {
__be16 _ports[2], *pptr;
pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr),
sizeof(_ports), _ports);
if (pptr == NULL)
sprintf(buf, "%s TRUNCATED " NIP6_FMT "->" NIP6_FMT,
pp->name,
NIP6(ih->saddr),
NIP6(ih->daddr));
else
sprintf(buf, "%s " NIP6_FMT ":%u->" NIP6_FMT ":%u",
pp->name,
NIP6(ih->saddr),
ntohs(pptr[0]),
NIP6(ih->daddr),
ntohs(pptr[1]));
}
printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
}
#endif
void
ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
const struct sk_buff *skb,
int offset,
const char *msg)
{
#ifdef CONFIG_IP_VS_IPV6
if (skb->protocol == htons(ETH_P_IPV6))
ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg);
else
#endif
ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg);
}
int __init ip_vs_protocol_init(void)
{
char protocols[64];
#define REGISTER_PROTOCOL(p) \
do { \
register_ip_vs_protocol(p); \
strcat(protocols, ", "); \
strcat(protocols, (p)->name); \
} while (0)
protocols[0] = '\0';
protocols[2] = '\0';
#ifdef CONFIG_IP_VS_PROTO_TCP
REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
#endif
#ifdef CONFIG_IP_VS_PROTO_UDP
REGISTER_PROTOCOL(&ip_vs_protocol_udp);
#endif
#ifdef CONFIG_IP_VS_PROTO_AH
REGISTER_PROTOCOL(&ip_vs_protocol_ah);
#endif
#ifdef CONFIG_IP_VS_PROTO_ESP
REGISTER_PROTOCOL(&ip_vs_protocol_esp);
#endif
IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]);
return 0;
}
void ip_vs_protocol_cleanup(void)
{
struct ip_vs_protocol *pp;
int i;
/* unregister all the ipvs protocols */
for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
while ((pp = ip_vs_proto_table[i]) != NULL)
unregister_ip_vs_protocol(pp);
}
}

View File

@@ -0,0 +1,235 @@
/*
* ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS
*
* Authors: Julian Anastasov <ja@ssi.bg>, February 2002
* Wensong Zhang <wensong@linuxvirtualserver.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 2 as published by the Free Software Foundation;
*
*/
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <net/ip_vs.h>
/* TODO:
struct isakmp_hdr {
__u8 icookie[8];
__u8 rcookie[8];
__u8 np;
__u8 version;
__u8 xchgtype;
__u8 flags;
__u32 msgid;
__u32 length;
};
*/
#define PORT_ISAKMP 500
static struct ip_vs_conn *
ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
const struct ip_vs_iphdr *iph, unsigned int proto_off,
int inverse)
{
struct ip_vs_conn *cp;
if (likely(!inverse)) {
cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
&iph->saddr,
htons(PORT_ISAKMP),
&iph->daddr,
htons(PORT_ISAKMP));
} else {
cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
&iph->daddr,
htons(PORT_ISAKMP),
&iph->saddr,
htons(PORT_ISAKMP));
}
if (!cp) {
/*
* We are not sure if the packet is from our
* service, so our conn_schedule hook should return NF_ACCEPT
*/
IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet "
"%s%s %s->%s\n",
inverse ? "ICMP+" : "",
pp->name,
IP_VS_DBG_ADDR(af, &iph->saddr),
IP_VS_DBG_ADDR(af, &iph->daddr));
}
return cp;
}
static struct ip_vs_conn *
ah_esp_conn_out_get(int af, const struct sk_buff *skb,
struct ip_vs_protocol *pp,
const struct ip_vs_iphdr *iph,
unsigned int proto_off,
int inverse)
{
struct ip_vs_conn *cp;
if (likely(!inverse)) {
cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
&iph->saddr,
htons(PORT_ISAKMP),
&iph->daddr,
htons(PORT_ISAKMP));
} else {
cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
&iph->daddr,
htons(PORT_ISAKMP),
&iph->saddr,
htons(PORT_ISAKMP));
}
if (!cp) {
IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
"%s%s %s->%s\n",
inverse ? "ICMP+" : "",
pp->name,
IP_VS_DBG_ADDR(af, &iph->saddr),
IP_VS_DBG_ADDR(af, &iph->daddr));
}
return cp;
}
static int
ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
int *verdict, struct ip_vs_conn **cpp)
{
/*
* AH/ESP is only related traffic. Pass the packet to IP stack.
*/
*verdict = NF_ACCEPT;
return 0;
}
static void
ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb,
int offset, const char *msg)
{
char buf[256];
struct iphdr _iph, *ih;
ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
if (ih == NULL)
sprintf(buf, "%s TRUNCATED", pp->name);
else
sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
pp->name, NIPQUAD(ih->saddr),
NIPQUAD(ih->daddr));
printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
}
#ifdef CONFIG_IP_VS_IPV6
static void
ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb,
int offset, const char *msg)
{
char buf[256];
struct ipv6hdr _iph, *ih;
ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
if (ih == NULL)
sprintf(buf, "%s TRUNCATED", pp->name);
else
sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT,
pp->name, NIP6(ih->saddr),
NIP6(ih->daddr));
printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
}
#endif
static void
ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
int offset, const char *msg)
{
#ifdef CONFIG_IP_VS_IPV6
if (skb->protocol == htons(ETH_P_IPV6))
ah_esp_debug_packet_v6(pp, skb, offset, msg);
else
#endif
ah_esp_debug_packet_v4(pp, skb, offset, msg);
}
static void ah_esp_init(struct ip_vs_protocol *pp)
{
/* nothing to do now */
}
static void ah_esp_exit(struct ip_vs_protocol *pp)
{
/* nothing to do now */
}
#ifdef CONFIG_IP_VS_PROTO_AH
struct ip_vs_protocol ip_vs_protocol_ah = {
.name = "AH",
.protocol = IPPROTO_AH,
.num_states = 1,
.dont_defrag = 1,
.init = ah_esp_init,
.exit = ah_esp_exit,
.conn_schedule = ah_esp_conn_schedule,
.conn_in_get = ah_esp_conn_in_get,
.conn_out_get = ah_esp_conn_out_get,
.snat_handler = NULL,
.dnat_handler = NULL,
.csum_check = NULL,
.state_transition = NULL,
.register_app = NULL,
.unregister_app = NULL,
.app_conn_bind = NULL,
.debug_packet = ah_esp_debug_packet,
.timeout_change = NULL, /* ISAKMP */
.set_state_timeout = NULL,
};
#endif
#ifdef CONFIG_IP_VS_PROTO_ESP
struct ip_vs_protocol ip_vs_protocol_esp = {
.name = "ESP",
.protocol = IPPROTO_ESP,
.num_states = 1,
.dont_defrag = 1,
.init = ah_esp_init,
.exit = ah_esp_exit,
.conn_schedule = ah_esp_conn_schedule,
.conn_in_get = ah_esp_conn_in_get,
.conn_out_get = ah_esp_conn_out_get,
.snat_handler = NULL,
.dnat_handler = NULL,
.csum_check = NULL,
.state_transition = NULL,
.register_app = NULL,
.unregister_app = NULL,
.app_conn_bind = NULL,
.debug_packet = ah_esp_debug_packet,
.timeout_change = NULL, /* ISAKMP */
};
#endif

View File

@@ -0,0 +1,732 @@
/*
* ip_vs_proto_tcp.c: TCP load balancing support for IPVS
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Julian Anastasov <ja@ssi.bg>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
*
*/
#include <linux/kernel.h>
#include <linux/ip.h>
#include <linux/tcp.h> /* for tcphdr */
#include <net/ip.h>
#include <net/tcp.h> /* for csum_tcpudp_magic */
#include <net/ip6_checksum.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <net/ip_vs.h>
static struct ip_vs_conn *
tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
const struct ip_vs_iphdr *iph, unsigned int proto_off,
int inverse)
{
__be16 _ports[2], *pptr;
pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
if (pptr == NULL)
return NULL;
if (likely(!inverse)) {
return ip_vs_conn_in_get(af, iph->protocol,
&iph->saddr, pptr[0],
&iph->daddr, pptr[1]);
} else {
return ip_vs_conn_in_get(af, iph->protocol,
&iph->daddr, pptr[1],
&iph->saddr, pptr[0]);
}
}
static struct ip_vs_conn *
tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
const struct ip_vs_iphdr *iph, unsigned int proto_off,
int inverse)
{
__be16 _ports[2], *pptr;
pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
if (pptr == NULL)
return NULL;
if (likely(!inverse)) {
return ip_vs_conn_out_get(af, iph->protocol,
&iph->saddr, pptr[0],
&iph->daddr, pptr[1]);
} else {
return ip_vs_conn_out_get(af, iph->protocol,
&iph->daddr, pptr[1],
&iph->saddr, pptr[0]);
}
}
static int
tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
int *verdict, struct ip_vs_conn **cpp)
{
struct ip_vs_service *svc;
struct tcphdr _tcph, *th;
struct ip_vs_iphdr iph;
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
if (th == NULL) {
*verdict = NF_DROP;
return 0;
}
if (th->syn &&
(svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
th->dest))) {
if (ip_vs_todrop()) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
*/
ip_vs_service_put(svc);
*verdict = NF_DROP;
return 0;
}
/*
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
*cpp = ip_vs_schedule(svc, skb);
if (!*cpp) {
*verdict = ip_vs_leave(svc, skb, pp);
return 0;
}
ip_vs_service_put(svc);
}
return 1;
}
static inline void
tcp_fast_csum_update(int af, struct tcphdr *tcph,
const union nf_inet_addr *oldip,
const union nf_inet_addr *newip,
__be16 oldport, __be16 newport)
{
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
tcph->check =
csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
ip_vs_check_diff2(oldport, newport,
~csum_unfold(tcph->check))));
else
#endif
tcph->check =
csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
ip_vs_check_diff2(oldport, newport,
~csum_unfold(tcph->check))));
}
static inline void
tcp_partial_csum_update(int af, struct tcphdr *tcph,
const union nf_inet_addr *oldip,
const union nf_inet_addr *newip,
__be16 oldlen, __be16 newlen)
{
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
tcph->check =
csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
ip_vs_check_diff2(oldlen, newlen,
~csum_unfold(tcph->check))));
else
#endif
tcph->check =
csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
ip_vs_check_diff2(oldlen, newlen,
~csum_unfold(tcph->check))));
}
static int
tcp_snat_handler(struct sk_buff *skb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
{
struct tcphdr *tcph;
unsigned int tcphoff;
int oldlen;
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
tcphoff = sizeof(struct ipv6hdr);
else
#endif
tcphoff = ip_hdrlen(skb);
oldlen = skb->len - tcphoff;
/* csum_check requires unshared skb */
if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
return 0;
if (unlikely(cp->app != NULL)) {
/* Some checks before mangling */
if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
return 0;
/* Call application helper if needed */
if (!ip_vs_app_pkt_out(cp, skb))
return 0;
}
tcph = (void *)skb_network_header(skb) + tcphoff;
tcph->source = cp->vport;
/* Adjust TCP checksums */
if (skb->ip_summed == CHECKSUM_PARTIAL) {
tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
htonl(oldlen),
htonl(skb->len - tcphoff));
} else if (!cp->app) {
/* Only port and addr are changed, do fast csum update */
tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
cp->dport, cp->vport);
if (skb->ip_summed == CHECKSUM_COMPLETE)
skb->ip_summed = CHECKSUM_NONE;
} else {
/* full checksum calculation */
tcph->check = 0;
skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
&cp->caddr.in6,
skb->len - tcphoff,
cp->protocol, skb->csum);
else
#endif
tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
cp->caddr.ip,
skb->len - tcphoff,
cp->protocol,
skb->csum);
IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
pp->name, tcph->check,
(char*)&(tcph->check) - (char*)tcph);
}
return 1;
}
static int
tcp_dnat_handler(struct sk_buff *skb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
{
struct tcphdr *tcph;
unsigned int tcphoff;
int oldlen;
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
tcphoff = sizeof(struct ipv6hdr);
else
#endif
tcphoff = ip_hdrlen(skb);
oldlen = skb->len - tcphoff;
/* csum_check requires unshared skb */
if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
return 0;
if (unlikely(cp->app != NULL)) {
/* Some checks before mangling */
if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
return 0;
/*
* Attempt ip_vs_app call.
* It will fix ip_vs_conn and iph ack_seq stuff
*/
if (!ip_vs_app_pkt_in(cp, skb))
return 0;
}
tcph = (void *)skb_network_header(skb) + tcphoff;
tcph->dest = cp->dport;
/*
* Adjust TCP checksums
*/
if (skb->ip_summed == CHECKSUM_PARTIAL) {
tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
htonl(oldlen),
htonl(skb->len - tcphoff));
} else if (!cp->app) {
/* Only port and addr are changed, do fast csum update */
tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
cp->vport, cp->dport);
if (skb->ip_summed == CHECKSUM_COMPLETE)
skb->ip_summed = CHECKSUM_NONE;
} else {
/* full checksum calculation */
tcph->check = 0;
skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
tcph->check = csum_ipv6_magic(&cp->caddr.in6,
&cp->daddr.in6,
skb->len - tcphoff,
cp->protocol, skb->csum);
else
#endif
tcph->check = csum_tcpudp_magic(cp->caddr.ip,
cp->daddr.ip,
skb->len - tcphoff,
cp->protocol,
skb->csum);
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
return 1;
}
static int
tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
{
unsigned int tcphoff;
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
tcphoff = sizeof(struct ipv6hdr);
else
#endif
tcphoff = ip_hdrlen(skb);
switch (skb->ip_summed) {
case CHECKSUM_NONE:
skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
case CHECKSUM_COMPLETE:
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
&ipv6_hdr(skb)->daddr,
skb->len - tcphoff,
ipv6_hdr(skb)->nexthdr,
skb->csum)) {
IP_VS_DBG_RL_PKT(0, pp, skb, 0,
"Failed checksum for");
return 0;
}
} else
#endif
if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
ip_hdr(skb)->daddr,
skb->len - tcphoff,
ip_hdr(skb)->protocol,
skb->csum)) {
IP_VS_DBG_RL_PKT(0, pp, skb, 0,
"Failed checksum for");
return 0;
}
break;
default:
/* No need to checksum. */
break;
}
return 1;
}
#define TCP_DIR_INPUT 0
#define TCP_DIR_OUTPUT 4
#define TCP_DIR_INPUT_ONLY 8
static const int tcp_state_off[IP_VS_DIR_LAST] = {
[IP_VS_DIR_INPUT] = TCP_DIR_INPUT,
[IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT,
[IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY,
};
/*
* Timeout table[state]
*/
static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
[IP_VS_TCP_S_NONE] = 2*HZ,
[IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
[IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
[IP_VS_TCP_S_SYN_RECV] = 1*60*HZ,
[IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ,
[IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ,
[IP_VS_TCP_S_CLOSE] = 10*HZ,
[IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
[IP_VS_TCP_S_LAST_ACK] = 30*HZ,
[IP_VS_TCP_S_LISTEN] = 2*60*HZ,
[IP_VS_TCP_S_SYNACK] = 120*HZ,
[IP_VS_TCP_S_LAST] = 2*HZ,
};
static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
[IP_VS_TCP_S_NONE] = "NONE",
[IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
[IP_VS_TCP_S_SYN_SENT] = "SYN_SENT",
[IP_VS_TCP_S_SYN_RECV] = "SYN_RECV",
[IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT",
[IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT",
[IP_VS_TCP_S_CLOSE] = "CLOSE",
[IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT",
[IP_VS_TCP_S_LAST_ACK] = "LAST_ACK",
[IP_VS_TCP_S_LISTEN] = "LISTEN",
[IP_VS_TCP_S_SYNACK] = "SYNACK",
[IP_VS_TCP_S_LAST] = "BUG!",
};
#define sNO IP_VS_TCP_S_NONE
#define sES IP_VS_TCP_S_ESTABLISHED
#define sSS IP_VS_TCP_S_SYN_SENT
#define sSR IP_VS_TCP_S_SYN_RECV
#define sFW IP_VS_TCP_S_FIN_WAIT
#define sTW IP_VS_TCP_S_TIME_WAIT
#define sCL IP_VS_TCP_S_CLOSE
#define sCW IP_VS_TCP_S_CLOSE_WAIT
#define sLA IP_VS_TCP_S_LAST_ACK
#define sLI IP_VS_TCP_S_LISTEN
#define sSA IP_VS_TCP_S_SYNACK
struct tcp_states_t {
int next_state[IP_VS_TCP_S_LAST];
};
static const char * tcp_state_name(int state)
{
if (state >= IP_VS_TCP_S_LAST)
return "ERR!";
return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
}
static struct tcp_states_t tcp_states [] = {
/* INPUT */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
/* OUTPUT */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
/* INPUT-ONLY */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
};
static struct tcp_states_t tcp_states_dos [] = {
/* INPUT */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
/* OUTPUT */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
/* INPUT-ONLY */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
};
static struct tcp_states_t *tcp_state_table = tcp_states;
static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
{
int on = (flags & 1); /* secure_tcp */
/*
** FIXME: change secure_tcp to independent sysctl var
** or make it per-service or per-app because it is valid
** for most if not for all of the applications. Something
** like "capabilities" (flags) for each object.
*/
tcp_state_table = (on? tcp_states_dos : tcp_states);
}
static int
tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
{
return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
tcp_state_name_table, sname, to);
}
static inline int tcp_state_idx(struct tcphdr *th)
{
if (th->rst)
return 3;
if (th->syn)
return 0;
if (th->fin)
return 1;
if (th->ack)
return 2;
return -1;
}
static inline void
set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
int direction, struct tcphdr *th)
{
int state_idx;
int new_state = IP_VS_TCP_S_CLOSE;
int state_off = tcp_state_off[direction];
/*
* Update state offset to INPUT_ONLY if necessary
* or delete NO_OUTPUT flag if output packet detected
*/
if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
if (state_off == TCP_DIR_OUTPUT)
cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
else
state_off = TCP_DIR_INPUT_ONLY;
}
if ((state_idx = tcp_state_idx(th)) < 0) {
IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
goto tcp_state_out;
}
new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
tcp_state_out:
if (new_state != cp->state) {
struct ip_vs_dest *dest = cp->dest;
IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
"%s:%d state: %s->%s conn->refcnt:%d\n",
pp->name,
((state_off == TCP_DIR_OUTPUT) ?
"output " : "input "),
th->syn ? 'S' : '.',
th->fin ? 'F' : '.',
th->ack ? 'A' : '.',
th->rst ? 'R' : '.',
IP_VS_DBG_ADDR(cp->af, &cp->daddr),
ntohs(cp->dport),
IP_VS_DBG_ADDR(cp->af, &cp->caddr),
ntohs(cp->cport),
tcp_state_name(cp->state),
tcp_state_name(new_state),
atomic_read(&cp->refcnt));
if (dest) {
if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
(new_state != IP_VS_TCP_S_ESTABLISHED)) {
atomic_dec(&dest->activeconns);
atomic_inc(&dest->inactconns);
cp->flags |= IP_VS_CONN_F_INACTIVE;
} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
(new_state == IP_VS_TCP_S_ESTABLISHED)) {
atomic_inc(&dest->activeconns);
atomic_dec(&dest->inactconns);
cp->flags &= ~IP_VS_CONN_F_INACTIVE;
}
}
}
cp->timeout = pp->timeout_table[cp->state = new_state];
}
/*
* Handle state transitions
*/
static int
tcp_state_transition(struct ip_vs_conn *cp, int direction,
const struct sk_buff *skb,
struct ip_vs_protocol *pp)
{
struct tcphdr _tcph, *th;
#ifdef CONFIG_IP_VS_IPV6
int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
#else
int ihl = ip_hdrlen(skb);
#endif
th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
if (th == NULL)
return 0;
spin_lock(&cp->lock);
set_tcp_state(pp, cp, direction, th);
spin_unlock(&cp->lock);
return 1;
}
/*
* Hash table for TCP application incarnations
*/
#define TCP_APP_TAB_BITS 4
#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS)
#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1)
static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
static DEFINE_SPINLOCK(tcp_app_lock);
static inline __u16 tcp_app_hashkey(__be16 port)
{
return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
& TCP_APP_TAB_MASK;
}
static int tcp_register_app(struct ip_vs_app *inc)
{
struct ip_vs_app *i;
__u16 hash;
__be16 port = inc->port;
int ret = 0;
hash = tcp_app_hashkey(port);
spin_lock_bh(&tcp_app_lock);
list_for_each_entry(i, &tcp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
list_add(&inc->p_list, &tcp_apps[hash]);
atomic_inc(&ip_vs_protocol_tcp.appcnt);
out:
spin_unlock_bh(&tcp_app_lock);
return ret;
}
static void
tcp_unregister_app(struct ip_vs_app *inc)
{
spin_lock_bh(&tcp_app_lock);
atomic_dec(&ip_vs_protocol_tcp.appcnt);
list_del(&inc->p_list);
spin_unlock_bh(&tcp_app_lock);
}
static int
tcp_app_conn_bind(struct ip_vs_conn *cp)
{
int hash;
struct ip_vs_app *inc;
int result = 0;
/* Default binding: bind app only for NAT */
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
return 0;
/* Lookup application incarnations and bind the right one */
hash = tcp_app_hashkey(cp->vport);
spin_lock(&tcp_app_lock);
list_for_each_entry(inc, &tcp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
spin_unlock(&tcp_app_lock);
IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
__func__,
IP_VS_DBG_ADDR(cp->af, &cp->caddr),
ntohs(cp->cport),
IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
ntohs(cp->vport),
inc->name, ntohs(inc->port));
cp->app = inc;
if (inc->init_conn)
result = inc->init_conn(inc, cp);
goto out;
}
}
spin_unlock(&tcp_app_lock);
out:
return result;
}
/*
* Set LISTEN timeout. (ip_vs_conn_put will setup timer)
*/
void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
{
spin_lock(&cp->lock);
cp->state = IP_VS_TCP_S_LISTEN;
cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
spin_unlock(&cp->lock);
}
static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
{
IP_VS_INIT_HASH_TABLE(tcp_apps);
pp->timeout_table = tcp_timeouts;
}
static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
{
}
struct ip_vs_protocol ip_vs_protocol_tcp = {
.name = "TCP",
.protocol = IPPROTO_TCP,
.num_states = IP_VS_TCP_S_LAST,
.dont_defrag = 0,
.appcnt = ATOMIC_INIT(0),
.init = ip_vs_tcp_init,
.exit = ip_vs_tcp_exit,
.register_app = tcp_register_app,
.unregister_app = tcp_unregister_app,
.conn_schedule = tcp_conn_schedule,
.conn_in_get = tcp_conn_in_get,
.conn_out_get = tcp_conn_out_get,
.snat_handler = tcp_snat_handler,
.dnat_handler = tcp_dnat_handler,
.csum_check = tcp_csum_check,
.state_name = tcp_state_name,
.state_transition = tcp_state_transition,
.app_conn_bind = tcp_app_conn_bind,
.debug_packet = ip_vs_tcpudp_debug_packet,
.timeout_change = tcp_timeout_change,
.set_state_timeout = tcp_set_state_timeout,
};

View File

@@ -0,0 +1,533 @@
/*
* ip_vs_proto_udp.c: UDP load balancing support for IPVS
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Julian Anastasov <ja@ssi.bg>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
*
*/
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/kernel.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <linux/udp.h>
#include <net/ip_vs.h>
#include <net/ip.h>
#include <net/ip6_checksum.h>
static struct ip_vs_conn *
udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
const struct ip_vs_iphdr *iph, unsigned int proto_off,
int inverse)
{
struct ip_vs_conn *cp;
__be16 _ports[2], *pptr;
pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
if (pptr == NULL)
return NULL;
if (likely(!inverse)) {
cp = ip_vs_conn_in_get(af, iph->protocol,
&iph->saddr, pptr[0],
&iph->daddr, pptr[1]);
} else {
cp = ip_vs_conn_in_get(af, iph->protocol,
&iph->daddr, pptr[1],
&iph->saddr, pptr[0]);
}
return cp;
}
static struct ip_vs_conn *
udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
const struct ip_vs_iphdr *iph, unsigned int proto_off,
int inverse)
{
struct ip_vs_conn *cp;
__be16 _ports[2], *pptr;
pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
if (pptr == NULL)
return NULL;
if (likely(!inverse)) {
cp = ip_vs_conn_out_get(af, iph->protocol,
&iph->saddr, pptr[0],
&iph->daddr, pptr[1]);
} else {
cp = ip_vs_conn_out_get(af, iph->protocol,
&iph->daddr, pptr[1],
&iph->saddr, pptr[0]);
}
return cp;
}
static int
udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
int *verdict, struct ip_vs_conn **cpp)
{
struct ip_vs_service *svc;
struct udphdr _udph, *uh;
struct ip_vs_iphdr iph;
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph);
if (uh == NULL) {
*verdict = NF_DROP;
return 0;
}
svc = ip_vs_service_get(af, skb->mark, iph.protocol,
&iph.daddr, uh->dest);
if (svc) {
if (ip_vs_todrop()) {
/*
* It seems that we are very loaded.
* We have to drop this packet :(
*/
ip_vs_service_put(svc);
*verdict = NF_DROP;
return 0;
}
/*
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
*cpp = ip_vs_schedule(svc, skb);
if (!*cpp) {
*verdict = ip_vs_leave(svc, skb, pp);
return 0;
}
ip_vs_service_put(svc);
}
return 1;
}
static inline void
udp_fast_csum_update(int af, struct udphdr *uhdr,
const union nf_inet_addr *oldip,
const union nf_inet_addr *newip,
__be16 oldport, __be16 newport)
{
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
uhdr->check =
csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
ip_vs_check_diff2(oldport, newport,
~csum_unfold(uhdr->check))));
else
#endif
uhdr->check =
csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
ip_vs_check_diff2(oldport, newport,
~csum_unfold(uhdr->check))));
if (!uhdr->check)
uhdr->check = CSUM_MANGLED_0;
}
static inline void
udp_partial_csum_update(int af, struct udphdr *uhdr,
const union nf_inet_addr *oldip,
const union nf_inet_addr *newip,
__be16 oldlen, __be16 newlen)
{
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
uhdr->check =
csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
ip_vs_check_diff2(oldlen, newlen,
~csum_unfold(uhdr->check))));
else
#endif
uhdr->check =
csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
ip_vs_check_diff2(oldlen, newlen,
~csum_unfold(uhdr->check))));
}
static int
udp_snat_handler(struct sk_buff *skb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
{
struct udphdr *udph;
unsigned int udphoff;
int oldlen;
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
udphoff = sizeof(struct ipv6hdr);
else
#endif
udphoff = ip_hdrlen(skb);
oldlen = skb->len - udphoff;
/* csum_check requires unshared skb */
if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
return 0;
if (unlikely(cp->app != NULL)) {
/* Some checks before mangling */
if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
return 0;
/*
* Call application helper if needed
*/
if (!ip_vs_app_pkt_out(cp, skb))
return 0;
}
udph = (void *)skb_network_header(skb) + udphoff;
udph->source = cp->vport;
/*
* Adjust UDP checksums
*/
if (skb->ip_summed == CHECKSUM_PARTIAL) {
udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
htonl(oldlen),
htonl(skb->len - udphoff));
} else if (!cp->app && (udph->check != 0)) {
/* Only port and addr are changed, do fast csum update */
udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
cp->dport, cp->vport);
if (skb->ip_summed == CHECKSUM_COMPLETE)
skb->ip_summed = CHECKSUM_NONE;
} else {
/* full checksum calculation */
udph->check = 0;
skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
udph->check = csum_ipv6_magic(&cp->vaddr.in6,
&cp->caddr.in6,
skb->len - udphoff,
cp->protocol, skb->csum);
else
#endif
udph->check = csum_tcpudp_magic(cp->vaddr.ip,
cp->caddr.ip,
skb->len - udphoff,
cp->protocol,
skb->csum);
if (udph->check == 0)
udph->check = CSUM_MANGLED_0;
IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
pp->name, udph->check,
(char*)&(udph->check) - (char*)udph);
}
return 1;
}
static int
udp_dnat_handler(struct sk_buff *skb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
{
struct udphdr *udph;
unsigned int udphoff;
int oldlen;
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
udphoff = sizeof(struct ipv6hdr);
else
#endif
udphoff = ip_hdrlen(skb);
oldlen = skb->len - udphoff;
/* csum_check requires unshared skb */
if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
return 0;
if (unlikely(cp->app != NULL)) {
/* Some checks before mangling */
if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
return 0;
/*
* Attempt ip_vs_app call.
* It will fix ip_vs_conn
*/
if (!ip_vs_app_pkt_in(cp, skb))
return 0;
}
udph = (void *)skb_network_header(skb) + udphoff;
udph->dest = cp->dport;
/*
* Adjust UDP checksums
*/
if (skb->ip_summed == CHECKSUM_PARTIAL) {
udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
htonl(oldlen),
htonl(skb->len - udphoff));
} else if (!cp->app && (udph->check != 0)) {
/* Only port and addr are changed, do fast csum update */
udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
cp->vport, cp->dport);
if (skb->ip_summed == CHECKSUM_COMPLETE)
skb->ip_summed = CHECKSUM_NONE;
} else {
/* full checksum calculation */
udph->check = 0;
skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
udph->check = csum_ipv6_magic(&cp->caddr.in6,
&cp->daddr.in6,
skb->len - udphoff,
cp->protocol, skb->csum);
else
#endif
udph->check = csum_tcpudp_magic(cp->caddr.ip,
cp->daddr.ip,
skb->len - udphoff,
cp->protocol,
skb->csum);
if (udph->check == 0)
udph->check = CSUM_MANGLED_0;
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
return 1;
}
static int
udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
{
struct udphdr _udph, *uh;
unsigned int udphoff;
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
udphoff = sizeof(struct ipv6hdr);
else
#endif
udphoff = ip_hdrlen(skb);
uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph);
if (uh == NULL)
return 0;
if (uh->check != 0) {
switch (skb->ip_summed) {
case CHECKSUM_NONE:
skb->csum = skb_checksum(skb, udphoff,
skb->len - udphoff, 0);
case CHECKSUM_COMPLETE:
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
&ipv6_hdr(skb)->daddr,
skb->len - udphoff,
ipv6_hdr(skb)->nexthdr,
skb->csum)) {
IP_VS_DBG_RL_PKT(0, pp, skb, 0,
"Failed checksum for");
return 0;
}
} else
#endif
if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
ip_hdr(skb)->daddr,
skb->len - udphoff,
ip_hdr(skb)->protocol,
skb->csum)) {
IP_VS_DBG_RL_PKT(0, pp, skb, 0,
"Failed checksum for");
return 0;
}
break;
default:
/* No need to checksum. */
break;
}
}
return 1;
}
/*
* Note: the caller guarantees that only one of register_app,
* unregister_app or app_conn_bind is called each time.
*/
#define UDP_APP_TAB_BITS 4
#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS)
#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1)
static struct list_head udp_apps[UDP_APP_TAB_SIZE];
static DEFINE_SPINLOCK(udp_app_lock);
static inline __u16 udp_app_hashkey(__be16 port)
{
return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port)
& UDP_APP_TAB_MASK;
}
static int udp_register_app(struct ip_vs_app *inc)
{
struct ip_vs_app *i;
__u16 hash;
__be16 port = inc->port;
int ret = 0;
hash = udp_app_hashkey(port);
spin_lock_bh(&udp_app_lock);
list_for_each_entry(i, &udp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
list_add(&inc->p_list, &udp_apps[hash]);
atomic_inc(&ip_vs_protocol_udp.appcnt);
out:
spin_unlock_bh(&udp_app_lock);
return ret;
}
static void
udp_unregister_app(struct ip_vs_app *inc)
{
spin_lock_bh(&udp_app_lock);
atomic_dec(&ip_vs_protocol_udp.appcnt);
list_del(&inc->p_list);
spin_unlock_bh(&udp_app_lock);
}
static int udp_app_conn_bind(struct ip_vs_conn *cp)
{
int hash;
struct ip_vs_app *inc;
int result = 0;
/* Default binding: bind app only for NAT */
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
return 0;
/* Lookup application incarnations and bind the right one */
hash = udp_app_hashkey(cp->vport);
spin_lock(&udp_app_lock);
list_for_each_entry(inc, &udp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
spin_unlock(&udp_app_lock);
IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
__func__,
IP_VS_DBG_ADDR(cp->af, &cp->caddr),
ntohs(cp->cport),
IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
ntohs(cp->vport),
inc->name, ntohs(inc->port));
cp->app = inc;
if (inc->init_conn)
result = inc->init_conn(inc, cp);
goto out;
}
}
spin_unlock(&udp_app_lock);
out:
return result;
}
static int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
[IP_VS_UDP_S_NORMAL] = 5*60*HZ,
[IP_VS_UDP_S_LAST] = 2*HZ,
};
static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
[IP_VS_UDP_S_NORMAL] = "UDP",
[IP_VS_UDP_S_LAST] = "BUG!",
};
static int
udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
{
return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
udp_state_name_table, sname, to);
}
static const char * udp_state_name(int state)
{
if (state >= IP_VS_UDP_S_LAST)
return "ERR!";
return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
}
static int
udp_state_transition(struct ip_vs_conn *cp, int direction,
const struct sk_buff *skb,
struct ip_vs_protocol *pp)
{
cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
return 1;
}
static void udp_init(struct ip_vs_protocol *pp)
{
IP_VS_INIT_HASH_TABLE(udp_apps);
pp->timeout_table = udp_timeouts;
}
static void udp_exit(struct ip_vs_protocol *pp)
{
}
struct ip_vs_protocol ip_vs_protocol_udp = {
.name = "UDP",
.protocol = IPPROTO_UDP,
.num_states = IP_VS_UDP_S_LAST,
.dont_defrag = 0,
.init = udp_init,
.exit = udp_exit,
.conn_schedule = udp_conn_schedule,
.conn_in_get = udp_conn_in_get,
.conn_out_get = udp_conn_out_get,
.snat_handler = udp_snat_handler,
.dnat_handler = udp_dnat_handler,
.csum_check = udp_csum_check,
.state_transition = udp_state_transition,
.state_name = udp_state_name,
.register_app = udp_register_app,
.unregister_app = udp_unregister_app,
.app_conn_bind = udp_app_conn_bind,
.debug_packet = ip_vs_tcpudp_debug_packet,
.timeout_change = NULL,
.set_state_timeout = udp_set_state_timeout,
};

View File

@@ -0,0 +1,112 @@
/*
* IPVS: Round-Robin Scheduling module
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Peter Kese <peter.kese@ijs.si>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Fixes/Changes:
* Wensong Zhang : changed the ip_vs_rr_schedule to return dest
* Julian Anastasov : fixed the NULL pointer access bug in debugging
* Wensong Zhang : changed some comestics things for debugging
* Wensong Zhang : changed for the d-linked destination list
* Wensong Zhang : added the ip_vs_rr_update_svc
* Wensong Zhang : added any dest with weight=0 is quiesced
*
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <net/ip_vs.h>
static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
{
svc->sched_data = &svc->destinations;
return 0;
}
static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
{
svc->sched_data = &svc->destinations;
return 0;
}
/*
* Round-Robin Scheduling
*/
static struct ip_vs_dest *
ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct list_head *p, *q;
struct ip_vs_dest *dest;
IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n");
write_lock(&svc->sched_lock);
p = (struct list_head *)svc->sched_data;
p = p->next;
q = p;
do {
/* skip list head */
if (q == &svc->destinations) {
q = q->next;
continue;
}
dest = list_entry(q, struct ip_vs_dest, n_list);
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
atomic_read(&dest->weight) > 0)
/* HIT */
goto out;
q = q->next;
} while (q != p);
write_unlock(&svc->sched_lock);
return NULL;
out:
svc->sched_data = q;
write_unlock(&svc->sched_lock);
IP_VS_DBG_BUF(6, "RR: server %s:%u "
"activeconns %d refcnt %d weight %d\n",
IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
atomic_read(&dest->activeconns),
atomic_read(&dest->refcnt), atomic_read(&dest->weight));
return dest;
}
static struct ip_vs_scheduler ip_vs_rr_scheduler = {
.name = "rr", /* name */
.refcnt = ATOMIC_INIT(0),
.module = THIS_MODULE,
.n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),
#ifdef CONFIG_IP_VS_IPV6
.supports_ipv6 = 1,
#endif
.init_service = ip_vs_rr_init_svc,
.update_service = ip_vs_rr_update_svc,
.schedule = ip_vs_rr_schedule,
};
static int __init ip_vs_rr_init(void)
{
return register_ip_vs_scheduler(&ip_vs_rr_scheduler);
}
static void __exit ip_vs_rr_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
}
module_init(ip_vs_rr_init);
module_exit(ip_vs_rr_cleanup);
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,251 @@
/*
* IPVS An implementation of the IP virtual server support for the
* LINUX operating system. IPVS is now implemented as a module
* over the Netfilter framework. IPVS can be used to build a
* high-performance and highly available server based on a
* cluster of servers.
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Peter Kese <peter.kese@ijs.si>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
*
*/
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <asm/string.h>
#include <linux/kmod.h>
#include <linux/sysctl.h>
#include <net/ip_vs.h>
/*
* IPVS scheduler list
*/
static LIST_HEAD(ip_vs_schedulers);
/* lock for service table */
static DEFINE_RWLOCK(__ip_vs_sched_lock);
/*
* Bind a service with a scheduler
*/
int ip_vs_bind_scheduler(struct ip_vs_service *svc,
struct ip_vs_scheduler *scheduler)
{
int ret;
if (svc == NULL) {
IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n");
return -EINVAL;
}
if (scheduler == NULL) {
IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n");
return -EINVAL;
}
svc->scheduler = scheduler;
if (scheduler->init_service) {
ret = scheduler->init_service(svc);
if (ret) {
IP_VS_ERR("ip_vs_bind_scheduler(): init error\n");
return ret;
}
}
return 0;
}
/*
* Unbind a service with its scheduler
*/
int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
{
struct ip_vs_scheduler *sched;
if (svc == NULL) {
IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n");
return -EINVAL;
}
sched = svc->scheduler;
if (sched == NULL) {
IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n");
return -EINVAL;
}
if (sched->done_service) {
if (sched->done_service(svc) != 0) {
IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n");
return -EINVAL;
}
}
svc->scheduler = NULL;
return 0;
}
/*
* Get scheduler in the scheduler list by name
*/
static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
{
struct ip_vs_scheduler *sched;
IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n",
sched_name);
read_lock_bh(&__ip_vs_sched_lock);
list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
/*
* Test and get the modules atomically
*/
if (sched->module && !try_module_get(sched->module)) {
/*
* This scheduler is just deleted
*/
continue;
}
if (strcmp(sched_name, sched->name)==0) {
/* HIT */
read_unlock_bh(&__ip_vs_sched_lock);
return sched;
}
if (sched->module)
module_put(sched->module);
}
read_unlock_bh(&__ip_vs_sched_lock);
return NULL;
}
/*
* Lookup scheduler and try to load it if it doesn't exist
*/
struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
{
struct ip_vs_scheduler *sched;
/*
* Search for the scheduler by sched_name
*/
sched = ip_vs_sched_getbyname(sched_name);
/*
* If scheduler not found, load the module and search again
*/
if (sched == NULL) {
request_module("ip_vs_%s", sched_name);
sched = ip_vs_sched_getbyname(sched_name);
}
return sched;
}
void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
{
if (scheduler->module)
module_put(scheduler->module);
}
/*
* Register a scheduler in the scheduler list
*/
int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
{
struct ip_vs_scheduler *sched;
if (!scheduler) {
IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n");
return -EINVAL;
}
if (!scheduler->name) {
IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n");
return -EINVAL;
}
/* increase the module use count */
ip_vs_use_count_inc();
write_lock_bh(&__ip_vs_sched_lock);
if (!list_empty(&scheduler->n_list)) {
write_unlock_bh(&__ip_vs_sched_lock);
ip_vs_use_count_dec();
IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
"already linked\n", scheduler->name);
return -EINVAL;
}
/*
* Make sure that the scheduler with this name doesn't exist
* in the scheduler list.
*/
list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
if (strcmp(scheduler->name, sched->name) == 0) {
write_unlock_bh(&__ip_vs_sched_lock);
ip_vs_use_count_dec();
IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
"already existed in the system\n",
scheduler->name);
return -EINVAL;
}
}
/*
* Add it into the d-linked scheduler list
*/
list_add(&scheduler->n_list, &ip_vs_schedulers);
write_unlock_bh(&__ip_vs_sched_lock);
IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name);
return 0;
}
/*
* Unregister a scheduler from the scheduler list
*/
int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
{
if (!scheduler) {
IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n");
return -EINVAL;
}
write_lock_bh(&__ip_vs_sched_lock);
if (list_empty(&scheduler->n_list)) {
write_unlock_bh(&__ip_vs_sched_lock);
IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler "
"is not in the list. failed\n", scheduler->name);
return -EINVAL;
}
/*
* Remove it from the d-linked scheduler list
*/
list_del(&scheduler->n_list);
write_unlock_bh(&__ip_vs_sched_lock);
/* decrease the module use count */
ip_vs_use_count_dec();
IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name);
return 0;
}

View File

@@ -0,0 +1,140 @@
/*
* IPVS: Shortest Expected Delay scheduling module
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
*
*/
/*
* The SED algorithm attempts to minimize each job's expected delay until
* completion. The expected delay that the job will experience is
* (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of
* jobs on the ith server and Ui is the fixed service rate (weight) of
* the ith server. The SED algorithm adopts a greedy policy that each does
* what is in its own best interest, i.e. to join the queue which would
* minimize its expected delay of completion.
*
* See the following paper for more information:
* A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
* in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
* pages 986-994, 1988.
*
* Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me.
*
* The difference between SED and WLC is that SED includes the incoming
* job in the cost function (the increment of 1). SED may outperform
* WLC, while scheduling big jobs under larger heterogeneous systems
* (the server weight varies a lot).
*
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <net/ip_vs.h>
static inline unsigned int
ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
{
/*
* We only use the active connection number in the cost
* calculation here.
*/
return atomic_read(&dest->activeconns) + 1;
}
/*
* Weighted Least Connection scheduling
*/
static struct ip_vs_dest *
ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_dest *dest, *least;
unsigned int loh, doh;
IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n");
/*
* We calculate the load of each dest server as follows:
* (server expected overhead) / dest->weight
*
* Remember -- no floats in kernel mode!!!
* The comparison of h1*w2 > h2*w1 is equivalent to that of
* h1/w1 > h2/w2
* if every weight is larger than zero.
*
* The server with weight=0 is quiesced and will not receive any
* new connections.
*/
list_for_each_entry(dest, &svc->destinations, n_list) {
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
atomic_read(&dest->weight) > 0) {
least = dest;
loh = ip_vs_sed_dest_overhead(least);
goto nextstage;
}
}
return NULL;
/*
* Find the destination with the least load.
*/
nextstage:
list_for_each_entry_continue(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
doh = ip_vs_sed_dest_overhead(dest);
if (loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight)) {
least = dest;
loh = doh;
}
}
IP_VS_DBG_BUF(6, "SED: server %s:%u "
"activeconns %d refcnt %d weight %d overhead %d\n",
IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
atomic_read(&least->activeconns),
atomic_read(&least->refcnt),
atomic_read(&least->weight), loh);
return least;
}
static struct ip_vs_scheduler ip_vs_sed_scheduler =
{
.name = "sed",
.refcnt = ATOMIC_INIT(0),
.module = THIS_MODULE,
.n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list),
#ifdef CONFIG_IP_VS_IPV6
.supports_ipv6 = 1,
#endif
.schedule = ip_vs_sed_schedule,
};
static int __init ip_vs_sed_init(void)
{
return register_ip_vs_scheduler(&ip_vs_sed_scheduler);
}
static void __exit ip_vs_sed_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
}
module_init(ip_vs_sed_init);
module_exit(ip_vs_sed_cleanup);
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,258 @@
/*
* IPVS: Source Hashing scheduling module
*
* Authors: Wensong Zhang <wensong@gnuchina.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
*
*/
/*
* The sh algorithm is to select server by the hash key of source IP
* address. The pseudo code is as follows:
*
* n <- servernode[src_ip];
* if (n is dead) OR
* (n is overloaded) or (n.weight <= 0) then
* return NULL;
*
* return n;
*
* Notes that servernode is a 256-bucket hash table that maps the hash
* index derived from packet source IP address to the current server
* array. If the sh scheduler is used in cache cluster, it is good to
* combine it with cache_bypass feature. When the statically assigned
* server is dead or overloaded, the load balancer can bypass the cache
* server and send requests to the original server directly.
*
*/
#include <linux/ip.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <net/ip_vs.h>
/*
* IPVS SH bucket
*/
struct ip_vs_sh_bucket {
struct ip_vs_dest *dest; /* real server (cache) */
};
/*
* for IPVS SH entry hash table
*/
#ifndef CONFIG_IP_VS_SH_TAB_BITS
#define CONFIG_IP_VS_SH_TAB_BITS 8
#endif
#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS
#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS)
#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1)
/*
* Returns hash value for IPVS SH entry
*/
static inline unsigned ip_vs_sh_hashkey(__be32 addr)
{
return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK;
}
/*
* Get ip_vs_dest associated with supplied parameters.
*/
static inline struct ip_vs_dest *
ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __be32 addr)
{
return (tbl[ip_vs_sh_hashkey(addr)]).dest;
}
/*
* Assign all the hash buckets of the specified table with the service.
*/
static int
ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
{
int i;
struct ip_vs_sh_bucket *b;
struct list_head *p;
struct ip_vs_dest *dest;
b = tbl;
p = &svc->destinations;
for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
if (list_empty(p)) {
b->dest = NULL;
} else {
if (p == &svc->destinations)
p = p->next;
dest = list_entry(p, struct ip_vs_dest, n_list);
atomic_inc(&dest->refcnt);
b->dest = dest;
p = p->next;
}
b++;
}
return 0;
}
/*
* Flush all the hash buckets of the specified table.
*/
static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
{
int i;
struct ip_vs_sh_bucket *b;
b = tbl;
for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
if (b->dest) {
atomic_dec(&b->dest->refcnt);
b->dest = NULL;
}
b++;
}
}
static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
{
struct ip_vs_sh_bucket *tbl;
/* allocate the SH table for this service */
tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
GFP_ATOMIC);
if (tbl == NULL) {
IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n");
return -ENOMEM;
}
svc->sched_data = tbl;
IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
"current service\n",
sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
/* assign the hash buckets with the updated service */
ip_vs_sh_assign(tbl, svc);
return 0;
}
static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
{
struct ip_vs_sh_bucket *tbl = svc->sched_data;
/* got to clean up hash buckets here */
ip_vs_sh_flush(tbl);
/* release the table itself */
kfree(svc->sched_data);
IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
return 0;
}
static int ip_vs_sh_update_svc(struct ip_vs_service *svc)
{
struct ip_vs_sh_bucket *tbl = svc->sched_data;
/* got to clean up hash buckets here */
ip_vs_sh_flush(tbl);
/* assign the hash buckets with the updated service */
ip_vs_sh_assign(tbl, svc);
return 0;
}
/*
* If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
* consider that the server is overloaded here.
*/
static inline int is_overloaded(struct ip_vs_dest *dest)
{
return dest->flags & IP_VS_DEST_F_OVERLOAD;
}
/*
* Source Hashing scheduling
*/
static struct ip_vs_dest *
ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_dest *dest;
struct ip_vs_sh_bucket *tbl;
struct iphdr *iph = ip_hdr(skb);
IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
dest = ip_vs_sh_get(tbl, iph->saddr);
if (!dest
|| !(dest->flags & IP_VS_DEST_F_AVAILABLE)
|| atomic_read(&dest->weight) <= 0
|| is_overloaded(dest)) {
return NULL;
}
IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u "
"--> server %u.%u.%u.%u:%d\n",
NIPQUAD(iph->saddr),
NIPQUAD(dest->addr.ip),
ntohs(dest->port));
return dest;
}
/*
* IPVS SH Scheduler structure
*/
static struct ip_vs_scheduler ip_vs_sh_scheduler =
{
.name = "sh",
.refcnt = ATOMIC_INIT(0),
.module = THIS_MODULE,
.n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
#ifdef CONFIG_IP_VS_IPV6
.supports_ipv6 = 0,
#endif
.init_service = ip_vs_sh_init_svc,
.done_service = ip_vs_sh_done_svc,
.update_service = ip_vs_sh_update_svc,
.schedule = ip_vs_sh_schedule,
};
static int __init ip_vs_sh_init(void)
{
return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
}
static void __exit ip_vs_sh_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
}
module_init(ip_vs_sh_init);
module_exit(ip_vs_sh_cleanup);
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,942 @@
/*
* IPVS An implementation of the IP virtual server support for the
* LINUX operating system. IPVS is now implemented as a module
* over the NetFilter framework. IPVS can be used to build a
* high-performance and highly available server based on a
* cluster of servers.
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
* ip_vs_sync: sync connection info from master load balancer to backups
* through multicast
*
* Changes:
* Alexandre Cassen : Added master & backup support at a time.
* Alexandre Cassen : Added SyncID support for incoming sync
* messages filtering.
* Justin Ossevoort : Fix endian problem on sync message size.
*/
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/inetdevice.h>
#include <linux/net.h>
#include <linux/completion.h>
#include <linux/delay.h>
#include <linux/skbuff.h>
#include <linux/in.h>
#include <linux/igmp.h> /* for ip_mc_join_group */
#include <linux/udp.h>
#include <linux/err.h>
#include <linux/kthread.h>
#include <linux/wait.h>
#include <linux/kernel.h>
#include <net/ip.h>
#include <net/sock.h>
#include <net/ip_vs.h>
#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
#define IP_VS_SYNC_PORT 8848 /* multicast port */
/*
* IPVS sync connection entry
*/
struct ip_vs_sync_conn {
__u8 reserved;
/* Protocol, addresses and port numbers */
__u8 protocol; /* Which protocol (TCP/UDP) */
__be16 cport;
__be16 vport;
__be16 dport;
__be32 caddr; /* client address */
__be32 vaddr; /* virtual address */
__be32 daddr; /* destination address */
/* Flags and state transition */
__be16 flags; /* status flags */
__be16 state; /* state info */
/* The sequence options start here */
};
struct ip_vs_sync_conn_options {
struct ip_vs_seq in_seq; /* incoming seq. struct */
struct ip_vs_seq out_seq; /* outgoing seq. struct */
};
struct ip_vs_sync_thread_data {
struct socket *sock;
char *buf;
};
#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn))
#define FULL_CONN_SIZE \
(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
/*
The master mulitcasts messages to the backup load balancers in the
following format.
0 1 2 3
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Count Conns | SyncID | Size |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| |
| IPVS Sync Connection (1) |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| . |
| . |
| . |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| |
| IPVS Sync Connection (n) |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*/
#define SYNC_MESG_HEADER_LEN 4
#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
struct ip_vs_sync_mesg {
__u8 nr_conns;
__u8 syncid;
__u16 size;
/* ip_vs_sync_conn entries start here */
};
/* the maximum length of sync (sending/receiving) message */
static int sync_send_mesg_maxlen;
static int sync_recv_mesg_maxlen;
struct ip_vs_sync_buff {
struct list_head list;
unsigned long firstuse;
/* pointers for the message data */
struct ip_vs_sync_mesg *mesg;
unsigned char *head;
unsigned char *end;
};
/* the sync_buff list head and the lock */
static LIST_HEAD(ip_vs_sync_queue);
static DEFINE_SPINLOCK(ip_vs_sync_lock);
/* current sync_buff for accepting new conn entries */
static struct ip_vs_sync_buff *curr_sb = NULL;
static DEFINE_SPINLOCK(curr_sb_lock);
/* ipvs sync daemon state */
volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
volatile int ip_vs_master_syncid = 0;
volatile int ip_vs_backup_syncid = 0;
/* multicast interface name */
char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
/* sync daemon tasks */
static struct task_struct *sync_master_thread;
static struct task_struct *sync_backup_thread;
/* multicast addr */
static struct sockaddr_in mcast_addr = {
.sin_family = AF_INET,
.sin_port = __constant_htons(IP_VS_SYNC_PORT),
.sin_addr.s_addr = __constant_htonl(IP_VS_SYNC_GROUP),
};
static inline struct ip_vs_sync_buff *sb_dequeue(void)
{
struct ip_vs_sync_buff *sb;
spin_lock_bh(&ip_vs_sync_lock);
if (list_empty(&ip_vs_sync_queue)) {
sb = NULL;
} else {
sb = list_entry(ip_vs_sync_queue.next,
struct ip_vs_sync_buff,
list);
list_del(&sb->list);
}
spin_unlock_bh(&ip_vs_sync_lock);
return sb;
}
static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
{
struct ip_vs_sync_buff *sb;
if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
return NULL;
if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
kfree(sb);
return NULL;
}
sb->mesg->nr_conns = 0;
sb->mesg->syncid = ip_vs_master_syncid;
sb->mesg->size = 4;
sb->head = (unsigned char *)sb->mesg + 4;
sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
sb->firstuse = jiffies;
return sb;
}
static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
{
kfree(sb->mesg);
kfree(sb);
}
static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
{
spin_lock(&ip_vs_sync_lock);
if (ip_vs_sync_state & IP_VS_STATE_MASTER)
list_add_tail(&sb->list, &ip_vs_sync_queue);
else
ip_vs_sync_buff_release(sb);
spin_unlock(&ip_vs_sync_lock);
}
/*
* Get the current sync buffer if it has been created for more
* than the specified time or the specified time is zero.
*/
static inline struct ip_vs_sync_buff *
get_curr_sync_buff(unsigned long time)
{
struct ip_vs_sync_buff *sb;
spin_lock_bh(&curr_sb_lock);
if (curr_sb && (time == 0 ||
time_before(jiffies - curr_sb->firstuse, time))) {
sb = curr_sb;
curr_sb = NULL;
} else
sb = NULL;
spin_unlock_bh(&curr_sb_lock);
return sb;
}
/*
* Add an ip_vs_conn information into the current sync_buff.
* Called by ip_vs_in.
*/
void ip_vs_sync_conn(struct ip_vs_conn *cp)
{
struct ip_vs_sync_mesg *m;
struct ip_vs_sync_conn *s;
int len;
spin_lock(&curr_sb_lock);
if (!curr_sb) {
if (!(curr_sb=ip_vs_sync_buff_create())) {
spin_unlock(&curr_sb_lock);
IP_VS_ERR("ip_vs_sync_buff_create failed.\n");
return;
}
}
len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
SIMPLE_CONN_SIZE;
m = curr_sb->mesg;
s = (struct ip_vs_sync_conn *)curr_sb->head;
/* copy members */
s->protocol = cp->protocol;
s->cport = cp->cport;
s->vport = cp->vport;
s->dport = cp->dport;
s->caddr = cp->caddr.ip;
s->vaddr = cp->vaddr.ip;
s->daddr = cp->daddr.ip;
s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
s->state = htons(cp->state);
if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
struct ip_vs_sync_conn_options *opt =
(struct ip_vs_sync_conn_options *)&s[1];
memcpy(opt, &cp->in_seq, sizeof(*opt));
}
m->nr_conns++;
m->size += len;
curr_sb->head += len;
/* check if there is a space for next one */
if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
sb_queue_tail(curr_sb);
curr_sb = NULL;
}
spin_unlock(&curr_sb_lock);
/* synchronize its controller if it has */
if (cp->control)
ip_vs_sync_conn(cp->control);
}
/*
* Process received multicast message and create the corresponding
* ip_vs_conn entries.
*/
static void ip_vs_process_message(const char *buffer, const size_t buflen)
{
struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
struct ip_vs_sync_conn *s;
struct ip_vs_sync_conn_options *opt;
struct ip_vs_conn *cp;
struct ip_vs_protocol *pp;
struct ip_vs_dest *dest;
char *p;
int i;
if (buflen < sizeof(struct ip_vs_sync_mesg)) {
IP_VS_ERR_RL("sync message header too short\n");
return;
}
/* Convert size back to host byte order */
m->size = ntohs(m->size);
if (buflen != m->size) {
IP_VS_ERR_RL("bogus sync message size\n");
return;
}
/* SyncID sanity check */
if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
m->syncid);
return;
}
p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
for (i=0; i<m->nr_conns; i++) {
unsigned flags, state;
if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
IP_VS_ERR_RL("bogus conn in sync message\n");
return;
}
s = (struct ip_vs_sync_conn *) p;
flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
flags &= ~IP_VS_CONN_F_HASHED;
if (flags & IP_VS_CONN_F_SEQ_MASK) {
opt = (struct ip_vs_sync_conn_options *)&s[1];
p += FULL_CONN_SIZE;
if (p > buffer+buflen) {
IP_VS_ERR_RL("bogus conn options in sync message\n");
return;
}
} else {
opt = NULL;
p += SIMPLE_CONN_SIZE;
}
state = ntohs(s->state);
if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
pp = ip_vs_proto_get(s->protocol);
if (!pp) {
IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n",
s->protocol);
continue;
}
if (state >= pp->num_states) {
IP_VS_DBG(2, "Invalid %s state %u in sync msg\n",
pp->name, state);
continue;
}
} else {
/* protocol in templates is not used for state/timeout */
pp = NULL;
if (state > 0) {
IP_VS_DBG(2, "Invalid template state %u in sync msg\n",
state);
state = 0;
}
}
if (!(flags & IP_VS_CONN_F_TEMPLATE))
cp = ip_vs_conn_in_get(AF_INET, s->protocol,
(union nf_inet_addr *)&s->caddr,
s->cport,
(union nf_inet_addr *)&s->vaddr,
s->vport);
else
cp = ip_vs_ct_in_get(AF_INET, s->protocol,
(union nf_inet_addr *)&s->caddr,
s->cport,
(union nf_inet_addr *)&s->vaddr,
s->vport);
if (!cp) {
/*
* Find the appropriate destination for the connection.
* If it is not found the connection will remain unbound
* but still handled.
*/
dest = ip_vs_find_dest(AF_INET,
(union nf_inet_addr *)&s->daddr,
s->dport,
(union nf_inet_addr *)&s->vaddr,
s->vport,
s->protocol);
/* Set the approprite ativity flag */
if (s->protocol == IPPROTO_TCP) {
if (state != IP_VS_TCP_S_ESTABLISHED)
flags |= IP_VS_CONN_F_INACTIVE;
else
flags &= ~IP_VS_CONN_F_INACTIVE;
}
cp = ip_vs_conn_new(AF_INET, s->protocol,
(union nf_inet_addr *)&s->caddr,
s->cport,
(union nf_inet_addr *)&s->vaddr,
s->vport,
(union nf_inet_addr *)&s->daddr,
s->dport,
flags, dest);
if (dest)
atomic_dec(&dest->refcnt);
if (!cp) {
IP_VS_ERR("ip_vs_conn_new failed\n");
return;
}
} else if (!cp->dest) {
dest = ip_vs_try_bind_dest(cp);
if (dest)
atomic_dec(&dest->refcnt);
} else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
(cp->state != state)) {
/* update active/inactive flag for the connection */
dest = cp->dest;
if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
(state != IP_VS_TCP_S_ESTABLISHED)) {
atomic_dec(&dest->activeconns);
atomic_inc(&dest->inactconns);
cp->flags |= IP_VS_CONN_F_INACTIVE;
} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
(state == IP_VS_TCP_S_ESTABLISHED)) {
atomic_inc(&dest->activeconns);
atomic_dec(&dest->inactconns);
cp->flags &= ~IP_VS_CONN_F_INACTIVE;
}
}
if (opt)
memcpy(&cp->in_seq, opt, sizeof(*opt));
atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
cp->state = state;
cp->old_state = cp->state;
/*
* We can not recover the right timeout for templates
* in all cases, we can not find the right fwmark
* virtual service. If needed, we can do it for
* non-fwmark persistent services.
*/
if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table)
cp->timeout = pp->timeout_table[state];
else
cp->timeout = (3*60*HZ);
ip_vs_conn_put(cp);
}
}
/*
* Setup loopback of outgoing multicasts on a sending socket
*/
static void set_mcast_loop(struct sock *sk, u_char loop)
{
struct inet_sock *inet = inet_sk(sk);
/* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
lock_sock(sk);
inet->mc_loop = loop ? 1 : 0;
release_sock(sk);
}
/*
* Specify TTL for outgoing multicasts on a sending socket
*/
static void set_mcast_ttl(struct sock *sk, u_char ttl)
{
struct inet_sock *inet = inet_sk(sk);
/* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
lock_sock(sk);
inet->mc_ttl = ttl;
release_sock(sk);
}
/*
* Specifiy default interface for outgoing multicasts
*/
static int set_mcast_if(struct sock *sk, char *ifname)
{
struct net_device *dev;
struct inet_sock *inet = inet_sk(sk);
if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
return -ENODEV;
if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
return -EINVAL;
lock_sock(sk);
inet->mc_index = dev->ifindex;
/* inet->mc_addr = 0; */
release_sock(sk);
return 0;
}
/*
* Set the maximum length of sync message according to the
* specified interface's MTU.
*/
static int set_sync_mesg_maxlen(int sync_state)
{
struct net_device *dev;
int num;
if (sync_state == IP_VS_STATE_MASTER) {
if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL)
return -ENODEV;
num = (dev->mtu - sizeof(struct iphdr) -
sizeof(struct udphdr) -
SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
IP_VS_DBG(7, "setting the maximum length of sync sending "
"message %d.\n", sync_send_mesg_maxlen);
} else if (sync_state == IP_VS_STATE_BACKUP) {
if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL)
return -ENODEV;
sync_recv_mesg_maxlen = dev->mtu -
sizeof(struct iphdr) - sizeof(struct udphdr);
IP_VS_DBG(7, "setting the maximum length of sync receiving "
"message %d.\n", sync_recv_mesg_maxlen);
}
return 0;
}
/*
* Join a multicast group.
* the group is specified by a class D multicast address 224.0.0.0/8
* in the in_addr structure passed in as a parameter.
*/
static int
join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
{
struct ip_mreqn mreq;
struct net_device *dev;
int ret;
memset(&mreq, 0, sizeof(mreq));
memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
return -ENODEV;
if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
return -EINVAL;
mreq.imr_ifindex = dev->ifindex;
lock_sock(sk);
ret = ip_mc_join_group(sk, &mreq);
release_sock(sk);
return ret;
}
static int bind_mcastif_addr(struct socket *sock, char *ifname)
{
struct net_device *dev;
__be32 addr;
struct sockaddr_in sin;
if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
return -ENODEV;
addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
if (!addr)
IP_VS_ERR("You probably need to specify IP address on "
"multicast interface.\n");
IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n",
ifname, NIPQUAD(addr));
/* Now bind the socket with the address of multicast interface */
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = addr;
sin.sin_port = 0;
return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
}
/*
* Set up sending multicast socket over UDP
*/
static struct socket * make_send_sock(void)
{
struct socket *sock;
int result;
/* First create a socket */
result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
if (result < 0) {
IP_VS_ERR("Error during creation of socket; terminating\n");
return ERR_PTR(result);
}
result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn);
if (result < 0) {
IP_VS_ERR("Error setting outbound mcast interface\n");
goto error;
}
set_mcast_loop(sock->sk, 0);
set_mcast_ttl(sock->sk, 1);
result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn);
if (result < 0) {
IP_VS_ERR("Error binding address of the mcast interface\n");
goto error;
}
result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
sizeof(struct sockaddr), 0);
if (result < 0) {
IP_VS_ERR("Error connecting to the multicast addr\n");
goto error;
}
return sock;
error:
sock_release(sock);
return ERR_PTR(result);
}
/*
* Set up receiving multicast socket over UDP
*/
static struct socket * make_receive_sock(void)
{
struct socket *sock;
int result;
/* First create a socket */
result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
if (result < 0) {
IP_VS_ERR("Error during creation of socket; terminating\n");
return ERR_PTR(result);
}
/* it is equivalent to the REUSEADDR option in user-space */
sock->sk->sk_reuse = 1;
result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
sizeof(struct sockaddr));
if (result < 0) {
IP_VS_ERR("Error binding to the multicast addr\n");
goto error;
}
/* join the multicast group */
result = join_mcast_group(sock->sk,
(struct in_addr *) &mcast_addr.sin_addr,
ip_vs_backup_mcast_ifn);
if (result < 0) {
IP_VS_ERR("Error joining to the multicast group\n");
goto error;
}
return sock;
error:
sock_release(sock);
return ERR_PTR(result);
}
static int
ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
{
struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
struct kvec iov;
int len;
EnterFunction(7);
iov.iov_base = (void *)buffer;
iov.iov_len = length;
len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
LeaveFunction(7);
return len;
}
static void
ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
{
int msize;
msize = msg->size;
/* Put size in network byte order */
msg->size = htons(msg->size);
if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
IP_VS_ERR("ip_vs_send_async error\n");
}
static int
ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
{
struct msghdr msg = {NULL,};
struct kvec iov;
int len;
EnterFunction(7);
/* Receive a packet */
iov.iov_base = buffer;
iov.iov_len = (size_t)buflen;
len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
if (len < 0)
return -1;
LeaveFunction(7);
return len;
}
static int sync_thread_master(void *data)
{
struct ip_vs_sync_thread_data *tinfo = data;
struct ip_vs_sync_buff *sb;
IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, "
"syncid = %d\n",
ip_vs_master_mcast_ifn, ip_vs_master_syncid);
while (!kthread_should_stop()) {
while ((sb = sb_dequeue())) {
ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
ip_vs_sync_buff_release(sb);
}
/* check if entries stay in curr_sb for 2 seconds */
sb = get_curr_sync_buff(2 * HZ);
if (sb) {
ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
ip_vs_sync_buff_release(sb);
}
schedule_timeout_interruptible(HZ);
}
/* clean up the sync_buff queue */
while ((sb=sb_dequeue())) {
ip_vs_sync_buff_release(sb);
}
/* clean up the current sync_buff */
if ((sb = get_curr_sync_buff(0))) {
ip_vs_sync_buff_release(sb);
}
/* release the sending multicast socket */
sock_release(tinfo->sock);
kfree(tinfo);
return 0;
}
static int sync_thread_backup(void *data)
{
struct ip_vs_sync_thread_data *tinfo = data;
int len;
IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, "
"syncid = %d\n",
ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
while (!kthread_should_stop()) {
wait_event_interruptible(*tinfo->sock->sk->sk_sleep,
!skb_queue_empty(&tinfo->sock->sk->sk_receive_queue)
|| kthread_should_stop());
/* do we have data now? */
while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
len = ip_vs_receive(tinfo->sock, tinfo->buf,
sync_recv_mesg_maxlen);
if (len <= 0) {
IP_VS_ERR("receiving message error\n");
break;
}
/* disable bottom half, because it accesses the data
shared by softirq while getting/creating conns */
local_bh_disable();
ip_vs_process_message(tinfo->buf, len);
local_bh_enable();
}
}
/* release the sending multicast socket */
sock_release(tinfo->sock);
kfree(tinfo->buf);
kfree(tinfo);
return 0;
}
int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
{
struct ip_vs_sync_thread_data *tinfo;
struct task_struct **realtask, *task;
struct socket *sock;
char *name, *buf = NULL;
int (*threadfn)(void *data);
int result = -ENOMEM;
IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current));
IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
sizeof(struct ip_vs_sync_conn));
if (state == IP_VS_STATE_MASTER) {
if (sync_master_thread)
return -EEXIST;
strlcpy(ip_vs_master_mcast_ifn, mcast_ifn,
sizeof(ip_vs_master_mcast_ifn));
ip_vs_master_syncid = syncid;
realtask = &sync_master_thread;
name = "ipvs_syncmaster";
threadfn = sync_thread_master;
sock = make_send_sock();
} else if (state == IP_VS_STATE_BACKUP) {
if (sync_backup_thread)
return -EEXIST;
strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn,
sizeof(ip_vs_backup_mcast_ifn));
ip_vs_backup_syncid = syncid;
realtask = &sync_backup_thread;
name = "ipvs_syncbackup";
threadfn = sync_thread_backup;
sock = make_receive_sock();
} else {
return -EINVAL;
}
if (IS_ERR(sock)) {
result = PTR_ERR(sock);
goto out;
}
set_sync_mesg_maxlen(state);
if (state == IP_VS_STATE_BACKUP) {
buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL);
if (!buf)
goto outsocket;
}
tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
if (!tinfo)
goto outbuf;
tinfo->sock = sock;
tinfo->buf = buf;
task = kthread_run(threadfn, tinfo, name);
if (IS_ERR(task)) {
result = PTR_ERR(task);
goto outtinfo;
}
/* mark as active */
*realtask = task;
ip_vs_sync_state |= state;
/* increase the module use count */
ip_vs_use_count_inc();
return 0;
outtinfo:
kfree(tinfo);
outbuf:
kfree(buf);
outsocket:
sock_release(sock);
out:
return result;
}
int stop_sync_thread(int state)
{
IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current));
if (state == IP_VS_STATE_MASTER) {
if (!sync_master_thread)
return -ESRCH;
IP_VS_INFO("stopping master sync thread %d ...\n",
task_pid_nr(sync_master_thread));
/*
* The lock synchronizes with sb_queue_tail(), so that we don't
* add sync buffers to the queue, when we are already in
* progress of stopping the master sync daemon.
*/
spin_lock_bh(&ip_vs_sync_lock);
ip_vs_sync_state &= ~IP_VS_STATE_MASTER;
spin_unlock_bh(&ip_vs_sync_lock);
kthread_stop(sync_master_thread);
sync_master_thread = NULL;
} else if (state == IP_VS_STATE_BACKUP) {
if (!sync_backup_thread)
return -ESRCH;
IP_VS_INFO("stopping backup sync thread %d ...\n",
task_pid_nr(sync_backup_thread));
ip_vs_sync_state &= ~IP_VS_STATE_BACKUP;
kthread_stop(sync_backup_thread);
sync_backup_thread = NULL;
} else {
return -EINVAL;
}
/* decrease the module use count */
ip_vs_use_count_dec();
return 0;
}

View File

@@ -0,0 +1,128 @@
/*
* IPVS: Weighted Least-Connection Scheduling module
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Peter Kese <peter.kese@ijs.si>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
* Wensong Zhang : changed the ip_vs_wlc_schedule to return dest
* Wensong Zhang : changed to use the inactconns in scheduling
* Wensong Zhang : changed some comestics things for debugging
* Wensong Zhang : changed for the d-linked destination list
* Wensong Zhang : added the ip_vs_wlc_update_svc
* Wensong Zhang : added any dest with weight=0 is quiesced
*
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <net/ip_vs.h>
static inline unsigned int
ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
{
/*
* We think the overhead of processing active connections is 256
* times higher than that of inactive connections in average. (This
* 256 times might not be accurate, we will change it later) We
* use the following formula to estimate the overhead now:
* dest->activeconns*256 + dest->inactconns
*/
return (atomic_read(&dest->activeconns) << 8) +
atomic_read(&dest->inactconns);
}
/*
* Weighted Least Connection scheduling
*/
static struct ip_vs_dest *
ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_dest *dest, *least;
unsigned int loh, doh;
IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
/*
* We calculate the load of each dest server as follows:
* (dest overhead) / dest->weight
*
* Remember -- no floats in kernel mode!!!
* The comparison of h1*w2 > h2*w1 is equivalent to that of
* h1/w1 > h2/w2
* if every weight is larger than zero.
*
* The server with weight=0 is quiesced and will not receive any
* new connections.
*/
list_for_each_entry(dest, &svc->destinations, n_list) {
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
atomic_read(&dest->weight) > 0) {
least = dest;
loh = ip_vs_wlc_dest_overhead(least);
goto nextstage;
}
}
return NULL;
/*
* Find the destination with the least load.
*/
nextstage:
list_for_each_entry_continue(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
doh = ip_vs_wlc_dest_overhead(dest);
if (loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight)) {
least = dest;
loh = doh;
}
}
IP_VS_DBG_BUF(6, "WLC: server %s:%u "
"activeconns %d refcnt %d weight %d overhead %d\n",
IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
atomic_read(&least->activeconns),
atomic_read(&least->refcnt),
atomic_read(&least->weight), loh);
return least;
}
static struct ip_vs_scheduler ip_vs_wlc_scheduler =
{
.name = "wlc",
.refcnt = ATOMIC_INIT(0),
.module = THIS_MODULE,
.n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list),
#ifdef CONFIG_IP_VS_IPV6
.supports_ipv6 = 1,
#endif
.schedule = ip_vs_wlc_schedule,
};
static int __init ip_vs_wlc_init(void)
{
return register_ip_vs_scheduler(&ip_vs_wlc_scheduler);
}
static void __exit ip_vs_wlc_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
}
module_init(ip_vs_wlc_init);
module_exit(ip_vs_wlc_cleanup);
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,237 @@
/*
* IPVS: Weighted Round-Robin Scheduling module
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Changes:
* Wensong Zhang : changed the ip_vs_wrr_schedule to return dest
* Wensong Zhang : changed some comestics things for debugging
* Wensong Zhang : changed for the d-linked destination list
* Wensong Zhang : added the ip_vs_wrr_update_svc
* Julian Anastasov : fixed the bug of returning destination
* with weight 0 when all weights are zero
*
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/net.h>
#include <net/ip_vs.h>
/*
* current destination pointer for weighted round-robin scheduling
*/
struct ip_vs_wrr_mark {
struct list_head *cl; /* current list head */
int cw; /* current weight */
int mw; /* maximum weight */
int di; /* decreasing interval */
};
/*
* Get the gcd of server weights
*/
static int gcd(int a, int b)
{
int c;
while ((c = a % b)) {
a = b;
b = c;
}
return b;
}
static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc)
{
struct ip_vs_dest *dest;
int weight;
int g = 0;
list_for_each_entry(dest, &svc->destinations, n_list) {
weight = atomic_read(&dest->weight);
if (weight > 0) {
if (g > 0)
g = gcd(weight, g);
else
g = weight;
}
}
return g ? g : 1;
}
/*
* Get the maximum weight of the service destinations.
*/
static int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
{
struct ip_vs_dest *dest;
int weight = 0;
list_for_each_entry(dest, &svc->destinations, n_list) {
if (atomic_read(&dest->weight) > weight)
weight = atomic_read(&dest->weight);
}
return weight;
}
static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
{
struct ip_vs_wrr_mark *mark;
/*
* Allocate the mark variable for WRR scheduling
*/
mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
if (mark == NULL) {
IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n");
return -ENOMEM;
}
mark->cl = &svc->destinations;
mark->cw = 0;
mark->mw = ip_vs_wrr_max_weight(svc);
mark->di = ip_vs_wrr_gcd_weight(svc);
svc->sched_data = mark;
return 0;
}
static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
{
/*
* Release the mark variable
*/
kfree(svc->sched_data);
return 0;
}
static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
{
struct ip_vs_wrr_mark *mark = svc->sched_data;
mark->cl = &svc->destinations;
mark->mw = ip_vs_wrr_max_weight(svc);
mark->di = ip_vs_wrr_gcd_weight(svc);
if (mark->cw > mark->mw)
mark->cw = 0;
return 0;
}
/*
* Weighted Round-Robin Scheduling
*/
static struct ip_vs_dest *
ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_dest *dest;
struct ip_vs_wrr_mark *mark = svc->sched_data;
struct list_head *p;
IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n");
/*
* This loop will always terminate, because mark->cw in (0, max_weight]
* and at least one server has its weight equal to max_weight.
*/
write_lock(&svc->sched_lock);
p = mark->cl;
while (1) {
if (mark->cl == &svc->destinations) {
/* it is at the head of the destination list */
if (mark->cl == mark->cl->next) {
/* no dest entry */
dest = NULL;
goto out;
}
mark->cl = svc->destinations.next;
mark->cw -= mark->di;
if (mark->cw <= 0) {
mark->cw = mark->mw;
/*
* Still zero, which means no available servers.
*/
if (mark->cw == 0) {
mark->cl = &svc->destinations;
IP_VS_ERR_RL("ip_vs_wrr_schedule(): "
"no available servers\n");
dest = NULL;
goto out;
}
}
} else
mark->cl = mark->cl->next;
if (mark->cl != &svc->destinations) {
/* not at the head of the list */
dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
atomic_read(&dest->weight) >= mark->cw) {
/* got it */
break;
}
}
if (mark->cl == p && mark->cw == mark->di) {
/* back to the start, and no dest is found.
It is only possible when all dests are OVERLOADED */
dest = NULL;
goto out;
}
}
IP_VS_DBG_BUF(6, "WRR: server %s:%u "
"activeconns %d refcnt %d weight %d\n",
IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
atomic_read(&dest->activeconns),
atomic_read(&dest->refcnt),
atomic_read(&dest->weight));
out:
write_unlock(&svc->sched_lock);
return dest;
}
static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
.name = "wrr",
.refcnt = ATOMIC_INIT(0),
.module = THIS_MODULE,
.n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list),
#ifdef CONFIG_IP_VS_IPV6
.supports_ipv6 = 1,
#endif
.init_service = ip_vs_wrr_init_svc,
.done_service = ip_vs_wrr_done_svc,
.update_service = ip_vs_wrr_update_svc,
.schedule = ip_vs_wrr_schedule,
};
static int __init ip_vs_wrr_init(void)
{
return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
}
static void __exit ip_vs_wrr_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
}
module_init(ip_vs_wrr_init);
module_exit(ip_vs_wrr_cleanup);
MODULE_LICENSE("GPL");

File diff suppressed because it is too large Load Diff