Merge branch 'lvs-next-2.6' of git://git.kernel.org/pub/scm/linux/kernel/git/horms/lvs-2.6
Conflicts: net/netfilter/Kconfig
This commit is contained in:
@@ -858,3 +858,5 @@ config NETFILTER_XT_MATCH_U32
|
||||
endif # NETFILTER_XTABLES
|
||||
|
||||
endmenu
|
||||
|
||||
source "net/netfilter/ipvs/Kconfig"
|
||||
|
@@ -89,3 +89,6 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o
|
||||
obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o
|
||||
obj-$(CONFIG_NETFILTER_XT_MATCH_TIME) += xt_time.o
|
||||
obj-$(CONFIG_NETFILTER_XT_MATCH_U32) += xt_u32.o
|
||||
|
||||
# IPVS
|
||||
obj-$(CONFIG_IP_VS) += ipvs/
|
||||
|
239
net/netfilter/ipvs/Kconfig
Normal file
239
net/netfilter/ipvs/Kconfig
Normal file
@@ -0,0 +1,239 @@
|
||||
#
|
||||
# IP Virtual Server configuration
|
||||
#
|
||||
menuconfig IP_VS
|
||||
tristate "IP virtual server support (EXPERIMENTAL)"
|
||||
depends on NETFILTER
|
||||
---help---
|
||||
IP Virtual Server support will let you build a high-performance
|
||||
virtual server based on cluster of two or more real servers. This
|
||||
option must be enabled for at least one of the clustered computers
|
||||
that will take care of intercepting incoming connections to a
|
||||
single IP address and scheduling them to real servers.
|
||||
|
||||
Three request dispatching techniques are implemented, they are
|
||||
virtual server via NAT, virtual server via tunneling and virtual
|
||||
server via direct routing. The several scheduling algorithms can
|
||||
be used to choose which server the connection is directed to,
|
||||
thus load balancing can be achieved among the servers. For more
|
||||
information and its administration program, please visit the
|
||||
following URL: <http://www.linuxvirtualserver.org/>.
|
||||
|
||||
If you want to compile it in kernel, say Y. To compile it as a
|
||||
module, choose M here. If unsure, say N.
|
||||
|
||||
if IP_VS
|
||||
|
||||
config IP_VS_IPV6
|
||||
bool "IPv6 support for IPVS (DANGEROUS)"
|
||||
depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6)
|
||||
---help---
|
||||
Add IPv6 support to IPVS. This is incomplete and might be dangerous.
|
||||
|
||||
Say N if unsure.
|
||||
|
||||
config IP_VS_DEBUG
|
||||
bool "IP virtual server debugging"
|
||||
---help---
|
||||
Say Y here if you want to get additional messages useful in
|
||||
debugging the IP virtual server code. You can change the debug
|
||||
level in /proc/sys/net/ipv4/vs/debug_level
|
||||
|
||||
config IP_VS_TAB_BITS
|
||||
int "IPVS connection table size (the Nth power of 2)"
|
||||
range 8 20
|
||||
default 12
|
||||
---help---
|
||||
The IPVS connection hash table uses the chaining scheme to handle
|
||||
hash collisions. Using a big IPVS connection hash table will greatly
|
||||
reduce conflicts when there are hundreds of thousands of connections
|
||||
in the hash table.
|
||||
|
||||
Note the table size must be power of 2. The table size will be the
|
||||
value of 2 to the your input number power. The number to choose is
|
||||
from 8 to 20, the default number is 12, which means the table size
|
||||
is 4096. Don't input the number too small, otherwise you will lose
|
||||
performance on it. You can adapt the table size yourself, according
|
||||
to your virtual server application. It is good to set the table size
|
||||
not far less than the number of connections per second multiplying
|
||||
average lasting time of connection in the table. For example, your
|
||||
virtual server gets 200 connections per second, the connection lasts
|
||||
for 200 seconds in average in the connection table, the table size
|
||||
should be not far less than 200x200, it is good to set the table
|
||||
size 32768 (2**15).
|
||||
|
||||
Another note that each connection occupies 128 bytes effectively and
|
||||
each hash entry uses 8 bytes, so you can estimate how much memory is
|
||||
needed for your box.
|
||||
|
||||
comment "IPVS transport protocol load balancing support"
|
||||
|
||||
config IP_VS_PROTO_TCP
|
||||
bool "TCP load balancing support"
|
||||
---help---
|
||||
This option enables support for load balancing TCP transport
|
||||
protocol. Say Y if unsure.
|
||||
|
||||
config IP_VS_PROTO_UDP
|
||||
bool "UDP load balancing support"
|
||||
---help---
|
||||
This option enables support for load balancing UDP transport
|
||||
protocol. Say Y if unsure.
|
||||
|
||||
config IP_VS_PROTO_AH_ESP
|
||||
bool
|
||||
depends on UNDEFINED
|
||||
|
||||
config IP_VS_PROTO_ESP
|
||||
bool "ESP load balancing support"
|
||||
select IP_VS_PROTO_AH_ESP
|
||||
---help---
|
||||
This option enables support for load balancing ESP (Encapsulation
|
||||
Security Payload) transport protocol. Say Y if unsure.
|
||||
|
||||
config IP_VS_PROTO_AH
|
||||
bool "AH load balancing support"
|
||||
select IP_VS_PROTO_AH_ESP
|
||||
---help---
|
||||
This option enables support for load balancing AH (Authentication
|
||||
Header) transport protocol. Say Y if unsure.
|
||||
|
||||
comment "IPVS scheduler"
|
||||
|
||||
config IP_VS_RR
|
||||
tristate "round-robin scheduling"
|
||||
---help---
|
||||
The robin-robin scheduling algorithm simply directs network
|
||||
connections to different real servers in a round-robin manner.
|
||||
|
||||
If you want to compile it in kernel, say Y. To compile it as a
|
||||
module, choose M here. If unsure, say N.
|
||||
|
||||
config IP_VS_WRR
|
||||
tristate "weighted round-robin scheduling"
|
||||
---help---
|
||||
The weighted robin-robin scheduling algorithm directs network
|
||||
connections to different real servers based on server weights
|
||||
in a round-robin manner. Servers with higher weights receive
|
||||
new connections first than those with less weights, and servers
|
||||
with higher weights get more connections than those with less
|
||||
weights and servers with equal weights get equal connections.
|
||||
|
||||
If you want to compile it in kernel, say Y. To compile it as a
|
||||
module, choose M here. If unsure, say N.
|
||||
|
||||
config IP_VS_LC
|
||||
tristate "least-connection scheduling"
|
||||
---help---
|
||||
The least-connection scheduling algorithm directs network
|
||||
connections to the server with the least number of active
|
||||
connections.
|
||||
|
||||
If you want to compile it in kernel, say Y. To compile it as a
|
||||
module, choose M here. If unsure, say N.
|
||||
|
||||
config IP_VS_WLC
|
||||
tristate "weighted least-connection scheduling"
|
||||
---help---
|
||||
The weighted least-connection scheduling algorithm directs network
|
||||
connections to the server with the least active connections
|
||||
normalized by the server weight.
|
||||
|
||||
If you want to compile it in kernel, say Y. To compile it as a
|
||||
module, choose M here. If unsure, say N.
|
||||
|
||||
config IP_VS_LBLC
|
||||
tristate "locality-based least-connection scheduling"
|
||||
---help---
|
||||
The locality-based least-connection scheduling algorithm is for
|
||||
destination IP load balancing. It is usually used in cache cluster.
|
||||
This algorithm usually directs packet destined for an IP address to
|
||||
its server if the server is alive and under load. If the server is
|
||||
overloaded (its active connection numbers is larger than its weight)
|
||||
and there is a server in its half load, then allocate the weighted
|
||||
least-connection server to this IP address.
|
||||
|
||||
If you want to compile it in kernel, say Y. To compile it as a
|
||||
module, choose M here. If unsure, say N.
|
||||
|
||||
config IP_VS_LBLCR
|
||||
tristate "locality-based least-connection with replication scheduling"
|
||||
---help---
|
||||
The locality-based least-connection with replication scheduling
|
||||
algorithm is also for destination IP load balancing. It is
|
||||
usually used in cache cluster. It differs from the LBLC scheduling
|
||||
as follows: the load balancer maintains mappings from a target
|
||||
to a set of server nodes that can serve the target. Requests for
|
||||
a target are assigned to the least-connection node in the target's
|
||||
server set. If all the node in the server set are over loaded,
|
||||
it picks up a least-connection node in the cluster and adds it
|
||||
in the sever set for the target. If the server set has not been
|
||||
modified for the specified time, the most loaded node is removed
|
||||
from the server set, in order to avoid high degree of replication.
|
||||
|
||||
If you want to compile it in kernel, say Y. To compile it as a
|
||||
module, choose M here. If unsure, say N.
|
||||
|
||||
config IP_VS_DH
|
||||
tristate "destination hashing scheduling"
|
||||
---help---
|
||||
The destination hashing scheduling algorithm assigns network
|
||||
connections to the servers through looking up a statically assigned
|
||||
hash table by their destination IP addresses.
|
||||
|
||||
If you want to compile it in kernel, say Y. To compile it as a
|
||||
module, choose M here. If unsure, say N.
|
||||
|
||||
config IP_VS_SH
|
||||
tristate "source hashing scheduling"
|
||||
---help---
|
||||
The source hashing scheduling algorithm assigns network
|
||||
connections to the servers through looking up a statically assigned
|
||||
hash table by their source IP addresses.
|
||||
|
||||
If you want to compile it in kernel, say Y. To compile it as a
|
||||
module, choose M here. If unsure, say N.
|
||||
|
||||
config IP_VS_SED
|
||||
tristate "shortest expected delay scheduling"
|
||||
---help---
|
||||
The shortest expected delay scheduling algorithm assigns network
|
||||
connections to the server with the shortest expected delay. The
|
||||
expected delay that the job will experience is (Ci + 1) / Ui if
|
||||
sent to the ith server, in which Ci is the number of connections
|
||||
on the ith server and Ui is the fixed service rate (weight)
|
||||
of the ith server.
|
||||
|
||||
If you want to compile it in kernel, say Y. To compile it as a
|
||||
module, choose M here. If unsure, say N.
|
||||
|
||||
config IP_VS_NQ
|
||||
tristate "never queue scheduling"
|
||||
---help---
|
||||
The never queue scheduling algorithm adopts a two-speed model.
|
||||
When there is an idle server available, the job will be sent to
|
||||
the idle server, instead of waiting for a fast one. When there
|
||||
is no idle server available, the job will be sent to the server
|
||||
that minimize its expected delay (The Shortest Expected Delay
|
||||
scheduling algorithm).
|
||||
|
||||
If you want to compile it in kernel, say Y. To compile it as a
|
||||
module, choose M here. If unsure, say N.
|
||||
|
||||
comment 'IPVS application helper'
|
||||
|
||||
config IP_VS_FTP
|
||||
tristate "FTP protocol helper"
|
||||
depends on IP_VS_PROTO_TCP
|
||||
---help---
|
||||
FTP is a protocol that transfers IP address and/or port number in
|
||||
the payload. In the virtual server via Network Address Translation,
|
||||
the IP address and port number of real servers cannot be sent to
|
||||
clients in ftp connections directly, so FTP protocol helper is
|
||||
required for tracking the connection and mangling it back to that of
|
||||
virtual service.
|
||||
|
||||
If you want to compile it in kernel, say Y. To compile it as a
|
||||
module, choose M here. If unsure, say N.
|
||||
|
||||
endif # IP_VS
|
33
net/netfilter/ipvs/Makefile
Normal file
33
net/netfilter/ipvs/Makefile
Normal file
@@ -0,0 +1,33 @@
|
||||
#
|
||||
# Makefile for the IPVS modules on top of IPv4.
|
||||
#
|
||||
|
||||
# IPVS transport protocol load balancing support
|
||||
ip_vs_proto-objs-y :=
|
||||
ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o
|
||||
ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
|
||||
ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o
|
||||
|
||||
ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \
|
||||
ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \
|
||||
ip_vs_est.o ip_vs_proto.o \
|
||||
$(ip_vs_proto-objs-y)
|
||||
|
||||
|
||||
# IPVS core
|
||||
obj-$(CONFIG_IP_VS) += ip_vs.o
|
||||
|
||||
# IPVS schedulers
|
||||
obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o
|
||||
obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o
|
||||
obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o
|
||||
obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o
|
||||
obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
|
||||
obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
|
||||
obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
|
||||
obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
|
||||
obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
|
||||
obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
|
||||
|
||||
# IPVS application helpers
|
||||
obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
|
622
net/netfilter/ipvs/ip_vs_app.c
Normal file
622
net/netfilter/ipvs/ip_vs_app.c
Normal file
@@ -0,0 +1,622 @@
|
||||
/*
|
||||
* ip_vs_app.c: Application module support for IPVS
|
||||
*
|
||||
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Most code here is taken from ip_masq_app.c in kernel 2.2. The difference
|
||||
* is that ip_vs_app module handles the reverse direction (incoming requests
|
||||
* and outgoing responses).
|
||||
*
|
||||
* IP_MASQ_APP application masquerading module
|
||||
*
|
||||
* Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/in.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/netfilter.h>
|
||||
#include <net/net_namespace.h>
|
||||
#include <net/protocol.h>
|
||||
#include <net/tcp.h>
|
||||
#include <asm/system.h>
|
||||
#include <linux/stat.h>
|
||||
#include <linux/proc_fs.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/mutex.h>
|
||||
|
||||
#include <net/ip_vs.h>
|
||||
|
||||
EXPORT_SYMBOL(register_ip_vs_app);
|
||||
EXPORT_SYMBOL(unregister_ip_vs_app);
|
||||
EXPORT_SYMBOL(register_ip_vs_app_inc);
|
||||
|
||||
/* ipvs application list head */
|
||||
static LIST_HEAD(ip_vs_app_list);
|
||||
static DEFINE_MUTEX(__ip_vs_app_mutex);
|
||||
|
||||
|
||||
/*
|
||||
* Get an ip_vs_app object
|
||||
*/
|
||||
static inline int ip_vs_app_get(struct ip_vs_app *app)
|
||||
{
|
||||
return try_module_get(app->module);
|
||||
}
|
||||
|
||||
|
||||
static inline void ip_vs_app_put(struct ip_vs_app *app)
|
||||
{
|
||||
module_put(app->module);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Allocate/initialize app incarnation and register it in proto apps.
|
||||
*/
|
||||
static int
|
||||
ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
|
||||
{
|
||||
struct ip_vs_protocol *pp;
|
||||
struct ip_vs_app *inc;
|
||||
int ret;
|
||||
|
||||
if (!(pp = ip_vs_proto_get(proto)))
|
||||
return -EPROTONOSUPPORT;
|
||||
|
||||
if (!pp->unregister_app)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
inc = kmemdup(app, sizeof(*inc), GFP_KERNEL);
|
||||
if (!inc)
|
||||
return -ENOMEM;
|
||||
INIT_LIST_HEAD(&inc->p_list);
|
||||
INIT_LIST_HEAD(&inc->incs_list);
|
||||
inc->app = app;
|
||||
inc->port = htons(port);
|
||||
atomic_set(&inc->usecnt, 0);
|
||||
|
||||
if (app->timeouts) {
|
||||
inc->timeout_table =
|
||||
ip_vs_create_timeout_table(app->timeouts,
|
||||
app->timeouts_size);
|
||||
if (!inc->timeout_table) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
ret = pp->register_app(inc);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
list_add(&inc->a_list, &app->incs_list);
|
||||
IP_VS_DBG(9, "%s application %s:%u registered\n",
|
||||
pp->name, inc->name, inc->port);
|
||||
|
||||
return 0;
|
||||
|
||||
out:
|
||||
kfree(inc->timeout_table);
|
||||
kfree(inc);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Release app incarnation
|
||||
*/
|
||||
static void
|
||||
ip_vs_app_inc_release(struct ip_vs_app *inc)
|
||||
{
|
||||
struct ip_vs_protocol *pp;
|
||||
|
||||
if (!(pp = ip_vs_proto_get(inc->protocol)))
|
||||
return;
|
||||
|
||||
if (pp->unregister_app)
|
||||
pp->unregister_app(inc);
|
||||
|
||||
IP_VS_DBG(9, "%s App %s:%u unregistered\n",
|
||||
pp->name, inc->name, inc->port);
|
||||
|
||||
list_del(&inc->a_list);
|
||||
|
||||
kfree(inc->timeout_table);
|
||||
kfree(inc);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Get reference to app inc (only called from softirq)
|
||||
*
|
||||
*/
|
||||
int ip_vs_app_inc_get(struct ip_vs_app *inc)
|
||||
{
|
||||
int result;
|
||||
|
||||
atomic_inc(&inc->usecnt);
|
||||
if (unlikely((result = ip_vs_app_get(inc->app)) != 1))
|
||||
atomic_dec(&inc->usecnt);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Put the app inc (only called from timer or net softirq)
|
||||
*/
|
||||
void ip_vs_app_inc_put(struct ip_vs_app *inc)
|
||||
{
|
||||
ip_vs_app_put(inc->app);
|
||||
atomic_dec(&inc->usecnt);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Register an application incarnation in protocol applications
|
||||
*/
|
||||
int
|
||||
register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
|
||||
{
|
||||
int result;
|
||||
|
||||
mutex_lock(&__ip_vs_app_mutex);
|
||||
|
||||
result = ip_vs_app_inc_new(app, proto, port);
|
||||
|
||||
mutex_unlock(&__ip_vs_app_mutex);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* ip_vs_app registration routine
|
||||
*/
|
||||
int register_ip_vs_app(struct ip_vs_app *app)
|
||||
{
|
||||
/* increase the module use count */
|
||||
ip_vs_use_count_inc();
|
||||
|
||||
mutex_lock(&__ip_vs_app_mutex);
|
||||
|
||||
list_add(&app->a_list, &ip_vs_app_list);
|
||||
|
||||
mutex_unlock(&__ip_vs_app_mutex);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* ip_vs_app unregistration routine
|
||||
* We are sure there are no app incarnations attached to services
|
||||
*/
|
||||
void unregister_ip_vs_app(struct ip_vs_app *app)
|
||||
{
|
||||
struct ip_vs_app *inc, *nxt;
|
||||
|
||||
mutex_lock(&__ip_vs_app_mutex);
|
||||
|
||||
list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) {
|
||||
ip_vs_app_inc_release(inc);
|
||||
}
|
||||
|
||||
list_del(&app->a_list);
|
||||
|
||||
mutex_unlock(&__ip_vs_app_mutex);
|
||||
|
||||
/* decrease the module use count */
|
||||
ip_vs_use_count_dec();
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
|
||||
*/
|
||||
int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp)
|
||||
{
|
||||
return pp->app_conn_bind(cp);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Unbind cp from application incarnation (called by cp destructor)
|
||||
*/
|
||||
void ip_vs_unbind_app(struct ip_vs_conn *cp)
|
||||
{
|
||||
struct ip_vs_app *inc = cp->app;
|
||||
|
||||
if (!inc)
|
||||
return;
|
||||
|
||||
if (inc->unbind_conn)
|
||||
inc->unbind_conn(inc, cp);
|
||||
if (inc->done_conn)
|
||||
inc->done_conn(inc, cp);
|
||||
ip_vs_app_inc_put(inc);
|
||||
cp->app = NULL;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Fixes th->seq based on ip_vs_seq info.
|
||||
*/
|
||||
static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
|
||||
{
|
||||
__u32 seq = ntohl(th->seq);
|
||||
|
||||
/*
|
||||
* Adjust seq with delta-offset for all packets after
|
||||
* the most recent resized pkt seq and with previous_delta offset
|
||||
* for all packets before most recent resized pkt seq.
|
||||
*/
|
||||
if (vseq->delta || vseq->previous_delta) {
|
||||
if(after(seq, vseq->init_seq)) {
|
||||
th->seq = htonl(seq + vseq->delta);
|
||||
IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n",
|
||||
vseq->delta);
|
||||
} else {
|
||||
th->seq = htonl(seq + vseq->previous_delta);
|
||||
IP_VS_DBG(9, "vs_fix_seq(): added previous_delta "
|
||||
"(%d) to seq\n", vseq->previous_delta);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Fixes th->ack_seq based on ip_vs_seq info.
|
||||
*/
|
||||
static inline void
|
||||
vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
|
||||
{
|
||||
__u32 ack_seq = ntohl(th->ack_seq);
|
||||
|
||||
/*
|
||||
* Adjust ack_seq with delta-offset for
|
||||
* the packets AFTER most recent resized pkt has caused a shift
|
||||
* for packets before most recent resized pkt, use previous_delta
|
||||
*/
|
||||
if (vseq->delta || vseq->previous_delta) {
|
||||
/* since ack_seq is the number of octet that is expected
|
||||
to receive next, so compare it with init_seq+delta */
|
||||
if(after(ack_seq, vseq->init_seq+vseq->delta)) {
|
||||
th->ack_seq = htonl(ack_seq - vseq->delta);
|
||||
IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta "
|
||||
"(%d) from ack_seq\n", vseq->delta);
|
||||
|
||||
} else {
|
||||
th->ack_seq = htonl(ack_seq - vseq->previous_delta);
|
||||
IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted "
|
||||
"previous_delta (%d) from ack_seq\n",
|
||||
vseq->previous_delta);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Updates ip_vs_seq if pkt has been resized
|
||||
* Assumes already checked proto==IPPROTO_TCP and diff!=0.
|
||||
*/
|
||||
static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
|
||||
unsigned flag, __u32 seq, int diff)
|
||||
{
|
||||
/* spinlock is to keep updating cp->flags atomic */
|
||||
spin_lock(&cp->lock);
|
||||
if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
|
||||
vseq->previous_delta = vseq->delta;
|
||||
vseq->delta += diff;
|
||||
vseq->init_seq = seq;
|
||||
cp->flags |= flag;
|
||||
}
|
||||
spin_unlock(&cp->lock);
|
||||
}
|
||||
|
||||
static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
|
||||
struct ip_vs_app *app)
|
||||
{
|
||||
int diff;
|
||||
const unsigned int tcp_offset = ip_hdrlen(skb);
|
||||
struct tcphdr *th;
|
||||
__u32 seq;
|
||||
|
||||
if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
|
||||
return 0;
|
||||
|
||||
th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
|
||||
|
||||
/*
|
||||
* Remember seq number in case this pkt gets resized
|
||||
*/
|
||||
seq = ntohl(th->seq);
|
||||
|
||||
/*
|
||||
* Fix seq stuff if flagged as so.
|
||||
*/
|
||||
if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
|
||||
vs_fix_seq(&cp->out_seq, th);
|
||||
if (cp->flags & IP_VS_CONN_F_IN_SEQ)
|
||||
vs_fix_ack_seq(&cp->in_seq, th);
|
||||
|
||||
/*
|
||||
* Call private output hook function
|
||||
*/
|
||||
if (app->pkt_out == NULL)
|
||||
return 1;
|
||||
|
||||
if (!app->pkt_out(app, cp, skb, &diff))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Update ip_vs seq stuff if len has changed.
|
||||
*/
|
||||
if (diff != 0)
|
||||
vs_seq_update(cp, &cp->out_seq,
|
||||
IP_VS_CONN_F_OUT_SEQ, seq, diff);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Output pkt hook. Will call bound ip_vs_app specific function
|
||||
* called by ipvs packet handler, assumes previously checked cp!=NULL
|
||||
* returns false if it can't handle packet (oom)
|
||||
*/
|
||||
int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
|
||||
{
|
||||
struct ip_vs_app *app;
|
||||
|
||||
/*
|
||||
* check if application module is bound to
|
||||
* this ip_vs_conn.
|
||||
*/
|
||||
if ((app = cp->app) == NULL)
|
||||
return 1;
|
||||
|
||||
/* TCP is complicated */
|
||||
if (cp->protocol == IPPROTO_TCP)
|
||||
return app_tcp_pkt_out(cp, skb, app);
|
||||
|
||||
/*
|
||||
* Call private output hook function
|
||||
*/
|
||||
if (app->pkt_out == NULL)
|
||||
return 1;
|
||||
|
||||
return app->pkt_out(app, cp, skb, NULL);
|
||||
}
|
||||
|
||||
|
||||
static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
|
||||
struct ip_vs_app *app)
|
||||
{
|
||||
int diff;
|
||||
const unsigned int tcp_offset = ip_hdrlen(skb);
|
||||
struct tcphdr *th;
|
||||
__u32 seq;
|
||||
|
||||
if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
|
||||
return 0;
|
||||
|
||||
th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
|
||||
|
||||
/*
|
||||
* Remember seq number in case this pkt gets resized
|
||||
*/
|
||||
seq = ntohl(th->seq);
|
||||
|
||||
/*
|
||||
* Fix seq stuff if flagged as so.
|
||||
*/
|
||||
if (cp->flags & IP_VS_CONN_F_IN_SEQ)
|
||||
vs_fix_seq(&cp->in_seq, th);
|
||||
if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
|
||||
vs_fix_ack_seq(&cp->out_seq, th);
|
||||
|
||||
/*
|
||||
* Call private input hook function
|
||||
*/
|
||||
if (app->pkt_in == NULL)
|
||||
return 1;
|
||||
|
||||
if (!app->pkt_in(app, cp, skb, &diff))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Update ip_vs seq stuff if len has changed.
|
||||
*/
|
||||
if (diff != 0)
|
||||
vs_seq_update(cp, &cp->in_seq,
|
||||
IP_VS_CONN_F_IN_SEQ, seq, diff);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Input pkt hook. Will call bound ip_vs_app specific function
|
||||
* called by ipvs packet handler, assumes previously checked cp!=NULL.
|
||||
* returns false if can't handle packet (oom).
|
||||
*/
|
||||
int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
|
||||
{
|
||||
struct ip_vs_app *app;
|
||||
|
||||
/*
|
||||
* check if application module is bound to
|
||||
* this ip_vs_conn.
|
||||
*/
|
||||
if ((app = cp->app) == NULL)
|
||||
return 1;
|
||||
|
||||
/* TCP is complicated */
|
||||
if (cp->protocol == IPPROTO_TCP)
|
||||
return app_tcp_pkt_in(cp, skb, app);
|
||||
|
||||
/*
|
||||
* Call private input hook function
|
||||
*/
|
||||
if (app->pkt_in == NULL)
|
||||
return 1;
|
||||
|
||||
return app->pkt_in(app, cp, skb, NULL);
|
||||
}
|
||||
|
||||
|
||||
#ifdef CONFIG_PROC_FS
|
||||
/*
|
||||
* /proc/net/ip_vs_app entry function
|
||||
*/
|
||||
|
||||
static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
|
||||
{
|
||||
struct ip_vs_app *app, *inc;
|
||||
|
||||
list_for_each_entry(app, &ip_vs_app_list, a_list) {
|
||||
list_for_each_entry(inc, &app->incs_list, a_list) {
|
||||
if (pos-- == 0)
|
||||
return inc;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
|
||||
}
|
||||
|
||||
static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
|
||||
{
|
||||
mutex_lock(&__ip_vs_app_mutex);
|
||||
|
||||
return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN;
|
||||
}
|
||||
|
||||
static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
|
||||
{
|
||||
struct ip_vs_app *inc, *app;
|
||||
struct list_head *e;
|
||||
|
||||
++*pos;
|
||||
if (v == SEQ_START_TOKEN)
|
||||
return ip_vs_app_idx(0);
|
||||
|
||||
inc = v;
|
||||
app = inc->app;
|
||||
|
||||
if ((e = inc->a_list.next) != &app->incs_list)
|
||||
return list_entry(e, struct ip_vs_app, a_list);
|
||||
|
||||
/* go on to next application */
|
||||
for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) {
|
||||
app = list_entry(e, struct ip_vs_app, a_list);
|
||||
list_for_each_entry(inc, &app->incs_list, a_list) {
|
||||
return inc;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void ip_vs_app_seq_stop(struct seq_file *seq, void *v)
|
||||
{
|
||||
mutex_unlock(&__ip_vs_app_mutex);
|
||||
}
|
||||
|
||||
static int ip_vs_app_seq_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
if (v == SEQ_START_TOKEN)
|
||||
seq_puts(seq, "prot port usecnt name\n");
|
||||
else {
|
||||
const struct ip_vs_app *inc = v;
|
||||
|
||||
seq_printf(seq, "%-3s %-7u %-6d %-17s\n",
|
||||
ip_vs_proto_name(inc->protocol),
|
||||
ntohs(inc->port),
|
||||
atomic_read(&inc->usecnt),
|
||||
inc->name);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct seq_operations ip_vs_app_seq_ops = {
|
||||
.start = ip_vs_app_seq_start,
|
||||
.next = ip_vs_app_seq_next,
|
||||
.stop = ip_vs_app_seq_stop,
|
||||
.show = ip_vs_app_seq_show,
|
||||
};
|
||||
|
||||
static int ip_vs_app_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return seq_open(file, &ip_vs_app_seq_ops);
|
||||
}
|
||||
|
||||
static const struct file_operations ip_vs_app_fops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = ip_vs_app_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = seq_release,
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
* Replace a segment of data with a new segment
|
||||
*/
|
||||
int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri,
|
||||
char *o_buf, int o_len, char *n_buf, int n_len)
|
||||
{
|
||||
int diff;
|
||||
int o_offset;
|
||||
int o_left;
|
||||
|
||||
EnterFunction(9);
|
||||
|
||||
diff = n_len - o_len;
|
||||
o_offset = o_buf - (char *)skb->data;
|
||||
/* The length of left data after o_buf+o_len in the skb data */
|
||||
o_left = skb->len - (o_offset + o_len);
|
||||
|
||||
if (diff <= 0) {
|
||||
memmove(o_buf + n_len, o_buf + o_len, o_left);
|
||||
memcpy(o_buf, n_buf, n_len);
|
||||
skb_trim(skb, skb->len + diff);
|
||||
} else if (diff <= skb_tailroom(skb)) {
|
||||
skb_put(skb, diff);
|
||||
memmove(o_buf + n_len, o_buf + o_len, o_left);
|
||||
memcpy(o_buf, n_buf, n_len);
|
||||
} else {
|
||||
if (pskb_expand_head(skb, skb_headroom(skb), diff, pri))
|
||||
return -ENOMEM;
|
||||
skb_put(skb, diff);
|
||||
memmove(skb->data + o_offset + n_len,
|
||||
skb->data + o_offset + o_len, o_left);
|
||||
skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len);
|
||||
}
|
||||
|
||||
/* must update the iph total length here */
|
||||
ip_hdr(skb)->tot_len = htons(skb->len);
|
||||
|
||||
LeaveFunction(9);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int __init ip_vs_app_init(void)
|
||||
{
|
||||
/* we will replace it with proc_net_ipvs_create() soon */
|
||||
proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
void ip_vs_app_cleanup(void)
|
||||
{
|
||||
proc_net_remove(&init_net, "ip_vs_app");
|
||||
}
|
1110
net/netfilter/ipvs/ip_vs_conn.c
Normal file
1110
net/netfilter/ipvs/ip_vs_conn.c
Normal file
File diff suppressed because it is too large
Load Diff
1542
net/netfilter/ipvs/ip_vs_core.c
Normal file
1542
net/netfilter/ipvs/ip_vs_core.c
Normal file
File diff suppressed because it is too large
Load Diff
3443
net/netfilter/ipvs/ip_vs_ctl.c
Normal file
3443
net/netfilter/ipvs/ip_vs_ctl.c
Normal file
File diff suppressed because it is too large
Load Diff
261
net/netfilter/ipvs/ip_vs_dh.c
Normal file
261
net/netfilter/ipvs/ip_vs_dh.c
Normal file
@@ -0,0 +1,261 @@
|
||||
/*
|
||||
* IPVS: Destination Hashing scheduling module
|
||||
*
|
||||
* Authors: Wensong Zhang <wensong@gnuchina.org>
|
||||
*
|
||||
* Inspired by the consistent hashing scheduler patch from
|
||||
* Thomas Proell <proellt@gmx.de>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Changes:
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* The dh algorithm is to select server by the hash key of destination IP
|
||||
* address. The pseudo code is as follows:
|
||||
*
|
||||
* n <- servernode[dest_ip];
|
||||
* if (n is dead) OR
|
||||
* (n is overloaded) OR (n.weight <= 0) then
|
||||
* return NULL;
|
||||
*
|
||||
* return n;
|
||||
*
|
||||
* Notes that servernode is a 256-bucket hash table that maps the hash
|
||||
* index derived from packet destination IP address to the current server
|
||||
* array. If the dh scheduler is used in cache cluster, it is good to
|
||||
* combine it with cache_bypass feature. When the statically assigned
|
||||
* server is dead or overloaded, the load balancer can bypass the cache
|
||||
* server and send requests to the original server directly.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/ip.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/skbuff.h>
|
||||
|
||||
#include <net/ip_vs.h>
|
||||
|
||||
|
||||
/*
|
||||
* IPVS DH bucket
|
||||
*/
|
||||
struct ip_vs_dh_bucket {
|
||||
struct ip_vs_dest *dest; /* real server (cache) */
|
||||
};
|
||||
|
||||
/*
|
||||
* for IPVS DH entry hash table
|
||||
*/
|
||||
#ifndef CONFIG_IP_VS_DH_TAB_BITS
|
||||
#define CONFIG_IP_VS_DH_TAB_BITS 8
|
||||
#endif
|
||||
#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS
|
||||
#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS)
|
||||
#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1)
|
||||
|
||||
|
||||
/*
|
||||
* Returns hash value for IPVS DH entry
|
||||
*/
|
||||
static inline unsigned ip_vs_dh_hashkey(__be32 addr)
|
||||
{
|
||||
return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Get ip_vs_dest associated with supplied parameters.
|
||||
*/
|
||||
static inline struct ip_vs_dest *
|
||||
ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __be32 addr)
|
||||
{
|
||||
return (tbl[ip_vs_dh_hashkey(addr)]).dest;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Assign all the hash buckets of the specified table with the service.
|
||||
*/
|
||||
static int
|
||||
ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)
|
||||
{
|
||||
int i;
|
||||
struct ip_vs_dh_bucket *b;
|
||||
struct list_head *p;
|
||||
struct ip_vs_dest *dest;
|
||||
|
||||
b = tbl;
|
||||
p = &svc->destinations;
|
||||
for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
|
||||
if (list_empty(p)) {
|
||||
b->dest = NULL;
|
||||
} else {
|
||||
if (p == &svc->destinations)
|
||||
p = p->next;
|
||||
|
||||
dest = list_entry(p, struct ip_vs_dest, n_list);
|
||||
atomic_inc(&dest->refcnt);
|
||||
b->dest = dest;
|
||||
|
||||
p = p->next;
|
||||
}
|
||||
b++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Flush all the hash buckets of the specified table.
|
||||
*/
|
||||
static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
|
||||
{
|
||||
int i;
|
||||
struct ip_vs_dh_bucket *b;
|
||||
|
||||
b = tbl;
|
||||
for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
|
||||
if (b->dest) {
|
||||
atomic_dec(&b->dest->refcnt);
|
||||
b->dest = NULL;
|
||||
}
|
||||
b++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
|
||||
{
|
||||
struct ip_vs_dh_bucket *tbl;
|
||||
|
||||
/* allocate the DH table for this service */
|
||||
tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE,
|
||||
GFP_ATOMIC);
|
||||
if (tbl == NULL) {
|
||||
IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
svc->sched_data = tbl;
|
||||
IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
|
||||
"current service\n",
|
||||
sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
|
||||
|
||||
/* assign the hash buckets with the updated service */
|
||||
ip_vs_dh_assign(tbl, svc);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static int ip_vs_dh_done_svc(struct ip_vs_service *svc)
|
||||
{
|
||||
struct ip_vs_dh_bucket *tbl = svc->sched_data;
|
||||
|
||||
/* got to clean up hash buckets here */
|
||||
ip_vs_dh_flush(tbl);
|
||||
|
||||
/* release the table itself */
|
||||
kfree(svc->sched_data);
|
||||
IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
|
||||
sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static int ip_vs_dh_update_svc(struct ip_vs_service *svc)
|
||||
{
|
||||
struct ip_vs_dh_bucket *tbl = svc->sched_data;
|
||||
|
||||
/* got to clean up hash buckets here */
|
||||
ip_vs_dh_flush(tbl);
|
||||
|
||||
/* assign the hash buckets with the updated service */
|
||||
ip_vs_dh_assign(tbl, svc);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
|
||||
* consider that the server is overloaded here.
|
||||
*/
|
||||
static inline int is_overloaded(struct ip_vs_dest *dest)
|
||||
{
|
||||
return dest->flags & IP_VS_DEST_F_OVERLOAD;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Destination hashing scheduling
|
||||
*/
|
||||
static struct ip_vs_dest *
|
||||
ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
|
||||
{
|
||||
struct ip_vs_dest *dest;
|
||||
struct ip_vs_dh_bucket *tbl;
|
||||
struct iphdr *iph = ip_hdr(skb);
|
||||
|
||||
IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n");
|
||||
|
||||
tbl = (struct ip_vs_dh_bucket *)svc->sched_data;
|
||||
dest = ip_vs_dh_get(tbl, iph->daddr);
|
||||
if (!dest
|
||||
|| !(dest->flags & IP_VS_DEST_F_AVAILABLE)
|
||||
|| atomic_read(&dest->weight) <= 0
|
||||
|| is_overloaded(dest)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u "
|
||||
"--> server %u.%u.%u.%u:%d\n",
|
||||
NIPQUAD(iph->daddr),
|
||||
NIPQUAD(dest->addr.ip),
|
||||
ntohs(dest->port));
|
||||
|
||||
return dest;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* IPVS DH Scheduler structure
|
||||
*/
|
||||
static struct ip_vs_scheduler ip_vs_dh_scheduler =
|
||||
{
|
||||
.name = "dh",
|
||||
.refcnt = ATOMIC_INIT(0),
|
||||
.module = THIS_MODULE,
|
||||
.n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list),
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
.supports_ipv6 = 0,
|
||||
#endif
|
||||
.init_service = ip_vs_dh_init_svc,
|
||||
.done_service = ip_vs_dh_done_svc,
|
||||
.update_service = ip_vs_dh_update_svc,
|
||||
.schedule = ip_vs_dh_schedule,
|
||||
};
|
||||
|
||||
|
||||
static int __init ip_vs_dh_init(void)
|
||||
{
|
||||
return register_ip_vs_scheduler(&ip_vs_dh_scheduler);
|
||||
}
|
||||
|
||||
|
||||
static void __exit ip_vs_dh_cleanup(void)
|
||||
{
|
||||
unregister_ip_vs_scheduler(&ip_vs_dh_scheduler);
|
||||
}
|
||||
|
||||
|
||||
module_init(ip_vs_dh_init);
|
||||
module_exit(ip_vs_dh_cleanup);
|
||||
MODULE_LICENSE("GPL");
|
166
net/netfilter/ipvs/ip_vs_est.c
Normal file
166
net/netfilter/ipvs/ip_vs_est.c
Normal file
@@ -0,0 +1,166 @@
|
||||
/*
|
||||
* ip_vs_est.c: simple rate estimator for IPVS
|
||||
*
|
||||
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Changes:
|
||||
*
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/jiffies.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <linux/list.h>
|
||||
|
||||
#include <net/ip_vs.h>
|
||||
|
||||
/*
|
||||
This code is to estimate rate in a shorter interval (such as 8
|
||||
seconds) for virtual services and real servers. For measure rate in a
|
||||
long interval, it is easy to implement a user level daemon which
|
||||
periodically reads those statistical counters and measure rate.
|
||||
|
||||
Currently, the measurement is activated by slow timer handler. Hope
|
||||
this measurement will not introduce too much load.
|
||||
|
||||
We measure rate during the last 8 seconds every 2 seconds:
|
||||
|
||||
avgrate = avgrate*(1-W) + rate*W
|
||||
|
||||
where W = 2^(-2)
|
||||
|
||||
NOTES.
|
||||
|
||||
* The stored value for average bps is scaled by 2^5, so that maximal
|
||||
rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10.
|
||||
|
||||
* A lot code is taken from net/sched/estimator.c
|
||||
*/
|
||||
|
||||
|
||||
static void estimation_timer(unsigned long arg);
|
||||
|
||||
static LIST_HEAD(est_list);
|
||||
static DEFINE_SPINLOCK(est_lock);
|
||||
static DEFINE_TIMER(est_timer, estimation_timer, 0, 0);
|
||||
|
||||
static void estimation_timer(unsigned long arg)
|
||||
{
|
||||
struct ip_vs_estimator *e;
|
||||
struct ip_vs_stats *s;
|
||||
u32 n_conns;
|
||||
u32 n_inpkts, n_outpkts;
|
||||
u64 n_inbytes, n_outbytes;
|
||||
u32 rate;
|
||||
|
||||
spin_lock(&est_lock);
|
||||
list_for_each_entry(e, &est_list, list) {
|
||||
s = container_of(e, struct ip_vs_stats, est);
|
||||
|
||||
spin_lock(&s->lock);
|
||||
n_conns = s->ustats.conns;
|
||||
n_inpkts = s->ustats.inpkts;
|
||||
n_outpkts = s->ustats.outpkts;
|
||||
n_inbytes = s->ustats.inbytes;
|
||||
n_outbytes = s->ustats.outbytes;
|
||||
|
||||
/* scaled by 2^10, but divided 2 seconds */
|
||||
rate = (n_conns - e->last_conns)<<9;
|
||||
e->last_conns = n_conns;
|
||||
e->cps += ((long)rate - (long)e->cps)>>2;
|
||||
s->ustats.cps = (e->cps+0x1FF)>>10;
|
||||
|
||||
rate = (n_inpkts - e->last_inpkts)<<9;
|
||||
e->last_inpkts = n_inpkts;
|
||||
e->inpps += ((long)rate - (long)e->inpps)>>2;
|
||||
s->ustats.inpps = (e->inpps+0x1FF)>>10;
|
||||
|
||||
rate = (n_outpkts - e->last_outpkts)<<9;
|
||||
e->last_outpkts = n_outpkts;
|
||||
e->outpps += ((long)rate - (long)e->outpps)>>2;
|
||||
s->ustats.outpps = (e->outpps+0x1FF)>>10;
|
||||
|
||||
rate = (n_inbytes - e->last_inbytes)<<4;
|
||||
e->last_inbytes = n_inbytes;
|
||||
e->inbps += ((long)rate - (long)e->inbps)>>2;
|
||||
s->ustats.inbps = (e->inbps+0xF)>>5;
|
||||
|
||||
rate = (n_outbytes - e->last_outbytes)<<4;
|
||||
e->last_outbytes = n_outbytes;
|
||||
e->outbps += ((long)rate - (long)e->outbps)>>2;
|
||||
s->ustats.outbps = (e->outbps+0xF)>>5;
|
||||
spin_unlock(&s->lock);
|
||||
}
|
||||
spin_unlock(&est_lock);
|
||||
mod_timer(&est_timer, jiffies + 2*HZ);
|
||||
}
|
||||
|
||||
void ip_vs_new_estimator(struct ip_vs_stats *stats)
|
||||
{
|
||||
struct ip_vs_estimator *est = &stats->est;
|
||||
|
||||
INIT_LIST_HEAD(&est->list);
|
||||
|
||||
est->last_conns = stats->ustats.conns;
|
||||
est->cps = stats->ustats.cps<<10;
|
||||
|
||||
est->last_inpkts = stats->ustats.inpkts;
|
||||
est->inpps = stats->ustats.inpps<<10;
|
||||
|
||||
est->last_outpkts = stats->ustats.outpkts;
|
||||
est->outpps = stats->ustats.outpps<<10;
|
||||
|
||||
est->last_inbytes = stats->ustats.inbytes;
|
||||
est->inbps = stats->ustats.inbps<<5;
|
||||
|
||||
est->last_outbytes = stats->ustats.outbytes;
|
||||
est->outbps = stats->ustats.outbps<<5;
|
||||
|
||||
spin_lock_bh(&est_lock);
|
||||
list_add(&est->list, &est_list);
|
||||
spin_unlock_bh(&est_lock);
|
||||
}
|
||||
|
||||
void ip_vs_kill_estimator(struct ip_vs_stats *stats)
|
||||
{
|
||||
struct ip_vs_estimator *est = &stats->est;
|
||||
|
||||
spin_lock_bh(&est_lock);
|
||||
list_del(&est->list);
|
||||
spin_unlock_bh(&est_lock);
|
||||
}
|
||||
|
||||
void ip_vs_zero_estimator(struct ip_vs_stats *stats)
|
||||
{
|
||||
struct ip_vs_estimator *est = &stats->est;
|
||||
|
||||
/* set counters zero, caller must hold the stats->lock lock */
|
||||
est->last_inbytes = 0;
|
||||
est->last_outbytes = 0;
|
||||
est->last_conns = 0;
|
||||
est->last_inpkts = 0;
|
||||
est->last_outpkts = 0;
|
||||
est->cps = 0;
|
||||
est->inpps = 0;
|
||||
est->outpps = 0;
|
||||
est->inbps = 0;
|
||||
est->outbps = 0;
|
||||
}
|
||||
|
||||
int __init ip_vs_estimator_init(void)
|
||||
{
|
||||
mod_timer(&est_timer, jiffies + 2 * HZ);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void ip_vs_estimator_cleanup(void)
|
||||
{
|
||||
del_timer_sync(&est_timer);
|
||||
}
|
410
net/netfilter/ipvs/ip_vs_ftp.c
Normal file
410
net/netfilter/ipvs/ip_vs_ftp.c
Normal file
@@ -0,0 +1,410 @@
|
||||
/*
|
||||
* ip_vs_ftp.c: IPVS ftp application module
|
||||
*
|
||||
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
|
||||
*
|
||||
* Changes:
|
||||
*
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference
|
||||
* is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp.
|
||||
*
|
||||
* IP_MASQ_FTP ftp masquerading module
|
||||
*
|
||||
* Version: @(#)ip_masq_ftp.c 0.04 02/05/96
|
||||
*
|
||||
* Author: Wouter Gadeyne
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/moduleparam.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/in.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/netfilter.h>
|
||||
#include <net/protocol.h>
|
||||
#include <net/tcp.h>
|
||||
#include <asm/unaligned.h>
|
||||
|
||||
#include <net/ip_vs.h>
|
||||
|
||||
|
||||
#define SERVER_STRING "227 Entering Passive Mode ("
|
||||
#define CLIENT_STRING "PORT "
|
||||
|
||||
|
||||
/*
|
||||
* List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
|
||||
* First port is set to the default port.
|
||||
*/
|
||||
static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0};
|
||||
module_param_array(ports, ushort, NULL, 0);
|
||||
MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands");
|
||||
|
||||
|
||||
/* Dummy variable */
|
||||
static int ip_vs_ftp_pasv;
|
||||
|
||||
|
||||
static int
|
||||
ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
|
||||
* with the "pattern" and terminated with the "term" character.
|
||||
* <addr,port> is in network order.
|
||||
*/
|
||||
static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
|
||||
const char *pattern, size_t plen, char term,
|
||||
__be32 *addr, __be16 *port,
|
||||
char **start, char **end)
|
||||
{
|
||||
unsigned char p[6];
|
||||
int i = 0;
|
||||
|
||||
if (data_limit - data < plen) {
|
||||
/* check if there is partial match */
|
||||
if (strnicmp(data, pattern, data_limit - data) == 0)
|
||||
return -1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (strnicmp(data, pattern, plen) != 0) {
|
||||
return 0;
|
||||
}
|
||||
*start = data + plen;
|
||||
|
||||
for (data = *start; *data != term; data++) {
|
||||
if (data == data_limit)
|
||||
return -1;
|
||||
}
|
||||
*end = data;
|
||||
|
||||
memset(p, 0, sizeof(p));
|
||||
for (data = *start; data != *end; data++) {
|
||||
if (*data >= '0' && *data <= '9') {
|
||||
p[i] = p[i]*10 + *data - '0';
|
||||
} else if (*data == ',' && i < 5) {
|
||||
i++;
|
||||
} else {
|
||||
/* unexpected character */
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (i != 5)
|
||||
return -1;
|
||||
|
||||
*addr = get_unaligned((__be32 *)p);
|
||||
*port = get_unaligned((__be16 *)(p + 4));
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Look at outgoing ftp packets to catch the response to a PASV command
|
||||
* from the server (inside-to-outside).
|
||||
* When we see one, we build a connection entry with the client address,
|
||||
* client port 0 (unknown at the moment), the server address and the
|
||||
* server port. Mark the current connection entry as a control channel
|
||||
* of the new entry. All this work is just to make the data connection
|
||||
* can be scheduled to the right server later.
|
||||
*
|
||||
* The outgoing packet should be something like
|
||||
* "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
|
||||
* xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
|
||||
*/
|
||||
static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
|
||||
struct sk_buff *skb, int *diff)
|
||||
{
|
||||
struct iphdr *iph;
|
||||
struct tcphdr *th;
|
||||
char *data, *data_limit;
|
||||
char *start, *end;
|
||||
union nf_inet_addr from;
|
||||
__be16 port;
|
||||
struct ip_vs_conn *n_cp;
|
||||
char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
|
||||
unsigned buf_len;
|
||||
int ret;
|
||||
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
/* This application helper doesn't work with IPv6 yet,
|
||||
* so turn this into a no-op for IPv6 packets
|
||||
*/
|
||||
if (cp->af == AF_INET6)
|
||||
return 1;
|
||||
#endif
|
||||
|
||||
*diff = 0;
|
||||
|
||||
/* Only useful for established sessions */
|
||||
if (cp->state != IP_VS_TCP_S_ESTABLISHED)
|
||||
return 1;
|
||||
|
||||
/* Linear packets are much easier to deal with. */
|
||||
if (!skb_make_writable(skb, skb->len))
|
||||
return 0;
|
||||
|
||||
if (cp->app_data == &ip_vs_ftp_pasv) {
|
||||
iph = ip_hdr(skb);
|
||||
th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
|
||||
data = (char *)th + (th->doff << 2);
|
||||
data_limit = skb_tail_pointer(skb);
|
||||
|
||||
if (ip_vs_ftp_get_addrport(data, data_limit,
|
||||
SERVER_STRING,
|
||||
sizeof(SERVER_STRING)-1, ')',
|
||||
&from.ip, &port,
|
||||
&start, &end) != 1)
|
||||
return 1;
|
||||
|
||||
IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> "
|
||||
"%u.%u.%u.%u:%d detected\n",
|
||||
NIPQUAD(from.ip), ntohs(port),
|
||||
NIPQUAD(cp->caddr.ip), 0);
|
||||
|
||||
/*
|
||||
* Now update or create an connection entry for it
|
||||
*/
|
||||
n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port,
|
||||
&cp->caddr, 0);
|
||||
if (!n_cp) {
|
||||
n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
|
||||
&cp->caddr, 0,
|
||||
&cp->vaddr, port,
|
||||
&from, port,
|
||||
IP_VS_CONN_F_NO_CPORT,
|
||||
cp->dest);
|
||||
if (!n_cp)
|
||||
return 0;
|
||||
|
||||
/* add its controller */
|
||||
ip_vs_control_add(n_cp, cp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Replace the old passive address with the new one
|
||||
*/
|
||||
from.ip = n_cp->vaddr.ip;
|
||||
port = n_cp->vport;
|
||||
sprintf(buf, "%d,%d,%d,%d,%d,%d", NIPQUAD(from.ip),
|
||||
(ntohs(port)>>8)&255, ntohs(port)&255);
|
||||
buf_len = strlen(buf);
|
||||
|
||||
/*
|
||||
* Calculate required delta-offset to keep TCP happy
|
||||
*/
|
||||
*diff = buf_len - (end-start);
|
||||
|
||||
if (*diff == 0) {
|
||||
/* simply replace it with new passive address */
|
||||
memcpy(start, buf, buf_len);
|
||||
ret = 1;
|
||||
} else {
|
||||
ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start,
|
||||
end-start, buf, buf_len);
|
||||
}
|
||||
|
||||
cp->app_data = NULL;
|
||||
ip_vs_tcp_conn_listen(n_cp);
|
||||
ip_vs_conn_put(n_cp);
|
||||
return ret;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Look at incoming ftp packets to catch the PASV/PORT command
|
||||
* (outside-to-inside).
|
||||
*
|
||||
* The incoming packet having the PORT command should be something like
|
||||
* "PORT xxx,xxx,xxx,xxx,ppp,ppp\n".
|
||||
* xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number.
|
||||
* In this case, we create a connection entry using the client address and
|
||||
* port, so that the active ftp data connection from the server can reach
|
||||
* the client.
|
||||
*/
|
||||
static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
|
||||
struct sk_buff *skb, int *diff)
|
||||
{
|
||||
struct iphdr *iph;
|
||||
struct tcphdr *th;
|
||||
char *data, *data_start, *data_limit;
|
||||
char *start, *end;
|
||||
union nf_inet_addr to;
|
||||
__be16 port;
|
||||
struct ip_vs_conn *n_cp;
|
||||
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
/* This application helper doesn't work with IPv6 yet,
|
||||
* so turn this into a no-op for IPv6 packets
|
||||
*/
|
||||
if (cp->af == AF_INET6)
|
||||
return 1;
|
||||
#endif
|
||||
|
||||
/* no diff required for incoming packets */
|
||||
*diff = 0;
|
||||
|
||||
/* Only useful for established sessions */
|
||||
if (cp->state != IP_VS_TCP_S_ESTABLISHED)
|
||||
return 1;
|
||||
|
||||
/* Linear packets are much easier to deal with. */
|
||||
if (!skb_make_writable(skb, skb->len))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Detecting whether it is passive
|
||||
*/
|
||||
iph = ip_hdr(skb);
|
||||
th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
|
||||
|
||||
/* Since there may be OPTIONS in the TCP packet and the HLEN is
|
||||
the length of the header in 32-bit multiples, it is accurate
|
||||
to calculate data address by th+HLEN*4 */
|
||||
data = data_start = (char *)th + (th->doff << 2);
|
||||
data_limit = skb_tail_pointer(skb);
|
||||
|
||||
while (data <= data_limit - 6) {
|
||||
if (strnicmp(data, "PASV\r\n", 6) == 0) {
|
||||
/* Passive mode on */
|
||||
IP_VS_DBG(7, "got PASV at %td of %td\n",
|
||||
data - data_start,
|
||||
data_limit - data_start);
|
||||
cp->app_data = &ip_vs_ftp_pasv;
|
||||
return 1;
|
||||
}
|
||||
data++;
|
||||
}
|
||||
|
||||
/*
|
||||
* To support virtual FTP server, the scenerio is as follows:
|
||||
* FTP client ----> Load Balancer ----> FTP server
|
||||
* First detect the port number in the application data,
|
||||
* then create a new connection entry for the coming data
|
||||
* connection.
|
||||
*/
|
||||
if (ip_vs_ftp_get_addrport(data_start, data_limit,
|
||||
CLIENT_STRING, sizeof(CLIENT_STRING)-1,
|
||||
'\r', &to.ip, &port,
|
||||
&start, &end) != 1)
|
||||
return 1;
|
||||
|
||||
IP_VS_DBG(7, "PORT %u.%u.%u.%u:%d detected\n",
|
||||
NIPQUAD(to.ip), ntohs(port));
|
||||
|
||||
/* Passive mode off */
|
||||
cp->app_data = NULL;
|
||||
|
||||
/*
|
||||
* Now update or create a connection entry for it
|
||||
*/
|
||||
IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n",
|
||||
ip_vs_proto_name(iph->protocol),
|
||||
NIPQUAD(to.ip), ntohs(port), NIPQUAD(cp->vaddr.ip), 0);
|
||||
|
||||
n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol,
|
||||
&to, port,
|
||||
&cp->vaddr, htons(ntohs(cp->vport)-1));
|
||||
if (!n_cp) {
|
||||
n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
|
||||
&to, port,
|
||||
&cp->vaddr, htons(ntohs(cp->vport)-1),
|
||||
&cp->daddr, htons(ntohs(cp->dport)-1),
|
||||
0,
|
||||
cp->dest);
|
||||
if (!n_cp)
|
||||
return 0;
|
||||
|
||||
/* add its controller */
|
||||
ip_vs_control_add(n_cp, cp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Move tunnel to listen state
|
||||
*/
|
||||
ip_vs_tcp_conn_listen(n_cp);
|
||||
ip_vs_conn_put(n_cp);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
static struct ip_vs_app ip_vs_ftp = {
|
||||
.name = "ftp",
|
||||
.type = IP_VS_APP_TYPE_FTP,
|
||||
.protocol = IPPROTO_TCP,
|
||||
.module = THIS_MODULE,
|
||||
.incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list),
|
||||
.init_conn = ip_vs_ftp_init_conn,
|
||||
.done_conn = ip_vs_ftp_done_conn,
|
||||
.bind_conn = NULL,
|
||||
.unbind_conn = NULL,
|
||||
.pkt_out = ip_vs_ftp_out,
|
||||
.pkt_in = ip_vs_ftp_in,
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* ip_vs_ftp initialization
|
||||
*/
|
||||
static int __init ip_vs_ftp_init(void)
|
||||
{
|
||||
int i, ret;
|
||||
struct ip_vs_app *app = &ip_vs_ftp;
|
||||
|
||||
ret = register_ip_vs_app(app);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
|
||||
if (!ports[i])
|
||||
continue;
|
||||
ret = register_ip_vs_app_inc(app, app->protocol, ports[i]);
|
||||
if (ret)
|
||||
break;
|
||||
IP_VS_INFO("%s: loaded support on port[%d] = %d\n",
|
||||
app->name, i, ports[i]);
|
||||
}
|
||||
|
||||
if (ret)
|
||||
unregister_ip_vs_app(app);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* ip_vs_ftp finish.
|
||||
*/
|
||||
static void __exit ip_vs_ftp_exit(void)
|
||||
{
|
||||
unregister_ip_vs_app(&ip_vs_ftp);
|
||||
}
|
||||
|
||||
|
||||
module_init(ip_vs_ftp_init);
|
||||
module_exit(ip_vs_ftp_exit);
|
||||
MODULE_LICENSE("GPL");
|
555
net/netfilter/ipvs/ip_vs_lblc.c
Normal file
555
net/netfilter/ipvs/ip_vs_lblc.c
Normal file
@@ -0,0 +1,555 @@
|
||||
/*
|
||||
* IPVS: Locality-Based Least-Connection scheduling module
|
||||
*
|
||||
* Authors: Wensong Zhang <wensong@gnuchina.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Changes:
|
||||
* Martin Hamilton : fixed the terrible locking bugs
|
||||
* *lock(tbl->lock) ==> *lock(&tbl->lock)
|
||||
* Wensong Zhang : fixed the uninitilized tbl->lock bug
|
||||
* Wensong Zhang : added doing full expiration check to
|
||||
* collect stale entries of 24+ hours when
|
||||
* no partial expire check in a half hour
|
||||
* Julian Anastasov : replaced del_timer call with del_timer_sync
|
||||
* to avoid the possible race between timer
|
||||
* handler and del_timer thread in SMP
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* The lblc algorithm is as follows (pseudo code):
|
||||
*
|
||||
* if cachenode[dest_ip] is null then
|
||||
* n, cachenode[dest_ip] <- {weighted least-conn node};
|
||||
* else
|
||||
* n <- cachenode[dest_ip];
|
||||
* if (n is dead) OR
|
||||
* (n.conns>n.weight AND
|
||||
* there is a node m with m.conns<m.weight/2) then
|
||||
* n, cachenode[dest_ip] <- {weighted least-conn node};
|
||||
*
|
||||
* return n;
|
||||
*
|
||||
* Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
|
||||
* me to write this module.
|
||||
*/
|
||||
|
||||
#include <linux/ip.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/jiffies.h>
|
||||
|
||||
/* for sysctl */
|
||||
#include <linux/fs.h>
|
||||
#include <linux/sysctl.h>
|
||||
|
||||
#include <net/ip_vs.h>
|
||||
|
||||
|
||||
/*
|
||||
* It is for garbage collection of stale IPVS lblc entries,
|
||||
* when the table is full.
|
||||
*/
|
||||
#define CHECK_EXPIRE_INTERVAL (60*HZ)
|
||||
#define ENTRY_TIMEOUT (6*60*HZ)
|
||||
|
||||
/*
|
||||
* It is for full expiration check.
|
||||
* When there is no partial expiration check (garbage collection)
|
||||
* in a half hour, do a full expiration check to collect stale
|
||||
* entries that haven't been touched for a day.
|
||||
*/
|
||||
#define COUNT_FOR_FULL_EXPIRATION 30
|
||||
static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
|
||||
|
||||
|
||||
/*
|
||||
* for IPVS lblc entry hash table
|
||||
*/
|
||||
#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
|
||||
#define CONFIG_IP_VS_LBLC_TAB_BITS 10
|
||||
#endif
|
||||
#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS
|
||||
#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS)
|
||||
#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1)
|
||||
|
||||
|
||||
/*
|
||||
* IPVS lblc entry represents an association between destination
|
||||
* IP address and its destination server
|
||||
*/
|
||||
struct ip_vs_lblc_entry {
|
||||
struct list_head list;
|
||||
__be32 addr; /* destination IP address */
|
||||
struct ip_vs_dest *dest; /* real server (cache) */
|
||||
unsigned long lastuse; /* last used time */
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* IPVS lblc hash table
|
||||
*/
|
||||
struct ip_vs_lblc_table {
|
||||
struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
|
||||
atomic_t entries; /* number of entries */
|
||||
int max_size; /* maximum size of entries */
|
||||
struct timer_list periodic_timer; /* collect stale entries */
|
||||
int rover; /* rover for expire check */
|
||||
int counter; /* counter for no expire */
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* IPVS LBLC sysctl table
|
||||
*/
|
||||
|
||||
static ctl_table vs_vars_table[] = {
|
||||
{
|
||||
.procname = "lblc_expiration",
|
||||
.data = &sysctl_ip_vs_lblc_expiration,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = &proc_dointvec_jiffies,
|
||||
},
|
||||
{ .ctl_name = 0 }
|
||||
};
|
||||
|
||||
static struct ctl_table_header * sysctl_header;
|
||||
|
||||
static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
|
||||
{
|
||||
list_del(&en->list);
|
||||
/*
|
||||
* We don't kfree dest because it is refered either by its service
|
||||
* or the trash dest list.
|
||||
*/
|
||||
atomic_dec(&en->dest->refcnt);
|
||||
kfree(en);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Returns hash value for IPVS LBLC entry
|
||||
*/
|
||||
static inline unsigned ip_vs_lblc_hashkey(__be32 addr)
|
||||
{
|
||||
return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Hash an entry in the ip_vs_lblc_table.
|
||||
* returns bool success.
|
||||
*/
|
||||
static void
|
||||
ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
|
||||
{
|
||||
unsigned hash = ip_vs_lblc_hashkey(en->addr);
|
||||
|
||||
list_add(&en->list, &tbl->bucket[hash]);
|
||||
atomic_inc(&tbl->entries);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Get ip_vs_lblc_entry associated with supplied parameters. Called under read
|
||||
* lock
|
||||
*/
|
||||
static inline struct ip_vs_lblc_entry *
|
||||
ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr)
|
||||
{
|
||||
unsigned hash = ip_vs_lblc_hashkey(addr);
|
||||
struct ip_vs_lblc_entry *en;
|
||||
|
||||
list_for_each_entry(en, &tbl->bucket[hash], list)
|
||||
if (en->addr == addr)
|
||||
return en;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
|
||||
* address to a server. Called under write lock.
|
||||
*/
|
||||
static inline struct ip_vs_lblc_entry *
|
||||
ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, __be32 daddr,
|
||||
struct ip_vs_dest *dest)
|
||||
{
|
||||
struct ip_vs_lblc_entry *en;
|
||||
|
||||
en = ip_vs_lblc_get(tbl, daddr);
|
||||
if (!en) {
|
||||
en = kmalloc(sizeof(*en), GFP_ATOMIC);
|
||||
if (!en) {
|
||||
IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
en->addr = daddr;
|
||||
en->lastuse = jiffies;
|
||||
|
||||
atomic_inc(&dest->refcnt);
|
||||
en->dest = dest;
|
||||
|
||||
ip_vs_lblc_hash(tbl, en);
|
||||
} else if (en->dest != dest) {
|
||||
atomic_dec(&en->dest->refcnt);
|
||||
atomic_inc(&dest->refcnt);
|
||||
en->dest = dest;
|
||||
}
|
||||
|
||||
return en;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Flush all the entries of the specified table.
|
||||
*/
|
||||
static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
|
||||
{
|
||||
struct ip_vs_lblc_entry *en, *nxt;
|
||||
int i;
|
||||
|
||||
for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
|
||||
list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
|
||||
ip_vs_lblc_free(en);
|
||||
atomic_dec(&tbl->entries);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
|
||||
{
|
||||
struct ip_vs_lblc_table *tbl = svc->sched_data;
|
||||
struct ip_vs_lblc_entry *en, *nxt;
|
||||
unsigned long now = jiffies;
|
||||
int i, j;
|
||||
|
||||
for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
|
||||
j = (j + 1) & IP_VS_LBLC_TAB_MASK;
|
||||
|
||||
write_lock(&svc->sched_lock);
|
||||
list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
|
||||
if (time_before(now,
|
||||
en->lastuse + sysctl_ip_vs_lblc_expiration))
|
||||
continue;
|
||||
|
||||
ip_vs_lblc_free(en);
|
||||
atomic_dec(&tbl->entries);
|
||||
}
|
||||
write_unlock(&svc->sched_lock);
|
||||
}
|
||||
tbl->rover = j;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Periodical timer handler for IPVS lblc table
|
||||
* It is used to collect stale entries when the number of entries
|
||||
* exceeds the maximum size of the table.
|
||||
*
|
||||
* Fixme: we probably need more complicated algorithm to collect
|
||||
* entries that have not been used for a long time even
|
||||
* if the number of entries doesn't exceed the maximum size
|
||||
* of the table.
|
||||
* The full expiration check is for this purpose now.
|
||||
*/
|
||||
static void ip_vs_lblc_check_expire(unsigned long data)
|
||||
{
|
||||
struct ip_vs_service *svc = (struct ip_vs_service *) data;
|
||||
struct ip_vs_lblc_table *tbl = svc->sched_data;
|
||||
unsigned long now = jiffies;
|
||||
int goal;
|
||||
int i, j;
|
||||
struct ip_vs_lblc_entry *en, *nxt;
|
||||
|
||||
if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
|
||||
/* do full expiration check */
|
||||
ip_vs_lblc_full_check(svc);
|
||||
tbl->counter = 1;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (atomic_read(&tbl->entries) <= tbl->max_size) {
|
||||
tbl->counter++;
|
||||
goto out;
|
||||
}
|
||||
|
||||
goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
|
||||
if (goal > tbl->max_size/2)
|
||||
goal = tbl->max_size/2;
|
||||
|
||||
for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
|
||||
j = (j + 1) & IP_VS_LBLC_TAB_MASK;
|
||||
|
||||
write_lock(&svc->sched_lock);
|
||||
list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
|
||||
if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
|
||||
continue;
|
||||
|
||||
ip_vs_lblc_free(en);
|
||||
atomic_dec(&tbl->entries);
|
||||
goal--;
|
||||
}
|
||||
write_unlock(&svc->sched_lock);
|
||||
if (goal <= 0)
|
||||
break;
|
||||
}
|
||||
tbl->rover = j;
|
||||
|
||||
out:
|
||||
mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
|
||||
}
|
||||
|
||||
|
||||
static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
|
||||
{
|
||||
int i;
|
||||
struct ip_vs_lblc_table *tbl;
|
||||
|
||||
/*
|
||||
* Allocate the ip_vs_lblc_table for this service
|
||||
*/
|
||||
tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
|
||||
if (tbl == NULL) {
|
||||
IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
svc->sched_data = tbl;
|
||||
IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
|
||||
"current service\n", sizeof(*tbl));
|
||||
|
||||
/*
|
||||
* Initialize the hash buckets
|
||||
*/
|
||||
for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
|
||||
INIT_LIST_HEAD(&tbl->bucket[i]);
|
||||
}
|
||||
tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
|
||||
tbl->rover = 0;
|
||||
tbl->counter = 1;
|
||||
|
||||
/*
|
||||
* Hook periodic timer for garbage collection
|
||||
*/
|
||||
setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire,
|
||||
(unsigned long)svc);
|
||||
mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
|
||||
{
|
||||
struct ip_vs_lblc_table *tbl = svc->sched_data;
|
||||
|
||||
/* remove periodic timer */
|
||||
del_timer_sync(&tbl->periodic_timer);
|
||||
|
||||
/* got to clean up table entries here */
|
||||
ip_vs_lblc_flush(tbl);
|
||||
|
||||
/* release the table itself */
|
||||
kfree(tbl);
|
||||
IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
|
||||
sizeof(*tbl));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static inline struct ip_vs_dest *
|
||||
__ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
|
||||
{
|
||||
struct ip_vs_dest *dest, *least;
|
||||
int loh, doh;
|
||||
|
||||
/*
|
||||
* We think the overhead of processing active connections is fifty
|
||||
* times higher than that of inactive connections in average. (This
|
||||
* fifty times might not be accurate, we will change it later.) We
|
||||
* use the following formula to estimate the overhead:
|
||||
* dest->activeconns*50 + dest->inactconns
|
||||
* and the load:
|
||||
* (dest overhead) / dest->weight
|
||||
*
|
||||
* Remember -- no floats in kernel mode!!!
|
||||
* The comparison of h1*w2 > h2*w1 is equivalent to that of
|
||||
* h1/w1 > h2/w2
|
||||
* if every weight is larger than zero.
|
||||
*
|
||||
* The server with weight=0 is quiesced and will not receive any
|
||||
* new connection.
|
||||
*/
|
||||
list_for_each_entry(dest, &svc->destinations, n_list) {
|
||||
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
|
||||
continue;
|
||||
if (atomic_read(&dest->weight) > 0) {
|
||||
least = dest;
|
||||
loh = atomic_read(&least->activeconns) * 50
|
||||
+ atomic_read(&least->inactconns);
|
||||
goto nextstage;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* Find the destination with the least load.
|
||||
*/
|
||||
nextstage:
|
||||
list_for_each_entry_continue(dest, &svc->destinations, n_list) {
|
||||
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
|
||||
continue;
|
||||
|
||||
doh = atomic_read(&dest->activeconns) * 50
|
||||
+ atomic_read(&dest->inactconns);
|
||||
if (loh * atomic_read(&dest->weight) >
|
||||
doh * atomic_read(&least->weight)) {
|
||||
least = dest;
|
||||
loh = doh;
|
||||
}
|
||||
}
|
||||
|
||||
IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
|
||||
"activeconns %d refcnt %d weight %d overhead %d\n",
|
||||
NIPQUAD(least->addr.ip), ntohs(least->port),
|
||||
atomic_read(&least->activeconns),
|
||||
atomic_read(&least->refcnt),
|
||||
atomic_read(&least->weight), loh);
|
||||
|
||||
return least;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* If this destination server is overloaded and there is a less loaded
|
||||
* server, then return true.
|
||||
*/
|
||||
static inline int
|
||||
is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
|
||||
{
|
||||
if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
|
||||
struct ip_vs_dest *d;
|
||||
|
||||
list_for_each_entry(d, &svc->destinations, n_list) {
|
||||
if (atomic_read(&d->activeconns)*2
|
||||
< atomic_read(&d->weight)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Locality-Based (weighted) Least-Connection scheduling
|
||||
*/
|
||||
static struct ip_vs_dest *
|
||||
ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
|
||||
{
|
||||
struct ip_vs_lblc_table *tbl = svc->sched_data;
|
||||
struct iphdr *iph = ip_hdr(skb);
|
||||
struct ip_vs_dest *dest = NULL;
|
||||
struct ip_vs_lblc_entry *en;
|
||||
|
||||
IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
|
||||
|
||||
/* First look in our cache */
|
||||
read_lock(&svc->sched_lock);
|
||||
en = ip_vs_lblc_get(tbl, iph->daddr);
|
||||
if (en) {
|
||||
/* We only hold a read lock, but this is atomic */
|
||||
en->lastuse = jiffies;
|
||||
|
||||
/*
|
||||
* If the destination is not available, i.e. it's in the trash,
|
||||
* we must ignore it, as it may be removed from under our feet,
|
||||
* if someone drops our reference count. Our caller only makes
|
||||
* sure that destinations, that are not in the trash, are not
|
||||
* moved to the trash, while we are scheduling. But anyone can
|
||||
* free up entries from the trash at any time.
|
||||
*/
|
||||
|
||||
if (en->dest->flags & IP_VS_DEST_F_AVAILABLE)
|
||||
dest = en->dest;
|
||||
}
|
||||
read_unlock(&svc->sched_lock);
|
||||
|
||||
/* If the destination has a weight and is not overloaded, use it */
|
||||
if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
|
||||
goto out;
|
||||
|
||||
/* No cache entry or it is invalid, time to schedule */
|
||||
dest = __ip_vs_lblc_schedule(svc, iph);
|
||||
if (!dest) {
|
||||
IP_VS_DBG(1, "no destination available\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* If we fail to create a cache entry, we'll just use the valid dest */
|
||||
write_lock(&svc->sched_lock);
|
||||
ip_vs_lblc_new(tbl, iph->daddr, dest);
|
||||
write_unlock(&svc->sched_lock);
|
||||
|
||||
out:
|
||||
IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
|
||||
"--> server %u.%u.%u.%u:%d\n",
|
||||
NIPQUAD(iph->daddr),
|
||||
NIPQUAD(dest->addr.ip),
|
||||
ntohs(dest->port));
|
||||
|
||||
return dest;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* IPVS LBLC Scheduler structure
|
||||
*/
|
||||
static struct ip_vs_scheduler ip_vs_lblc_scheduler =
|
||||
{
|
||||
.name = "lblc",
|
||||
.refcnt = ATOMIC_INIT(0),
|
||||
.module = THIS_MODULE,
|
||||
.n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list),
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
.supports_ipv6 = 0,
|
||||
#endif
|
||||
.init_service = ip_vs_lblc_init_svc,
|
||||
.done_service = ip_vs_lblc_done_svc,
|
||||
.schedule = ip_vs_lblc_schedule,
|
||||
};
|
||||
|
||||
|
||||
static int __init ip_vs_lblc_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
|
||||
ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
|
||||
if (ret)
|
||||
unregister_sysctl_table(sysctl_header);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static void __exit ip_vs_lblc_cleanup(void)
|
||||
{
|
||||
unregister_sysctl_table(sysctl_header);
|
||||
unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
|
||||
}
|
||||
|
||||
|
||||
module_init(ip_vs_lblc_init);
|
||||
module_exit(ip_vs_lblc_cleanup);
|
||||
MODULE_LICENSE("GPL");
|
755
net/netfilter/ipvs/ip_vs_lblcr.c
Normal file
755
net/netfilter/ipvs/ip_vs_lblcr.c
Normal file
@@ -0,0 +1,755 @@
|
||||
/*
|
||||
* IPVS: Locality-Based Least-Connection with Replication scheduler
|
||||
*
|
||||
* Authors: Wensong Zhang <wensong@gnuchina.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Changes:
|
||||
* Julian Anastasov : Added the missing (dest->weight>0)
|
||||
* condition in the ip_vs_dest_set_max.
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* The lblc/r algorithm is as follows (pseudo code):
|
||||
*
|
||||
* if serverSet[dest_ip] is null then
|
||||
* n, serverSet[dest_ip] <- {weighted least-conn node};
|
||||
* else
|
||||
* n <- {least-conn (alive) node in serverSet[dest_ip]};
|
||||
* if (n is null) OR
|
||||
* (n.conns>n.weight AND
|
||||
* there is a node m with m.conns<m.weight/2) then
|
||||
* n <- {weighted least-conn node};
|
||||
* add n to serverSet[dest_ip];
|
||||
* if |serverSet[dest_ip]| > 1 AND
|
||||
* now - serverSet[dest_ip].lastMod > T then
|
||||
* m <- {most conn node in serverSet[dest_ip]};
|
||||
* remove m from serverSet[dest_ip];
|
||||
* if serverSet[dest_ip] changed then
|
||||
* serverSet[dest_ip].lastMod <- now;
|
||||
*
|
||||
* return n;
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/ip.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/jiffies.h>
|
||||
|
||||
/* for sysctl */
|
||||
#include <linux/fs.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <net/net_namespace.h>
|
||||
|
||||
#include <net/ip_vs.h>
|
||||
|
||||
|
||||
/*
|
||||
* It is for garbage collection of stale IPVS lblcr entries,
|
||||
* when the table is full.
|
||||
*/
|
||||
#define CHECK_EXPIRE_INTERVAL (60*HZ)
|
||||
#define ENTRY_TIMEOUT (6*60*HZ)
|
||||
|
||||
/*
|
||||
* It is for full expiration check.
|
||||
* When there is no partial expiration check (garbage collection)
|
||||
* in a half hour, do a full expiration check to collect stale
|
||||
* entries that haven't been touched for a day.
|
||||
*/
|
||||
#define COUNT_FOR_FULL_EXPIRATION 30
|
||||
static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
|
||||
|
||||
|
||||
/*
|
||||
* for IPVS lblcr entry hash table
|
||||
*/
|
||||
#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
|
||||
#define CONFIG_IP_VS_LBLCR_TAB_BITS 10
|
||||
#endif
|
||||
#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS
|
||||
#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS)
|
||||
#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1)
|
||||
|
||||
|
||||
/*
|
||||
* IPVS destination set structure and operations
|
||||
*/
|
||||
struct ip_vs_dest_list {
|
||||
struct ip_vs_dest_list *next; /* list link */
|
||||
struct ip_vs_dest *dest; /* destination server */
|
||||
};
|
||||
|
||||
struct ip_vs_dest_set {
|
||||
atomic_t size; /* set size */
|
||||
unsigned long lastmod; /* last modified time */
|
||||
struct ip_vs_dest_list *list; /* destination list */
|
||||
rwlock_t lock; /* lock for this list */
|
||||
};
|
||||
|
||||
|
||||
static struct ip_vs_dest_list *
|
||||
ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
|
||||
{
|
||||
struct ip_vs_dest_list *e;
|
||||
|
||||
for (e=set->list; e!=NULL; e=e->next) {
|
||||
if (e->dest == dest)
|
||||
/* already existed */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
e = kmalloc(sizeof(*e), GFP_ATOMIC);
|
||||
if (e == NULL) {
|
||||
IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
atomic_inc(&dest->refcnt);
|
||||
e->dest = dest;
|
||||
|
||||
/* link it to the list */
|
||||
e->next = set->list;
|
||||
set->list = e;
|
||||
atomic_inc(&set->size);
|
||||
|
||||
set->lastmod = jiffies;
|
||||
return e;
|
||||
}
|
||||
|
||||
static void
|
||||
ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
|
||||
{
|
||||
struct ip_vs_dest_list *e, **ep;
|
||||
|
||||
for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
|
||||
if (e->dest == dest) {
|
||||
/* HIT */
|
||||
*ep = e->next;
|
||||
atomic_dec(&set->size);
|
||||
set->lastmod = jiffies;
|
||||
atomic_dec(&e->dest->refcnt);
|
||||
kfree(e);
|
||||
break;
|
||||
}
|
||||
ep = &e->next;
|
||||
}
|
||||
}
|
||||
|
||||
static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
|
||||
{
|
||||
struct ip_vs_dest_list *e, **ep;
|
||||
|
||||
write_lock(&set->lock);
|
||||
for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
|
||||
*ep = e->next;
|
||||
/*
|
||||
* We don't kfree dest because it is refered either
|
||||
* by its service or by the trash dest list.
|
||||
*/
|
||||
atomic_dec(&e->dest->refcnt);
|
||||
kfree(e);
|
||||
}
|
||||
write_unlock(&set->lock);
|
||||
}
|
||||
|
||||
/* get weighted least-connection node in the destination set */
|
||||
static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
|
||||
{
|
||||
register struct ip_vs_dest_list *e;
|
||||
struct ip_vs_dest *dest, *least;
|
||||
int loh, doh;
|
||||
|
||||
if (set == NULL)
|
||||
return NULL;
|
||||
|
||||
/* select the first destination server, whose weight > 0 */
|
||||
for (e=set->list; e!=NULL; e=e->next) {
|
||||
least = e->dest;
|
||||
if (least->flags & IP_VS_DEST_F_OVERLOAD)
|
||||
continue;
|
||||
|
||||
if ((atomic_read(&least->weight) > 0)
|
||||
&& (least->flags & IP_VS_DEST_F_AVAILABLE)) {
|
||||
loh = atomic_read(&least->activeconns) * 50
|
||||
+ atomic_read(&least->inactconns);
|
||||
goto nextstage;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
|
||||
/* find the destination with the weighted least load */
|
||||
nextstage:
|
||||
for (e=e->next; e!=NULL; e=e->next) {
|
||||
dest = e->dest;
|
||||
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
|
||||
continue;
|
||||
|
||||
doh = atomic_read(&dest->activeconns) * 50
|
||||
+ atomic_read(&dest->inactconns);
|
||||
if ((loh * atomic_read(&dest->weight) >
|
||||
doh * atomic_read(&least->weight))
|
||||
&& (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
|
||||
least = dest;
|
||||
loh = doh;
|
||||
}
|
||||
}
|
||||
|
||||
IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
|
||||
"activeconns %d refcnt %d weight %d overhead %d\n",
|
||||
NIPQUAD(least->addr.ip), ntohs(least->port),
|
||||
atomic_read(&least->activeconns),
|
||||
atomic_read(&least->refcnt),
|
||||
atomic_read(&least->weight), loh);
|
||||
return least;
|
||||
}
|
||||
|
||||
|
||||
/* get weighted most-connection node in the destination set */
|
||||
static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
|
||||
{
|
||||
register struct ip_vs_dest_list *e;
|
||||
struct ip_vs_dest *dest, *most;
|
||||
int moh, doh;
|
||||
|
||||
if (set == NULL)
|
||||
return NULL;
|
||||
|
||||
/* select the first destination server, whose weight > 0 */
|
||||
for (e=set->list; e!=NULL; e=e->next) {
|
||||
most = e->dest;
|
||||
if (atomic_read(&most->weight) > 0) {
|
||||
moh = atomic_read(&most->activeconns) * 50
|
||||
+ atomic_read(&most->inactconns);
|
||||
goto nextstage;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
|
||||
/* find the destination with the weighted most load */
|
||||
nextstage:
|
||||
for (e=e->next; e!=NULL; e=e->next) {
|
||||
dest = e->dest;
|
||||
doh = atomic_read(&dest->activeconns) * 50
|
||||
+ atomic_read(&dest->inactconns);
|
||||
/* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
|
||||
if ((moh * atomic_read(&dest->weight) <
|
||||
doh * atomic_read(&most->weight))
|
||||
&& (atomic_read(&dest->weight) > 0)) {
|
||||
most = dest;
|
||||
moh = doh;
|
||||
}
|
||||
}
|
||||
|
||||
IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
|
||||
"activeconns %d refcnt %d weight %d overhead %d\n",
|
||||
NIPQUAD(most->addr.ip), ntohs(most->port),
|
||||
atomic_read(&most->activeconns),
|
||||
atomic_read(&most->refcnt),
|
||||
atomic_read(&most->weight), moh);
|
||||
return most;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* IPVS lblcr entry represents an association between destination
|
||||
* IP address and its destination server set
|
||||
*/
|
||||
struct ip_vs_lblcr_entry {
|
||||
struct list_head list;
|
||||
__be32 addr; /* destination IP address */
|
||||
struct ip_vs_dest_set set; /* destination server set */
|
||||
unsigned long lastuse; /* last used time */
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* IPVS lblcr hash table
|
||||
*/
|
||||
struct ip_vs_lblcr_table {
|
||||
struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
|
||||
atomic_t entries; /* number of entries */
|
||||
int max_size; /* maximum size of entries */
|
||||
struct timer_list periodic_timer; /* collect stale entries */
|
||||
int rover; /* rover for expire check */
|
||||
int counter; /* counter for no expire */
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* IPVS LBLCR sysctl table
|
||||
*/
|
||||
|
||||
static ctl_table vs_vars_table[] = {
|
||||
{
|
||||
.procname = "lblcr_expiration",
|
||||
.data = &sysctl_ip_vs_lblcr_expiration,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = &proc_dointvec_jiffies,
|
||||
},
|
||||
{ .ctl_name = 0 }
|
||||
};
|
||||
|
||||
static struct ctl_table_header * sysctl_header;
|
||||
|
||||
static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
|
||||
{
|
||||
list_del(&en->list);
|
||||
ip_vs_dest_set_eraseall(&en->set);
|
||||
kfree(en);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Returns hash value for IPVS LBLCR entry
|
||||
*/
|
||||
static inline unsigned ip_vs_lblcr_hashkey(__be32 addr)
|
||||
{
|
||||
return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Hash an entry in the ip_vs_lblcr_table.
|
||||
* returns bool success.
|
||||
*/
|
||||
static void
|
||||
ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
|
||||
{
|
||||
unsigned hash = ip_vs_lblcr_hashkey(en->addr);
|
||||
|
||||
list_add(&en->list, &tbl->bucket[hash]);
|
||||
atomic_inc(&tbl->entries);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Get ip_vs_lblcr_entry associated with supplied parameters. Called under
|
||||
* read lock.
|
||||
*/
|
||||
static inline struct ip_vs_lblcr_entry *
|
||||
ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr)
|
||||
{
|
||||
unsigned hash = ip_vs_lblcr_hashkey(addr);
|
||||
struct ip_vs_lblcr_entry *en;
|
||||
|
||||
list_for_each_entry(en, &tbl->bucket[hash], list)
|
||||
if (en->addr == addr)
|
||||
return en;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
|
||||
* IP address to a server. Called under write lock.
|
||||
*/
|
||||
static inline struct ip_vs_lblcr_entry *
|
||||
ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, __be32 daddr,
|
||||
struct ip_vs_dest *dest)
|
||||
{
|
||||
struct ip_vs_lblcr_entry *en;
|
||||
|
||||
en = ip_vs_lblcr_get(tbl, daddr);
|
||||
if (!en) {
|
||||
en = kmalloc(sizeof(*en), GFP_ATOMIC);
|
||||
if (!en) {
|
||||
IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
en->addr = daddr;
|
||||
en->lastuse = jiffies;
|
||||
|
||||
/* initilize its dest set */
|
||||
atomic_set(&(en->set.size), 0);
|
||||
en->set.list = NULL;
|
||||
rwlock_init(&en->set.lock);
|
||||
|
||||
ip_vs_lblcr_hash(tbl, en);
|
||||
}
|
||||
|
||||
write_lock(&en->set.lock);
|
||||
ip_vs_dest_set_insert(&en->set, dest);
|
||||
write_unlock(&en->set.lock);
|
||||
|
||||
return en;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Flush all the entries of the specified table.
|
||||
*/
|
||||
static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
|
||||
{
|
||||
int i;
|
||||
struct ip_vs_lblcr_entry *en, *nxt;
|
||||
|
||||
/* No locking required, only called during cleanup. */
|
||||
for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
|
||||
list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
|
||||
ip_vs_lblcr_free(en);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
|
||||
{
|
||||
struct ip_vs_lblcr_table *tbl = svc->sched_data;
|
||||
unsigned long now = jiffies;
|
||||
int i, j;
|
||||
struct ip_vs_lblcr_entry *en, *nxt;
|
||||
|
||||
for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
|
||||
j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
|
||||
|
||||
write_lock(&svc->sched_lock);
|
||||
list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
|
||||
if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
|
||||
now))
|
||||
continue;
|
||||
|
||||
ip_vs_lblcr_free(en);
|
||||
atomic_dec(&tbl->entries);
|
||||
}
|
||||
write_unlock(&svc->sched_lock);
|
||||
}
|
||||
tbl->rover = j;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Periodical timer handler for IPVS lblcr table
|
||||
* It is used to collect stale entries when the number of entries
|
||||
* exceeds the maximum size of the table.
|
||||
*
|
||||
* Fixme: we probably need more complicated algorithm to collect
|
||||
* entries that have not been used for a long time even
|
||||
* if the number of entries doesn't exceed the maximum size
|
||||
* of the table.
|
||||
* The full expiration check is for this purpose now.
|
||||
*/
|
||||
static void ip_vs_lblcr_check_expire(unsigned long data)
|
||||
{
|
||||
struct ip_vs_service *svc = (struct ip_vs_service *) data;
|
||||
struct ip_vs_lblcr_table *tbl = svc->sched_data;
|
||||
unsigned long now = jiffies;
|
||||
int goal;
|
||||
int i, j;
|
||||
struct ip_vs_lblcr_entry *en, *nxt;
|
||||
|
||||
if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
|
||||
/* do full expiration check */
|
||||
ip_vs_lblcr_full_check(svc);
|
||||
tbl->counter = 1;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (atomic_read(&tbl->entries) <= tbl->max_size) {
|
||||
tbl->counter++;
|
||||
goto out;
|
||||
}
|
||||
|
||||
goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
|
||||
if (goal > tbl->max_size/2)
|
||||
goal = tbl->max_size/2;
|
||||
|
||||
for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
|
||||
j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
|
||||
|
||||
write_lock(&svc->sched_lock);
|
||||
list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
|
||||
if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
|
||||
continue;
|
||||
|
||||
ip_vs_lblcr_free(en);
|
||||
atomic_dec(&tbl->entries);
|
||||
goal--;
|
||||
}
|
||||
write_unlock(&svc->sched_lock);
|
||||
if (goal <= 0)
|
||||
break;
|
||||
}
|
||||
tbl->rover = j;
|
||||
|
||||
out:
|
||||
mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
|
||||
}
|
||||
|
||||
static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
|
||||
{
|
||||
int i;
|
||||
struct ip_vs_lblcr_table *tbl;
|
||||
|
||||
/*
|
||||
* Allocate the ip_vs_lblcr_table for this service
|
||||
*/
|
||||
tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
|
||||
if (tbl == NULL) {
|
||||
IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
svc->sched_data = tbl;
|
||||
IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
|
||||
"current service\n", sizeof(*tbl));
|
||||
|
||||
/*
|
||||
* Initialize the hash buckets
|
||||
*/
|
||||
for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
|
||||
INIT_LIST_HEAD(&tbl->bucket[i]);
|
||||
}
|
||||
tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
|
||||
tbl->rover = 0;
|
||||
tbl->counter = 1;
|
||||
|
||||
/*
|
||||
* Hook periodic timer for garbage collection
|
||||
*/
|
||||
setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
|
||||
(unsigned long)svc);
|
||||
mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
|
||||
{
|
||||
struct ip_vs_lblcr_table *tbl = svc->sched_data;
|
||||
|
||||
/* remove periodic timer */
|
||||
del_timer_sync(&tbl->periodic_timer);
|
||||
|
||||
/* got to clean up table entries here */
|
||||
ip_vs_lblcr_flush(tbl);
|
||||
|
||||
/* release the table itself */
|
||||
kfree(tbl);
|
||||
IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
|
||||
sizeof(*tbl));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static inline struct ip_vs_dest *
|
||||
__ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph)
|
||||
{
|
||||
struct ip_vs_dest *dest, *least;
|
||||
int loh, doh;
|
||||
|
||||
/*
|
||||
* We think the overhead of processing active connections is fifty
|
||||
* times higher than that of inactive connections in average. (This
|
||||
* fifty times might not be accurate, we will change it later.) We
|
||||
* use the following formula to estimate the overhead:
|
||||
* dest->activeconns*50 + dest->inactconns
|
||||
* and the load:
|
||||
* (dest overhead) / dest->weight
|
||||
*
|
||||
* Remember -- no floats in kernel mode!!!
|
||||
* The comparison of h1*w2 > h2*w1 is equivalent to that of
|
||||
* h1/w1 > h2/w2
|
||||
* if every weight is larger than zero.
|
||||
*
|
||||
* The server with weight=0 is quiesced and will not receive any
|
||||
* new connection.
|
||||
*/
|
||||
list_for_each_entry(dest, &svc->destinations, n_list) {
|
||||
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
|
||||
continue;
|
||||
|
||||
if (atomic_read(&dest->weight) > 0) {
|
||||
least = dest;
|
||||
loh = atomic_read(&least->activeconns) * 50
|
||||
+ atomic_read(&least->inactconns);
|
||||
goto nextstage;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* Find the destination with the least load.
|
||||
*/
|
||||
nextstage:
|
||||
list_for_each_entry_continue(dest, &svc->destinations, n_list) {
|
||||
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
|
||||
continue;
|
||||
|
||||
doh = atomic_read(&dest->activeconns) * 50
|
||||
+ atomic_read(&dest->inactconns);
|
||||
if (loh * atomic_read(&dest->weight) >
|
||||
doh * atomic_read(&least->weight)) {
|
||||
least = dest;
|
||||
loh = doh;
|
||||
}
|
||||
}
|
||||
|
||||
IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d "
|
||||
"activeconns %d refcnt %d weight %d overhead %d\n",
|
||||
NIPQUAD(least->addr.ip), ntohs(least->port),
|
||||
atomic_read(&least->activeconns),
|
||||
atomic_read(&least->refcnt),
|
||||
atomic_read(&least->weight), loh);
|
||||
|
||||
return least;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* If this destination server is overloaded and there is a less loaded
|
||||
* server, then return true.
|
||||
*/
|
||||
static inline int
|
||||
is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
|
||||
{
|
||||
if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
|
||||
struct ip_vs_dest *d;
|
||||
|
||||
list_for_each_entry(d, &svc->destinations, n_list) {
|
||||
if (atomic_read(&d->activeconns)*2
|
||||
< atomic_read(&d->weight)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Locality-Based (weighted) Least-Connection scheduling
|
||||
*/
|
||||
static struct ip_vs_dest *
|
||||
ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
|
||||
{
|
||||
struct ip_vs_lblcr_table *tbl = svc->sched_data;
|
||||
struct iphdr *iph = ip_hdr(skb);
|
||||
struct ip_vs_dest *dest = NULL;
|
||||
struct ip_vs_lblcr_entry *en;
|
||||
|
||||
IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
|
||||
|
||||
/* First look in our cache */
|
||||
read_lock(&svc->sched_lock);
|
||||
en = ip_vs_lblcr_get(tbl, iph->daddr);
|
||||
if (en) {
|
||||
/* We only hold a read lock, but this is atomic */
|
||||
en->lastuse = jiffies;
|
||||
|
||||
/* Get the least loaded destination */
|
||||
read_lock(&en->set.lock);
|
||||
dest = ip_vs_dest_set_min(&en->set);
|
||||
read_unlock(&en->set.lock);
|
||||
|
||||
/* More than one destination + enough time passed by, cleanup */
|
||||
if (atomic_read(&en->set.size) > 1 &&
|
||||
time_after(jiffies, en->set.lastmod +
|
||||
sysctl_ip_vs_lblcr_expiration)) {
|
||||
struct ip_vs_dest *m;
|
||||
|
||||
write_lock(&en->set.lock);
|
||||
m = ip_vs_dest_set_max(&en->set);
|
||||
if (m)
|
||||
ip_vs_dest_set_erase(&en->set, m);
|
||||
write_unlock(&en->set.lock);
|
||||
}
|
||||
|
||||
/* If the destination is not overloaded, use it */
|
||||
if (dest && !is_overloaded(dest, svc)) {
|
||||
read_unlock(&svc->sched_lock);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* The cache entry is invalid, time to schedule */
|
||||
dest = __ip_vs_lblcr_schedule(svc, iph);
|
||||
if (!dest) {
|
||||
IP_VS_DBG(1, "no destination available\n");
|
||||
read_unlock(&svc->sched_lock);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Update our cache entry */
|
||||
write_lock(&en->set.lock);
|
||||
ip_vs_dest_set_insert(&en->set, dest);
|
||||
write_unlock(&en->set.lock);
|
||||
}
|
||||
read_unlock(&svc->sched_lock);
|
||||
|
||||
if (dest)
|
||||
goto out;
|
||||
|
||||
/* No cache entry, time to schedule */
|
||||
dest = __ip_vs_lblcr_schedule(svc, iph);
|
||||
if (!dest) {
|
||||
IP_VS_DBG(1, "no destination available\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* If we fail to create a cache entry, we'll just use the valid dest */
|
||||
write_lock(&svc->sched_lock);
|
||||
ip_vs_lblcr_new(tbl, iph->daddr, dest);
|
||||
write_unlock(&svc->sched_lock);
|
||||
|
||||
out:
|
||||
IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
|
||||
"--> server %u.%u.%u.%u:%d\n",
|
||||
NIPQUAD(iph->daddr),
|
||||
NIPQUAD(dest->addr.ip),
|
||||
ntohs(dest->port));
|
||||
|
||||
return dest;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* IPVS LBLCR Scheduler structure
|
||||
*/
|
||||
static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
|
||||
{
|
||||
.name = "lblcr",
|
||||
.refcnt = ATOMIC_INIT(0),
|
||||
.module = THIS_MODULE,
|
||||
.n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list),
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
.supports_ipv6 = 0,
|
||||
#endif
|
||||
.init_service = ip_vs_lblcr_init_svc,
|
||||
.done_service = ip_vs_lblcr_done_svc,
|
||||
.schedule = ip_vs_lblcr_schedule,
|
||||
};
|
||||
|
||||
|
||||
static int __init ip_vs_lblcr_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
|
||||
ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
|
||||
if (ret)
|
||||
unregister_sysctl_table(sysctl_header);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static void __exit ip_vs_lblcr_cleanup(void)
|
||||
{
|
||||
unregister_sysctl_table(sysctl_header);
|
||||
unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
|
||||
}
|
||||
|
||||
|
||||
module_init(ip_vs_lblcr_init);
|
||||
module_exit(ip_vs_lblcr_cleanup);
|
||||
MODULE_LICENSE("GPL");
|
103
net/netfilter/ipvs/ip_vs_lc.c
Normal file
103
net/netfilter/ipvs/ip_vs_lc.c
Normal file
@@ -0,0 +1,103 @@
|
||||
/*
|
||||
* IPVS: Least-Connection Scheduling module
|
||||
*
|
||||
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Changes:
|
||||
* Wensong Zhang : added the ip_vs_lc_update_svc
|
||||
* Wensong Zhang : added any dest with weight=0 is quiesced
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
|
||||
#include <net/ip_vs.h>
|
||||
|
||||
|
||||
static inline unsigned int
|
||||
ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
|
||||
{
|
||||
/*
|
||||
* We think the overhead of processing active connections is 256
|
||||
* times higher than that of inactive connections in average. (This
|
||||
* 256 times might not be accurate, we will change it later) We
|
||||
* use the following formula to estimate the overhead now:
|
||||
* dest->activeconns*256 + dest->inactconns
|
||||
*/
|
||||
return (atomic_read(&dest->activeconns) << 8) +
|
||||
atomic_read(&dest->inactconns);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Least Connection scheduling
|
||||
*/
|
||||
static struct ip_vs_dest *
|
||||
ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
|
||||
{
|
||||
struct ip_vs_dest *dest, *least = NULL;
|
||||
unsigned int loh = 0, doh;
|
||||
|
||||
IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n");
|
||||
|
||||
/*
|
||||
* Simply select the server with the least number of
|
||||
* (activeconns<<5) + inactconns
|
||||
* Except whose weight is equal to zero.
|
||||
* If the weight is equal to zero, it means that the server is
|
||||
* quiesced, the existing connections to the server still get
|
||||
* served, but no new connection is assigned to the server.
|
||||
*/
|
||||
|
||||
list_for_each_entry(dest, &svc->destinations, n_list) {
|
||||
if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
|
||||
atomic_read(&dest->weight) == 0)
|
||||
continue;
|
||||
doh = ip_vs_lc_dest_overhead(dest);
|
||||
if (!least || doh < loh) {
|
||||
least = dest;
|
||||
loh = doh;
|
||||
}
|
||||
}
|
||||
|
||||
if (least)
|
||||
IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d inactconns %d\n",
|
||||
IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
|
||||
atomic_read(&least->activeconns),
|
||||
atomic_read(&least->inactconns));
|
||||
|
||||
return least;
|
||||
}
|
||||
|
||||
|
||||
static struct ip_vs_scheduler ip_vs_lc_scheduler = {
|
||||
.name = "lc",
|
||||
.refcnt = ATOMIC_INIT(0),
|
||||
.module = THIS_MODULE,
|
||||
.n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list),
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
.supports_ipv6 = 1,
|
||||
#endif
|
||||
.schedule = ip_vs_lc_schedule,
|
||||
};
|
||||
|
||||
|
||||
static int __init ip_vs_lc_init(void)
|
||||
{
|
||||
return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ;
|
||||
}
|
||||
|
||||
static void __exit ip_vs_lc_cleanup(void)
|
||||
{
|
||||
unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
|
||||
}
|
||||
|
||||
module_init(ip_vs_lc_init);
|
||||
module_exit(ip_vs_lc_cleanup);
|
||||
MODULE_LICENSE("GPL");
|
138
net/netfilter/ipvs/ip_vs_nq.c
Normal file
138
net/netfilter/ipvs/ip_vs_nq.c
Normal file
@@ -0,0 +1,138 @@
|
||||
/*
|
||||
* IPVS: Never Queue scheduling module
|
||||
*
|
||||
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Changes:
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* The NQ algorithm adopts a two-speed model. When there is an idle server
|
||||
* available, the job will be sent to the idle server, instead of waiting
|
||||
* for a fast one. When there is no idle server available, the job will be
|
||||
* sent to the server that minimize its expected delay (The Shortest
|
||||
* Expected Delay scheduling algorithm).
|
||||
*
|
||||
* See the following paper for more information:
|
||||
* A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
|
||||
* in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
|
||||
* pages 986-994, 1988.
|
||||
*
|
||||
* Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me.
|
||||
*
|
||||
* The difference between NQ and SED is that NQ can improve overall
|
||||
* system utilization.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
|
||||
#include <net/ip_vs.h>
|
||||
|
||||
|
||||
static inline unsigned int
|
||||
ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
|
||||
{
|
||||
/*
|
||||
* We only use the active connection number in the cost
|
||||
* calculation here.
|
||||
*/
|
||||
return atomic_read(&dest->activeconns) + 1;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Weighted Least Connection scheduling
|
||||
*/
|
||||
static struct ip_vs_dest *
|
||||
ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
|
||||
{
|
||||
struct ip_vs_dest *dest, *least = NULL;
|
||||
unsigned int loh = 0, doh;
|
||||
|
||||
IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n");
|
||||
|
||||
/*
|
||||
* We calculate the load of each dest server as follows:
|
||||
* (server expected overhead) / dest->weight
|
||||
*
|
||||
* Remember -- no floats in kernel mode!!!
|
||||
* The comparison of h1*w2 > h2*w1 is equivalent to that of
|
||||
* h1/w1 > h2/w2
|
||||
* if every weight is larger than zero.
|
||||
*
|
||||
* The server with weight=0 is quiesced and will not receive any
|
||||
* new connections.
|
||||
*/
|
||||
|
||||
list_for_each_entry(dest, &svc->destinations, n_list) {
|
||||
|
||||
if (dest->flags & IP_VS_DEST_F_OVERLOAD ||
|
||||
!atomic_read(&dest->weight))
|
||||
continue;
|
||||
|
||||
doh = ip_vs_nq_dest_overhead(dest);
|
||||
|
||||
/* return the server directly if it is idle */
|
||||
if (atomic_read(&dest->activeconns) == 0) {
|
||||
least = dest;
|
||||
loh = doh;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!least ||
|
||||
(loh * atomic_read(&dest->weight) >
|
||||
doh * atomic_read(&least->weight))) {
|
||||
least = dest;
|
||||
loh = doh;
|
||||
}
|
||||
}
|
||||
|
||||
if (!least)
|
||||
return NULL;
|
||||
|
||||
out:
|
||||
IP_VS_DBG_BUF(6, "NQ: server %s:%u "
|
||||
"activeconns %d refcnt %d weight %d overhead %d\n",
|
||||
IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
|
||||
atomic_read(&least->activeconns),
|
||||
atomic_read(&least->refcnt),
|
||||
atomic_read(&least->weight), loh);
|
||||
|
||||
return least;
|
||||
}
|
||||
|
||||
|
||||
static struct ip_vs_scheduler ip_vs_nq_scheduler =
|
||||
{
|
||||
.name = "nq",
|
||||
.refcnt = ATOMIC_INIT(0),
|
||||
.module = THIS_MODULE,
|
||||
.n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list),
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
.supports_ipv6 = 1,
|
||||
#endif
|
||||
.schedule = ip_vs_nq_schedule,
|
||||
};
|
||||
|
||||
|
||||
static int __init ip_vs_nq_init(void)
|
||||
{
|
||||
return register_ip_vs_scheduler(&ip_vs_nq_scheduler);
|
||||
}
|
||||
|
||||
static void __exit ip_vs_nq_cleanup(void)
|
||||
{
|
||||
unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
|
||||
}
|
||||
|
||||
module_init(ip_vs_nq_init);
|
||||
module_exit(ip_vs_nq_cleanup);
|
||||
MODULE_LICENSE("GPL");
|
288
net/netfilter/ipvs/ip_vs_proto.c
Normal file
288
net/netfilter/ipvs/ip_vs_proto.c
Normal file
@@ -0,0 +1,288 @@
|
||||
/*
|
||||
* ip_vs_proto.c: transport protocol load balancing support for IPVS
|
||||
*
|
||||
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
|
||||
* Julian Anastasov <ja@ssi.bg>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Changes:
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/in.h>
|
||||
#include <linux/ip.h>
|
||||
#include <net/protocol.h>
|
||||
#include <net/tcp.h>
|
||||
#include <net/udp.h>
|
||||
#include <asm/system.h>
|
||||
#include <linux/stat.h>
|
||||
#include <linux/proc_fs.h>
|
||||
|
||||
#include <net/ip_vs.h>
|
||||
|
||||
|
||||
/*
|
||||
* IPVS protocols can only be registered/unregistered when the ipvs
|
||||
* module is loaded/unloaded, so no lock is needed in accessing the
|
||||
* ipvs protocol table.
|
||||
*/
|
||||
|
||||
#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */
|
||||
#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1))
|
||||
|
||||
static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
|
||||
|
||||
|
||||
/*
|
||||
* register an ipvs protocol
|
||||
*/
|
||||
static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
|
||||
{
|
||||
unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
|
||||
|
||||
pp->next = ip_vs_proto_table[hash];
|
||||
ip_vs_proto_table[hash] = pp;
|
||||
|
||||
if (pp->init != NULL)
|
||||
pp->init(pp);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* unregister an ipvs protocol
|
||||
*/
|
||||
static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
|
||||
{
|
||||
struct ip_vs_protocol **pp_p;
|
||||
unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
|
||||
|
||||
pp_p = &ip_vs_proto_table[hash];
|
||||
for (; *pp_p; pp_p = &(*pp_p)->next) {
|
||||
if (*pp_p == pp) {
|
||||
*pp_p = pp->next;
|
||||
if (pp->exit != NULL)
|
||||
pp->exit(pp);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return -ESRCH;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* get ip_vs_protocol object by its proto.
|
||||
*/
|
||||
struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
|
||||
{
|
||||
struct ip_vs_protocol *pp;
|
||||
unsigned hash = IP_VS_PROTO_HASH(proto);
|
||||
|
||||
for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) {
|
||||
if (pp->protocol == proto)
|
||||
return pp;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Propagate event for state change to all protocols
|
||||
*/
|
||||
void ip_vs_protocol_timeout_change(int flags)
|
||||
{
|
||||
struct ip_vs_protocol *pp;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
|
||||
for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) {
|
||||
if (pp->timeout_change)
|
||||
pp->timeout_change(pp, flags);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int *
|
||||
ip_vs_create_timeout_table(int *table, int size)
|
||||
{
|
||||
return kmemdup(table, size, GFP_ATOMIC);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Set timeout value for state specified by name
|
||||
*/
|
||||
int
|
||||
ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (!table || !name || !to)
|
||||
return -EINVAL;
|
||||
|
||||
for (i = 0; i < num; i++) {
|
||||
if (strcmp(names[i], name))
|
||||
continue;
|
||||
table[i] = to * HZ;
|
||||
return 0;
|
||||
}
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
|
||||
const char * ip_vs_state_name(__u16 proto, int state)
|
||||
{
|
||||
struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
|
||||
|
||||
if (pp == NULL || pp->state_name == NULL)
|
||||
return (IPPROTO_IP == proto) ? "NONE" : "ERR!";
|
||||
return pp->state_name(state);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
|
||||
const struct sk_buff *skb,
|
||||
int offset,
|
||||
const char *msg)
|
||||
{
|
||||
char buf[128];
|
||||
struct iphdr _iph, *ih;
|
||||
|
||||
ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
|
||||
if (ih == NULL)
|
||||
sprintf(buf, "%s TRUNCATED", pp->name);
|
||||
else if (ih->frag_off & htons(IP_OFFSET))
|
||||
sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
|
||||
pp->name, NIPQUAD(ih->saddr),
|
||||
NIPQUAD(ih->daddr));
|
||||
else {
|
||||
__be16 _ports[2], *pptr
|
||||
;
|
||||
pptr = skb_header_pointer(skb, offset + ih->ihl*4,
|
||||
sizeof(_ports), _ports);
|
||||
if (pptr == NULL)
|
||||
sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u",
|
||||
pp->name,
|
||||
NIPQUAD(ih->saddr),
|
||||
NIPQUAD(ih->daddr));
|
||||
else
|
||||
sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u",
|
||||
pp->name,
|
||||
NIPQUAD(ih->saddr),
|
||||
ntohs(pptr[0]),
|
||||
NIPQUAD(ih->daddr),
|
||||
ntohs(pptr[1]));
|
||||
}
|
||||
|
||||
printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
static void
|
||||
ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
|
||||
const struct sk_buff *skb,
|
||||
int offset,
|
||||
const char *msg)
|
||||
{
|
||||
char buf[192];
|
||||
struct ipv6hdr _iph, *ih;
|
||||
|
||||
ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
|
||||
if (ih == NULL)
|
||||
sprintf(buf, "%s TRUNCATED", pp->name);
|
||||
else if (ih->nexthdr == IPPROTO_FRAGMENT)
|
||||
sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT " frag",
|
||||
pp->name, NIP6(ih->saddr),
|
||||
NIP6(ih->daddr));
|
||||
else {
|
||||
__be16 _ports[2], *pptr;
|
||||
|
||||
pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr),
|
||||
sizeof(_ports), _ports);
|
||||
if (pptr == NULL)
|
||||
sprintf(buf, "%s TRUNCATED " NIP6_FMT "->" NIP6_FMT,
|
||||
pp->name,
|
||||
NIP6(ih->saddr),
|
||||
NIP6(ih->daddr));
|
||||
else
|
||||
sprintf(buf, "%s " NIP6_FMT ":%u->" NIP6_FMT ":%u",
|
||||
pp->name,
|
||||
NIP6(ih->saddr),
|
||||
ntohs(pptr[0]),
|
||||
NIP6(ih->daddr),
|
||||
ntohs(pptr[1]));
|
||||
}
|
||||
|
||||
printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
void
|
||||
ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
|
||||
const struct sk_buff *skb,
|
||||
int offset,
|
||||
const char *msg)
|
||||
{
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
if (skb->protocol == htons(ETH_P_IPV6))
|
||||
ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg);
|
||||
else
|
||||
#endif
|
||||
ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg);
|
||||
}
|
||||
|
||||
|
||||
int __init ip_vs_protocol_init(void)
|
||||
{
|
||||
char protocols[64];
|
||||
#define REGISTER_PROTOCOL(p) \
|
||||
do { \
|
||||
register_ip_vs_protocol(p); \
|
||||
strcat(protocols, ", "); \
|
||||
strcat(protocols, (p)->name); \
|
||||
} while (0)
|
||||
|
||||
protocols[0] = '\0';
|
||||
protocols[2] = '\0';
|
||||
#ifdef CONFIG_IP_VS_PROTO_TCP
|
||||
REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
|
||||
#endif
|
||||
#ifdef CONFIG_IP_VS_PROTO_UDP
|
||||
REGISTER_PROTOCOL(&ip_vs_protocol_udp);
|
||||
#endif
|
||||
#ifdef CONFIG_IP_VS_PROTO_AH
|
||||
REGISTER_PROTOCOL(&ip_vs_protocol_ah);
|
||||
#endif
|
||||
#ifdef CONFIG_IP_VS_PROTO_ESP
|
||||
REGISTER_PROTOCOL(&ip_vs_protocol_esp);
|
||||
#endif
|
||||
IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
void ip_vs_protocol_cleanup(void)
|
||||
{
|
||||
struct ip_vs_protocol *pp;
|
||||
int i;
|
||||
|
||||
/* unregister all the ipvs protocols */
|
||||
for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
|
||||
while ((pp = ip_vs_proto_table[i]) != NULL)
|
||||
unregister_ip_vs_protocol(pp);
|
||||
}
|
||||
}
|
235
net/netfilter/ipvs/ip_vs_proto_ah_esp.c
Normal file
235
net/netfilter/ipvs/ip_vs_proto_ah_esp.c
Normal file
@@ -0,0 +1,235 @@
|
||||
/*
|
||||
* ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS
|
||||
*
|
||||
* Authors: Julian Anastasov <ja@ssi.bg>, February 2002
|
||||
* Wensong Zhang <wensong@linuxvirtualserver.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* version 2 as published by the Free Software Foundation;
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/in.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/netfilter.h>
|
||||
#include <linux/netfilter_ipv4.h>
|
||||
|
||||
#include <net/ip_vs.h>
|
||||
|
||||
|
||||
/* TODO:
|
||||
|
||||
struct isakmp_hdr {
|
||||
__u8 icookie[8];
|
||||
__u8 rcookie[8];
|
||||
__u8 np;
|
||||
__u8 version;
|
||||
__u8 xchgtype;
|
||||
__u8 flags;
|
||||
__u32 msgid;
|
||||
__u32 length;
|
||||
};
|
||||
|
||||
*/
|
||||
|
||||
#define PORT_ISAKMP 500
|
||||
|
||||
|
||||
static struct ip_vs_conn *
|
||||
ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
|
||||
const struct ip_vs_iphdr *iph, unsigned int proto_off,
|
||||
int inverse)
|
||||
{
|
||||
struct ip_vs_conn *cp;
|
||||
|
||||
if (likely(!inverse)) {
|
||||
cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
|
||||
&iph->saddr,
|
||||
htons(PORT_ISAKMP),
|
||||
&iph->daddr,
|
||||
htons(PORT_ISAKMP));
|
||||
} else {
|
||||
cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
|
||||
&iph->daddr,
|
||||
htons(PORT_ISAKMP),
|
||||
&iph->saddr,
|
||||
htons(PORT_ISAKMP));
|
||||
}
|
||||
|
||||
if (!cp) {
|
||||
/*
|
||||
* We are not sure if the packet is from our
|
||||
* service, so our conn_schedule hook should return NF_ACCEPT
|
||||
*/
|
||||
IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet "
|
||||
"%s%s %s->%s\n",
|
||||
inverse ? "ICMP+" : "",
|
||||
pp->name,
|
||||
IP_VS_DBG_ADDR(af, &iph->saddr),
|
||||
IP_VS_DBG_ADDR(af, &iph->daddr));
|
||||
}
|
||||
|
||||
return cp;
|
||||
}
|
||||
|
||||
|
||||
static struct ip_vs_conn *
|
||||
ah_esp_conn_out_get(int af, const struct sk_buff *skb,
|
||||
struct ip_vs_protocol *pp,
|
||||
const struct ip_vs_iphdr *iph,
|
||||
unsigned int proto_off,
|
||||
int inverse)
|
||||
{
|
||||
struct ip_vs_conn *cp;
|
||||
|
||||
if (likely(!inverse)) {
|
||||
cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
|
||||
&iph->saddr,
|
||||
htons(PORT_ISAKMP),
|
||||
&iph->daddr,
|
||||
htons(PORT_ISAKMP));
|
||||
} else {
|
||||
cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
|
||||
&iph->daddr,
|
||||
htons(PORT_ISAKMP),
|
||||
&iph->saddr,
|
||||
htons(PORT_ISAKMP));
|
||||
}
|
||||
|
||||
if (!cp) {
|
||||
IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
|
||||
"%s%s %s->%s\n",
|
||||
inverse ? "ICMP+" : "",
|
||||
pp->name,
|
||||
IP_VS_DBG_ADDR(af, &iph->saddr),
|
||||
IP_VS_DBG_ADDR(af, &iph->daddr));
|
||||
}
|
||||
|
||||
return cp;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
|
||||
int *verdict, struct ip_vs_conn **cpp)
|
||||
{
|
||||
/*
|
||||
* AH/ESP is only related traffic. Pass the packet to IP stack.
|
||||
*/
|
||||
*verdict = NF_ACCEPT;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb,
|
||||
int offset, const char *msg)
|
||||
{
|
||||
char buf[256];
|
||||
struct iphdr _iph, *ih;
|
||||
|
||||
ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
|
||||
if (ih == NULL)
|
||||
sprintf(buf, "%s TRUNCATED", pp->name);
|
||||
else
|
||||
sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
|
||||
pp->name, NIPQUAD(ih->saddr),
|
||||
NIPQUAD(ih->daddr));
|
||||
|
||||
printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
static void
|
||||
ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb,
|
||||
int offset, const char *msg)
|
||||
{
|
||||
char buf[256];
|
||||
struct ipv6hdr _iph, *ih;
|
||||
|
||||
ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
|
||||
if (ih == NULL)
|
||||
sprintf(buf, "%s TRUNCATED", pp->name);
|
||||
else
|
||||
sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT,
|
||||
pp->name, NIP6(ih->saddr),
|
||||
NIP6(ih->daddr));
|
||||
|
||||
printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
|
||||
}
|
||||
#endif
|
||||
|
||||
static void
|
||||
ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
|
||||
int offset, const char *msg)
|
||||
{
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
if (skb->protocol == htons(ETH_P_IPV6))
|
||||
ah_esp_debug_packet_v6(pp, skb, offset, msg);
|
||||
else
|
||||
#endif
|
||||
ah_esp_debug_packet_v4(pp, skb, offset, msg);
|
||||
}
|
||||
|
||||
|
||||
static void ah_esp_init(struct ip_vs_protocol *pp)
|
||||
{
|
||||
/* nothing to do now */
|
||||
}
|
||||
|
||||
|
||||
static void ah_esp_exit(struct ip_vs_protocol *pp)
|
||||
{
|
||||
/* nothing to do now */
|
||||
}
|
||||
|
||||
|
||||
#ifdef CONFIG_IP_VS_PROTO_AH
|
||||
struct ip_vs_protocol ip_vs_protocol_ah = {
|
||||
.name = "AH",
|
||||
.protocol = IPPROTO_AH,
|
||||
.num_states = 1,
|
||||
.dont_defrag = 1,
|
||||
.init = ah_esp_init,
|
||||
.exit = ah_esp_exit,
|
||||
.conn_schedule = ah_esp_conn_schedule,
|
||||
.conn_in_get = ah_esp_conn_in_get,
|
||||
.conn_out_get = ah_esp_conn_out_get,
|
||||
.snat_handler = NULL,
|
||||
.dnat_handler = NULL,
|
||||
.csum_check = NULL,
|
||||
.state_transition = NULL,
|
||||
.register_app = NULL,
|
||||
.unregister_app = NULL,
|
||||
.app_conn_bind = NULL,
|
||||
.debug_packet = ah_esp_debug_packet,
|
||||
.timeout_change = NULL, /* ISAKMP */
|
||||
.set_state_timeout = NULL,
|
||||
};
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_IP_VS_PROTO_ESP
|
||||
struct ip_vs_protocol ip_vs_protocol_esp = {
|
||||
.name = "ESP",
|
||||
.protocol = IPPROTO_ESP,
|
||||
.num_states = 1,
|
||||
.dont_defrag = 1,
|
||||
.init = ah_esp_init,
|
||||
.exit = ah_esp_exit,
|
||||
.conn_schedule = ah_esp_conn_schedule,
|
||||
.conn_in_get = ah_esp_conn_in_get,
|
||||
.conn_out_get = ah_esp_conn_out_get,
|
||||
.snat_handler = NULL,
|
||||
.dnat_handler = NULL,
|
||||
.csum_check = NULL,
|
||||
.state_transition = NULL,
|
||||
.register_app = NULL,
|
||||
.unregister_app = NULL,
|
||||
.app_conn_bind = NULL,
|
||||
.debug_packet = ah_esp_debug_packet,
|
||||
.timeout_change = NULL, /* ISAKMP */
|
||||
};
|
||||
#endif
|
732
net/netfilter/ipvs/ip_vs_proto_tcp.c
Normal file
732
net/netfilter/ipvs/ip_vs_proto_tcp.c
Normal file
@@ -0,0 +1,732 @@
|
||||
/*
|
||||
* ip_vs_proto_tcp.c: TCP load balancing support for IPVS
|
||||
*
|
||||
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
|
||||
* Julian Anastasov <ja@ssi.bg>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Changes:
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/tcp.h> /* for tcphdr */
|
||||
#include <net/ip.h>
|
||||
#include <net/tcp.h> /* for csum_tcpudp_magic */
|
||||
#include <net/ip6_checksum.h>
|
||||
#include <linux/netfilter.h>
|
||||
#include <linux/netfilter_ipv4.h>
|
||||
|
||||
#include <net/ip_vs.h>
|
||||
|
||||
|
||||
static struct ip_vs_conn *
|
||||
tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
|
||||
const struct ip_vs_iphdr *iph, unsigned int proto_off,
|
||||
int inverse)
|
||||
{
|
||||
__be16 _ports[2], *pptr;
|
||||
|
||||
pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
|
||||
if (pptr == NULL)
|
||||
return NULL;
|
||||
|
||||
if (likely(!inverse)) {
|
||||
return ip_vs_conn_in_get(af, iph->protocol,
|
||||
&iph->saddr, pptr[0],
|
||||
&iph->daddr, pptr[1]);
|
||||
} else {
|
||||
return ip_vs_conn_in_get(af, iph->protocol,
|
||||
&iph->daddr, pptr[1],
|
||||
&iph->saddr, pptr[0]);
|
||||
}
|
||||
}
|
||||
|
||||
static struct ip_vs_conn *
|
||||
tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
|
||||
const struct ip_vs_iphdr *iph, unsigned int proto_off,
|
||||
int inverse)
|
||||
{
|
||||
__be16 _ports[2], *pptr;
|
||||
|
||||
pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
|
||||
if (pptr == NULL)
|
||||
return NULL;
|
||||
|
||||
if (likely(!inverse)) {
|
||||
return ip_vs_conn_out_get(af, iph->protocol,
|
||||
&iph->saddr, pptr[0],
|
||||
&iph->daddr, pptr[1]);
|
||||
} else {
|
||||
return ip_vs_conn_out_get(af, iph->protocol,
|
||||
&iph->daddr, pptr[1],
|
||||
&iph->saddr, pptr[0]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
|
||||
int *verdict, struct ip_vs_conn **cpp)
|
||||
{
|
||||
struct ip_vs_service *svc;
|
||||
struct tcphdr _tcph, *th;
|
||||
struct ip_vs_iphdr iph;
|
||||
|
||||
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
|
||||
|
||||
th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
|
||||
if (th == NULL) {
|
||||
*verdict = NF_DROP;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (th->syn &&
|
||||
(svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
|
||||
th->dest))) {
|
||||
if (ip_vs_todrop()) {
|
||||
/*
|
||||
* It seems that we are very loaded.
|
||||
* We have to drop this packet :(
|
||||
*/
|
||||
ip_vs_service_put(svc);
|
||||
*verdict = NF_DROP;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Let the virtual server select a real server for the
|
||||
* incoming connection, and create a connection entry.
|
||||
*/
|
||||
*cpp = ip_vs_schedule(svc, skb);
|
||||
if (!*cpp) {
|
||||
*verdict = ip_vs_leave(svc, skb, pp);
|
||||
return 0;
|
||||
}
|
||||
ip_vs_service_put(svc);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
tcp_fast_csum_update(int af, struct tcphdr *tcph,
|
||||
const union nf_inet_addr *oldip,
|
||||
const union nf_inet_addr *newip,
|
||||
__be16 oldport, __be16 newport)
|
||||
{
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
if (af == AF_INET6)
|
||||
tcph->check =
|
||||
csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
|
||||
ip_vs_check_diff2(oldport, newport,
|
||||
~csum_unfold(tcph->check))));
|
||||
else
|
||||
#endif
|
||||
tcph->check =
|
||||
csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
|
||||
ip_vs_check_diff2(oldport, newport,
|
||||
~csum_unfold(tcph->check))));
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
tcp_partial_csum_update(int af, struct tcphdr *tcph,
|
||||
const union nf_inet_addr *oldip,
|
||||
const union nf_inet_addr *newip,
|
||||
__be16 oldlen, __be16 newlen)
|
||||
{
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
if (af == AF_INET6)
|
||||
tcph->check =
|
||||
csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
|
||||
ip_vs_check_diff2(oldlen, newlen,
|
||||
~csum_unfold(tcph->check))));
|
||||
else
|
||||
#endif
|
||||
tcph->check =
|
||||
csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
|
||||
ip_vs_check_diff2(oldlen, newlen,
|
||||
~csum_unfold(tcph->check))));
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
tcp_snat_handler(struct sk_buff *skb,
|
||||
struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
|
||||
{
|
||||
struct tcphdr *tcph;
|
||||
unsigned int tcphoff;
|
||||
int oldlen;
|
||||
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
if (cp->af == AF_INET6)
|
||||
tcphoff = sizeof(struct ipv6hdr);
|
||||
else
|
||||
#endif
|
||||
tcphoff = ip_hdrlen(skb);
|
||||
oldlen = skb->len - tcphoff;
|
||||
|
||||
/* csum_check requires unshared skb */
|
||||
if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
|
||||
return 0;
|
||||
|
||||
if (unlikely(cp->app != NULL)) {
|
||||
/* Some checks before mangling */
|
||||
if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
|
||||
return 0;
|
||||
|
||||
/* Call application helper if needed */
|
||||
if (!ip_vs_app_pkt_out(cp, skb))
|
||||
return 0;
|
||||
}
|
||||
|
||||
tcph = (void *)skb_network_header(skb) + tcphoff;
|
||||
tcph->source = cp->vport;
|
||||
|
||||
/* Adjust TCP checksums */
|
||||
if (skb->ip_summed == CHECKSUM_PARTIAL) {
|
||||
tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
|
||||
htonl(oldlen),
|
||||
htonl(skb->len - tcphoff));
|
||||
} else if (!cp->app) {
|
||||
/* Only port and addr are changed, do fast csum update */
|
||||
tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
|
||||
cp->dport, cp->vport);
|
||||
if (skb->ip_summed == CHECKSUM_COMPLETE)
|
||||
skb->ip_summed = CHECKSUM_NONE;
|
||||
} else {
|
||||
/* full checksum calculation */
|
||||
tcph->check = 0;
|
||||
skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
if (cp->af == AF_INET6)
|
||||
tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
|
||||
&cp->caddr.in6,
|
||||
skb->len - tcphoff,
|
||||
cp->protocol, skb->csum);
|
||||
else
|
||||
#endif
|
||||
tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
|
||||
cp->caddr.ip,
|
||||
skb->len - tcphoff,
|
||||
cp->protocol,
|
||||
skb->csum);
|
||||
|
||||
IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
|
||||
pp->name, tcph->check,
|
||||
(char*)&(tcph->check) - (char*)tcph);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
tcp_dnat_handler(struct sk_buff *skb,
|
||||
struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
|
||||
{
|
||||
struct tcphdr *tcph;
|
||||
unsigned int tcphoff;
|
||||
int oldlen;
|
||||
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
if (cp->af == AF_INET6)
|
||||
tcphoff = sizeof(struct ipv6hdr);
|
||||
else
|
||||
#endif
|
||||
tcphoff = ip_hdrlen(skb);
|
||||
oldlen = skb->len - tcphoff;
|
||||
|
||||
/* csum_check requires unshared skb */
|
||||
if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
|
||||
return 0;
|
||||
|
||||
if (unlikely(cp->app != NULL)) {
|
||||
/* Some checks before mangling */
|
||||
if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Attempt ip_vs_app call.
|
||||
* It will fix ip_vs_conn and iph ack_seq stuff
|
||||
*/
|
||||
if (!ip_vs_app_pkt_in(cp, skb))
|
||||
return 0;
|
||||
}
|
||||
|
||||
tcph = (void *)skb_network_header(skb) + tcphoff;
|
||||
tcph->dest = cp->dport;
|
||||
|
||||
/*
|
||||
* Adjust TCP checksums
|
||||
*/
|
||||
if (skb->ip_summed == CHECKSUM_PARTIAL) {
|
||||
tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
|
||||
htonl(oldlen),
|
||||
htonl(skb->len - tcphoff));
|
||||
} else if (!cp->app) {
|
||||
/* Only port and addr are changed, do fast csum update */
|
||||
tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
|
||||
cp->vport, cp->dport);
|
||||
if (skb->ip_summed == CHECKSUM_COMPLETE)
|
||||
skb->ip_summed = CHECKSUM_NONE;
|
||||
} else {
|
||||
/* full checksum calculation */
|
||||
tcph->check = 0;
|
||||
skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
if (cp->af == AF_INET6)
|
||||
tcph->check = csum_ipv6_magic(&cp->caddr.in6,
|
||||
&cp->daddr.in6,
|
||||
skb->len - tcphoff,
|
||||
cp->protocol, skb->csum);
|
||||
else
|
||||
#endif
|
||||
tcph->check = csum_tcpudp_magic(cp->caddr.ip,
|
||||
cp->daddr.ip,
|
||||
skb->len - tcphoff,
|
||||
cp->protocol,
|
||||
skb->csum);
|
||||
skb->ip_summed = CHECKSUM_UNNECESSARY;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
|
||||
{
|
||||
unsigned int tcphoff;
|
||||
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
if (af == AF_INET6)
|
||||
tcphoff = sizeof(struct ipv6hdr);
|
||||
else
|
||||
#endif
|
||||
tcphoff = ip_hdrlen(skb);
|
||||
|
||||
switch (skb->ip_summed) {
|
||||
case CHECKSUM_NONE:
|
||||
skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
|
||||
case CHECKSUM_COMPLETE:
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
if (af == AF_INET6) {
|
||||
if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
|
||||
&ipv6_hdr(skb)->daddr,
|
||||
skb->len - tcphoff,
|
||||
ipv6_hdr(skb)->nexthdr,
|
||||
skb->csum)) {
|
||||
IP_VS_DBG_RL_PKT(0, pp, skb, 0,
|
||||
"Failed checksum for");
|
||||
return 0;
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
|
||||
ip_hdr(skb)->daddr,
|
||||
skb->len - tcphoff,
|
||||
ip_hdr(skb)->protocol,
|
||||
skb->csum)) {
|
||||
IP_VS_DBG_RL_PKT(0, pp, skb, 0,
|
||||
"Failed checksum for");
|
||||
return 0;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
/* No need to checksum. */
|
||||
break;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
#define TCP_DIR_INPUT 0
|
||||
#define TCP_DIR_OUTPUT 4
|
||||
#define TCP_DIR_INPUT_ONLY 8
|
||||
|
||||
static const int tcp_state_off[IP_VS_DIR_LAST] = {
|
||||
[IP_VS_DIR_INPUT] = TCP_DIR_INPUT,
|
||||
[IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT,
|
||||
[IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY,
|
||||
};
|
||||
|
||||
/*
|
||||
* Timeout table[state]
|
||||
*/
|
||||
static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
|
||||
[IP_VS_TCP_S_NONE] = 2*HZ,
|
||||
[IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
|
||||
[IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
|
||||
[IP_VS_TCP_S_SYN_RECV] = 1*60*HZ,
|
||||
[IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ,
|
||||
[IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ,
|
||||
[IP_VS_TCP_S_CLOSE] = 10*HZ,
|
||||
[IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
|
||||
[IP_VS_TCP_S_LAST_ACK] = 30*HZ,
|
||||
[IP_VS_TCP_S_LISTEN] = 2*60*HZ,
|
||||
[IP_VS_TCP_S_SYNACK] = 120*HZ,
|
||||
[IP_VS_TCP_S_LAST] = 2*HZ,
|
||||
};
|
||||
|
||||
static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
|
||||
[IP_VS_TCP_S_NONE] = "NONE",
|
||||
[IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
|
||||
[IP_VS_TCP_S_SYN_SENT] = "SYN_SENT",
|
||||
[IP_VS_TCP_S_SYN_RECV] = "SYN_RECV",
|
||||
[IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT",
|
||||
[IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT",
|
||||
[IP_VS_TCP_S_CLOSE] = "CLOSE",
|
||||
[IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT",
|
||||
[IP_VS_TCP_S_LAST_ACK] = "LAST_ACK",
|
||||
[IP_VS_TCP_S_LISTEN] = "LISTEN",
|
||||
[IP_VS_TCP_S_SYNACK] = "SYNACK",
|
||||
[IP_VS_TCP_S_LAST] = "BUG!",
|
||||
};
|
||||
|
||||
#define sNO IP_VS_TCP_S_NONE
|
||||
#define sES IP_VS_TCP_S_ESTABLISHED
|
||||
#define sSS IP_VS_TCP_S_SYN_SENT
|
||||
#define sSR IP_VS_TCP_S_SYN_RECV
|
||||
#define sFW IP_VS_TCP_S_FIN_WAIT
|
||||
#define sTW IP_VS_TCP_S_TIME_WAIT
|
||||
#define sCL IP_VS_TCP_S_CLOSE
|
||||
#define sCW IP_VS_TCP_S_CLOSE_WAIT
|
||||
#define sLA IP_VS_TCP_S_LAST_ACK
|
||||
#define sLI IP_VS_TCP_S_LISTEN
|
||||
#define sSA IP_VS_TCP_S_SYNACK
|
||||
|
||||
struct tcp_states_t {
|
||||
int next_state[IP_VS_TCP_S_LAST];
|
||||
};
|
||||
|
||||
static const char * tcp_state_name(int state)
|
||||
{
|
||||
if (state >= IP_VS_TCP_S_LAST)
|
||||
return "ERR!";
|
||||
return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
|
||||
}
|
||||
|
||||
static struct tcp_states_t tcp_states [] = {
|
||||
/* INPUT */
|
||||
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
|
||||
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
|
||||
/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
|
||||
/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
|
||||
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
|
||||
|
||||
/* OUTPUT */
|
||||
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
|
||||
/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
|
||||
/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
|
||||
/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
|
||||
/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
|
||||
|
||||
/* INPUT-ONLY */
|
||||
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
|
||||
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
|
||||
/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
|
||||
/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
|
||||
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
|
||||
};
|
||||
|
||||
static struct tcp_states_t tcp_states_dos [] = {
|
||||
/* INPUT */
|
||||
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
|
||||
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
|
||||
/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
|
||||
/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
|
||||
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
|
||||
|
||||
/* OUTPUT */
|
||||
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
|
||||
/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
|
||||
/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
|
||||
/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
|
||||
/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
|
||||
|
||||
/* INPUT-ONLY */
|
||||
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
|
||||
/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
|
||||
/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
|
||||
/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
|
||||
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
|
||||
};
|
||||
|
||||
static struct tcp_states_t *tcp_state_table = tcp_states;
|
||||
|
||||
|
||||
static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
|
||||
{
|
||||
int on = (flags & 1); /* secure_tcp */
|
||||
|
||||
/*
|
||||
** FIXME: change secure_tcp to independent sysctl var
|
||||
** or make it per-service or per-app because it is valid
|
||||
** for most if not for all of the applications. Something
|
||||
** like "capabilities" (flags) for each object.
|
||||
*/
|
||||
tcp_state_table = (on? tcp_states_dos : tcp_states);
|
||||
}
|
||||
|
||||
static int
|
||||
tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
|
||||
{
|
||||
return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
|
||||
tcp_state_name_table, sname, to);
|
||||
}
|
||||
|
||||
static inline int tcp_state_idx(struct tcphdr *th)
|
||||
{
|
||||
if (th->rst)
|
||||
return 3;
|
||||
if (th->syn)
|
||||
return 0;
|
||||
if (th->fin)
|
||||
return 1;
|
||||
if (th->ack)
|
||||
return 2;
|
||||
return -1;
|
||||
}
|
||||
|
||||
static inline void
|
||||
set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
|
||||
int direction, struct tcphdr *th)
|
||||
{
|
||||
int state_idx;
|
||||
int new_state = IP_VS_TCP_S_CLOSE;
|
||||
int state_off = tcp_state_off[direction];
|
||||
|
||||
/*
|
||||
* Update state offset to INPUT_ONLY if necessary
|
||||
* or delete NO_OUTPUT flag if output packet detected
|
||||
*/
|
||||
if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
|
||||
if (state_off == TCP_DIR_OUTPUT)
|
||||
cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
|
||||
else
|
||||
state_off = TCP_DIR_INPUT_ONLY;
|
||||
}
|
||||
|
||||
if ((state_idx = tcp_state_idx(th)) < 0) {
|
||||
IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
|
||||
goto tcp_state_out;
|
||||
}
|
||||
|
||||
new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
|
||||
|
||||
tcp_state_out:
|
||||
if (new_state != cp->state) {
|
||||
struct ip_vs_dest *dest = cp->dest;
|
||||
|
||||
IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
|
||||
"%s:%d state: %s->%s conn->refcnt:%d\n",
|
||||
pp->name,
|
||||
((state_off == TCP_DIR_OUTPUT) ?
|
||||
"output " : "input "),
|
||||
th->syn ? 'S' : '.',
|
||||
th->fin ? 'F' : '.',
|
||||
th->ack ? 'A' : '.',
|
||||
th->rst ? 'R' : '.',
|
||||
IP_VS_DBG_ADDR(cp->af, &cp->daddr),
|
||||
ntohs(cp->dport),
|
||||
IP_VS_DBG_ADDR(cp->af, &cp->caddr),
|
||||
ntohs(cp->cport),
|
||||
tcp_state_name(cp->state),
|
||||
tcp_state_name(new_state),
|
||||
atomic_read(&cp->refcnt));
|
||||
|
||||
if (dest) {
|
||||
if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
|
||||
(new_state != IP_VS_TCP_S_ESTABLISHED)) {
|
||||
atomic_dec(&dest->activeconns);
|
||||
atomic_inc(&dest->inactconns);
|
||||
cp->flags |= IP_VS_CONN_F_INACTIVE;
|
||||
} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
|
||||
(new_state == IP_VS_TCP_S_ESTABLISHED)) {
|
||||
atomic_inc(&dest->activeconns);
|
||||
atomic_dec(&dest->inactconns);
|
||||
cp->flags &= ~IP_VS_CONN_F_INACTIVE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cp->timeout = pp->timeout_table[cp->state = new_state];
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Handle state transitions
|
||||
*/
|
||||
static int
|
||||
tcp_state_transition(struct ip_vs_conn *cp, int direction,
|
||||
const struct sk_buff *skb,
|
||||
struct ip_vs_protocol *pp)
|
||||
{
|
||||
struct tcphdr _tcph, *th;
|
||||
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
|
||||
#else
|
||||
int ihl = ip_hdrlen(skb);
|
||||
#endif
|
||||
|
||||
th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
|
||||
if (th == NULL)
|
||||
return 0;
|
||||
|
||||
spin_lock(&cp->lock);
|
||||
set_tcp_state(pp, cp, direction, th);
|
||||
spin_unlock(&cp->lock);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Hash table for TCP application incarnations
|
||||
*/
|
||||
#define TCP_APP_TAB_BITS 4
|
||||
#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS)
|
||||
#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1)
|
||||
|
||||
static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
|
||||
static DEFINE_SPINLOCK(tcp_app_lock);
|
||||
|
||||
static inline __u16 tcp_app_hashkey(__be16 port)
|
||||
{
|
||||
return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
|
||||
& TCP_APP_TAB_MASK;
|
||||
}
|
||||
|
||||
|
||||
static int tcp_register_app(struct ip_vs_app *inc)
|
||||
{
|
||||
struct ip_vs_app *i;
|
||||
__u16 hash;
|
||||
__be16 port = inc->port;
|
||||
int ret = 0;
|
||||
|
||||
hash = tcp_app_hashkey(port);
|
||||
|
||||
spin_lock_bh(&tcp_app_lock);
|
||||
list_for_each_entry(i, &tcp_apps[hash], p_list) {
|
||||
if (i->port == port) {
|
||||
ret = -EEXIST;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
list_add(&inc->p_list, &tcp_apps[hash]);
|
||||
atomic_inc(&ip_vs_protocol_tcp.appcnt);
|
||||
|
||||
out:
|
||||
spin_unlock_bh(&tcp_app_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
tcp_unregister_app(struct ip_vs_app *inc)
|
||||
{
|
||||
spin_lock_bh(&tcp_app_lock);
|
||||
atomic_dec(&ip_vs_protocol_tcp.appcnt);
|
||||
list_del(&inc->p_list);
|
||||
spin_unlock_bh(&tcp_app_lock);
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
tcp_app_conn_bind(struct ip_vs_conn *cp)
|
||||
{
|
||||
int hash;
|
||||
struct ip_vs_app *inc;
|
||||
int result = 0;
|
||||
|
||||
/* Default binding: bind app only for NAT */
|
||||
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
|
||||
return 0;
|
||||
|
||||
/* Lookup application incarnations and bind the right one */
|
||||
hash = tcp_app_hashkey(cp->vport);
|
||||
|
||||
spin_lock(&tcp_app_lock);
|
||||
list_for_each_entry(inc, &tcp_apps[hash], p_list) {
|
||||
if (inc->port == cp->vport) {
|
||||
if (unlikely(!ip_vs_app_inc_get(inc)))
|
||||
break;
|
||||
spin_unlock(&tcp_app_lock);
|
||||
|
||||
IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
|
||||
"%s:%u to app %s on port %u\n",
|
||||
__func__,
|
||||
IP_VS_DBG_ADDR(cp->af, &cp->caddr),
|
||||
ntohs(cp->cport),
|
||||
IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
|
||||
ntohs(cp->vport),
|
||||
inc->name, ntohs(inc->port));
|
||||
|
||||
cp->app = inc;
|
||||
if (inc->init_conn)
|
||||
result = inc->init_conn(inc, cp);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
spin_unlock(&tcp_app_lock);
|
||||
|
||||
out:
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Set LISTEN timeout. (ip_vs_conn_put will setup timer)
|
||||
*/
|
||||
void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
|
||||
{
|
||||
spin_lock(&cp->lock);
|
||||
cp->state = IP_VS_TCP_S_LISTEN;
|
||||
cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
|
||||
spin_unlock(&cp->lock);
|
||||
}
|
||||
|
||||
|
||||
static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
|
||||
{
|
||||
IP_VS_INIT_HASH_TABLE(tcp_apps);
|
||||
pp->timeout_table = tcp_timeouts;
|
||||
}
|
||||
|
||||
|
||||
static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
struct ip_vs_protocol ip_vs_protocol_tcp = {
|
||||
.name = "TCP",
|
||||
.protocol = IPPROTO_TCP,
|
||||
.num_states = IP_VS_TCP_S_LAST,
|
||||
.dont_defrag = 0,
|
||||
.appcnt = ATOMIC_INIT(0),
|
||||
.init = ip_vs_tcp_init,
|
||||
.exit = ip_vs_tcp_exit,
|
||||
.register_app = tcp_register_app,
|
||||
.unregister_app = tcp_unregister_app,
|
||||
.conn_schedule = tcp_conn_schedule,
|
||||
.conn_in_get = tcp_conn_in_get,
|
||||
.conn_out_get = tcp_conn_out_get,
|
||||
.snat_handler = tcp_snat_handler,
|
||||
.dnat_handler = tcp_dnat_handler,
|
||||
.csum_check = tcp_csum_check,
|
||||
.state_name = tcp_state_name,
|
||||
.state_transition = tcp_state_transition,
|
||||
.app_conn_bind = tcp_app_conn_bind,
|
||||
.debug_packet = ip_vs_tcpudp_debug_packet,
|
||||
.timeout_change = tcp_timeout_change,
|
||||
.set_state_timeout = tcp_set_state_timeout,
|
||||
};
|
533
net/netfilter/ipvs/ip_vs_proto_udp.c
Normal file
533
net/netfilter/ipvs/ip_vs_proto_udp.c
Normal file
@@ -0,0 +1,533 @@
|
||||
/*
|
||||
* ip_vs_proto_udp.c: UDP load balancing support for IPVS
|
||||
*
|
||||
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
|
||||
* Julian Anastasov <ja@ssi.bg>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Changes:
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/in.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/netfilter.h>
|
||||
#include <linux/netfilter_ipv4.h>
|
||||
#include <linux/udp.h>
|
||||
|
||||
#include <net/ip_vs.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/ip6_checksum.h>
|
||||
|
||||
static struct ip_vs_conn *
|
||||
udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
|
||||
const struct ip_vs_iphdr *iph, unsigned int proto_off,
|
||||
int inverse)
|
||||
{
|
||||
struct ip_vs_conn *cp;
|
||||
__be16 _ports[2], *pptr;
|
||||
|
||||
pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
|
||||
if (pptr == NULL)
|
||||
return NULL;
|
||||
|
||||
if (likely(!inverse)) {
|
||||
cp = ip_vs_conn_in_get(af, iph->protocol,
|
||||
&iph->saddr, pptr[0],
|
||||
&iph->daddr, pptr[1]);
|
||||
} else {
|
||||
cp = ip_vs_conn_in_get(af, iph->protocol,
|
||||
&iph->daddr, pptr[1],
|
||||
&iph->saddr, pptr[0]);
|
||||
}
|
||||
|
||||
return cp;
|
||||
}
|
||||
|
||||
|
||||
static struct ip_vs_conn *
|
||||
udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
|
||||
const struct ip_vs_iphdr *iph, unsigned int proto_off,
|
||||
int inverse)
|
||||
{
|
||||
struct ip_vs_conn *cp;
|
||||
__be16 _ports[2], *pptr;
|
||||
|
||||
pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
|
||||
if (pptr == NULL)
|
||||
return NULL;
|
||||
|
||||
if (likely(!inverse)) {
|
||||
cp = ip_vs_conn_out_get(af, iph->protocol,
|
||||
&iph->saddr, pptr[0],
|
||||
&iph->daddr, pptr[1]);
|
||||
} else {
|
||||
cp = ip_vs_conn_out_get(af, iph->protocol,
|
||||
&iph->daddr, pptr[1],
|
||||
&iph->saddr, pptr[0]);
|
||||
}
|
||||
|
||||
return cp;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
|
||||
int *verdict, struct ip_vs_conn **cpp)
|
||||
{
|
||||
struct ip_vs_service *svc;
|
||||
struct udphdr _udph, *uh;
|
||||
struct ip_vs_iphdr iph;
|
||||
|
||||
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
|
||||
|
||||
uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph);
|
||||
if (uh == NULL) {
|
||||
*verdict = NF_DROP;
|
||||
return 0;
|
||||
}
|
||||
|
||||
svc = ip_vs_service_get(af, skb->mark, iph.protocol,
|
||||
&iph.daddr, uh->dest);
|
||||
if (svc) {
|
||||
if (ip_vs_todrop()) {
|
||||
/*
|
||||
* It seems that we are very loaded.
|
||||
* We have to drop this packet :(
|
||||
*/
|
||||
ip_vs_service_put(svc);
|
||||
*verdict = NF_DROP;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Let the virtual server select a real server for the
|
||||
* incoming connection, and create a connection entry.
|
||||
*/
|
||||
*cpp = ip_vs_schedule(svc, skb);
|
||||
if (!*cpp) {
|
||||
*verdict = ip_vs_leave(svc, skb, pp);
|
||||
return 0;
|
||||
}
|
||||
ip_vs_service_put(svc);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
static inline void
|
||||
udp_fast_csum_update(int af, struct udphdr *uhdr,
|
||||
const union nf_inet_addr *oldip,
|
||||
const union nf_inet_addr *newip,
|
||||
__be16 oldport, __be16 newport)
|
||||
{
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
if (af == AF_INET6)
|
||||
uhdr->check =
|
||||
csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
|
||||
ip_vs_check_diff2(oldport, newport,
|
||||
~csum_unfold(uhdr->check))));
|
||||
else
|
||||
#endif
|
||||
uhdr->check =
|
||||
csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
|
||||
ip_vs_check_diff2(oldport, newport,
|
||||
~csum_unfold(uhdr->check))));
|
||||
if (!uhdr->check)
|
||||
uhdr->check = CSUM_MANGLED_0;
|
||||
}
|
||||
|
||||
static inline void
|
||||
udp_partial_csum_update(int af, struct udphdr *uhdr,
|
||||
const union nf_inet_addr *oldip,
|
||||
const union nf_inet_addr *newip,
|
||||
__be16 oldlen, __be16 newlen)
|
||||
{
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
if (af == AF_INET6)
|
||||
uhdr->check =
|
||||
csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
|
||||
ip_vs_check_diff2(oldlen, newlen,
|
||||
~csum_unfold(uhdr->check))));
|
||||
else
|
||||
#endif
|
||||
uhdr->check =
|
||||
csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
|
||||
ip_vs_check_diff2(oldlen, newlen,
|
||||
~csum_unfold(uhdr->check))));
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
udp_snat_handler(struct sk_buff *skb,
|
||||
struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
|
||||
{
|
||||
struct udphdr *udph;
|
||||
unsigned int udphoff;
|
||||
int oldlen;
|
||||
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
if (cp->af == AF_INET6)
|
||||
udphoff = sizeof(struct ipv6hdr);
|
||||
else
|
||||
#endif
|
||||
udphoff = ip_hdrlen(skb);
|
||||
oldlen = skb->len - udphoff;
|
||||
|
||||
/* csum_check requires unshared skb */
|
||||
if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
|
||||
return 0;
|
||||
|
||||
if (unlikely(cp->app != NULL)) {
|
||||
/* Some checks before mangling */
|
||||
if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Call application helper if needed
|
||||
*/
|
||||
if (!ip_vs_app_pkt_out(cp, skb))
|
||||
return 0;
|
||||
}
|
||||
|
||||
udph = (void *)skb_network_header(skb) + udphoff;
|
||||
udph->source = cp->vport;
|
||||
|
||||
/*
|
||||
* Adjust UDP checksums
|
||||
*/
|
||||
if (skb->ip_summed == CHECKSUM_PARTIAL) {
|
||||
udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
|
||||
htonl(oldlen),
|
||||
htonl(skb->len - udphoff));
|
||||
} else if (!cp->app && (udph->check != 0)) {
|
||||
/* Only port and addr are changed, do fast csum update */
|
||||
udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
|
||||
cp->dport, cp->vport);
|
||||
if (skb->ip_summed == CHECKSUM_COMPLETE)
|
||||
skb->ip_summed = CHECKSUM_NONE;
|
||||
} else {
|
||||
/* full checksum calculation */
|
||||
udph->check = 0;
|
||||
skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
if (cp->af == AF_INET6)
|
||||
udph->check = csum_ipv6_magic(&cp->vaddr.in6,
|
||||
&cp->caddr.in6,
|
||||
skb->len - udphoff,
|
||||
cp->protocol, skb->csum);
|
||||
else
|
||||
#endif
|
||||
udph->check = csum_tcpudp_magic(cp->vaddr.ip,
|
||||
cp->caddr.ip,
|
||||
skb->len - udphoff,
|
||||
cp->protocol,
|
||||
skb->csum);
|
||||
if (udph->check == 0)
|
||||
udph->check = CSUM_MANGLED_0;
|
||||
IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
|
||||
pp->name, udph->check,
|
||||
(char*)&(udph->check) - (char*)udph);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
udp_dnat_handler(struct sk_buff *skb,
|
||||
struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
|
||||
{
|
||||
struct udphdr *udph;
|
||||
unsigned int udphoff;
|
||||
int oldlen;
|
||||
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
if (cp->af == AF_INET6)
|
||||
udphoff = sizeof(struct ipv6hdr);
|
||||
else
|
||||
#endif
|
||||
udphoff = ip_hdrlen(skb);
|
||||
oldlen = skb->len - udphoff;
|
||||
|
||||
/* csum_check requires unshared skb */
|
||||
if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
|
||||
return 0;
|
||||
|
||||
if (unlikely(cp->app != NULL)) {
|
||||
/* Some checks before mangling */
|
||||
if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Attempt ip_vs_app call.
|
||||
* It will fix ip_vs_conn
|
||||
*/
|
||||
if (!ip_vs_app_pkt_in(cp, skb))
|
||||
return 0;
|
||||
}
|
||||
|
||||
udph = (void *)skb_network_header(skb) + udphoff;
|
||||
udph->dest = cp->dport;
|
||||
|
||||
/*
|
||||
* Adjust UDP checksums
|
||||
*/
|
||||
if (skb->ip_summed == CHECKSUM_PARTIAL) {
|
||||
udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
|
||||
htonl(oldlen),
|
||||
htonl(skb->len - udphoff));
|
||||
} else if (!cp->app && (udph->check != 0)) {
|
||||
/* Only port and addr are changed, do fast csum update */
|
||||
udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
|
||||
cp->vport, cp->dport);
|
||||
if (skb->ip_summed == CHECKSUM_COMPLETE)
|
||||
skb->ip_summed = CHECKSUM_NONE;
|
||||
} else {
|
||||
/* full checksum calculation */
|
||||
udph->check = 0;
|
||||
skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
if (cp->af == AF_INET6)
|
||||
udph->check = csum_ipv6_magic(&cp->caddr.in6,
|
||||
&cp->daddr.in6,
|
||||
skb->len - udphoff,
|
||||
cp->protocol, skb->csum);
|
||||
else
|
||||
#endif
|
||||
udph->check = csum_tcpudp_magic(cp->caddr.ip,
|
||||
cp->daddr.ip,
|
||||
skb->len - udphoff,
|
||||
cp->protocol,
|
||||
skb->csum);
|
||||
if (udph->check == 0)
|
||||
udph->check = CSUM_MANGLED_0;
|
||||
skb->ip_summed = CHECKSUM_UNNECESSARY;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
|
||||
{
|
||||
struct udphdr _udph, *uh;
|
||||
unsigned int udphoff;
|
||||
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
if (af == AF_INET6)
|
||||
udphoff = sizeof(struct ipv6hdr);
|
||||
else
|
||||
#endif
|
||||
udphoff = ip_hdrlen(skb);
|
||||
|
||||
uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph);
|
||||
if (uh == NULL)
|
||||
return 0;
|
||||
|
||||
if (uh->check != 0) {
|
||||
switch (skb->ip_summed) {
|
||||
case CHECKSUM_NONE:
|
||||
skb->csum = skb_checksum(skb, udphoff,
|
||||
skb->len - udphoff, 0);
|
||||
case CHECKSUM_COMPLETE:
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
if (af == AF_INET6) {
|
||||
if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
|
||||
&ipv6_hdr(skb)->daddr,
|
||||
skb->len - udphoff,
|
||||
ipv6_hdr(skb)->nexthdr,
|
||||
skb->csum)) {
|
||||
IP_VS_DBG_RL_PKT(0, pp, skb, 0,
|
||||
"Failed checksum for");
|
||||
return 0;
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
|
||||
ip_hdr(skb)->daddr,
|
||||
skb->len - udphoff,
|
||||
ip_hdr(skb)->protocol,
|
||||
skb->csum)) {
|
||||
IP_VS_DBG_RL_PKT(0, pp, skb, 0,
|
||||
"Failed checksum for");
|
||||
return 0;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
/* No need to checksum. */
|
||||
break;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Note: the caller guarantees that only one of register_app,
|
||||
* unregister_app or app_conn_bind is called each time.
|
||||
*/
|
||||
|
||||
#define UDP_APP_TAB_BITS 4
|
||||
#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS)
|
||||
#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1)
|
||||
|
||||
static struct list_head udp_apps[UDP_APP_TAB_SIZE];
|
||||
static DEFINE_SPINLOCK(udp_app_lock);
|
||||
|
||||
static inline __u16 udp_app_hashkey(__be16 port)
|
||||
{
|
||||
return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port)
|
||||
& UDP_APP_TAB_MASK;
|
||||
}
|
||||
|
||||
|
||||
static int udp_register_app(struct ip_vs_app *inc)
|
||||
{
|
||||
struct ip_vs_app *i;
|
||||
__u16 hash;
|
||||
__be16 port = inc->port;
|
||||
int ret = 0;
|
||||
|
||||
hash = udp_app_hashkey(port);
|
||||
|
||||
|
||||
spin_lock_bh(&udp_app_lock);
|
||||
list_for_each_entry(i, &udp_apps[hash], p_list) {
|
||||
if (i->port == port) {
|
||||
ret = -EEXIST;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
list_add(&inc->p_list, &udp_apps[hash]);
|
||||
atomic_inc(&ip_vs_protocol_udp.appcnt);
|
||||
|
||||
out:
|
||||
spin_unlock_bh(&udp_app_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
udp_unregister_app(struct ip_vs_app *inc)
|
||||
{
|
||||
spin_lock_bh(&udp_app_lock);
|
||||
atomic_dec(&ip_vs_protocol_udp.appcnt);
|
||||
list_del(&inc->p_list);
|
||||
spin_unlock_bh(&udp_app_lock);
|
||||
}
|
||||
|
||||
|
||||
static int udp_app_conn_bind(struct ip_vs_conn *cp)
|
||||
{
|
||||
int hash;
|
||||
struct ip_vs_app *inc;
|
||||
int result = 0;
|
||||
|
||||
/* Default binding: bind app only for NAT */
|
||||
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
|
||||
return 0;
|
||||
|
||||
/* Lookup application incarnations and bind the right one */
|
||||
hash = udp_app_hashkey(cp->vport);
|
||||
|
||||
spin_lock(&udp_app_lock);
|
||||
list_for_each_entry(inc, &udp_apps[hash], p_list) {
|
||||
if (inc->port == cp->vport) {
|
||||
if (unlikely(!ip_vs_app_inc_get(inc)))
|
||||
break;
|
||||
spin_unlock(&udp_app_lock);
|
||||
|
||||
IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
|
||||
"%s:%u to app %s on port %u\n",
|
||||
__func__,
|
||||
IP_VS_DBG_ADDR(cp->af, &cp->caddr),
|
||||
ntohs(cp->cport),
|
||||
IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
|
||||
ntohs(cp->vport),
|
||||
inc->name, ntohs(inc->port));
|
||||
|
||||
cp->app = inc;
|
||||
if (inc->init_conn)
|
||||
result = inc->init_conn(inc, cp);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
spin_unlock(&udp_app_lock);
|
||||
|
||||
out:
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
static int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
|
||||
[IP_VS_UDP_S_NORMAL] = 5*60*HZ,
|
||||
[IP_VS_UDP_S_LAST] = 2*HZ,
|
||||
};
|
||||
|
||||
static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
|
||||
[IP_VS_UDP_S_NORMAL] = "UDP",
|
||||
[IP_VS_UDP_S_LAST] = "BUG!",
|
||||
};
|
||||
|
||||
|
||||
static int
|
||||
udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
|
||||
{
|
||||
return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
|
||||
udp_state_name_table, sname, to);
|
||||
}
|
||||
|
||||
static const char * udp_state_name(int state)
|
||||
{
|
||||
if (state >= IP_VS_UDP_S_LAST)
|
||||
return "ERR!";
|
||||
return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
|
||||
}
|
||||
|
||||
static int
|
||||
udp_state_transition(struct ip_vs_conn *cp, int direction,
|
||||
const struct sk_buff *skb,
|
||||
struct ip_vs_protocol *pp)
|
||||
{
|
||||
cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void udp_init(struct ip_vs_protocol *pp)
|
||||
{
|
||||
IP_VS_INIT_HASH_TABLE(udp_apps);
|
||||
pp->timeout_table = udp_timeouts;
|
||||
}
|
||||
|
||||
static void udp_exit(struct ip_vs_protocol *pp)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
struct ip_vs_protocol ip_vs_protocol_udp = {
|
||||
.name = "UDP",
|
||||
.protocol = IPPROTO_UDP,
|
||||
.num_states = IP_VS_UDP_S_LAST,
|
||||
.dont_defrag = 0,
|
||||
.init = udp_init,
|
||||
.exit = udp_exit,
|
||||
.conn_schedule = udp_conn_schedule,
|
||||
.conn_in_get = udp_conn_in_get,
|
||||
.conn_out_get = udp_conn_out_get,
|
||||
.snat_handler = udp_snat_handler,
|
||||
.dnat_handler = udp_dnat_handler,
|
||||
.csum_check = udp_csum_check,
|
||||
.state_transition = udp_state_transition,
|
||||
.state_name = udp_state_name,
|
||||
.register_app = udp_register_app,
|
||||
.unregister_app = udp_unregister_app,
|
||||
.app_conn_bind = udp_app_conn_bind,
|
||||
.debug_packet = ip_vs_tcpudp_debug_packet,
|
||||
.timeout_change = NULL,
|
||||
.set_state_timeout = udp_set_state_timeout,
|
||||
};
|
112
net/netfilter/ipvs/ip_vs_rr.c
Normal file
112
net/netfilter/ipvs/ip_vs_rr.c
Normal file
@@ -0,0 +1,112 @@
|
||||
/*
|
||||
* IPVS: Round-Robin Scheduling module
|
||||
*
|
||||
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
|
||||
* Peter Kese <peter.kese@ijs.si>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Fixes/Changes:
|
||||
* Wensong Zhang : changed the ip_vs_rr_schedule to return dest
|
||||
* Julian Anastasov : fixed the NULL pointer access bug in debugging
|
||||
* Wensong Zhang : changed some comestics things for debugging
|
||||
* Wensong Zhang : changed for the d-linked destination list
|
||||
* Wensong Zhang : added the ip_vs_rr_update_svc
|
||||
* Wensong Zhang : added any dest with weight=0 is quiesced
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
|
||||
#include <net/ip_vs.h>
|
||||
|
||||
|
||||
static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
|
||||
{
|
||||
svc->sched_data = &svc->destinations;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
|
||||
{
|
||||
svc->sched_data = &svc->destinations;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Round-Robin Scheduling
|
||||
*/
|
||||
static struct ip_vs_dest *
|
||||
ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
|
||||
{
|
||||
struct list_head *p, *q;
|
||||
struct ip_vs_dest *dest;
|
||||
|
||||
IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n");
|
||||
|
||||
write_lock(&svc->sched_lock);
|
||||
p = (struct list_head *)svc->sched_data;
|
||||
p = p->next;
|
||||
q = p;
|
||||
do {
|
||||
/* skip list head */
|
||||
if (q == &svc->destinations) {
|
||||
q = q->next;
|
||||
continue;
|
||||
}
|
||||
|
||||
dest = list_entry(q, struct ip_vs_dest, n_list);
|
||||
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
|
||||
atomic_read(&dest->weight) > 0)
|
||||
/* HIT */
|
||||
goto out;
|
||||
q = q->next;
|
||||
} while (q != p);
|
||||
write_unlock(&svc->sched_lock);
|
||||
return NULL;
|
||||
|
||||
out:
|
||||
svc->sched_data = q;
|
||||
write_unlock(&svc->sched_lock);
|
||||
IP_VS_DBG_BUF(6, "RR: server %s:%u "
|
||||
"activeconns %d refcnt %d weight %d\n",
|
||||
IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
|
||||
atomic_read(&dest->activeconns),
|
||||
atomic_read(&dest->refcnt), atomic_read(&dest->weight));
|
||||
|
||||
return dest;
|
||||
}
|
||||
|
||||
|
||||
static struct ip_vs_scheduler ip_vs_rr_scheduler = {
|
||||
.name = "rr", /* name */
|
||||
.refcnt = ATOMIC_INIT(0),
|
||||
.module = THIS_MODULE,
|
||||
.n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
.supports_ipv6 = 1,
|
||||
#endif
|
||||
.init_service = ip_vs_rr_init_svc,
|
||||
.update_service = ip_vs_rr_update_svc,
|
||||
.schedule = ip_vs_rr_schedule,
|
||||
};
|
||||
|
||||
static int __init ip_vs_rr_init(void)
|
||||
{
|
||||
return register_ip_vs_scheduler(&ip_vs_rr_scheduler);
|
||||
}
|
||||
|
||||
static void __exit ip_vs_rr_cleanup(void)
|
||||
{
|
||||
unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
|
||||
}
|
||||
|
||||
module_init(ip_vs_rr_init);
|
||||
module_exit(ip_vs_rr_cleanup);
|
||||
MODULE_LICENSE("GPL");
|
251
net/netfilter/ipvs/ip_vs_sched.c
Normal file
251
net/netfilter/ipvs/ip_vs_sched.c
Normal file
@@ -0,0 +1,251 @@
|
||||
/*
|
||||
* IPVS An implementation of the IP virtual server support for the
|
||||
* LINUX operating system. IPVS is now implemented as a module
|
||||
* over the Netfilter framework. IPVS can be used to build a
|
||||
* high-performance and highly available server based on a
|
||||
* cluster of servers.
|
||||
*
|
||||
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
|
||||
* Peter Kese <peter.kese@ijs.si>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Changes:
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <asm/string.h>
|
||||
#include <linux/kmod.h>
|
||||
#include <linux/sysctl.h>
|
||||
|
||||
#include <net/ip_vs.h>
|
||||
|
||||
/*
|
||||
* IPVS scheduler list
|
||||
*/
|
||||
static LIST_HEAD(ip_vs_schedulers);
|
||||
|
||||
/* lock for service table */
|
||||
static DEFINE_RWLOCK(__ip_vs_sched_lock);
|
||||
|
||||
|
||||
/*
|
||||
* Bind a service with a scheduler
|
||||
*/
|
||||
int ip_vs_bind_scheduler(struct ip_vs_service *svc,
|
||||
struct ip_vs_scheduler *scheduler)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (svc == NULL) {
|
||||
IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
if (scheduler == NULL) {
|
||||
IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
svc->scheduler = scheduler;
|
||||
|
||||
if (scheduler->init_service) {
|
||||
ret = scheduler->init_service(svc);
|
||||
if (ret) {
|
||||
IP_VS_ERR("ip_vs_bind_scheduler(): init error\n");
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Unbind a service with its scheduler
|
||||
*/
|
||||
int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
|
||||
{
|
||||
struct ip_vs_scheduler *sched;
|
||||
|
||||
if (svc == NULL) {
|
||||
IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
sched = svc->scheduler;
|
||||
if (sched == NULL) {
|
||||
IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (sched->done_service) {
|
||||
if (sched->done_service(svc) != 0) {
|
||||
IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
svc->scheduler = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Get scheduler in the scheduler list by name
|
||||
*/
|
||||
static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
|
||||
{
|
||||
struct ip_vs_scheduler *sched;
|
||||
|
||||
IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n",
|
||||
sched_name);
|
||||
|
||||
read_lock_bh(&__ip_vs_sched_lock);
|
||||
|
||||
list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
|
||||
/*
|
||||
* Test and get the modules atomically
|
||||
*/
|
||||
if (sched->module && !try_module_get(sched->module)) {
|
||||
/*
|
||||
* This scheduler is just deleted
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
if (strcmp(sched_name, sched->name)==0) {
|
||||
/* HIT */
|
||||
read_unlock_bh(&__ip_vs_sched_lock);
|
||||
return sched;
|
||||
}
|
||||
if (sched->module)
|
||||
module_put(sched->module);
|
||||
}
|
||||
|
||||
read_unlock_bh(&__ip_vs_sched_lock);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Lookup scheduler and try to load it if it doesn't exist
|
||||
*/
|
||||
struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
|
||||
{
|
||||
struct ip_vs_scheduler *sched;
|
||||
|
||||
/*
|
||||
* Search for the scheduler by sched_name
|
||||
*/
|
||||
sched = ip_vs_sched_getbyname(sched_name);
|
||||
|
||||
/*
|
||||
* If scheduler not found, load the module and search again
|
||||
*/
|
||||
if (sched == NULL) {
|
||||
request_module("ip_vs_%s", sched_name);
|
||||
sched = ip_vs_sched_getbyname(sched_name);
|
||||
}
|
||||
|
||||
return sched;
|
||||
}
|
||||
|
||||
void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
|
||||
{
|
||||
if (scheduler->module)
|
||||
module_put(scheduler->module);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Register a scheduler in the scheduler list
|
||||
*/
|
||||
int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
|
||||
{
|
||||
struct ip_vs_scheduler *sched;
|
||||
|
||||
if (!scheduler) {
|
||||
IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (!scheduler->name) {
|
||||
IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* increase the module use count */
|
||||
ip_vs_use_count_inc();
|
||||
|
||||
write_lock_bh(&__ip_vs_sched_lock);
|
||||
|
||||
if (!list_empty(&scheduler->n_list)) {
|
||||
write_unlock_bh(&__ip_vs_sched_lock);
|
||||
ip_vs_use_count_dec();
|
||||
IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
|
||||
"already linked\n", scheduler->name);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure that the scheduler with this name doesn't exist
|
||||
* in the scheduler list.
|
||||
*/
|
||||
list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
|
||||
if (strcmp(scheduler->name, sched->name) == 0) {
|
||||
write_unlock_bh(&__ip_vs_sched_lock);
|
||||
ip_vs_use_count_dec();
|
||||
IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
|
||||
"already existed in the system\n",
|
||||
scheduler->name);
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Add it into the d-linked scheduler list
|
||||
*/
|
||||
list_add(&scheduler->n_list, &ip_vs_schedulers);
|
||||
write_unlock_bh(&__ip_vs_sched_lock);
|
||||
|
||||
IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Unregister a scheduler from the scheduler list
|
||||
*/
|
||||
int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
|
||||
{
|
||||
if (!scheduler) {
|
||||
IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
write_lock_bh(&__ip_vs_sched_lock);
|
||||
if (list_empty(&scheduler->n_list)) {
|
||||
write_unlock_bh(&__ip_vs_sched_lock);
|
||||
IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler "
|
||||
"is not in the list. failed\n", scheduler->name);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove it from the d-linked scheduler list
|
||||
*/
|
||||
list_del(&scheduler->n_list);
|
||||
write_unlock_bh(&__ip_vs_sched_lock);
|
||||
|
||||
/* decrease the module use count */
|
||||
ip_vs_use_count_dec();
|
||||
|
||||
IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name);
|
||||
|
||||
return 0;
|
||||
}
|
140
net/netfilter/ipvs/ip_vs_sed.c
Normal file
140
net/netfilter/ipvs/ip_vs_sed.c
Normal file
@@ -0,0 +1,140 @@
|
||||
/*
|
||||
* IPVS: Shortest Expected Delay scheduling module
|
||||
*
|
||||
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Changes:
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* The SED algorithm attempts to minimize each job's expected delay until
|
||||
* completion. The expected delay that the job will experience is
|
||||
* (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of
|
||||
* jobs on the ith server and Ui is the fixed service rate (weight) of
|
||||
* the ith server. The SED algorithm adopts a greedy policy that each does
|
||||
* what is in its own best interest, i.e. to join the queue which would
|
||||
* minimize its expected delay of completion.
|
||||
*
|
||||
* See the following paper for more information:
|
||||
* A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
|
||||
* in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
|
||||
* pages 986-994, 1988.
|
||||
*
|
||||
* Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me.
|
||||
*
|
||||
* The difference between SED and WLC is that SED includes the incoming
|
||||
* job in the cost function (the increment of 1). SED may outperform
|
||||
* WLC, while scheduling big jobs under larger heterogeneous systems
|
||||
* (the server weight varies a lot).
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
|
||||
#include <net/ip_vs.h>
|
||||
|
||||
|
||||
static inline unsigned int
|
||||
ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
|
||||
{
|
||||
/*
|
||||
* We only use the active connection number in the cost
|
||||
* calculation here.
|
||||
*/
|
||||
return atomic_read(&dest->activeconns) + 1;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Weighted Least Connection scheduling
|
||||
*/
|
||||
static struct ip_vs_dest *
|
||||
ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
|
||||
{
|
||||
struct ip_vs_dest *dest, *least;
|
||||
unsigned int loh, doh;
|
||||
|
||||
IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n");
|
||||
|
||||
/*
|
||||
* We calculate the load of each dest server as follows:
|
||||
* (server expected overhead) / dest->weight
|
||||
*
|
||||
* Remember -- no floats in kernel mode!!!
|
||||
* The comparison of h1*w2 > h2*w1 is equivalent to that of
|
||||
* h1/w1 > h2/w2
|
||||
* if every weight is larger than zero.
|
||||
*
|
||||
* The server with weight=0 is quiesced and will not receive any
|
||||
* new connections.
|
||||
*/
|
||||
|
||||
list_for_each_entry(dest, &svc->destinations, n_list) {
|
||||
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
|
||||
atomic_read(&dest->weight) > 0) {
|
||||
least = dest;
|
||||
loh = ip_vs_sed_dest_overhead(least);
|
||||
goto nextstage;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* Find the destination with the least load.
|
||||
*/
|
||||
nextstage:
|
||||
list_for_each_entry_continue(dest, &svc->destinations, n_list) {
|
||||
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
|
||||
continue;
|
||||
doh = ip_vs_sed_dest_overhead(dest);
|
||||
if (loh * atomic_read(&dest->weight) >
|
||||
doh * atomic_read(&least->weight)) {
|
||||
least = dest;
|
||||
loh = doh;
|
||||
}
|
||||
}
|
||||
|
||||
IP_VS_DBG_BUF(6, "SED: server %s:%u "
|
||||
"activeconns %d refcnt %d weight %d overhead %d\n",
|
||||
IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
|
||||
atomic_read(&least->activeconns),
|
||||
atomic_read(&least->refcnt),
|
||||
atomic_read(&least->weight), loh);
|
||||
|
||||
return least;
|
||||
}
|
||||
|
||||
|
||||
static struct ip_vs_scheduler ip_vs_sed_scheduler =
|
||||
{
|
||||
.name = "sed",
|
||||
.refcnt = ATOMIC_INIT(0),
|
||||
.module = THIS_MODULE,
|
||||
.n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list),
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
.supports_ipv6 = 1,
|
||||
#endif
|
||||
.schedule = ip_vs_sed_schedule,
|
||||
};
|
||||
|
||||
|
||||
static int __init ip_vs_sed_init(void)
|
||||
{
|
||||
return register_ip_vs_scheduler(&ip_vs_sed_scheduler);
|
||||
}
|
||||
|
||||
static void __exit ip_vs_sed_cleanup(void)
|
||||
{
|
||||
unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
|
||||
}
|
||||
|
||||
module_init(ip_vs_sed_init);
|
||||
module_exit(ip_vs_sed_cleanup);
|
||||
MODULE_LICENSE("GPL");
|
258
net/netfilter/ipvs/ip_vs_sh.c
Normal file
258
net/netfilter/ipvs/ip_vs_sh.c
Normal file
@@ -0,0 +1,258 @@
|
||||
/*
|
||||
* IPVS: Source Hashing scheduling module
|
||||
*
|
||||
* Authors: Wensong Zhang <wensong@gnuchina.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Changes:
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* The sh algorithm is to select server by the hash key of source IP
|
||||
* address. The pseudo code is as follows:
|
||||
*
|
||||
* n <- servernode[src_ip];
|
||||
* if (n is dead) OR
|
||||
* (n is overloaded) or (n.weight <= 0) then
|
||||
* return NULL;
|
||||
*
|
||||
* return n;
|
||||
*
|
||||
* Notes that servernode is a 256-bucket hash table that maps the hash
|
||||
* index derived from packet source IP address to the current server
|
||||
* array. If the sh scheduler is used in cache cluster, it is good to
|
||||
* combine it with cache_bypass feature. When the statically assigned
|
||||
* server is dead or overloaded, the load balancer can bypass the cache
|
||||
* server and send requests to the original server directly.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/ip.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/skbuff.h>
|
||||
|
||||
#include <net/ip_vs.h>
|
||||
|
||||
|
||||
/*
|
||||
* IPVS SH bucket
|
||||
*/
|
||||
struct ip_vs_sh_bucket {
|
||||
struct ip_vs_dest *dest; /* real server (cache) */
|
||||
};
|
||||
|
||||
/*
|
||||
* for IPVS SH entry hash table
|
||||
*/
|
||||
#ifndef CONFIG_IP_VS_SH_TAB_BITS
|
||||
#define CONFIG_IP_VS_SH_TAB_BITS 8
|
||||
#endif
|
||||
#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS
|
||||
#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS)
|
||||
#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1)
|
||||
|
||||
|
||||
/*
|
||||
* Returns hash value for IPVS SH entry
|
||||
*/
|
||||
static inline unsigned ip_vs_sh_hashkey(__be32 addr)
|
||||
{
|
||||
return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Get ip_vs_dest associated with supplied parameters.
|
||||
*/
|
||||
static inline struct ip_vs_dest *
|
||||
ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __be32 addr)
|
||||
{
|
||||
return (tbl[ip_vs_sh_hashkey(addr)]).dest;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Assign all the hash buckets of the specified table with the service.
|
||||
*/
|
||||
static int
|
||||
ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
|
||||
{
|
||||
int i;
|
||||
struct ip_vs_sh_bucket *b;
|
||||
struct list_head *p;
|
||||
struct ip_vs_dest *dest;
|
||||
|
||||
b = tbl;
|
||||
p = &svc->destinations;
|
||||
for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
|
||||
if (list_empty(p)) {
|
||||
b->dest = NULL;
|
||||
} else {
|
||||
if (p == &svc->destinations)
|
||||
p = p->next;
|
||||
|
||||
dest = list_entry(p, struct ip_vs_dest, n_list);
|
||||
atomic_inc(&dest->refcnt);
|
||||
b->dest = dest;
|
||||
|
||||
p = p->next;
|
||||
}
|
||||
b++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Flush all the hash buckets of the specified table.
|
||||
*/
|
||||
static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
|
||||
{
|
||||
int i;
|
||||
struct ip_vs_sh_bucket *b;
|
||||
|
||||
b = tbl;
|
||||
for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
|
||||
if (b->dest) {
|
||||
atomic_dec(&b->dest->refcnt);
|
||||
b->dest = NULL;
|
||||
}
|
||||
b++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
|
||||
{
|
||||
struct ip_vs_sh_bucket *tbl;
|
||||
|
||||
/* allocate the SH table for this service */
|
||||
tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
|
||||
GFP_ATOMIC);
|
||||
if (tbl == NULL) {
|
||||
IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
svc->sched_data = tbl;
|
||||
IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
|
||||
"current service\n",
|
||||
sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
|
||||
|
||||
/* assign the hash buckets with the updated service */
|
||||
ip_vs_sh_assign(tbl, svc);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
|
||||
{
|
||||
struct ip_vs_sh_bucket *tbl = svc->sched_data;
|
||||
|
||||
/* got to clean up hash buckets here */
|
||||
ip_vs_sh_flush(tbl);
|
||||
|
||||
/* release the table itself */
|
||||
kfree(svc->sched_data);
|
||||
IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
|
||||
sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static int ip_vs_sh_update_svc(struct ip_vs_service *svc)
|
||||
{
|
||||
struct ip_vs_sh_bucket *tbl = svc->sched_data;
|
||||
|
||||
/* got to clean up hash buckets here */
|
||||
ip_vs_sh_flush(tbl);
|
||||
|
||||
/* assign the hash buckets with the updated service */
|
||||
ip_vs_sh_assign(tbl, svc);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
|
||||
* consider that the server is overloaded here.
|
||||
*/
|
||||
static inline int is_overloaded(struct ip_vs_dest *dest)
|
||||
{
|
||||
return dest->flags & IP_VS_DEST_F_OVERLOAD;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Source Hashing scheduling
|
||||
*/
|
||||
static struct ip_vs_dest *
|
||||
ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
|
||||
{
|
||||
struct ip_vs_dest *dest;
|
||||
struct ip_vs_sh_bucket *tbl;
|
||||
struct iphdr *iph = ip_hdr(skb);
|
||||
|
||||
IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
|
||||
|
||||
tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
|
||||
dest = ip_vs_sh_get(tbl, iph->saddr);
|
||||
if (!dest
|
||||
|| !(dest->flags & IP_VS_DEST_F_AVAILABLE)
|
||||
|| atomic_read(&dest->weight) <= 0
|
||||
|| is_overloaded(dest)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u "
|
||||
"--> server %u.%u.%u.%u:%d\n",
|
||||
NIPQUAD(iph->saddr),
|
||||
NIPQUAD(dest->addr.ip),
|
||||
ntohs(dest->port));
|
||||
|
||||
return dest;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* IPVS SH Scheduler structure
|
||||
*/
|
||||
static struct ip_vs_scheduler ip_vs_sh_scheduler =
|
||||
{
|
||||
.name = "sh",
|
||||
.refcnt = ATOMIC_INIT(0),
|
||||
.module = THIS_MODULE,
|
||||
.n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
.supports_ipv6 = 0,
|
||||
#endif
|
||||
.init_service = ip_vs_sh_init_svc,
|
||||
.done_service = ip_vs_sh_done_svc,
|
||||
.update_service = ip_vs_sh_update_svc,
|
||||
.schedule = ip_vs_sh_schedule,
|
||||
};
|
||||
|
||||
|
||||
static int __init ip_vs_sh_init(void)
|
||||
{
|
||||
return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
|
||||
}
|
||||
|
||||
|
||||
static void __exit ip_vs_sh_cleanup(void)
|
||||
{
|
||||
unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
|
||||
}
|
||||
|
||||
|
||||
module_init(ip_vs_sh_init);
|
||||
module_exit(ip_vs_sh_cleanup);
|
||||
MODULE_LICENSE("GPL");
|
942
net/netfilter/ipvs/ip_vs_sync.c
Normal file
942
net/netfilter/ipvs/ip_vs_sync.c
Normal file
@@ -0,0 +1,942 @@
|
||||
/*
|
||||
* IPVS An implementation of the IP virtual server support for the
|
||||
* LINUX operating system. IPVS is now implemented as a module
|
||||
* over the NetFilter framework. IPVS can be used to build a
|
||||
* high-performance and highly available server based on a
|
||||
* cluster of servers.
|
||||
*
|
||||
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
|
||||
*
|
||||
* ip_vs_sync: sync connection info from master load balancer to backups
|
||||
* through multicast
|
||||
*
|
||||
* Changes:
|
||||
* Alexandre Cassen : Added master & backup support at a time.
|
||||
* Alexandre Cassen : Added SyncID support for incoming sync
|
||||
* messages filtering.
|
||||
* Justin Ossevoort : Fix endian problem on sync message size.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/inetdevice.h>
|
||||
#include <linux/net.h>
|
||||
#include <linux/completion.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/in.h>
|
||||
#include <linux/igmp.h> /* for ip_mc_join_group */
|
||||
#include <linux/udp.h>
|
||||
#include <linux/err.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/wait.h>
|
||||
#include <linux/kernel.h>
|
||||
|
||||
#include <net/ip.h>
|
||||
#include <net/sock.h>
|
||||
|
||||
#include <net/ip_vs.h>
|
||||
|
||||
#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
|
||||
#define IP_VS_SYNC_PORT 8848 /* multicast port */
|
||||
|
||||
|
||||
/*
|
||||
* IPVS sync connection entry
|
||||
*/
|
||||
struct ip_vs_sync_conn {
|
||||
__u8 reserved;
|
||||
|
||||
/* Protocol, addresses and port numbers */
|
||||
__u8 protocol; /* Which protocol (TCP/UDP) */
|
||||
__be16 cport;
|
||||
__be16 vport;
|
||||
__be16 dport;
|
||||
__be32 caddr; /* client address */
|
||||
__be32 vaddr; /* virtual address */
|
||||
__be32 daddr; /* destination address */
|
||||
|
||||
/* Flags and state transition */
|
||||
__be16 flags; /* status flags */
|
||||
__be16 state; /* state info */
|
||||
|
||||
/* The sequence options start here */
|
||||
};
|
||||
|
||||
struct ip_vs_sync_conn_options {
|
||||
struct ip_vs_seq in_seq; /* incoming seq. struct */
|
||||
struct ip_vs_seq out_seq; /* outgoing seq. struct */
|
||||
};
|
||||
|
||||
struct ip_vs_sync_thread_data {
|
||||
struct socket *sock;
|
||||
char *buf;
|
||||
};
|
||||
|
||||
#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn))
|
||||
#define FULL_CONN_SIZE \
|
||||
(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
|
||||
|
||||
|
||||
/*
|
||||
The master mulitcasts messages to the backup load balancers in the
|
||||
following format.
|
||||
|
||||
0 1 2 3
|
||||
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
|
||||
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||||
| Count Conns | SyncID | Size |
|
||||
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||||
| |
|
||||
| IPVS Sync Connection (1) |
|
||||
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||||
| . |
|
||||
| . |
|
||||
| . |
|
||||
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||||
| |
|
||||
| IPVS Sync Connection (n) |
|
||||
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|
||||
*/
|
||||
|
||||
#define SYNC_MESG_HEADER_LEN 4
|
||||
#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
|
||||
|
||||
struct ip_vs_sync_mesg {
|
||||
__u8 nr_conns;
|
||||
__u8 syncid;
|
||||
__u16 size;
|
||||
|
||||
/* ip_vs_sync_conn entries start here */
|
||||
};
|
||||
|
||||
/* the maximum length of sync (sending/receiving) message */
|
||||
static int sync_send_mesg_maxlen;
|
||||
static int sync_recv_mesg_maxlen;
|
||||
|
||||
struct ip_vs_sync_buff {
|
||||
struct list_head list;
|
||||
unsigned long firstuse;
|
||||
|
||||
/* pointers for the message data */
|
||||
struct ip_vs_sync_mesg *mesg;
|
||||
unsigned char *head;
|
||||
unsigned char *end;
|
||||
};
|
||||
|
||||
|
||||
/* the sync_buff list head and the lock */
|
||||
static LIST_HEAD(ip_vs_sync_queue);
|
||||
static DEFINE_SPINLOCK(ip_vs_sync_lock);
|
||||
|
||||
/* current sync_buff for accepting new conn entries */
|
||||
static struct ip_vs_sync_buff *curr_sb = NULL;
|
||||
static DEFINE_SPINLOCK(curr_sb_lock);
|
||||
|
||||
/* ipvs sync daemon state */
|
||||
volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
|
||||
volatile int ip_vs_master_syncid = 0;
|
||||
volatile int ip_vs_backup_syncid = 0;
|
||||
|
||||
/* multicast interface name */
|
||||
char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
|
||||
char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
|
||||
|
||||
/* sync daemon tasks */
|
||||
static struct task_struct *sync_master_thread;
|
||||
static struct task_struct *sync_backup_thread;
|
||||
|
||||
/* multicast addr */
|
||||
static struct sockaddr_in mcast_addr = {
|
||||
.sin_family = AF_INET,
|
||||
.sin_port = __constant_htons(IP_VS_SYNC_PORT),
|
||||
.sin_addr.s_addr = __constant_htonl(IP_VS_SYNC_GROUP),
|
||||
};
|
||||
|
||||
|
||||
static inline struct ip_vs_sync_buff *sb_dequeue(void)
|
||||
{
|
||||
struct ip_vs_sync_buff *sb;
|
||||
|
||||
spin_lock_bh(&ip_vs_sync_lock);
|
||||
if (list_empty(&ip_vs_sync_queue)) {
|
||||
sb = NULL;
|
||||
} else {
|
||||
sb = list_entry(ip_vs_sync_queue.next,
|
||||
struct ip_vs_sync_buff,
|
||||
list);
|
||||
list_del(&sb->list);
|
||||
}
|
||||
spin_unlock_bh(&ip_vs_sync_lock);
|
||||
|
||||
return sb;
|
||||
}
|
||||
|
||||
static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
|
||||
{
|
||||
struct ip_vs_sync_buff *sb;
|
||||
|
||||
if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
|
||||
return NULL;
|
||||
|
||||
if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
|
||||
kfree(sb);
|
||||
return NULL;
|
||||
}
|
||||
sb->mesg->nr_conns = 0;
|
||||
sb->mesg->syncid = ip_vs_master_syncid;
|
||||
sb->mesg->size = 4;
|
||||
sb->head = (unsigned char *)sb->mesg + 4;
|
||||
sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
|
||||
sb->firstuse = jiffies;
|
||||
return sb;
|
||||
}
|
||||
|
||||
static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
|
||||
{
|
||||
kfree(sb->mesg);
|
||||
kfree(sb);
|
||||
}
|
||||
|
||||
static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
|
||||
{
|
||||
spin_lock(&ip_vs_sync_lock);
|
||||
if (ip_vs_sync_state & IP_VS_STATE_MASTER)
|
||||
list_add_tail(&sb->list, &ip_vs_sync_queue);
|
||||
else
|
||||
ip_vs_sync_buff_release(sb);
|
||||
spin_unlock(&ip_vs_sync_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the current sync buffer if it has been created for more
|
||||
* than the specified time or the specified time is zero.
|
||||
*/
|
||||
static inline struct ip_vs_sync_buff *
|
||||
get_curr_sync_buff(unsigned long time)
|
||||
{
|
||||
struct ip_vs_sync_buff *sb;
|
||||
|
||||
spin_lock_bh(&curr_sb_lock);
|
||||
if (curr_sb && (time == 0 ||
|
||||
time_before(jiffies - curr_sb->firstuse, time))) {
|
||||
sb = curr_sb;
|
||||
curr_sb = NULL;
|
||||
} else
|
||||
sb = NULL;
|
||||
spin_unlock_bh(&curr_sb_lock);
|
||||
return sb;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Add an ip_vs_conn information into the current sync_buff.
|
||||
* Called by ip_vs_in.
|
||||
*/
|
||||
void ip_vs_sync_conn(struct ip_vs_conn *cp)
|
||||
{
|
||||
struct ip_vs_sync_mesg *m;
|
||||
struct ip_vs_sync_conn *s;
|
||||
int len;
|
||||
|
||||
spin_lock(&curr_sb_lock);
|
||||
if (!curr_sb) {
|
||||
if (!(curr_sb=ip_vs_sync_buff_create())) {
|
||||
spin_unlock(&curr_sb_lock);
|
||||
IP_VS_ERR("ip_vs_sync_buff_create failed.\n");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
|
||||
SIMPLE_CONN_SIZE;
|
||||
m = curr_sb->mesg;
|
||||
s = (struct ip_vs_sync_conn *)curr_sb->head;
|
||||
|
||||
/* copy members */
|
||||
s->protocol = cp->protocol;
|
||||
s->cport = cp->cport;
|
||||
s->vport = cp->vport;
|
||||
s->dport = cp->dport;
|
||||
s->caddr = cp->caddr.ip;
|
||||
s->vaddr = cp->vaddr.ip;
|
||||
s->daddr = cp->daddr.ip;
|
||||
s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
|
||||
s->state = htons(cp->state);
|
||||
if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
|
||||
struct ip_vs_sync_conn_options *opt =
|
||||
(struct ip_vs_sync_conn_options *)&s[1];
|
||||
memcpy(opt, &cp->in_seq, sizeof(*opt));
|
||||
}
|
||||
|
||||
m->nr_conns++;
|
||||
m->size += len;
|
||||
curr_sb->head += len;
|
||||
|
||||
/* check if there is a space for next one */
|
||||
if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
|
||||
sb_queue_tail(curr_sb);
|
||||
curr_sb = NULL;
|
||||
}
|
||||
spin_unlock(&curr_sb_lock);
|
||||
|
||||
/* synchronize its controller if it has */
|
||||
if (cp->control)
|
||||
ip_vs_sync_conn(cp->control);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Process received multicast message and create the corresponding
|
||||
* ip_vs_conn entries.
|
||||
*/
|
||||
static void ip_vs_process_message(const char *buffer, const size_t buflen)
|
||||
{
|
||||
struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
|
||||
struct ip_vs_sync_conn *s;
|
||||
struct ip_vs_sync_conn_options *opt;
|
||||
struct ip_vs_conn *cp;
|
||||
struct ip_vs_protocol *pp;
|
||||
struct ip_vs_dest *dest;
|
||||
char *p;
|
||||
int i;
|
||||
|
||||
if (buflen < sizeof(struct ip_vs_sync_mesg)) {
|
||||
IP_VS_ERR_RL("sync message header too short\n");
|
||||
return;
|
||||
}
|
||||
|
||||
/* Convert size back to host byte order */
|
||||
m->size = ntohs(m->size);
|
||||
|
||||
if (buflen != m->size) {
|
||||
IP_VS_ERR_RL("bogus sync message size\n");
|
||||
return;
|
||||
}
|
||||
|
||||
/* SyncID sanity check */
|
||||
if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
|
||||
IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
|
||||
m->syncid);
|
||||
return;
|
||||
}
|
||||
|
||||
p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
|
||||
for (i=0; i<m->nr_conns; i++) {
|
||||
unsigned flags, state;
|
||||
|
||||
if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
|
||||
IP_VS_ERR_RL("bogus conn in sync message\n");
|
||||
return;
|
||||
}
|
||||
s = (struct ip_vs_sync_conn *) p;
|
||||
flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
|
||||
flags &= ~IP_VS_CONN_F_HASHED;
|
||||
if (flags & IP_VS_CONN_F_SEQ_MASK) {
|
||||
opt = (struct ip_vs_sync_conn_options *)&s[1];
|
||||
p += FULL_CONN_SIZE;
|
||||
if (p > buffer+buflen) {
|
||||
IP_VS_ERR_RL("bogus conn options in sync message\n");
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
opt = NULL;
|
||||
p += SIMPLE_CONN_SIZE;
|
||||
}
|
||||
|
||||
state = ntohs(s->state);
|
||||
if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
|
||||
pp = ip_vs_proto_get(s->protocol);
|
||||
if (!pp) {
|
||||
IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n",
|
||||
s->protocol);
|
||||
continue;
|
||||
}
|
||||
if (state >= pp->num_states) {
|
||||
IP_VS_DBG(2, "Invalid %s state %u in sync msg\n",
|
||||
pp->name, state);
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
/* protocol in templates is not used for state/timeout */
|
||||
pp = NULL;
|
||||
if (state > 0) {
|
||||
IP_VS_DBG(2, "Invalid template state %u in sync msg\n",
|
||||
state);
|
||||
state = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (!(flags & IP_VS_CONN_F_TEMPLATE))
|
||||
cp = ip_vs_conn_in_get(AF_INET, s->protocol,
|
||||
(union nf_inet_addr *)&s->caddr,
|
||||
s->cport,
|
||||
(union nf_inet_addr *)&s->vaddr,
|
||||
s->vport);
|
||||
else
|
||||
cp = ip_vs_ct_in_get(AF_INET, s->protocol,
|
||||
(union nf_inet_addr *)&s->caddr,
|
||||
s->cport,
|
||||
(union nf_inet_addr *)&s->vaddr,
|
||||
s->vport);
|
||||
if (!cp) {
|
||||
/*
|
||||
* Find the appropriate destination for the connection.
|
||||
* If it is not found the connection will remain unbound
|
||||
* but still handled.
|
||||
*/
|
||||
dest = ip_vs_find_dest(AF_INET,
|
||||
(union nf_inet_addr *)&s->daddr,
|
||||
s->dport,
|
||||
(union nf_inet_addr *)&s->vaddr,
|
||||
s->vport,
|
||||
s->protocol);
|
||||
/* Set the approprite ativity flag */
|
||||
if (s->protocol == IPPROTO_TCP) {
|
||||
if (state != IP_VS_TCP_S_ESTABLISHED)
|
||||
flags |= IP_VS_CONN_F_INACTIVE;
|
||||
else
|
||||
flags &= ~IP_VS_CONN_F_INACTIVE;
|
||||
}
|
||||
cp = ip_vs_conn_new(AF_INET, s->protocol,
|
||||
(union nf_inet_addr *)&s->caddr,
|
||||
s->cport,
|
||||
(union nf_inet_addr *)&s->vaddr,
|
||||
s->vport,
|
||||
(union nf_inet_addr *)&s->daddr,
|
||||
s->dport,
|
||||
flags, dest);
|
||||
if (dest)
|
||||
atomic_dec(&dest->refcnt);
|
||||
if (!cp) {
|
||||
IP_VS_ERR("ip_vs_conn_new failed\n");
|
||||
return;
|
||||
}
|
||||
} else if (!cp->dest) {
|
||||
dest = ip_vs_try_bind_dest(cp);
|
||||
if (dest)
|
||||
atomic_dec(&dest->refcnt);
|
||||
} else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
|
||||
(cp->state != state)) {
|
||||
/* update active/inactive flag for the connection */
|
||||
dest = cp->dest;
|
||||
if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
|
||||
(state != IP_VS_TCP_S_ESTABLISHED)) {
|
||||
atomic_dec(&dest->activeconns);
|
||||
atomic_inc(&dest->inactconns);
|
||||
cp->flags |= IP_VS_CONN_F_INACTIVE;
|
||||
} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
|
||||
(state == IP_VS_TCP_S_ESTABLISHED)) {
|
||||
atomic_inc(&dest->activeconns);
|
||||
atomic_dec(&dest->inactconns);
|
||||
cp->flags &= ~IP_VS_CONN_F_INACTIVE;
|
||||
}
|
||||
}
|
||||
|
||||
if (opt)
|
||||
memcpy(&cp->in_seq, opt, sizeof(*opt));
|
||||
atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
|
||||
cp->state = state;
|
||||
cp->old_state = cp->state;
|
||||
/*
|
||||
* We can not recover the right timeout for templates
|
||||
* in all cases, we can not find the right fwmark
|
||||
* virtual service. If needed, we can do it for
|
||||
* non-fwmark persistent services.
|
||||
*/
|
||||
if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table)
|
||||
cp->timeout = pp->timeout_table[state];
|
||||
else
|
||||
cp->timeout = (3*60*HZ);
|
||||
ip_vs_conn_put(cp);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Setup loopback of outgoing multicasts on a sending socket
|
||||
*/
|
||||
static void set_mcast_loop(struct sock *sk, u_char loop)
|
||||
{
|
||||
struct inet_sock *inet = inet_sk(sk);
|
||||
|
||||
/* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
|
||||
lock_sock(sk);
|
||||
inet->mc_loop = loop ? 1 : 0;
|
||||
release_sock(sk);
|
||||
}
|
||||
|
||||
/*
|
||||
* Specify TTL for outgoing multicasts on a sending socket
|
||||
*/
|
||||
static void set_mcast_ttl(struct sock *sk, u_char ttl)
|
||||
{
|
||||
struct inet_sock *inet = inet_sk(sk);
|
||||
|
||||
/* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
|
||||
lock_sock(sk);
|
||||
inet->mc_ttl = ttl;
|
||||
release_sock(sk);
|
||||
}
|
||||
|
||||
/*
|
||||
* Specifiy default interface for outgoing multicasts
|
||||
*/
|
||||
static int set_mcast_if(struct sock *sk, char *ifname)
|
||||
{
|
||||
struct net_device *dev;
|
||||
struct inet_sock *inet = inet_sk(sk);
|
||||
|
||||
if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
|
||||
return -ENODEV;
|
||||
|
||||
if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
|
||||
return -EINVAL;
|
||||
|
||||
lock_sock(sk);
|
||||
inet->mc_index = dev->ifindex;
|
||||
/* inet->mc_addr = 0; */
|
||||
release_sock(sk);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Set the maximum length of sync message according to the
|
||||
* specified interface's MTU.
|
||||
*/
|
||||
static int set_sync_mesg_maxlen(int sync_state)
|
||||
{
|
||||
struct net_device *dev;
|
||||
int num;
|
||||
|
||||
if (sync_state == IP_VS_STATE_MASTER) {
|
||||
if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL)
|
||||
return -ENODEV;
|
||||
|
||||
num = (dev->mtu - sizeof(struct iphdr) -
|
||||
sizeof(struct udphdr) -
|
||||
SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
|
||||
sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
|
||||
SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
|
||||
IP_VS_DBG(7, "setting the maximum length of sync sending "
|
||||
"message %d.\n", sync_send_mesg_maxlen);
|
||||
} else if (sync_state == IP_VS_STATE_BACKUP) {
|
||||
if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL)
|
||||
return -ENODEV;
|
||||
|
||||
sync_recv_mesg_maxlen = dev->mtu -
|
||||
sizeof(struct iphdr) - sizeof(struct udphdr);
|
||||
IP_VS_DBG(7, "setting the maximum length of sync receiving "
|
||||
"message %d.\n", sync_recv_mesg_maxlen);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Join a multicast group.
|
||||
* the group is specified by a class D multicast address 224.0.0.0/8
|
||||
* in the in_addr structure passed in as a parameter.
|
||||
*/
|
||||
static int
|
||||
join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
|
||||
{
|
||||
struct ip_mreqn mreq;
|
||||
struct net_device *dev;
|
||||
int ret;
|
||||
|
||||
memset(&mreq, 0, sizeof(mreq));
|
||||
memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
|
||||
|
||||
if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
|
||||
return -ENODEV;
|
||||
if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
|
||||
return -EINVAL;
|
||||
|
||||
mreq.imr_ifindex = dev->ifindex;
|
||||
|
||||
lock_sock(sk);
|
||||
ret = ip_mc_join_group(sk, &mreq);
|
||||
release_sock(sk);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static int bind_mcastif_addr(struct socket *sock, char *ifname)
|
||||
{
|
||||
struct net_device *dev;
|
||||
__be32 addr;
|
||||
struct sockaddr_in sin;
|
||||
|
||||
if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
|
||||
return -ENODEV;
|
||||
|
||||
addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
|
||||
if (!addr)
|
||||
IP_VS_ERR("You probably need to specify IP address on "
|
||||
"multicast interface.\n");
|
||||
|
||||
IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n",
|
||||
ifname, NIPQUAD(addr));
|
||||
|
||||
/* Now bind the socket with the address of multicast interface */
|
||||
sin.sin_family = AF_INET;
|
||||
sin.sin_addr.s_addr = addr;
|
||||
sin.sin_port = 0;
|
||||
|
||||
return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
|
||||
}
|
||||
|
||||
/*
|
||||
* Set up sending multicast socket over UDP
|
||||
*/
|
||||
static struct socket * make_send_sock(void)
|
||||
{
|
||||
struct socket *sock;
|
||||
int result;
|
||||
|
||||
/* First create a socket */
|
||||
result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
|
||||
if (result < 0) {
|
||||
IP_VS_ERR("Error during creation of socket; terminating\n");
|
||||
return ERR_PTR(result);
|
||||
}
|
||||
|
||||
result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn);
|
||||
if (result < 0) {
|
||||
IP_VS_ERR("Error setting outbound mcast interface\n");
|
||||
goto error;
|
||||
}
|
||||
|
||||
set_mcast_loop(sock->sk, 0);
|
||||
set_mcast_ttl(sock->sk, 1);
|
||||
|
||||
result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn);
|
||||
if (result < 0) {
|
||||
IP_VS_ERR("Error binding address of the mcast interface\n");
|
||||
goto error;
|
||||
}
|
||||
|
||||
result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
|
||||
sizeof(struct sockaddr), 0);
|
||||
if (result < 0) {
|
||||
IP_VS_ERR("Error connecting to the multicast addr\n");
|
||||
goto error;
|
||||
}
|
||||
|
||||
return sock;
|
||||
|
||||
error:
|
||||
sock_release(sock);
|
||||
return ERR_PTR(result);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Set up receiving multicast socket over UDP
|
||||
*/
|
||||
static struct socket * make_receive_sock(void)
|
||||
{
|
||||
struct socket *sock;
|
||||
int result;
|
||||
|
||||
/* First create a socket */
|
||||
result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
|
||||
if (result < 0) {
|
||||
IP_VS_ERR("Error during creation of socket; terminating\n");
|
||||
return ERR_PTR(result);
|
||||
}
|
||||
|
||||
/* it is equivalent to the REUSEADDR option in user-space */
|
||||
sock->sk->sk_reuse = 1;
|
||||
|
||||
result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
|
||||
sizeof(struct sockaddr));
|
||||
if (result < 0) {
|
||||
IP_VS_ERR("Error binding to the multicast addr\n");
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* join the multicast group */
|
||||
result = join_mcast_group(sock->sk,
|
||||
(struct in_addr *) &mcast_addr.sin_addr,
|
||||
ip_vs_backup_mcast_ifn);
|
||||
if (result < 0) {
|
||||
IP_VS_ERR("Error joining to the multicast group\n");
|
||||
goto error;
|
||||
}
|
||||
|
||||
return sock;
|
||||
|
||||
error:
|
||||
sock_release(sock);
|
||||
return ERR_PTR(result);
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
|
||||
{
|
||||
struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
|
||||
struct kvec iov;
|
||||
int len;
|
||||
|
||||
EnterFunction(7);
|
||||
iov.iov_base = (void *)buffer;
|
||||
iov.iov_len = length;
|
||||
|
||||
len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
|
||||
|
||||
LeaveFunction(7);
|
||||
return len;
|
||||
}
|
||||
|
||||
static void
|
||||
ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
|
||||
{
|
||||
int msize;
|
||||
|
||||
msize = msg->size;
|
||||
|
||||
/* Put size in network byte order */
|
||||
msg->size = htons(msg->size);
|
||||
|
||||
if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
|
||||
IP_VS_ERR("ip_vs_send_async error\n");
|
||||
}
|
||||
|
||||
static int
|
||||
ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
|
||||
{
|
||||
struct msghdr msg = {NULL,};
|
||||
struct kvec iov;
|
||||
int len;
|
||||
|
||||
EnterFunction(7);
|
||||
|
||||
/* Receive a packet */
|
||||
iov.iov_base = buffer;
|
||||
iov.iov_len = (size_t)buflen;
|
||||
|
||||
len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
|
||||
|
||||
if (len < 0)
|
||||
return -1;
|
||||
|
||||
LeaveFunction(7);
|
||||
return len;
|
||||
}
|
||||
|
||||
|
||||
static int sync_thread_master(void *data)
|
||||
{
|
||||
struct ip_vs_sync_thread_data *tinfo = data;
|
||||
struct ip_vs_sync_buff *sb;
|
||||
|
||||
IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, "
|
||||
"syncid = %d\n",
|
||||
ip_vs_master_mcast_ifn, ip_vs_master_syncid);
|
||||
|
||||
while (!kthread_should_stop()) {
|
||||
while ((sb = sb_dequeue())) {
|
||||
ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
|
||||
ip_vs_sync_buff_release(sb);
|
||||
}
|
||||
|
||||
/* check if entries stay in curr_sb for 2 seconds */
|
||||
sb = get_curr_sync_buff(2 * HZ);
|
||||
if (sb) {
|
||||
ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
|
||||
ip_vs_sync_buff_release(sb);
|
||||
}
|
||||
|
||||
schedule_timeout_interruptible(HZ);
|
||||
}
|
||||
|
||||
/* clean up the sync_buff queue */
|
||||
while ((sb=sb_dequeue())) {
|
||||
ip_vs_sync_buff_release(sb);
|
||||
}
|
||||
|
||||
/* clean up the current sync_buff */
|
||||
if ((sb = get_curr_sync_buff(0))) {
|
||||
ip_vs_sync_buff_release(sb);
|
||||
}
|
||||
|
||||
/* release the sending multicast socket */
|
||||
sock_release(tinfo->sock);
|
||||
kfree(tinfo);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static int sync_thread_backup(void *data)
|
||||
{
|
||||
struct ip_vs_sync_thread_data *tinfo = data;
|
||||
int len;
|
||||
|
||||
IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, "
|
||||
"syncid = %d\n",
|
||||
ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
|
||||
|
||||
while (!kthread_should_stop()) {
|
||||
wait_event_interruptible(*tinfo->sock->sk->sk_sleep,
|
||||
!skb_queue_empty(&tinfo->sock->sk->sk_receive_queue)
|
||||
|| kthread_should_stop());
|
||||
|
||||
/* do we have data now? */
|
||||
while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
|
||||
len = ip_vs_receive(tinfo->sock, tinfo->buf,
|
||||
sync_recv_mesg_maxlen);
|
||||
if (len <= 0) {
|
||||
IP_VS_ERR("receiving message error\n");
|
||||
break;
|
||||
}
|
||||
|
||||
/* disable bottom half, because it accesses the data
|
||||
shared by softirq while getting/creating conns */
|
||||
local_bh_disable();
|
||||
ip_vs_process_message(tinfo->buf, len);
|
||||
local_bh_enable();
|
||||
}
|
||||
}
|
||||
|
||||
/* release the sending multicast socket */
|
||||
sock_release(tinfo->sock);
|
||||
kfree(tinfo->buf);
|
||||
kfree(tinfo);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
|
||||
{
|
||||
struct ip_vs_sync_thread_data *tinfo;
|
||||
struct task_struct **realtask, *task;
|
||||
struct socket *sock;
|
||||
char *name, *buf = NULL;
|
||||
int (*threadfn)(void *data);
|
||||
int result = -ENOMEM;
|
||||
|
||||
IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current));
|
||||
IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
|
||||
sizeof(struct ip_vs_sync_conn));
|
||||
|
||||
if (state == IP_VS_STATE_MASTER) {
|
||||
if (sync_master_thread)
|
||||
return -EEXIST;
|
||||
|
||||
strlcpy(ip_vs_master_mcast_ifn, mcast_ifn,
|
||||
sizeof(ip_vs_master_mcast_ifn));
|
||||
ip_vs_master_syncid = syncid;
|
||||
realtask = &sync_master_thread;
|
||||
name = "ipvs_syncmaster";
|
||||
threadfn = sync_thread_master;
|
||||
sock = make_send_sock();
|
||||
} else if (state == IP_VS_STATE_BACKUP) {
|
||||
if (sync_backup_thread)
|
||||
return -EEXIST;
|
||||
|
||||
strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn,
|
||||
sizeof(ip_vs_backup_mcast_ifn));
|
||||
ip_vs_backup_syncid = syncid;
|
||||
realtask = &sync_backup_thread;
|
||||
name = "ipvs_syncbackup";
|
||||
threadfn = sync_thread_backup;
|
||||
sock = make_receive_sock();
|
||||
} else {
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (IS_ERR(sock)) {
|
||||
result = PTR_ERR(sock);
|
||||
goto out;
|
||||
}
|
||||
|
||||
set_sync_mesg_maxlen(state);
|
||||
if (state == IP_VS_STATE_BACKUP) {
|
||||
buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL);
|
||||
if (!buf)
|
||||
goto outsocket;
|
||||
}
|
||||
|
||||
tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
|
||||
if (!tinfo)
|
||||
goto outbuf;
|
||||
|
||||
tinfo->sock = sock;
|
||||
tinfo->buf = buf;
|
||||
|
||||
task = kthread_run(threadfn, tinfo, name);
|
||||
if (IS_ERR(task)) {
|
||||
result = PTR_ERR(task);
|
||||
goto outtinfo;
|
||||
}
|
||||
|
||||
/* mark as active */
|
||||
*realtask = task;
|
||||
ip_vs_sync_state |= state;
|
||||
|
||||
/* increase the module use count */
|
||||
ip_vs_use_count_inc();
|
||||
|
||||
return 0;
|
||||
|
||||
outtinfo:
|
||||
kfree(tinfo);
|
||||
outbuf:
|
||||
kfree(buf);
|
||||
outsocket:
|
||||
sock_release(sock);
|
||||
out:
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
int stop_sync_thread(int state)
|
||||
{
|
||||
IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current));
|
||||
|
||||
if (state == IP_VS_STATE_MASTER) {
|
||||
if (!sync_master_thread)
|
||||
return -ESRCH;
|
||||
|
||||
IP_VS_INFO("stopping master sync thread %d ...\n",
|
||||
task_pid_nr(sync_master_thread));
|
||||
|
||||
/*
|
||||
* The lock synchronizes with sb_queue_tail(), so that we don't
|
||||
* add sync buffers to the queue, when we are already in
|
||||
* progress of stopping the master sync daemon.
|
||||
*/
|
||||
|
||||
spin_lock_bh(&ip_vs_sync_lock);
|
||||
ip_vs_sync_state &= ~IP_VS_STATE_MASTER;
|
||||
spin_unlock_bh(&ip_vs_sync_lock);
|
||||
kthread_stop(sync_master_thread);
|
||||
sync_master_thread = NULL;
|
||||
} else if (state == IP_VS_STATE_BACKUP) {
|
||||
if (!sync_backup_thread)
|
||||
return -ESRCH;
|
||||
|
||||
IP_VS_INFO("stopping backup sync thread %d ...\n",
|
||||
task_pid_nr(sync_backup_thread));
|
||||
|
||||
ip_vs_sync_state &= ~IP_VS_STATE_BACKUP;
|
||||
kthread_stop(sync_backup_thread);
|
||||
sync_backup_thread = NULL;
|
||||
} else {
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* decrease the module use count */
|
||||
ip_vs_use_count_dec();
|
||||
|
||||
return 0;
|
||||
}
|
128
net/netfilter/ipvs/ip_vs_wlc.c
Normal file
128
net/netfilter/ipvs/ip_vs_wlc.c
Normal file
@@ -0,0 +1,128 @@
|
||||
/*
|
||||
* IPVS: Weighted Least-Connection Scheduling module
|
||||
*
|
||||
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
|
||||
* Peter Kese <peter.kese@ijs.si>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Changes:
|
||||
* Wensong Zhang : changed the ip_vs_wlc_schedule to return dest
|
||||
* Wensong Zhang : changed to use the inactconns in scheduling
|
||||
* Wensong Zhang : changed some comestics things for debugging
|
||||
* Wensong Zhang : changed for the d-linked destination list
|
||||
* Wensong Zhang : added the ip_vs_wlc_update_svc
|
||||
* Wensong Zhang : added any dest with weight=0 is quiesced
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
|
||||
#include <net/ip_vs.h>
|
||||
|
||||
|
||||
static inline unsigned int
|
||||
ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
|
||||
{
|
||||
/*
|
||||
* We think the overhead of processing active connections is 256
|
||||
* times higher than that of inactive connections in average. (This
|
||||
* 256 times might not be accurate, we will change it later) We
|
||||
* use the following formula to estimate the overhead now:
|
||||
* dest->activeconns*256 + dest->inactconns
|
||||
*/
|
||||
return (atomic_read(&dest->activeconns) << 8) +
|
||||
atomic_read(&dest->inactconns);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Weighted Least Connection scheduling
|
||||
*/
|
||||
static struct ip_vs_dest *
|
||||
ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
|
||||
{
|
||||
struct ip_vs_dest *dest, *least;
|
||||
unsigned int loh, doh;
|
||||
|
||||
IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
|
||||
|
||||
/*
|
||||
* We calculate the load of each dest server as follows:
|
||||
* (dest overhead) / dest->weight
|
||||
*
|
||||
* Remember -- no floats in kernel mode!!!
|
||||
* The comparison of h1*w2 > h2*w1 is equivalent to that of
|
||||
* h1/w1 > h2/w2
|
||||
* if every weight is larger than zero.
|
||||
*
|
||||
* The server with weight=0 is quiesced and will not receive any
|
||||
* new connections.
|
||||
*/
|
||||
|
||||
list_for_each_entry(dest, &svc->destinations, n_list) {
|
||||
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
|
||||
atomic_read(&dest->weight) > 0) {
|
||||
least = dest;
|
||||
loh = ip_vs_wlc_dest_overhead(least);
|
||||
goto nextstage;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* Find the destination with the least load.
|
||||
*/
|
||||
nextstage:
|
||||
list_for_each_entry_continue(dest, &svc->destinations, n_list) {
|
||||
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
|
||||
continue;
|
||||
doh = ip_vs_wlc_dest_overhead(dest);
|
||||
if (loh * atomic_read(&dest->weight) >
|
||||
doh * atomic_read(&least->weight)) {
|
||||
least = dest;
|
||||
loh = doh;
|
||||
}
|
||||
}
|
||||
|
||||
IP_VS_DBG_BUF(6, "WLC: server %s:%u "
|
||||
"activeconns %d refcnt %d weight %d overhead %d\n",
|
||||
IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
|
||||
atomic_read(&least->activeconns),
|
||||
atomic_read(&least->refcnt),
|
||||
atomic_read(&least->weight), loh);
|
||||
|
||||
return least;
|
||||
}
|
||||
|
||||
|
||||
static struct ip_vs_scheduler ip_vs_wlc_scheduler =
|
||||
{
|
||||
.name = "wlc",
|
||||
.refcnt = ATOMIC_INIT(0),
|
||||
.module = THIS_MODULE,
|
||||
.n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list),
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
.supports_ipv6 = 1,
|
||||
#endif
|
||||
.schedule = ip_vs_wlc_schedule,
|
||||
};
|
||||
|
||||
|
||||
static int __init ip_vs_wlc_init(void)
|
||||
{
|
||||
return register_ip_vs_scheduler(&ip_vs_wlc_scheduler);
|
||||
}
|
||||
|
||||
static void __exit ip_vs_wlc_cleanup(void)
|
||||
{
|
||||
unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
|
||||
}
|
||||
|
||||
module_init(ip_vs_wlc_init);
|
||||
module_exit(ip_vs_wlc_cleanup);
|
||||
MODULE_LICENSE("GPL");
|
237
net/netfilter/ipvs/ip_vs_wrr.c
Normal file
237
net/netfilter/ipvs/ip_vs_wrr.c
Normal file
@@ -0,0 +1,237 @@
|
||||
/*
|
||||
* IPVS: Weighted Round-Robin Scheduling module
|
||||
*
|
||||
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Changes:
|
||||
* Wensong Zhang : changed the ip_vs_wrr_schedule to return dest
|
||||
* Wensong Zhang : changed some comestics things for debugging
|
||||
* Wensong Zhang : changed for the d-linked destination list
|
||||
* Wensong Zhang : added the ip_vs_wrr_update_svc
|
||||
* Julian Anastasov : fixed the bug of returning destination
|
||||
* with weight 0 when all weights are zero
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/net.h>
|
||||
|
||||
#include <net/ip_vs.h>
|
||||
|
||||
/*
|
||||
* current destination pointer for weighted round-robin scheduling
|
||||
*/
|
||||
struct ip_vs_wrr_mark {
|
||||
struct list_head *cl; /* current list head */
|
||||
int cw; /* current weight */
|
||||
int mw; /* maximum weight */
|
||||
int di; /* decreasing interval */
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Get the gcd of server weights
|
||||
*/
|
||||
static int gcd(int a, int b)
|
||||
{
|
||||
int c;
|
||||
|
||||
while ((c = a % b)) {
|
||||
a = b;
|
||||
b = c;
|
||||
}
|
||||
return b;
|
||||
}
|
||||
|
||||
static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc)
|
||||
{
|
||||
struct ip_vs_dest *dest;
|
||||
int weight;
|
||||
int g = 0;
|
||||
|
||||
list_for_each_entry(dest, &svc->destinations, n_list) {
|
||||
weight = atomic_read(&dest->weight);
|
||||
if (weight > 0) {
|
||||
if (g > 0)
|
||||
g = gcd(weight, g);
|
||||
else
|
||||
g = weight;
|
||||
}
|
||||
}
|
||||
return g ? g : 1;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Get the maximum weight of the service destinations.
|
||||
*/
|
||||
static int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
|
||||
{
|
||||
struct ip_vs_dest *dest;
|
||||
int weight = 0;
|
||||
|
||||
list_for_each_entry(dest, &svc->destinations, n_list) {
|
||||
if (atomic_read(&dest->weight) > weight)
|
||||
weight = atomic_read(&dest->weight);
|
||||
}
|
||||
|
||||
return weight;
|
||||
}
|
||||
|
||||
|
||||
static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
|
||||
{
|
||||
struct ip_vs_wrr_mark *mark;
|
||||
|
||||
/*
|
||||
* Allocate the mark variable for WRR scheduling
|
||||
*/
|
||||
mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
|
||||
if (mark == NULL) {
|
||||
IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
mark->cl = &svc->destinations;
|
||||
mark->cw = 0;
|
||||
mark->mw = ip_vs_wrr_max_weight(svc);
|
||||
mark->di = ip_vs_wrr_gcd_weight(svc);
|
||||
svc->sched_data = mark;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
|
||||
{
|
||||
/*
|
||||
* Release the mark variable
|
||||
*/
|
||||
kfree(svc->sched_data);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
|
||||
{
|
||||
struct ip_vs_wrr_mark *mark = svc->sched_data;
|
||||
|
||||
mark->cl = &svc->destinations;
|
||||
mark->mw = ip_vs_wrr_max_weight(svc);
|
||||
mark->di = ip_vs_wrr_gcd_weight(svc);
|
||||
if (mark->cw > mark->mw)
|
||||
mark->cw = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Weighted Round-Robin Scheduling
|
||||
*/
|
||||
static struct ip_vs_dest *
|
||||
ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
|
||||
{
|
||||
struct ip_vs_dest *dest;
|
||||
struct ip_vs_wrr_mark *mark = svc->sched_data;
|
||||
struct list_head *p;
|
||||
|
||||
IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n");
|
||||
|
||||
/*
|
||||
* This loop will always terminate, because mark->cw in (0, max_weight]
|
||||
* and at least one server has its weight equal to max_weight.
|
||||
*/
|
||||
write_lock(&svc->sched_lock);
|
||||
p = mark->cl;
|
||||
while (1) {
|
||||
if (mark->cl == &svc->destinations) {
|
||||
/* it is at the head of the destination list */
|
||||
|
||||
if (mark->cl == mark->cl->next) {
|
||||
/* no dest entry */
|
||||
dest = NULL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
mark->cl = svc->destinations.next;
|
||||
mark->cw -= mark->di;
|
||||
if (mark->cw <= 0) {
|
||||
mark->cw = mark->mw;
|
||||
/*
|
||||
* Still zero, which means no available servers.
|
||||
*/
|
||||
if (mark->cw == 0) {
|
||||
mark->cl = &svc->destinations;
|
||||
IP_VS_ERR_RL("ip_vs_wrr_schedule(): "
|
||||
"no available servers\n");
|
||||
dest = NULL;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
} else
|
||||
mark->cl = mark->cl->next;
|
||||
|
||||
if (mark->cl != &svc->destinations) {
|
||||
/* not at the head of the list */
|
||||
dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
|
||||
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
|
||||
atomic_read(&dest->weight) >= mark->cw) {
|
||||
/* got it */
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (mark->cl == p && mark->cw == mark->di) {
|
||||
/* back to the start, and no dest is found.
|
||||
It is only possible when all dests are OVERLOADED */
|
||||
dest = NULL;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
IP_VS_DBG_BUF(6, "WRR: server %s:%u "
|
||||
"activeconns %d refcnt %d weight %d\n",
|
||||
IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
|
||||
atomic_read(&dest->activeconns),
|
||||
atomic_read(&dest->refcnt),
|
||||
atomic_read(&dest->weight));
|
||||
|
||||
out:
|
||||
write_unlock(&svc->sched_lock);
|
||||
return dest;
|
||||
}
|
||||
|
||||
|
||||
static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
|
||||
.name = "wrr",
|
||||
.refcnt = ATOMIC_INIT(0),
|
||||
.module = THIS_MODULE,
|
||||
.n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list),
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
.supports_ipv6 = 1,
|
||||
#endif
|
||||
.init_service = ip_vs_wrr_init_svc,
|
||||
.done_service = ip_vs_wrr_done_svc,
|
||||
.update_service = ip_vs_wrr_update_svc,
|
||||
.schedule = ip_vs_wrr_schedule,
|
||||
};
|
||||
|
||||
static int __init ip_vs_wrr_init(void)
|
||||
{
|
||||
return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
|
||||
}
|
||||
|
||||
static void __exit ip_vs_wrr_cleanup(void)
|
||||
{
|
||||
unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
|
||||
}
|
||||
|
||||
module_init(ip_vs_wrr_init);
|
||||
module_exit(ip_vs_wrr_cleanup);
|
||||
MODULE_LICENSE("GPL");
|
1004
net/netfilter/ipvs/ip_vs_xmit.c
Normal file
1004
net/netfilter/ipvs/ip_vs_xmit.c
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user