epoll: introduce resource usage limits
It has been thought that the per-user file descriptors limit would also limit the resources that a normal user can request via the epoll interface. Vegard Nossum reported a very simple program (a modified version attached) that can make a normal user to request a pretty large amount of kernel memory, well within the its maximum number of fds. To solve such problem, default limits are now imposed, and /proc based configuration has been introduced. A new directory has been created, named /proc/sys/fs/epoll/ and inside there, there are two configuration points: max_user_instances = Maximum number of devices - per user max_user_watches = Maximum number of "watched" fds - per user The current default for "max_user_watches" limits the memory used by epoll to store "watches", to 1/32 of the amount of the low RAM. As example, a 256MB 32bit machine, will have "max_user_watches" set to roughly 90000. That should be enough to not break existing heavy epoll users. The default value for "max_user_instances" is set to 128, that should be enough too. This also changes the userspace, because a new error code can now come out from EPOLL_CTL_ADD (-ENOSPC). The EMFILE from epoll_create() was already listed, so that should be ok. [akpm@linux-foundation.org: use get_current_user()] Signed-off-by: Davide Libenzi <davidel@xmailserver.org> Cc: Michael Kerrisk <mtk.manpages@gmail.com> Cc: <stable@kernel.org> Cc: Cyrill Gorcunov <gorcunov@gmail.com> Reported-by: Vegard Nossum <vegardno@ifi.uio.no> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
committed by
Linus Torvalds
parent
b7d271df87
commit
7ef9964e6d
@@ -44,6 +44,7 @@ Table of Contents
|
|||||||
2.14 /proc/<pid>/io - Display the IO accounting fields
|
2.14 /proc/<pid>/io - Display the IO accounting fields
|
||||||
2.15 /proc/<pid>/coredump_filter - Core dump filtering settings
|
2.15 /proc/<pid>/coredump_filter - Core dump filtering settings
|
||||||
2.16 /proc/<pid>/mountinfo - Information about mounts
|
2.16 /proc/<pid>/mountinfo - Information about mounts
|
||||||
|
2.17 /proc/sys/fs/epoll - Configuration options for the epoll interface
|
||||||
|
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
Preface
|
Preface
|
||||||
@@ -2483,4 +2484,30 @@ For more information on mount propagation see:
|
|||||||
|
|
||||||
Documentation/filesystems/sharedsubtree.txt
|
Documentation/filesystems/sharedsubtree.txt
|
||||||
|
|
||||||
|
2.17 /proc/sys/fs/epoll - Configuration options for the epoll interface
|
||||||
|
--------------------------------------------------------
|
||||||
|
|
||||||
|
This directory contains configuration options for the epoll(7) interface.
|
||||||
|
|
||||||
|
max_user_instances
|
||||||
|
------------------
|
||||||
|
|
||||||
|
This is the maximum number of epoll file descriptors that a single user can
|
||||||
|
have open at a given time. The default value is 128, and should be enough
|
||||||
|
for normal users.
|
||||||
|
|
||||||
|
max_user_watches
|
||||||
|
----------------
|
||||||
|
|
||||||
|
Every epoll file descriptor can store a number of files to be monitored
|
||||||
|
for event readiness. Each one of these monitored files constitutes a "watch".
|
||||||
|
This configuration option sets the maximum number of "watches" that are
|
||||||
|
allowed for each user.
|
||||||
|
Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes
|
||||||
|
on a 64bit one.
|
||||||
|
The current default value for max_user_watches is the 1/32 of the available
|
||||||
|
low memory, divided for the "watch" cost in bytes.
|
||||||
|
|
||||||
|
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@@ -102,6 +102,8 @@
|
|||||||
|
|
||||||
#define EP_UNACTIVE_PTR ((void *) -1L)
|
#define EP_UNACTIVE_PTR ((void *) -1L)
|
||||||
|
|
||||||
|
#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
|
||||||
|
|
||||||
struct epoll_filefd {
|
struct epoll_filefd {
|
||||||
struct file *file;
|
struct file *file;
|
||||||
int fd;
|
int fd;
|
||||||
@@ -200,6 +202,9 @@ struct eventpoll {
|
|||||||
* holding ->lock.
|
* holding ->lock.
|
||||||
*/
|
*/
|
||||||
struct epitem *ovflist;
|
struct epitem *ovflist;
|
||||||
|
|
||||||
|
/* The user that created the eventpoll descriptor */
|
||||||
|
struct user_struct *user;
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Wait structure used by the poll hooks */
|
/* Wait structure used by the poll hooks */
|
||||||
@@ -226,10 +231,18 @@ struct ep_pqueue {
|
|||||||
struct epitem *epi;
|
struct epitem *epi;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Configuration options available inside /proc/sys/fs/epoll/
|
||||||
|
*/
|
||||||
|
/* Maximum number of epoll devices, per user */
|
||||||
|
static int max_user_instances __read_mostly;
|
||||||
|
/* Maximum number of epoll watched descriptors, per user */
|
||||||
|
static int max_user_watches __read_mostly;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This mutex is used to serialize ep_free() and eventpoll_release_file().
|
* This mutex is used to serialize ep_free() and eventpoll_release_file().
|
||||||
*/
|
*/
|
||||||
static struct mutex epmutex;
|
static DEFINE_MUTEX(epmutex);
|
||||||
|
|
||||||
/* Safe wake up implementation */
|
/* Safe wake up implementation */
|
||||||
static struct poll_safewake psw;
|
static struct poll_safewake psw;
|
||||||
@@ -240,6 +253,33 @@ static struct kmem_cache *epi_cache __read_mostly;
|
|||||||
/* Slab cache used to allocate "struct eppoll_entry" */
|
/* Slab cache used to allocate "struct eppoll_entry" */
|
||||||
static struct kmem_cache *pwq_cache __read_mostly;
|
static struct kmem_cache *pwq_cache __read_mostly;
|
||||||
|
|
||||||
|
#ifdef CONFIG_SYSCTL
|
||||||
|
|
||||||
|
#include <linux/sysctl.h>
|
||||||
|
|
||||||
|
static int zero;
|
||||||
|
|
||||||
|
ctl_table epoll_table[] = {
|
||||||
|
{
|
||||||
|
.procname = "max_user_instances",
|
||||||
|
.data = &max_user_instances,
|
||||||
|
.maxlen = sizeof(int),
|
||||||
|
.mode = 0644,
|
||||||
|
.proc_handler = &proc_dointvec_minmax,
|
||||||
|
.extra1 = &zero,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
.procname = "max_user_watches",
|
||||||
|
.data = &max_user_watches,
|
||||||
|
.maxlen = sizeof(int),
|
||||||
|
.mode = 0644,
|
||||||
|
.proc_handler = &proc_dointvec_minmax,
|
||||||
|
.extra1 = &zero,
|
||||||
|
},
|
||||||
|
{ .ctl_name = 0 }
|
||||||
|
};
|
||||||
|
#endif /* CONFIG_SYSCTL */
|
||||||
|
|
||||||
|
|
||||||
/* Setup the structure that is used as key for the RB tree */
|
/* Setup the structure that is used as key for the RB tree */
|
||||||
static inline void ep_set_ffd(struct epoll_filefd *ffd,
|
static inline void ep_set_ffd(struct epoll_filefd *ffd,
|
||||||
@@ -402,6 +442,8 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
|
|||||||
/* At this point it is safe to free the eventpoll item */
|
/* At this point it is safe to free the eventpoll item */
|
||||||
kmem_cache_free(epi_cache, epi);
|
kmem_cache_free(epi_cache, epi);
|
||||||
|
|
||||||
|
atomic_dec(&ep->user->epoll_watches);
|
||||||
|
|
||||||
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n",
|
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n",
|
||||||
current, ep, file));
|
current, ep, file));
|
||||||
|
|
||||||
@@ -449,6 +491,8 @@ static void ep_free(struct eventpoll *ep)
|
|||||||
|
|
||||||
mutex_unlock(&epmutex);
|
mutex_unlock(&epmutex);
|
||||||
mutex_destroy(&ep->mtx);
|
mutex_destroy(&ep->mtx);
|
||||||
|
atomic_dec(&ep->user->epoll_devs);
|
||||||
|
free_uid(ep->user);
|
||||||
kfree(ep);
|
kfree(ep);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -532,10 +576,19 @@ void eventpoll_release_file(struct file *file)
|
|||||||
|
|
||||||
static int ep_alloc(struct eventpoll **pep)
|
static int ep_alloc(struct eventpoll **pep)
|
||||||
{
|
{
|
||||||
struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL);
|
int error;
|
||||||
|
struct user_struct *user;
|
||||||
|
struct eventpoll *ep;
|
||||||
|
|
||||||
if (!ep)
|
user = get_current_user();
|
||||||
return -ENOMEM;
|
error = -EMFILE;
|
||||||
|
if (unlikely(atomic_read(&user->epoll_devs) >=
|
||||||
|
max_user_instances))
|
||||||
|
goto free_uid;
|
||||||
|
error = -ENOMEM;
|
||||||
|
ep = kzalloc(sizeof(*ep), GFP_KERNEL);
|
||||||
|
if (unlikely(!ep))
|
||||||
|
goto free_uid;
|
||||||
|
|
||||||
spin_lock_init(&ep->lock);
|
spin_lock_init(&ep->lock);
|
||||||
mutex_init(&ep->mtx);
|
mutex_init(&ep->mtx);
|
||||||
@@ -544,12 +597,17 @@ static int ep_alloc(struct eventpoll **pep)
|
|||||||
INIT_LIST_HEAD(&ep->rdllist);
|
INIT_LIST_HEAD(&ep->rdllist);
|
||||||
ep->rbr = RB_ROOT;
|
ep->rbr = RB_ROOT;
|
||||||
ep->ovflist = EP_UNACTIVE_PTR;
|
ep->ovflist = EP_UNACTIVE_PTR;
|
||||||
|
ep->user = user;
|
||||||
|
|
||||||
*pep = ep;
|
*pep = ep;
|
||||||
|
|
||||||
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n",
|
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n",
|
||||||
current, ep));
|
current, ep));
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
free_uid:
|
||||||
|
free_uid(user);
|
||||||
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -703,9 +761,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
|
|||||||
struct epitem *epi;
|
struct epitem *epi;
|
||||||
struct ep_pqueue epq;
|
struct ep_pqueue epq;
|
||||||
|
|
||||||
error = -ENOMEM;
|
if (unlikely(atomic_read(&ep->user->epoll_watches) >=
|
||||||
|
max_user_watches))
|
||||||
|
return -ENOSPC;
|
||||||
if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
|
if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
|
||||||
goto error_return;
|
return -ENOMEM;
|
||||||
|
|
||||||
/* Item initialization follow here ... */
|
/* Item initialization follow here ... */
|
||||||
INIT_LIST_HEAD(&epi->rdllink);
|
INIT_LIST_HEAD(&epi->rdllink);
|
||||||
@@ -735,6 +795,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
|
|||||||
* install process. Namely an allocation for a wait queue failed due
|
* install process. Namely an allocation for a wait queue failed due
|
||||||
* high memory pressure.
|
* high memory pressure.
|
||||||
*/
|
*/
|
||||||
|
error = -ENOMEM;
|
||||||
if (epi->nwait < 0)
|
if (epi->nwait < 0)
|
||||||
goto error_unregister;
|
goto error_unregister;
|
||||||
|
|
||||||
@@ -765,6 +826,8 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
|
|||||||
|
|
||||||
spin_unlock_irqrestore(&ep->lock, flags);
|
spin_unlock_irqrestore(&ep->lock, flags);
|
||||||
|
|
||||||
|
atomic_inc(&ep->user->epoll_watches);
|
||||||
|
|
||||||
/* We have to call this outside the lock */
|
/* We have to call this outside the lock */
|
||||||
if (pwake)
|
if (pwake)
|
||||||
ep_poll_safewake(&psw, &ep->poll_wait);
|
ep_poll_safewake(&psw, &ep->poll_wait);
|
||||||
@@ -789,7 +852,7 @@ error_unregister:
|
|||||||
spin_unlock_irqrestore(&ep->lock, flags);
|
spin_unlock_irqrestore(&ep->lock, flags);
|
||||||
|
|
||||||
kmem_cache_free(epi_cache, epi);
|
kmem_cache_free(epi_cache, epi);
|
||||||
error_return:
|
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1078,6 +1141,7 @@ asmlinkage long sys_epoll_create1(int flags)
|
|||||||
flags & O_CLOEXEC);
|
flags & O_CLOEXEC);
|
||||||
if (fd < 0)
|
if (fd < 0)
|
||||||
ep_free(ep);
|
ep_free(ep);
|
||||||
|
atomic_inc(&ep->user->epoll_devs);
|
||||||
|
|
||||||
error_return:
|
error_return:
|
||||||
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
|
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
|
||||||
@@ -1299,7 +1363,12 @@ asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
|
|||||||
|
|
||||||
static int __init eventpoll_init(void)
|
static int __init eventpoll_init(void)
|
||||||
{
|
{
|
||||||
mutex_init(&epmutex);
|
struct sysinfo si;
|
||||||
|
|
||||||
|
si_meminfo(&si);
|
||||||
|
max_user_instances = 128;
|
||||||
|
max_user_watches = (((si.totalram - si.totalhigh) / 32) << PAGE_SHIFT) /
|
||||||
|
EP_ITEM_COST;
|
||||||
|
|
||||||
/* Initialize the structure used to perform safe poll wait head wake ups */
|
/* Initialize the structure used to perform safe poll wait head wake ups */
|
||||||
ep_poll_safewake_init(&psw);
|
ep_poll_safewake_init(&psw);
|
||||||
|
@@ -630,6 +630,10 @@ struct user_struct {
|
|||||||
atomic_t inotify_watches; /* How many inotify watches does this user have? */
|
atomic_t inotify_watches; /* How many inotify watches does this user have? */
|
||||||
atomic_t inotify_devs; /* How many inotify devs does this user have opened? */
|
atomic_t inotify_devs; /* How many inotify devs does this user have opened? */
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef CONFIG_EPOLL
|
||||||
|
atomic_t epoll_devs; /* The number of epoll descriptors currently open */
|
||||||
|
atomic_t epoll_watches; /* The number of file descriptors currently watched */
|
||||||
|
#endif
|
||||||
#ifdef CONFIG_POSIX_MQUEUE
|
#ifdef CONFIG_POSIX_MQUEUE
|
||||||
/* protected by mq_lock */
|
/* protected by mq_lock */
|
||||||
unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */
|
unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */
|
||||||
|
@@ -176,6 +176,9 @@ extern struct ctl_table random_table[];
|
|||||||
#ifdef CONFIG_INOTIFY_USER
|
#ifdef CONFIG_INOTIFY_USER
|
||||||
extern struct ctl_table inotify_table[];
|
extern struct ctl_table inotify_table[];
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef CONFIG_EPOLL
|
||||||
|
extern struct ctl_table epoll_table[];
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
|
#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
|
||||||
int sysctl_legacy_va_layout;
|
int sysctl_legacy_va_layout;
|
||||||
@@ -1325,6 +1328,13 @@ static struct ctl_table fs_table[] = {
|
|||||||
.child = inotify_table,
|
.child = inotify_table,
|
||||||
},
|
},
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef CONFIG_EPOLL
|
||||||
|
{
|
||||||
|
.procname = "epoll",
|
||||||
|
.mode = 0555,
|
||||||
|
.child = epoll_table,
|
||||||
|
},
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
.ctl_name = KERN_SETUID_DUMPABLE,
|
.ctl_name = KERN_SETUID_DUMPABLE,
|
||||||
|
Reference in New Issue
Block a user