memcg: oom kill disable and oom status

This adds a feature to disable oom-killer for memcg, if disabled, of
course, tasks under memcg will stop.

But now, we have oom-notifier for memcg.  And the world around memcg is
not under out-of-memory.  memcg's out-of-memory just shows memcg hits
limit.  Then, administrator or management daemon can recover the situation
by

	- kill some process
	- enlarge limit, add more swap.
	- migrate some tasks
	- remove file cache on tmps (difficult ?)

Unlike oom-killer, you can take enough information before killing tasks.
(by gcore, or, ps etc.)

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
KAMEZAWA Hiroyuki
2010-05-26 14:42:37 -07:00
committed by Linus Torvalds
parent 9490ff2756
commit 3c11ecf448
2 changed files with 117 additions and 19 deletions

View File

@ -214,6 +214,8 @@ struct mem_cgroup {
atomic_t refcnt;
unsigned int swappiness;
/* OOM-Killer disable */
int oom_kill_disable;
/* set when res.limit == memsw.limit */
bool memsw_is_minimum;
@ -235,7 +237,6 @@ struct mem_cgroup {
* mem_cgroup ? And what type of charges should we move ?
*/
unsigned long move_charge_at_immigrate;
/*
* percpu counter.
*/
@ -1342,20 +1343,26 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
}
static void memcg_oom_recover(struct mem_cgroup *mem)
{
if (mem->oom_kill_disable && atomic_read(&mem->oom_lock))
memcg_wakeup_oom(mem);
}
/*
* try to call OOM killer. returns false if we should exit memory-reclaim loop.
*/
bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
{
struct oom_wait_info owait;
bool locked;
bool locked, need_to_kill;
owait.mem = mem;
owait.wait.flags = 0;
owait.wait.func = memcg_oom_wake_function;
owait.wait.private = current;
INIT_LIST_HEAD(&owait.wait.task_list);
need_to_kill = true;
/* At first, try to OOM lock hierarchy under mem.*/
mutex_lock(&memcg_oom_mutex);
locked = mem_cgroup_oom_lock(mem);
@ -1364,15 +1371,17 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
* accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
* under OOM is always welcomed, use TASK_KILLABLE here.
*/
if (!locked)
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
else
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
if (!locked || mem->oom_kill_disable)
need_to_kill = false;
if (locked)
mem_cgroup_oom_notify(mem);
mutex_unlock(&memcg_oom_mutex);
if (locked)
if (need_to_kill) {
finish_wait(&memcg_oom_waitq, &owait.wait);
mem_cgroup_out_of_memory(mem, mask);
else {
} else {
schedule();
finish_wait(&memcg_oom_waitq, &owait.wait);
}
@ -2162,15 +2171,6 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
/* If swapout, usage of swap doesn't decrease */
if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
uncharge_memsw = false;
/*
* do_batch > 0 when unmapping pages or inode invalidate/truncate.
* In those cases, all pages freed continously can be expected to be in
* the same cgroup and we have chance to coalesce uncharges.
* But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
* because we want to do uncharge as soon as possible.
*/
if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE))
goto direct_uncharge;
batch = &current->memcg_batch;
/*
@ -2180,6 +2180,17 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
*/
if (!batch->memcg)
batch->memcg = mem;
/*
* do_batch > 0 when unmapping pages or inode invalidate/truncate.
* In those cases, all pages freed continously can be expected to be in
* the same cgroup and we have chance to coalesce uncharges.
* But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
* because we want to do uncharge as soon as possible.
*/
if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
goto direct_uncharge;
/*
* In typical case, batch->memcg == mem. This means we can
* merge a series of uncharges to an uncharge of res_counter.
@ -2196,6 +2207,8 @@ direct_uncharge:
res_counter_uncharge(&mem->res, PAGE_SIZE);
if (uncharge_memsw)
res_counter_uncharge(&mem->memsw, PAGE_SIZE);
if (unlikely(batch->memcg != mem))
memcg_oom_recover(mem);
return;
}
@ -2332,6 +2345,7 @@ void mem_cgroup_uncharge_end(void)
res_counter_uncharge(&batch->memcg->res, batch->bytes);
if (batch->memsw_bytes)
res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
memcg_oom_recover(batch->memcg);
/* forget this pointer (for sanity check) */
batch->memcg = NULL;
}
@ -2568,10 +2582,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
unsigned long long val)
{
int retry_count;
u64 memswlimit;
u64 memswlimit, memlimit;
int ret = 0;
int children = mem_cgroup_count_children(memcg);
u64 curusage, oldusage;
int enlarge;
/*
* For keeping hierarchical_reclaim simple, how long we should retry
@ -2582,6 +2597,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
enlarge = 0;
while (retry_count) {
if (signal_pending(current)) {
ret = -EINTR;
@ -2599,6 +2615,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
mutex_unlock(&set_limit_mutex);
break;
}
memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
if (memlimit < val)
enlarge = 1;
ret = res_counter_set_limit(&memcg->res, val);
if (!ret) {
if (memswlimit == val)
@ -2620,6 +2641,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
else
oldusage = curusage;
}
if (!ret && enlarge)
memcg_oom_recover(memcg);
return ret;
}
@ -2628,9 +2651,10 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
unsigned long long val)
{
int retry_count;
u64 memlimit, oldusage, curusage;
u64 memlimit, memswlimit, oldusage, curusage;
int children = mem_cgroup_count_children(memcg);
int ret = -EBUSY;
int enlarge = 0;
/* see mem_cgroup_resize_res_limit */
retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
@ -2652,6 +2676,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
mutex_unlock(&set_limit_mutex);
break;
}
memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
if (memswlimit < val)
enlarge = 1;
ret = res_counter_set_limit(&memcg->memsw, val);
if (!ret) {
if (memlimit == val)
@ -2674,6 +2701,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
else
oldusage = curusage;
}
if (!ret && enlarge)
memcg_oom_recover(memcg);
return ret;
}
@ -2865,6 +2894,7 @@ move_account:
if (ret)
break;
}
memcg_oom_recover(mem);
/* it seems parent cgroup doesn't have enough mem */
if (ret == -ENOMEM)
goto try_to_free;
@ -3645,6 +3675,46 @@ static int mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
return 0;
}
static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
struct cftype *cft, struct cgroup_map_cb *cb)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
if (atomic_read(&mem->oom_lock))
cb->fill(cb, "under_oom", 1);
else
cb->fill(cb, "under_oom", 0);
return 0;
}
/*
*/
static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
struct cftype *cft, u64 val)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
struct mem_cgroup *parent;
/* cannot set to root cgroup and only 0 and 1 are allowed */
if (!cgrp->parent || !((val == 0) || (val == 1)))
return -EINVAL;
parent = mem_cgroup_from_cont(cgrp->parent);
cgroup_lock();
/* oom-kill-disable is a flag for subhierarchy. */
if ((parent->use_hierarchy) ||
(mem->use_hierarchy && !list_empty(&cgrp->children))) {
cgroup_unlock();
return -EINVAL;
}
mem->oom_kill_disable = val;
cgroup_unlock();
return 0;
}
static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
@ -3702,6 +3772,8 @@ static struct cftype mem_cgroup_files[] = {
},
{
.name = "oom_control",
.read_map = mem_cgroup_oom_control_read,
.write_u64 = mem_cgroup_oom_control_write,
.register_event = mem_cgroup_oom_register_event,
.unregister_event = mem_cgroup_oom_unregister_event,
.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
@ -3943,6 +4015,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
} else {
parent = mem_cgroup_from_cont(cont->parent);
mem->use_hierarchy = parent->use_hierarchy;
mem->oom_kill_disable = parent->oom_kill_disable;
}
if (parent && parent->use_hierarchy) {
@ -4215,6 +4288,7 @@ static void mem_cgroup_clear_mc(void)
if (mc.precharge) {
__mem_cgroup_cancel_charge(mc.to, mc.precharge);
mc.precharge = 0;
memcg_oom_recover(mc.to);
}
/*
* we didn't uncharge from mc.from at mem_cgroup_move_account(), so
@ -4223,6 +4297,7 @@ static void mem_cgroup_clear_mc(void)
if (mc.moved_charge) {
__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
mc.moved_charge = 0;
memcg_oom_recover(mc.from);
}
/* we must fixup refcnts and charges */
if (mc.moved_swap) {