per-zone and reclaim enhancements for memory controller: modifies vmscan.c for isolate globa/cgroup lru activity
When using memory controller, there are 2 levels of memory reclaim. 1. zone memory reclaim because of system/zone memory shortage. 2. memory cgroup memory reclaim because of hitting limit. These two can be distinguished by sc->mem_cgroup parameter. (scan_global_lru() macro) This patch tries to make memory cgroup reclaim routine avoid affecting system/zone memory reclaim. This patch inserts if (scan_global_lru()) and hook to memory_cgroup reclaim support functions. This patch can be a help for isolating system lru activity and group lru activity and shows what additional functions are necessary. * mem_cgroup_calc_mapped_ratio() ... calculate mapped ratio for cgroup. * mem_cgroup_reclaim_imbalance() ... calculate active/inactive balance in cgroup. * mem_cgroup_calc_reclaim_active() ... calculate the number of active pages to be scanned in this priority in mem_cgroup. * mem_cgroup_calc_reclaim_inactive() ... calculate the number of inactive pages to be scanned in this priority in mem_cgroup. * mem_cgroup_all_unreclaimable() .. checks cgroup's page is all unreclaimable or not. * mem_cgroup_get_reclaim_priority() ... * mem_cgroup_note_reclaim_priority() ... record reclaim priority (temporal) * mem_cgroup_remember_reclaim_priority() .... record reclaim priority as zone->prev_priority. This value is used for calc reclaim_mapped. [akpm@linux-foundation.org: fix unused var warning] Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: David Rientjes <rientjes@google.com> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Kirill Korotaev <dev@sw.ru> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Paul Menage <menage@google.com> Cc: Pavel Emelianov <xemul@openvz.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> Cc: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
committed by
Linus Torvalds
parent
cc38108e1b
commit
1cfb419b39
164
mm/vmscan.c
164
mm/vmscan.c
@@ -856,6 +856,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
|
|||||||
__mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
|
__mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
|
||||||
__mod_zone_page_state(zone, NR_INACTIVE,
|
__mod_zone_page_state(zone, NR_INACTIVE,
|
||||||
-(nr_taken - nr_active));
|
-(nr_taken - nr_active));
|
||||||
|
if (scan_global_lru(sc))
|
||||||
zone->pages_scanned += nr_scan;
|
zone->pages_scanned += nr_scan;
|
||||||
spin_unlock_irq(&zone->lru_lock);
|
spin_unlock_irq(&zone->lru_lock);
|
||||||
|
|
||||||
@@ -888,8 +889,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
|
|||||||
if (current_is_kswapd()) {
|
if (current_is_kswapd()) {
|
||||||
__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
|
__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
|
||||||
__count_vm_events(KSWAPD_STEAL, nr_freed);
|
__count_vm_events(KSWAPD_STEAL, nr_freed);
|
||||||
} else
|
} else if (scan_global_lru(sc))
|
||||||
__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
|
__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
|
||||||
|
|
||||||
__count_zone_vm_events(PGSTEAL, zone, nr_freed);
|
__count_zone_vm_events(PGSTEAL, zone, nr_freed);
|
||||||
|
|
||||||
if (nr_taken == 0)
|
if (nr_taken == 0)
|
||||||
@@ -943,49 +945,31 @@ static inline int zone_is_near_oom(struct zone *zone)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This moves pages from the active list to the inactive list.
|
* Determine we should try to reclaim mapped pages.
|
||||||
*
|
* This is called only when sc->mem_cgroup is NULL.
|
||||||
* We move them the other way if the page is referenced by one or more
|
|
||||||
* processes, from rmap.
|
|
||||||
*
|
|
||||||
* If the pages are mostly unmapped, the processing is fast and it is
|
|
||||||
* appropriate to hold zone->lru_lock across the whole operation. But if
|
|
||||||
* the pages are mapped, the processing is slow (page_referenced()) so we
|
|
||||||
* should drop zone->lru_lock around each page. It's impossible to balance
|
|
||||||
* this, so instead we remove the pages from the LRU while processing them.
|
|
||||||
* It is safe to rely on PG_active against the non-LRU pages in here because
|
|
||||||
* nobody will play with that bit on a non-LRU page.
|
|
||||||
*
|
|
||||||
* The downside is that we have to touch page->_count against each page.
|
|
||||||
* But we had to alter page->flags anyway.
|
|
||||||
*/
|
*/
|
||||||
static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
|
static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
|
||||||
struct scan_control *sc, int priority)
|
int priority)
|
||||||
{
|
{
|
||||||
unsigned long pgmoved;
|
|
||||||
int pgdeactivate = 0;
|
|
||||||
unsigned long pgscanned;
|
|
||||||
LIST_HEAD(l_hold); /* The pages which were snipped off */
|
|
||||||
LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */
|
|
||||||
LIST_HEAD(l_active); /* Pages to go onto the active_list */
|
|
||||||
struct page *page;
|
|
||||||
struct pagevec pvec;
|
|
||||||
int reclaim_mapped = 0;
|
|
||||||
|
|
||||||
if (sc->may_swap) {
|
|
||||||
long mapped_ratio;
|
long mapped_ratio;
|
||||||
long distress;
|
long distress;
|
||||||
long swap_tendency;
|
long swap_tendency;
|
||||||
long imbalance;
|
long imbalance;
|
||||||
|
int reclaim_mapped = 0;
|
||||||
|
int prev_priority;
|
||||||
|
|
||||||
if (zone_is_near_oom(zone))
|
if (scan_global_lru(sc) && zone_is_near_oom(zone))
|
||||||
goto force_reclaim_mapped;
|
return 1;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* `distress' is a measure of how much trouble we're having
|
* `distress' is a measure of how much trouble we're having
|
||||||
* reclaiming pages. 0 -> no problems. 100 -> great trouble.
|
* reclaiming pages. 0 -> no problems. 100 -> great trouble.
|
||||||
*/
|
*/
|
||||||
distress = 100 >> min(zone->prev_priority, priority);
|
if (scan_global_lru(sc))
|
||||||
|
prev_priority = zone->prev_priority;
|
||||||
|
else
|
||||||
|
prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup);
|
||||||
|
|
||||||
|
distress = 100 >> min(prev_priority, priority);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The point of this algorithm is to decide when to start
|
* The point of this algorithm is to decide when to start
|
||||||
@@ -993,9 +977,12 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
|
|||||||
* how much memory
|
* how much memory
|
||||||
* is mapped.
|
* is mapped.
|
||||||
*/
|
*/
|
||||||
|
if (scan_global_lru(sc))
|
||||||
mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
|
mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
|
||||||
global_page_state(NR_ANON_PAGES)) * 100) /
|
global_page_state(NR_ANON_PAGES)) * 100) /
|
||||||
vm_total_pages;
|
vm_total_pages;
|
||||||
|
else
|
||||||
|
mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Now decide how much we really want to unmap some pages. The
|
* Now decide how much we really want to unmap some pages. The
|
||||||
@@ -1023,8 +1010,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
|
|||||||
* Avoid div by zero with nr_inactive+1, and max resulting
|
* Avoid div by zero with nr_inactive+1, and max resulting
|
||||||
* value is vm_total_pages.
|
* value is vm_total_pages.
|
||||||
*/
|
*/
|
||||||
|
if (scan_global_lru(sc)) {
|
||||||
imbalance = zone_page_state(zone, NR_ACTIVE);
|
imbalance = zone_page_state(zone, NR_ACTIVE);
|
||||||
imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
|
imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
|
||||||
|
} else
|
||||||
|
imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Reduce the effect of imbalance if swappiness is low,
|
* Reduce the effect of imbalance if swappiness is low,
|
||||||
@@ -1056,16 +1046,58 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
|
|||||||
* memory onto the inactive list.
|
* memory onto the inactive list.
|
||||||
*/
|
*/
|
||||||
if (swap_tendency >= 100)
|
if (swap_tendency >= 100)
|
||||||
force_reclaim_mapped:
|
|
||||||
reclaim_mapped = 1;
|
reclaim_mapped = 1;
|
||||||
}
|
|
||||||
|
return reclaim_mapped;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This moves pages from the active list to the inactive list.
|
||||||
|
*
|
||||||
|
* We move them the other way if the page is referenced by one or more
|
||||||
|
* processes, from rmap.
|
||||||
|
*
|
||||||
|
* If the pages are mostly unmapped, the processing is fast and it is
|
||||||
|
* appropriate to hold zone->lru_lock across the whole operation. But if
|
||||||
|
* the pages are mapped, the processing is slow (page_referenced()) so we
|
||||||
|
* should drop zone->lru_lock around each page. It's impossible to balance
|
||||||
|
* this, so instead we remove the pages from the LRU while processing them.
|
||||||
|
* It is safe to rely on PG_active against the non-LRU pages in here because
|
||||||
|
* nobody will play with that bit on a non-LRU page.
|
||||||
|
*
|
||||||
|
* The downside is that we have to touch page->_count against each page.
|
||||||
|
* But we had to alter page->flags anyway.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
|
||||||
|
struct scan_control *sc, int priority)
|
||||||
|
{
|
||||||
|
unsigned long pgmoved;
|
||||||
|
int pgdeactivate = 0;
|
||||||
|
unsigned long pgscanned;
|
||||||
|
LIST_HEAD(l_hold); /* The pages which were snipped off */
|
||||||
|
LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */
|
||||||
|
LIST_HEAD(l_active); /* Pages to go onto the active_list */
|
||||||
|
struct page *page;
|
||||||
|
struct pagevec pvec;
|
||||||
|
int reclaim_mapped = 0;
|
||||||
|
|
||||||
|
if (sc->may_swap)
|
||||||
|
reclaim_mapped = calc_reclaim_mapped(sc, zone, priority);
|
||||||
|
|
||||||
lru_add_drain();
|
lru_add_drain();
|
||||||
spin_lock_irq(&zone->lru_lock);
|
spin_lock_irq(&zone->lru_lock);
|
||||||
pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
|
pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
|
||||||
ISOLATE_ACTIVE, zone,
|
ISOLATE_ACTIVE, zone,
|
||||||
sc->mem_cgroup, 1);
|
sc->mem_cgroup, 1);
|
||||||
|
/*
|
||||||
|
* zone->pages_scanned is used for detect zone's oom
|
||||||
|
* mem_cgroup remembers nr_scan by itself.
|
||||||
|
*/
|
||||||
|
if (scan_global_lru(sc))
|
||||||
zone->pages_scanned += pgscanned;
|
zone->pages_scanned += pgscanned;
|
||||||
|
|
||||||
__mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
|
__mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
|
||||||
spin_unlock_irq(&zone->lru_lock);
|
spin_unlock_irq(&zone->lru_lock);
|
||||||
|
|
||||||
@@ -1155,18 +1187,14 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
|
|||||||
unsigned long nr_to_scan;
|
unsigned long nr_to_scan;
|
||||||
unsigned long nr_reclaimed = 0;
|
unsigned long nr_reclaimed = 0;
|
||||||
|
|
||||||
|
if (scan_global_lru(sc)) {
|
||||||
/*
|
/*
|
||||||
* Add one to `nr_to_scan' just to make sure that the kernel will
|
* Add one to nr_to_scan just to make sure that the kernel
|
||||||
* slowly sift through the active list.
|
* will slowly sift through the active list.
|
||||||
*/
|
*/
|
||||||
zone->nr_scan_active +=
|
zone->nr_scan_active +=
|
||||||
(zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
|
(zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
|
||||||
nr_active = zone->nr_scan_active;
|
nr_active = zone->nr_scan_active;
|
||||||
if (nr_active >= sc->swap_cluster_max)
|
|
||||||
zone->nr_scan_active = 0;
|
|
||||||
else
|
|
||||||
nr_active = 0;
|
|
||||||
|
|
||||||
zone->nr_scan_inactive +=
|
zone->nr_scan_inactive +=
|
||||||
(zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
|
(zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
|
||||||
nr_inactive = zone->nr_scan_inactive;
|
nr_inactive = zone->nr_scan_inactive;
|
||||||
@@ -1175,6 +1203,24 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
|
|||||||
else
|
else
|
||||||
nr_inactive = 0;
|
nr_inactive = 0;
|
||||||
|
|
||||||
|
if (nr_active >= sc->swap_cluster_max)
|
||||||
|
zone->nr_scan_active = 0;
|
||||||
|
else
|
||||||
|
nr_active = 0;
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* This reclaim occurs not because zone memory shortage but
|
||||||
|
* because memory controller hits its limit.
|
||||||
|
* Then, don't modify zone reclaim related data.
|
||||||
|
*/
|
||||||
|
nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup,
|
||||||
|
zone, priority);
|
||||||
|
|
||||||
|
nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup,
|
||||||
|
zone, priority);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
while (nr_active || nr_inactive) {
|
while (nr_active || nr_inactive) {
|
||||||
if (nr_active) {
|
if (nr_active) {
|
||||||
nr_to_scan = min(nr_active,
|
nr_to_scan = min(nr_active,
|
||||||
@@ -1218,25 +1264,39 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
|
|||||||
unsigned long nr_reclaimed = 0;
|
unsigned long nr_reclaimed = 0;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
|
|
||||||
sc->all_unreclaimable = 1;
|
sc->all_unreclaimable = 1;
|
||||||
for (i = 0; zones[i] != NULL; i++) {
|
for (i = 0; zones[i] != NULL; i++) {
|
||||||
struct zone *zone = zones[i];
|
struct zone *zone = zones[i];
|
||||||
|
|
||||||
if (!populated_zone(zone))
|
if (!populated_zone(zone))
|
||||||
continue;
|
continue;
|
||||||
|
/*
|
||||||
|
* Take care memory controller reclaiming has small influence
|
||||||
|
* to global LRU.
|
||||||
|
*/
|
||||||
|
if (scan_global_lru(sc)) {
|
||||||
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
|
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
note_zone_scanning_priority(zone, priority);
|
note_zone_scanning_priority(zone, priority);
|
||||||
|
|
||||||
if (zone_is_all_unreclaimable(zone) && priority != DEF_PRIORITY)
|
if (zone_is_all_unreclaimable(zone) &&
|
||||||
|
priority != DEF_PRIORITY)
|
||||||
continue; /* Let kswapd poll it */
|
continue; /* Let kswapd poll it */
|
||||||
|
|
||||||
sc->all_unreclaimable = 0;
|
sc->all_unreclaimable = 0;
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* Ignore cpuset limitation here. We just want to reduce
|
||||||
|
* # of used pages by us regardless of memory shortage.
|
||||||
|
*/
|
||||||
|
sc->all_unreclaimable = 0;
|
||||||
|
mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
|
||||||
|
priority);
|
||||||
|
}
|
||||||
|
|
||||||
nr_reclaimed += shrink_zone(priority, zone, sc);
|
nr_reclaimed += shrink_zone(priority, zone, sc);
|
||||||
}
|
}
|
||||||
|
|
||||||
return nr_reclaimed;
|
return nr_reclaimed;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1264,8 +1324,12 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
|
|||||||
unsigned long lru_pages = 0;
|
unsigned long lru_pages = 0;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
|
if (scan_global_lru(sc))
|
||||||
count_vm_event(ALLOCSTALL);
|
count_vm_event(ALLOCSTALL);
|
||||||
|
/*
|
||||||
|
* mem_cgroup will not do shrink_slab.
|
||||||
|
*/
|
||||||
|
if (scan_global_lru(sc)) {
|
||||||
for (i = 0; zones[i] != NULL; i++) {
|
for (i = 0; zones[i] != NULL; i++) {
|
||||||
struct zone *zone = zones[i];
|
struct zone *zone = zones[i];
|
||||||
|
|
||||||
@@ -1275,6 +1339,7 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
|
|||||||
lru_pages += zone_page_state(zone, NR_ACTIVE)
|
lru_pages += zone_page_state(zone, NR_ACTIVE)
|
||||||
+ zone_page_state(zone, NR_INACTIVE);
|
+ zone_page_state(zone, NR_INACTIVE);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
|
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
|
||||||
sc->nr_scanned = 0;
|
sc->nr_scanned = 0;
|
||||||
@@ -1330,6 +1395,8 @@ out:
|
|||||||
*/
|
*/
|
||||||
if (priority < 0)
|
if (priority < 0)
|
||||||
priority = 0;
|
priority = 0;
|
||||||
|
|
||||||
|
if (scan_global_lru(sc)) {
|
||||||
for (i = 0; zones[i] != NULL; i++) {
|
for (i = 0; zones[i] != NULL; i++) {
|
||||||
struct zone *zone = zones[i];
|
struct zone *zone = zones[i];
|
||||||
|
|
||||||
@@ -1338,6 +1405,9 @@ out:
|
|||||||
|
|
||||||
zone->prev_priority = priority;
|
zone->prev_priority = priority;
|
||||||
}
|
}
|
||||||
|
} else
|
||||||
|
mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user