From 4b239f458c229de044d6905c2b0f9fe16ed9e01e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 17 Dec 2010 16:58:28 -0800 Subject: [PATCH 1/7] x86-64, mm: Put early page table high While dubug kdump, found current kernel will have problem with crashkernel=512M. It turns out that initial mapping is to 512M, and later initial mapping to 4G (acutally is 2040M in my platform), will put page table near 512M. then initial mapping to 128g will be near 2g. before this patch: [ 0.000000] initial memory mapped : 0 - 20000000 [ 0.000000] init_memory_mapping: [0x00000000000000-0x0000007f74ffff] [ 0.000000] 0000000000 - 007f600000 page 2M [ 0.000000] 007f600000 - 007f750000 page 4k [ 0.000000] kernel direct mapping tables up to 7f750000 @ [0x1fffc000-0x1fffffff] [ 0.000000] memblock_x86_reserve_range: [0x1fffc000-0x1fffdfff] PGTABLE [ 0.000000] init_memory_mapping: [0x00000100000000-0x0000207fffffff] [ 0.000000] 0100000000 - 2080000000 page 2M [ 0.000000] kernel direct mapping tables up to 2080000000 @ [0x7bc01000-0x7bc83fff] [ 0.000000] memblock_x86_reserve_range: [0x7bc01000-0x7bc7efff] PGTABLE [ 0.000000] RAMDISK: 7bc84000 - 7f745000 [ 0.000000] crashkernel reservation failed - No suitable area found. after patch: [ 0.000000] initial memory mapped : 0 - 20000000 [ 0.000000] init_memory_mapping: [0x00000000000000-0x0000007f74ffff] [ 0.000000] 0000000000 - 007f600000 page 2M [ 0.000000] 007f600000 - 007f750000 page 4k [ 0.000000] kernel direct mapping tables up to 7f750000 @ [0x7f74c000-0x7f74ffff] [ 0.000000] memblock_x86_reserve_range: [0x7f74c000-0x7f74dfff] PGTABLE [ 0.000000] init_memory_mapping: [0x00000100000000-0x0000207fffffff] [ 0.000000] 0100000000 - 2080000000 page 2M [ 0.000000] kernel direct mapping tables up to 2080000000 @ [0x207ff7d000-0x207fffffff] [ 0.000000] memblock_x86_reserve_range: [0x207ff7d000-0x207fffafff] PGTABLE [ 0.000000] RAMDISK: 7bc84000 - 7f745000 [ 0.000000] memblock_x86_reserve_range: [0x17000000-0x36ffffff] CRASH KERNEL [ 0.000000] Reserving 512MB of memory at 368MB for crashkernel (System RAM: 133120MB) It means with the patch, page table for [0, 2g) will need 2g, instead of under 512M, page table for [4g, 128g) will be near 128g, instead of under 2g. That would good, if we have lots of memory above 4g, like 1024g, or 2048g or 16T, will not put related page table under 2g. that would be have chance to fill the under 2g if 1G or 2M page is not used. the code change will use add map_low_page() and update unmap_low_page() for 64bit, and use them to get access the corresponding high memory for page table setting. Signed-off-by: Yinghai Lu LKML-Reference: <4D0C0734.7060900@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 9 +++---- arch/x86/mm/init_64.c | 63 +++++++++++++++++++------------------------ 2 files changed, 30 insertions(+), 42 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index c0e28a13de7d..5863950ebe0c 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -33,7 +33,7 @@ int direct_gbpages static void __init find_early_table_space(unsigned long end, int use_pse, int use_gbpages) { - unsigned long puds, pmds, ptes, tables, start; + unsigned long puds, pmds, ptes, tables, start = 0, good_end = end; phys_addr_t base; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; @@ -73,12 +73,9 @@ static void __init find_early_table_space(unsigned long end, int use_pse, * need roughly 0.5KB per GB. */ #ifdef CONFIG_X86_32 - start = 0x7000; -#else - start = 0x8000; + good_end = max_pfn_mapped << PAGE_SHIFT; #endif - base = memblock_find_in_range(start, max_pfn_mapped< Date: Fri, 17 Dec 2010 16:58:40 -0800 Subject: [PATCH 2/7] x86-64, gart: Fix allocation with memblock When trying to change alloc_bootmem with memblock to go with real top-down Found one old system: [ 0.000000] Node 0: aperture @ ac000000 size 64 MB [ 0.000000] Aperture pointing to e820 RAM. Ignoring. [ 0.000000] Your BIOS doesn't leave a aperture memory hole [ 0.000000] Please enable the IOMMU option in the BIOS setup [ 0.000000] This costs you 64 MB of RAM [ 0.000000] memblock_x86_reserve_range: [0x2020000000-0x2023ffffff] aperture64 [ 0.000000] Cannot allocate aperture memory hole (ffff882020000000,65536K) [ 0.000000] memblock_x86_free_range: [0x2020000000-0x2023ffffff] [ 0.000000] Kernel panic - not syncing: Not enough memory for aperture [ 0.000000] Pid: 0, comm: swapper Not tainted 2.6.37-rc5-tip-yh-06229-gb792dc2-dirty #331 [ 0.000000] Call Trace: [ 0.000000] [] ? panic+0x91/0x1a3 [ 0.000000] [] ? gart_iommu_hole_init+0x3d7/0x4a3 [ 0.000000] [] ? _etext+0x0/0x3 [ 0.000000] [] ? pci_iommu_alloc+0x47/0x71 [ 0.000000] [] ? mem_init+0x19/0xec [ 0.000000] [] ? start_kernel+0x20a/0x3e8 [ 0.000000] [] ? x86_64_start_reservations+0x9c/0xa0 [ 0.000000] [] ? x86_64_start_kernel+0x114/0x11b it means __alloc_bootmem_nopanic() get too high for that aperture. Use memblock_find_in_range() with limit directly. Signed-off-by: Yinghai Lu LKML-Reference: <4D0C0740.90104@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/aperture_64.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index dcd7c83e1659..85f66b4f4fee 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include @@ -69,7 +69,7 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size) static u32 __init allocate_aperture(void) { u32 aper_size; - void *p; + unsigned long addr; /* aper_size should <= 1G */ if (fallback_aper_order > 5) @@ -95,27 +95,26 @@ static u32 __init allocate_aperture(void) * so don't use 512M below as gart iommu, leave the space for kernel * code for safe */ - p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20); + addr = memblock_find_in_range(0, 1ULL<<32, aper_size, 512ULL<<20); + if (addr == MEMBLOCK_ERROR || addr + aper_size > 0xffffffff) { + printk(KERN_ERR + "Cannot allocate aperture memory hole (%lx,%uK)\n", + addr, aper_size>>10); + return 0; + } + memblock_x86_reserve_range(addr, addr + aper_size, "aperture64"); /* * Kmemleak should not scan this block as it may not be mapped via the * kernel direct mapping. */ - kmemleak_ignore(p); - if (!p || __pa(p)+aper_size > 0xffffffff) { - printk(KERN_ERR - "Cannot allocate aperture memory hole (%p,%uK)\n", - p, aper_size>>10); - if (p) - free_bootmem(__pa(p), aper_size); - return 0; - } + kmemleak_ignore(phys_to_virt(addr)); printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", - aper_size >> 10, __pa(p)); - insert_aperture_resource((u32)__pa(p), aper_size); - register_nosave_region((u32)__pa(p) >> PAGE_SHIFT, - (u32)__pa(p+aper_size) >> PAGE_SHIFT); + aper_size >> 10, addr); + insert_aperture_resource((u32)addr, aper_size); + register_nosave_region(addr >> PAGE_SHIFT, + (addr+aper_size) >> PAGE_SHIFT); - return (u32)__pa(p); + return (u32)addr; } From 1a4a678b12c84db9ae5dce424e0e97f0559bb57c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 17 Dec 2010 16:59:07 -0800 Subject: [PATCH 3/7] memblock: Make find_memory_core_early() find from top-down That is used for find ram in node or bootmem type. We should make it top-down so it will be consistent to memblock_find, and to avoid allocating potentially valuable low memory before we actually need it. Suggested-by: Jeremy Fitzhardinge Signed-off-by: Yinghai Lu LKML-Reference: <4D0C075B.3040501@kernel.org> Signed-off-by: H. Peter Anvin --- mm/page_alloc.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 07a654486f75..19413bfdef92 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3555,6 +3555,34 @@ static int __meminit next_active_region_index_in_nid(int index, int nid) return -1; } +/* + * Basic iterator support. Return the last range of PFNs for a node + * Note: nid == MAX_NUMNODES returns last region regardless of node + */ +static int __meminit last_active_region_index_in_nid(int nid) +{ + int i; + + for (i = nr_nodemap_entries - 1; i >= 0; i--) + if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) + return i; + + return -1; +} + +/* + * Basic iterator support. Return the previous active range of PFNs for a node + * Note: nid == MAX_NUMNODES returns next region regardless of node + */ +static int __meminit previous_active_region_index_in_nid(int index, int nid) +{ + for (index = index - 1; index >= 0; index--) + if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) + return index; + + return -1; +} + #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID /* * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. @@ -3606,6 +3634,10 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) for (i = first_active_region_index_in_nid(nid); i != -1; \ i = next_active_region_index_in_nid(i, nid)) +#define for_each_active_range_index_in_nid_reverse(i, nid) \ + for (i = last_active_region_index_in_nid(nid); i != -1; \ + i = previous_active_region_index_in_nid(i, nid)) + /** * free_bootmem_with_active_regions - Call free_bootmem_node for each active range * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. @@ -3644,7 +3676,7 @@ u64 __init find_memory_core_early(int nid, u64 size, u64 align, int i; /* Need to go over early_node_map to find out good range for node */ - for_each_active_range_index_in_nid(i, nid) { + for_each_active_range_index_in_nid_reverse(i, nid) { u64 addr; u64 ei_start, ei_last; u64 final_start, final_end; From 45635ab5e41bcde94a82f9a05d660ef77fe38c1b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Dec 2010 16:47:54 -0800 Subject: [PATCH 4/7] x86: Change get_max_mapped() to inline Move it into head file. to prepare use it in other files. [ hpa: added missing and changed type to phys_addr_t. ] Signed-off-by: Yinghai Lu LKML-Reference: <4D1933BA.8000508@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page_types.h | 6 ++++++ arch/x86/kernel/setup.c | 9 --------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 1df66211fd1b..93626e699679 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -2,6 +2,7 @@ #define _ASM_X86_PAGE_DEFS_H #include +#include /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 @@ -45,6 +46,11 @@ extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; +static inline phys_addr_t get_max_mapped(void) +{ + return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT; +} + extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index df172c1e8238..3def8c9a5dc9 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -669,15 +669,6 @@ static int __init parse_reservelow(char *p) early_param("reservelow", parse_reservelow); -static u64 __init get_max_mapped(void) -{ - u64 end = max_pfn_mapped; - - end <<= PAGE_SHIFT; - - return end; -} - /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures From dbef7b56d2fc5115f26f72a0b080283bbf972cab Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Dec 2010 16:48:08 -0800 Subject: [PATCH 5/7] x86-64, numa: Allocate memnodemap under max_pfn_mapped We need to access it right way, so make sure that it is mapped already. Prepare to put page table on local node, and nodemap is used before that. Signed-off-by: Yinghai Lu LKML-Reference: <4D1933C8.7060105@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 7762a517d69d..02d36ff85ebd 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -87,7 +87,7 @@ static int __init allocate_cachealigned_memnodemap(void) addr = 0x8000; nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); - nodemap_addr = memblock_find_in_range(addr, max_pfn< Date: Mon, 27 Dec 2010 16:48:17 -0800 Subject: [PATCH 6/7] x86-64, numa: Put pgtable to local node memory Introduce init_memory_mapping_high(), and use it with 64bit. It will go with every memory segment above 4g to create page table to the memory range itself. before this patch all page tables was on one node. with this patch, one RED-PEN is killed debug out for 8 sockets system after patch [ 0.000000] initial memory mapped : 0 - 20000000 [ 0.000000] init_memory_mapping: [0x00000000000000-0x0000007f74ffff] [ 0.000000] 0000000000 - 007f600000 page 2M [ 0.000000] 007f600000 - 007f750000 page 4k [ 0.000000] kernel direct mapping tables up to 7f750000 @ [0x7f74c000-0x7f74ffff] [ 0.000000] RAMDISK: 7bc84000 - 7f745000 .... [ 0.000000] Adding active range (0, 0x10, 0x95) 0 entries of 3200 used [ 0.000000] Adding active range (0, 0x100, 0x7f750) 1 entries of 3200 used [ 0.000000] Adding active range (0, 0x100000, 0x1080000) 2 entries of 3200 used [ 0.000000] Adding active range (1, 0x1080000, 0x2080000) 3 entries of 3200 used [ 0.000000] Adding active range (2, 0x2080000, 0x3080000) 4 entries of 3200 used [ 0.000000] Adding active range (3, 0x3080000, 0x4080000) 5 entries of 3200 used [ 0.000000] Adding active range (4, 0x4080000, 0x5080000) 6 entries of 3200 used [ 0.000000] Adding active range (5, 0x5080000, 0x6080000) 7 entries of 3200 used [ 0.000000] Adding active range (6, 0x6080000, 0x7080000) 8 entries of 3200 used [ 0.000000] Adding active range (7, 0x7080000, 0x8080000) 9 entries of 3200 used [ 0.000000] init_memory_mapping: [0x00000100000000-0x0000107fffffff] [ 0.000000] 0100000000 - 1080000000 page 2M [ 0.000000] kernel direct mapping tables up to 1080000000 @ [0x107ffbd000-0x107fffffff] [ 0.000000] memblock_x86_reserve_range: [0x107ffc2000-0x107fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00001080000000-0x0000207fffffff] [ 0.000000] 1080000000 - 2080000000 page 2M [ 0.000000] kernel direct mapping tables up to 2080000000 @ [0x207ff7d000-0x207fffffff] [ 0.000000] memblock_x86_reserve_range: [0x207ffc0000-0x207fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00002080000000-0x0000307fffffff] [ 0.000000] 2080000000 - 3080000000 page 2M [ 0.000000] kernel direct mapping tables up to 3080000000 @ [0x307ff3d000-0x307fffffff] [ 0.000000] memblock_x86_reserve_range: [0x307ffc0000-0x307fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00003080000000-0x0000407fffffff] [ 0.000000] 3080000000 - 4080000000 page 2M [ 0.000000] kernel direct mapping tables up to 4080000000 @ [0x407fefd000-0x407fffffff] [ 0.000000] memblock_x86_reserve_range: [0x407ffc0000-0x407fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00004080000000-0x0000507fffffff] [ 0.000000] 4080000000 - 5080000000 page 2M [ 0.000000] kernel direct mapping tables up to 5080000000 @ [0x507febd000-0x507fffffff] [ 0.000000] memblock_x86_reserve_range: [0x507ffc0000-0x507fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00005080000000-0x0000607fffffff] [ 0.000000] 5080000000 - 6080000000 page 2M [ 0.000000] kernel direct mapping tables up to 6080000000 @ [0x607fe7d000-0x607fffffff] [ 0.000000] memblock_x86_reserve_range: [0x607ffc0000-0x607fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00006080000000-0x0000707fffffff] [ 0.000000] 6080000000 - 7080000000 page 2M [ 0.000000] kernel direct mapping tables up to 7080000000 @ [0x707fe3d000-0x707fffffff] [ 0.000000] memblock_x86_reserve_range: [0x707ffc0000-0x707fffffff] PGTABLE [ 0.000000] init_memory_mapping: [0x00007080000000-0x0000807fffffff] [ 0.000000] 7080000000 - 8080000000 page 2M [ 0.000000] kernel direct mapping tables up to 8080000000 @ [0x807fdfc000-0x807fffffff] [ 0.000000] memblock_x86_reserve_range: [0x807ffbf000-0x807fffffff] PGTABLE [ 0.000000] Initmem setup node 0 [0000000000000000-000000107fffffff] [ 0.000000] NODE_DATA [0x0000107ffbd000-0x0000107ffc1fff] [ 0.000000] Initmem setup node 1 [0000001080000000-000000207fffffff] [ 0.000000] NODE_DATA [0x0000207ffbb000-0x0000207ffbffff] [ 0.000000] Initmem setup node 2 [0000002080000000-000000307fffffff] [ 0.000000] NODE_DATA [0x0000307ffbb000-0x0000307ffbffff] [ 0.000000] Initmem setup node 3 [0000003080000000-000000407fffffff] [ 0.000000] NODE_DATA [0x0000407ffbb000-0x0000407ffbffff] [ 0.000000] Initmem setup node 4 [0000004080000000-000000507fffffff] [ 0.000000] NODE_DATA [0x0000507ffbb000-0x0000507ffbffff] [ 0.000000] Initmem setup node 5 [0000005080000000-000000607fffffff] [ 0.000000] NODE_DATA [0x0000607ffbb000-0x0000607ffbffff] [ 0.000000] Initmem setup node 6 [0000006080000000-000000707fffffff] [ 0.000000] NODE_DATA [0x0000707ffbb000-0x0000707ffbffff] [ 0.000000] Initmem setup node 7 [0000007080000000-000000807fffffff] [ 0.000000] NODE_DATA [0x0000807ffba000-0x0000807ffbefff] Signed-off-by: Yinghai Lu LKML-Reference: <4D1933D1.9020609@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page_types.h | 2 ++ arch/x86/kernel/setup.c | 8 ----- arch/x86/mm/amdtopology_64.c | 8 +++-- arch/x86/mm/init.c | 8 +---- arch/x86/mm/init_64.c | 54 +++++++++++++++++++++++++++++++ arch/x86/mm/numa_64.c | 6 ++-- arch/x86/mm/srat_64.c | 2 ++ 7 files changed, 68 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 93626e699679..731d211a1b20 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -54,6 +54,8 @@ static inline phys_addr_t get_max_mapped(void) extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); +void init_memory_mapping_high(void); + extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn, int acpi, int k8); extern void free_initmem(void); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3def8c9a5dc9..fc0fe743f3a1 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -931,14 +931,6 @@ void __init setup_arch(char **cmdline_p) max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< max_low_pfn) { - max_pfn_mapped = init_memory_mapping(1UL<<32, - max_pfn<> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); + init_memory_mapping_high(); + for_each_node_mask(i, node_possible_map) { + int j; + for (j = apicid_base; j < cores + apicid_base; j++) apicid_to_node[(i << bits) + j] = i; setup_node_bootmem(i, nodes[i].start, nodes[i].end); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 5863950ebe0c..fa6fe756d912 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -65,16 +65,10 @@ static void __init find_early_table_space(unsigned long end, int use_pse, #ifdef CONFIG_X86_32 /* for fixmap */ tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); -#endif - /* - * RED-PEN putting page tables only on node 0 could - * cause a hotspot and fill up ZONE_DMA. The page tables - * need roughly 0.5KB per GB. - */ -#ifdef CONFIG_X86_32 good_end = max_pfn_mapped << PAGE_SHIFT; #endif + base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); if (base == MEMBLOCK_ERROR) panic("Cannot find space for the kernel page tables"); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 024847dc81ab..194f2732ab77 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -607,9 +607,63 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, int acpi, int k8) { memblock_x86_register_active_regions(0, start_pfn, end_pfn); + init_memory_mapping_high(); } #endif +struct mapping_work_data { + unsigned long start; + unsigned long end; + unsigned long pfn_mapped; +}; + +static int __init_refok +mapping_work_fn(unsigned long start_pfn, unsigned long end_pfn, void *datax) +{ + struct mapping_work_data *data = datax; + unsigned long pfn_mapped; + unsigned long final_start, final_end; + + final_start = max_t(unsigned long, start_pfn<start); + final_end = min_t(unsigned long, end_pfn<end); + + if (final_end <= final_start) + return 0; + + pfn_mapped = init_memory_mapping(final_start, final_end); + + if (pfn_mapped > data->pfn_mapped) + data->pfn_mapped = pfn_mapped; + + return 0; +} + +static unsigned long __init_refok +init_memory_mapping_active_regions(unsigned long start, unsigned long end) +{ + struct mapping_work_data data; + + data.start = start; + data.end = end; + data.pfn_mapped = 0; + + work_with_active_regions(MAX_NUMNODES, mapping_work_fn, &data); + + return data.pfn_mapped; +} + +void __init_refok init_memory_mapping_high(void) +{ + if (max_pfn > max_low_pfn) { + max_pfn_mapped = init_memory_mapping_active_regions(1UL<<32, + max_pfn<> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); + init_memory_mapping_high(); + for_each_node_mask(i, node_possible_map) setup_node_bootmem(i, nodes[i].start, nodes[i].end); - } acpi_fake_nodes(nodes, num_nodes); numa_init_array(); return 0; @@ -645,6 +646,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, for (i = 0; i < nr_cpu_ids; i++) numa_set_node(i, 0); memblock_x86_register_active_regions(0, start_pfn, last_pfn); + init_memory_mapping_high(); setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); } diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index a35cb9d8b060..0b961c8bffb4 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -433,6 +433,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) return -1; } + init_memory_mapping_high(); + /* Account for nodes with cpus and no memory */ nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); From f005fe12b90c5b9fe180a09209a893e09affa8aa Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 27 Dec 2010 16:48:32 -0800 Subject: [PATCH 7/7] x86-64: Move out cleanup higmap [_brk_end, _end) out of init_memory_mapping() It is not related to init_memory_mapping(), and init_memory_mapping() is getting more bigger. So make it as seperated function and call it from reserve_brk() and that is point when _brk_end is concluded. Signed-off-by: Yinghai Lu LKML-Reference: <4D1933E0.7090305@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 24 ++++++++++++++++++++++++ arch/x86/mm/init.c | 19 ------------------- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index fc0fe743f3a1..635185ba4435 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -293,10 +293,32 @@ static void __init init_gbpages(void) else direct_gbpages = 0; } + +static void __init cleanup_highmap_brk_end(void) +{ + pud_t *pud; + pmd_t *pmd; + + mmu_cr4_features = read_cr4(); + + /* + * _brk_end cannot change anymore, but it and _end may be + * located on different 2M pages. cleanup_highmap(), however, + * can only consider _end when it runs, so destroy any + * mappings beyond _brk_end here. + */ + pud = pud_offset(pgd_offset_k(_brk_end), _brk_end); + pmd = pmd_offset(pud, _brk_end - 1); + while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1)) + pmd_clear(pmd); +} #else static inline void init_gbpages(void) { } +static inline void cleanup_highmap_brk_end(void) +{ +} #endif static void __init reserve_brk(void) @@ -307,6 +329,8 @@ static void __init reserve_brk(void) /* Mark brk area as locked down and no longer taking any new allocations */ _brk_start = 0; + + cleanup_highmap_brk_end(); } #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index fa6fe756d912..35ee75d9061a 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -270,25 +270,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, load_cr3(swapper_pg_dir); #endif -#ifdef CONFIG_X86_64 - if (!after_bootmem && !start) { - pud_t *pud; - pmd_t *pmd; - - mmu_cr4_features = read_cr4(); - - /* - * _brk_end cannot change anymore, but it and _end may be - * located on different 2M pages. cleanup_highmap(), however, - * can only consider _end when it runs, so destroy any - * mappings beyond _brk_end here. - */ - pud = pud_offset(pgd_offset_k(_brk_end), _brk_end); - pmd = pmd_offset(pud, _brk_end - 1); - while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1)) - pmd_clear(pmd); - } -#endif __flush_tlb_all(); if (!after_bootmem && e820_table_end > e820_table_start)