linux-kernel-test/arch/x86/include/asm/x86_init.h
Stefano Stabellini 279b706bf8 x86,xen: introduce x86_init.mapping.pagetable_reserve
Introduce a new x86_init hook called pagetable_reserve that at the end
of init_memory_mapping is used to reserve a range of memory addresses for
the kernel pagetable pages we used and free the other ones.

On native it just calls memblock_x86_reserve_range while on xen it also
takes care of setting the spare memory previously allocated
for kernel pagetable pages from RO to RW, so that it can be used for
other purposes.

A detailed explanation of the reason why this hook is needed follows.

As a consequence of the commit:

commit 4b239f458c
Author: Yinghai Lu <yinghai@kernel.org>
Date:   Fri Dec 17 16:58:28 2010 -0800

    x86-64, mm: Put early page table high

at some point init_memory_mapping is going to reach the pagetable pages
area and map those pages too (mapping them as normal memory that falls
in the range of addresses passed to init_memory_mapping as argument).
Some of those pages are already pagetable pages (they are in the range
pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and
everything is fine.
Some of these pages are not pagetable pages yet (they fall in the range
pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they
are going to be mapped RW.  When these pages become pagetable pages and
are hooked into the pagetable, xen will find that the guest has already
a RW mapping of them somewhere and fail the operation.
The reason Xen requires pagetables to be RO is that the hypervisor needs
to verify that the pagetables are valid before using them. The validation
operations are called "pinning" (more details in arch/x86/xen/mmu.c).

In order to fix the issue we mark all the pages in the entire range
pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation
is completed only the range pgt_buf_start-pgt_buf_end is reserved by
init_memory_mapping. Hence the kernel is going to crash as soon as one
of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those
ranges are RO).

For this reason we need a hook to reserve the kernel pagetable pages we
used and free the other ones so that they can be reused for other
purposes.
On native it just means calling memblock_x86_reserve_range, on Xen it
also means marking RW the pagetable pages that we allocated before but
that haven't been used before.

Another way to fix this is without using the hook is by adding a 'if
(xen_pv_domain)' in the 'init_memory_mapping' code and calling the Xen
counterpart, but that is just nasty.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
2011-05-12 13:05:04 -04:00

188 lines
5.3 KiB
C

#ifndef _ASM_X86_PLATFORM_H
#define _ASM_X86_PLATFORM_H
#include <asm/pgtable_types.h>
#include <asm/bootparam.h>
struct mpc_bus;
struct mpc_cpu;
struct mpc_table;
/**
* struct x86_init_mpparse - platform specific mpparse ops
* @mpc_record: platform specific mpc record accounting
* @setup_ioapic_ids: platform specific ioapic id override
* @mpc_apic_id: platform specific mpc apic id assignment
* @smp_read_mpc_oem: platform specific oem mpc table setup
* @mpc_oem_pci_bus: platform specific pci bus setup (default NULL)
* @mpc_oem_bus_info: platform specific mpc bus info
* @find_smp_config: find the smp configuration
* @get_smp_config: get the smp configuration
*/
struct x86_init_mpparse {
void (*mpc_record)(unsigned int mode);
void (*setup_ioapic_ids)(void);
int (*mpc_apic_id)(struct mpc_cpu *m);
void (*smp_read_mpc_oem)(struct mpc_table *mpc);
void (*mpc_oem_pci_bus)(struct mpc_bus *m);
void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
void (*find_smp_config)(void);
void (*get_smp_config)(unsigned int early);
};
/**
* struct x86_init_resources - platform specific resource related ops
* @probe_roms: probe BIOS roms
* @reserve_resources: reserve the standard resources for the
* platform
* @memory_setup: platform specific memory setup
*
*/
struct x86_init_resources {
void (*probe_roms)(void);
void (*reserve_resources)(void);
char *(*memory_setup)(void);
};
/**
* struct x86_init_irqs - platform specific interrupt setup
* @pre_vector_init: init code to run before interrupt vectors
* are set up.
* @intr_init: interrupt init code
* @trap_init: platform specific trap setup
*/
struct x86_init_irqs {
void (*pre_vector_init)(void);
void (*intr_init)(void);
void (*trap_init)(void);
};
/**
* struct x86_init_oem - oem platform specific customizing functions
* @arch_setup: platform specific architecure setup
* @banner: print a platform specific banner
*/
struct x86_init_oem {
void (*arch_setup)(void);
void (*banner)(void);
};
/**
* struct x86_init_mapping - platform specific initial kernel pagetable setup
* @pagetable_reserve: reserve a range of addresses for kernel pagetable usage
*
* For more details on the purpose of this hook, look in
* init_memory_mapping and the commit that added it.
*/
struct x86_init_mapping {
void (*pagetable_reserve)(u64 start, u64 end);
};
/**
* struct x86_init_paging - platform specific paging functions
* @pagetable_setup_start: platform specific pre paging_init() call
* @pagetable_setup_done: platform specific post paging_init() call
*/
struct x86_init_paging {
void (*pagetable_setup_start)(pgd_t *base);
void (*pagetable_setup_done)(pgd_t *base);
};
/**
* struct x86_init_timers - platform specific timer setup
* @setup_perpcu_clockev: set up the per cpu clock event device for the
* boot cpu
* @tsc_pre_init: platform function called before TSC init
* @timer_init: initialize the platform timer (default PIT/HPET)
* @wallclock_init: init the wallclock device
*/
struct x86_init_timers {
void (*setup_percpu_clockev)(void);
void (*tsc_pre_init)(void);
void (*timer_init)(void);
void (*wallclock_init)(void);
};
/**
* struct x86_init_iommu - platform specific iommu setup
* @iommu_init: platform specific iommu setup
*/
struct x86_init_iommu {
int (*iommu_init)(void);
};
/**
* struct x86_init_pci - platform specific pci init functions
* @arch_init: platform specific pci arch init call
* @init: platform specific pci subsystem init
* @init_irq: platform specific pci irq init
* @fixup_irqs: platform specific pci irq fixup
*/
struct x86_init_pci {
int (*arch_init)(void);
int (*init)(void);
void (*init_irq)(void);
void (*fixup_irqs)(void);
};
/**
* struct x86_init_ops - functions for platform specific setup
*
*/
struct x86_init_ops {
struct x86_init_resources resources;
struct x86_init_mpparse mpparse;
struct x86_init_irqs irqs;
struct x86_init_oem oem;
struct x86_init_mapping mapping;
struct x86_init_paging paging;
struct x86_init_timers timers;
struct x86_init_iommu iommu;
struct x86_init_pci pci;
};
/**
* struct x86_cpuinit_ops - platform specific cpu hotplug setups
* @setup_percpu_clockev: set up the per cpu clock event device
*/
struct x86_cpuinit_ops {
void (*setup_percpu_clockev)(void);
};
/**
* struct x86_platform_ops - platform specific runtime functions
* @calibrate_tsc: calibrate TSC
* @get_wallclock: get time from HW clock like RTC etc.
* @set_wallclock: set time back to HW clock
* @is_untracked_pat_range exclude from PAT logic
* @nmi_init enable NMI on cpus
* @i8042_detect pre-detect if i8042 controller exists
*/
struct x86_platform_ops {
unsigned long (*calibrate_tsc)(void);
unsigned long (*get_wallclock)(void);
int (*set_wallclock)(unsigned long nowtime);
void (*iommu_shutdown)(void);
bool (*is_untracked_pat_range)(u64 start, u64 end);
void (*nmi_init)(void);
int (*i8042_detect)(void);
};
struct pci_dev;
struct x86_msi_ops {
int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type);
void (*teardown_msi_irq)(unsigned int irq);
void (*teardown_msi_irqs)(struct pci_dev *dev);
};
extern struct x86_init_ops x86_init;
extern struct x86_cpuinit_ops x86_cpuinit;
extern struct x86_platform_ops x86_platform;
extern struct x86_msi_ops x86_msi;
extern void x86_init_noop(void);
extern void x86_init_uint_noop(unsigned int unused);
#endif