Merge branch 'x86/mem' into perf/core
Merge reason: memcpy_64.S changes an assumption perf bench has, so merge this here so we can fix it. Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
@@ -15,4 +15,13 @@
|
|||||||
.endm
|
.endm
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
.macro altinstruction_entry orig alt feature orig_len alt_len
|
||||||
|
.align 8
|
||||||
|
.quad \orig
|
||||||
|
.quad \alt
|
||||||
|
.word \feature
|
||||||
|
.byte \orig_len
|
||||||
|
.byte \alt_len
|
||||||
|
.endm
|
||||||
|
|
||||||
#endif /* __ASSEMBLY__ */
|
#endif /* __ASSEMBLY__ */
|
||||||
|
@@ -195,6 +195,7 @@
|
|||||||
|
|
||||||
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
|
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
|
||||||
#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
|
#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
|
||||||
|
#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */
|
||||||
|
|
||||||
#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
|
#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
|
||||||
|
|
||||||
|
@@ -42,7 +42,7 @@
|
|||||||
* Returns 0 if the range is valid, nonzero otherwise.
|
* Returns 0 if the range is valid, nonzero otherwise.
|
||||||
*
|
*
|
||||||
* This is equivalent to the following test:
|
* This is equivalent to the following test:
|
||||||
* (u33)addr + (u33)size >= (u33)current->addr_limit.seg (u65 for x86_64)
|
* (u33)addr + (u33)size > (u33)current->addr_limit.seg (u65 for x86_64)
|
||||||
*
|
*
|
||||||
* This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry...
|
* This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry...
|
||||||
*/
|
*/
|
||||||
|
@@ -210,6 +210,15 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
|
|||||||
u8 insnbuf[MAX_PATCH_LEN];
|
u8 insnbuf[MAX_PATCH_LEN];
|
||||||
|
|
||||||
DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
|
DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
|
||||||
|
/*
|
||||||
|
* The scan order should be from start to end. A later scanned
|
||||||
|
* alternative code can overwrite a previous scanned alternative code.
|
||||||
|
* Some kernel functions (e.g. memcpy, memset, etc) use this order to
|
||||||
|
* patch code.
|
||||||
|
*
|
||||||
|
* So be careful if you want to change the scan order to any other
|
||||||
|
* order.
|
||||||
|
*/
|
||||||
for (a = start; a < end; a++) {
|
for (a = start; a < end; a++) {
|
||||||
u8 *instr = a->instr;
|
u8 *instr = a->instr;
|
||||||
BUG_ON(a->replacementlen > a->instrlen);
|
BUG_ON(a->replacementlen > a->instrlen);
|
||||||
|
@@ -565,8 +565,7 @@ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
|
|||||||
|
|
||||||
cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
|
cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
|
||||||
|
|
||||||
if (eax > 0)
|
c->x86_capability[9] = ebx;
|
||||||
c->x86_capability[9] = ebx;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* AMD-defined flags: level 0x80000001 */
|
/* AMD-defined flags: level 0x80000001 */
|
||||||
|
@@ -29,10 +29,10 @@
|
|||||||
|
|
||||||
static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
|
static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
|
||||||
{
|
{
|
||||||
|
u64 misc_enable;
|
||||||
|
|
||||||
/* Unmask CPUID levels if masked: */
|
/* Unmask CPUID levels if masked: */
|
||||||
if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
|
if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
|
||||||
u64 misc_enable;
|
|
||||||
|
|
||||||
rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
|
rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
|
||||||
|
|
||||||
if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
|
if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
|
||||||
@@ -118,8 +118,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
|
|||||||
* (model 2) with the same problem.
|
* (model 2) with the same problem.
|
||||||
*/
|
*/
|
||||||
if (c->x86 == 15) {
|
if (c->x86 == 15) {
|
||||||
u64 misc_enable;
|
|
||||||
|
|
||||||
rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
|
rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
|
||||||
|
|
||||||
if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
|
if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
|
||||||
@@ -130,6 +128,19 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If fast string is not enabled in IA32_MISC_ENABLE for any reason,
|
||||||
|
* clear the fast string and enhanced fast string CPU capabilities.
|
||||||
|
*/
|
||||||
|
if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
|
||||||
|
rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
|
||||||
|
if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
|
||||||
|
printk(KERN_INFO "Disabled fast string operations\n");
|
||||||
|
setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
|
||||||
|
setup_clear_cpu_cap(X86_FEATURE_ERMS);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_X86_32
|
#ifdef CONFIG_X86_32
|
||||||
|
@@ -1,5 +1,6 @@
|
|||||||
#include <linux/linkage.h>
|
#include <linux/linkage.h>
|
||||||
#include <asm/dwarf2.h>
|
#include <asm/dwarf2.h>
|
||||||
|
#include <asm/alternative-asm.h>
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Zero a page.
|
* Zero a page.
|
||||||
@@ -14,6 +15,15 @@ ENTRY(clear_page_c)
|
|||||||
CFI_ENDPROC
|
CFI_ENDPROC
|
||||||
ENDPROC(clear_page_c)
|
ENDPROC(clear_page_c)
|
||||||
|
|
||||||
|
ENTRY(clear_page_c_e)
|
||||||
|
CFI_STARTPROC
|
||||||
|
movl $4096,%ecx
|
||||||
|
xorl %eax,%eax
|
||||||
|
rep stosb
|
||||||
|
ret
|
||||||
|
CFI_ENDPROC
|
||||||
|
ENDPROC(clear_page_c_e)
|
||||||
|
|
||||||
ENTRY(clear_page)
|
ENTRY(clear_page)
|
||||||
CFI_STARTPROC
|
CFI_STARTPROC
|
||||||
xorl %eax,%eax
|
xorl %eax,%eax
|
||||||
@@ -38,21 +48,26 @@ ENTRY(clear_page)
|
|||||||
.Lclear_page_end:
|
.Lclear_page_end:
|
||||||
ENDPROC(clear_page)
|
ENDPROC(clear_page)
|
||||||
|
|
||||||
/* Some CPUs run faster using the string instructions.
|
/*
|
||||||
It is also a lot simpler. Use this when possible */
|
* Some CPUs support enhanced REP MOVSB/STOSB instructions.
|
||||||
|
* It is recommended to use this when possible.
|
||||||
|
* If enhanced REP MOVSB/STOSB is not available, try to use fast string.
|
||||||
|
* Otherwise, use original function.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
#include <asm/cpufeature.h>
|
#include <asm/cpufeature.h>
|
||||||
|
|
||||||
.section .altinstr_replacement,"ax"
|
.section .altinstr_replacement,"ax"
|
||||||
1: .byte 0xeb /* jmp <disp8> */
|
1: .byte 0xeb /* jmp <disp8> */
|
||||||
.byte (clear_page_c - clear_page) - (2f - 1b) /* offset */
|
.byte (clear_page_c - clear_page) - (2f - 1b) /* offset */
|
||||||
2:
|
2: .byte 0xeb /* jmp <disp8> */
|
||||||
|
.byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */
|
||||||
|
3:
|
||||||
.previous
|
.previous
|
||||||
.section .altinstructions,"a"
|
.section .altinstructions,"a"
|
||||||
.align 8
|
altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
|
||||||
.quad clear_page
|
.Lclear_page_end-clear_page, 2b-1b
|
||||||
.quad 1b
|
altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \
|
||||||
.word X86_FEATURE_REP_GOOD
|
.Lclear_page_end-clear_page,3b-2b
|
||||||
.byte .Lclear_page_end - clear_page
|
|
||||||
.byte 2b - 1b
|
|
||||||
.previous
|
.previous
|
||||||
|
@@ -15,23 +15,30 @@
|
|||||||
#include <asm/asm-offsets.h>
|
#include <asm/asm-offsets.h>
|
||||||
#include <asm/thread_info.h>
|
#include <asm/thread_info.h>
|
||||||
#include <asm/cpufeature.h>
|
#include <asm/cpufeature.h>
|
||||||
|
#include <asm/alternative-asm.h>
|
||||||
|
|
||||||
.macro ALTERNATIVE_JUMP feature,orig,alt
|
/*
|
||||||
|
* By placing feature2 after feature1 in altinstructions section, we logically
|
||||||
|
* implement:
|
||||||
|
* If CPU has feature2, jmp to alt2 is used
|
||||||
|
* else if CPU has feature1, jmp to alt1 is used
|
||||||
|
* else jmp to orig is used.
|
||||||
|
*/
|
||||||
|
.macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
|
||||||
0:
|
0:
|
||||||
.byte 0xe9 /* 32bit jump */
|
.byte 0xe9 /* 32bit jump */
|
||||||
.long \orig-1f /* by default jump to orig */
|
.long \orig-1f /* by default jump to orig */
|
||||||
1:
|
1:
|
||||||
.section .altinstr_replacement,"ax"
|
.section .altinstr_replacement,"ax"
|
||||||
2: .byte 0xe9 /* near jump with 32bit immediate */
|
2: .byte 0xe9 /* near jump with 32bit immediate */
|
||||||
.long \alt-1b /* offset */ /* or alternatively to alt */
|
.long \alt1-1b /* offset */ /* or alternatively to alt1 */
|
||||||
|
3: .byte 0xe9 /* near jump with 32bit immediate */
|
||||||
|
.long \alt2-1b /* offset */ /* or alternatively to alt2 */
|
||||||
.previous
|
.previous
|
||||||
|
|
||||||
.section .altinstructions,"a"
|
.section .altinstructions,"a"
|
||||||
.align 8
|
altinstruction_entry 0b,2b,\feature1,5,5
|
||||||
.quad 0b
|
altinstruction_entry 0b,3b,\feature2,5,5
|
||||||
.quad 2b
|
|
||||||
.word \feature /* when feature is set */
|
|
||||||
.byte 5
|
|
||||||
.byte 5
|
|
||||||
.previous
|
.previous
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
@@ -72,8 +79,10 @@ ENTRY(_copy_to_user)
|
|||||||
addq %rdx,%rcx
|
addq %rdx,%rcx
|
||||||
jc bad_to_user
|
jc bad_to_user
|
||||||
cmpq TI_addr_limit(%rax),%rcx
|
cmpq TI_addr_limit(%rax),%rcx
|
||||||
jae bad_to_user
|
ja bad_to_user
|
||||||
ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
|
ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
|
||||||
|
copy_user_generic_unrolled,copy_user_generic_string, \
|
||||||
|
copy_user_enhanced_fast_string
|
||||||
CFI_ENDPROC
|
CFI_ENDPROC
|
||||||
ENDPROC(_copy_to_user)
|
ENDPROC(_copy_to_user)
|
||||||
|
|
||||||
@@ -85,8 +94,10 @@ ENTRY(_copy_from_user)
|
|||||||
addq %rdx,%rcx
|
addq %rdx,%rcx
|
||||||
jc bad_from_user
|
jc bad_from_user
|
||||||
cmpq TI_addr_limit(%rax),%rcx
|
cmpq TI_addr_limit(%rax),%rcx
|
||||||
jae bad_from_user
|
ja bad_from_user
|
||||||
ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
|
ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
|
||||||
|
copy_user_generic_unrolled,copy_user_generic_string, \
|
||||||
|
copy_user_enhanced_fast_string
|
||||||
CFI_ENDPROC
|
CFI_ENDPROC
|
||||||
ENDPROC(_copy_from_user)
|
ENDPROC(_copy_from_user)
|
||||||
|
|
||||||
@@ -255,3 +266,37 @@ ENTRY(copy_user_generic_string)
|
|||||||
.previous
|
.previous
|
||||||
CFI_ENDPROC
|
CFI_ENDPROC
|
||||||
ENDPROC(copy_user_generic_string)
|
ENDPROC(copy_user_generic_string)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
|
||||||
|
* It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
|
||||||
|
*
|
||||||
|
* Input:
|
||||||
|
* rdi destination
|
||||||
|
* rsi source
|
||||||
|
* rdx count
|
||||||
|
*
|
||||||
|
* Output:
|
||||||
|
* eax uncopied bytes or 0 if successful.
|
||||||
|
*/
|
||||||
|
ENTRY(copy_user_enhanced_fast_string)
|
||||||
|
CFI_STARTPROC
|
||||||
|
andl %edx,%edx
|
||||||
|
jz 2f
|
||||||
|
movl %edx,%ecx
|
||||||
|
1: rep
|
||||||
|
movsb
|
||||||
|
2: xorl %eax,%eax
|
||||||
|
ret
|
||||||
|
|
||||||
|
.section .fixup,"ax"
|
||||||
|
12: movl %ecx,%edx /* ecx is zerorest also */
|
||||||
|
jmp copy_user_handle_tail
|
||||||
|
.previous
|
||||||
|
|
||||||
|
.section __ex_table,"a"
|
||||||
|
.align 8
|
||||||
|
.quad 1b,12b
|
||||||
|
.previous
|
||||||
|
CFI_ENDPROC
|
||||||
|
ENDPROC(copy_user_enhanced_fast_string)
|
||||||
|
@@ -4,6 +4,7 @@
|
|||||||
|
|
||||||
#include <asm/cpufeature.h>
|
#include <asm/cpufeature.h>
|
||||||
#include <asm/dwarf2.h>
|
#include <asm/dwarf2.h>
|
||||||
|
#include <asm/alternative-asm.h>
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* memcpy - Copy a memory block.
|
* memcpy - Copy a memory block.
|
||||||
@@ -37,6 +38,23 @@
|
|||||||
.Lmemcpy_e:
|
.Lmemcpy_e:
|
||||||
.previous
|
.previous
|
||||||
|
|
||||||
|
/*
|
||||||
|
* memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
|
||||||
|
* memcpy_c. Use memcpy_c_e when possible.
|
||||||
|
*
|
||||||
|
* This gets patched over the unrolled variant (below) via the
|
||||||
|
* alternative instructions framework:
|
||||||
|
*/
|
||||||
|
.section .altinstr_replacement, "ax", @progbits
|
||||||
|
.Lmemcpy_c_e:
|
||||||
|
movq %rdi, %rax
|
||||||
|
|
||||||
|
movl %edx, %ecx
|
||||||
|
rep movsb
|
||||||
|
ret
|
||||||
|
.Lmemcpy_e_e:
|
||||||
|
.previous
|
||||||
|
|
||||||
ENTRY(__memcpy)
|
ENTRY(__memcpy)
|
||||||
ENTRY(memcpy)
|
ENTRY(memcpy)
|
||||||
CFI_STARTPROC
|
CFI_STARTPROC
|
||||||
@@ -171,21 +189,22 @@ ENDPROC(memcpy)
|
|||||||
ENDPROC(__memcpy)
|
ENDPROC(__memcpy)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Some CPUs run faster using the string copy instructions.
|
* Some CPUs are adding enhanced REP MOVSB/STOSB feature
|
||||||
* It is also a lot simpler. Use this when possible:
|
* If the feature is supported, memcpy_c_e() is the first choice.
|
||||||
*/
|
* If enhanced rep movsb copy is not available, use fast string copy
|
||||||
|
* memcpy_c() when possible. This is faster and code is simpler than
|
||||||
.section .altinstructions, "a"
|
* original memcpy().
|
||||||
.align 8
|
* Otherwise, original memcpy() is used.
|
||||||
.quad memcpy
|
* In .altinstructions section, ERMS feature is placed after REG_GOOD
|
||||||
.quad .Lmemcpy_c
|
* feature to implement the right patch order.
|
||||||
.word X86_FEATURE_REP_GOOD
|
*
|
||||||
|
|
||||||
/*
|
|
||||||
* Replace only beginning, memcpy is used to apply alternatives,
|
* Replace only beginning, memcpy is used to apply alternatives,
|
||||||
* so it is silly to overwrite itself with nops - reboot is the
|
* so it is silly to overwrite itself with nops - reboot is the
|
||||||
* only outcome...
|
* only outcome...
|
||||||
*/
|
*/
|
||||||
.byte .Lmemcpy_e - .Lmemcpy_c
|
.section .altinstructions, "a"
|
||||||
.byte .Lmemcpy_e - .Lmemcpy_c
|
altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
|
||||||
|
.Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
|
||||||
|
altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
|
||||||
|
.Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
|
||||||
.previous
|
.previous
|
||||||
|
@@ -8,6 +8,7 @@
|
|||||||
#define _STRING_C
|
#define _STRING_C
|
||||||
#include <linux/linkage.h>
|
#include <linux/linkage.h>
|
||||||
#include <asm/dwarf2.h>
|
#include <asm/dwarf2.h>
|
||||||
|
#include <asm/cpufeature.h>
|
||||||
|
|
||||||
#undef memmove
|
#undef memmove
|
||||||
|
|
||||||
@@ -24,6 +25,7 @@
|
|||||||
*/
|
*/
|
||||||
ENTRY(memmove)
|
ENTRY(memmove)
|
||||||
CFI_STARTPROC
|
CFI_STARTPROC
|
||||||
|
|
||||||
/* Handle more 32bytes in loop */
|
/* Handle more 32bytes in loop */
|
||||||
mov %rdi, %rax
|
mov %rdi, %rax
|
||||||
cmp $0x20, %rdx
|
cmp $0x20, %rdx
|
||||||
@@ -31,8 +33,13 @@ ENTRY(memmove)
|
|||||||
|
|
||||||
/* Decide forward/backward copy mode */
|
/* Decide forward/backward copy mode */
|
||||||
cmp %rdi, %rsi
|
cmp %rdi, %rsi
|
||||||
jb 2f
|
jge .Lmemmove_begin_forward
|
||||||
|
mov %rsi, %r8
|
||||||
|
add %rdx, %r8
|
||||||
|
cmp %rdi, %r8
|
||||||
|
jg 2f
|
||||||
|
|
||||||
|
.Lmemmove_begin_forward:
|
||||||
/*
|
/*
|
||||||
* movsq instruction have many startup latency
|
* movsq instruction have many startup latency
|
||||||
* so we handle small size by general register.
|
* so we handle small size by general register.
|
||||||
@@ -78,6 +85,8 @@ ENTRY(memmove)
|
|||||||
rep movsq
|
rep movsq
|
||||||
movq %r11, (%r10)
|
movq %r11, (%r10)
|
||||||
jmp 13f
|
jmp 13f
|
||||||
|
.Lmemmove_end_forward:
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Handle data backward by movsq.
|
* Handle data backward by movsq.
|
||||||
*/
|
*/
|
||||||
@@ -194,4 +203,22 @@ ENTRY(memmove)
|
|||||||
13:
|
13:
|
||||||
retq
|
retq
|
||||||
CFI_ENDPROC
|
CFI_ENDPROC
|
||||||
|
|
||||||
|
.section .altinstr_replacement,"ax"
|
||||||
|
.Lmemmove_begin_forward_efs:
|
||||||
|
/* Forward moving data. */
|
||||||
|
movq %rdx, %rcx
|
||||||
|
rep movsb
|
||||||
|
retq
|
||||||
|
.Lmemmove_end_forward_efs:
|
||||||
|
.previous
|
||||||
|
|
||||||
|
.section .altinstructions,"a"
|
||||||
|
.align 8
|
||||||
|
.quad .Lmemmove_begin_forward
|
||||||
|
.quad .Lmemmove_begin_forward_efs
|
||||||
|
.word X86_FEATURE_ERMS
|
||||||
|
.byte .Lmemmove_end_forward-.Lmemmove_begin_forward
|
||||||
|
.byte .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
|
||||||
|
.previous
|
||||||
ENDPROC(memmove)
|
ENDPROC(memmove)
|
||||||
|
@@ -2,9 +2,13 @@
|
|||||||
|
|
||||||
#include <linux/linkage.h>
|
#include <linux/linkage.h>
|
||||||
#include <asm/dwarf2.h>
|
#include <asm/dwarf2.h>
|
||||||
|
#include <asm/cpufeature.h>
|
||||||
|
#include <asm/alternative-asm.h>
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* ISO C memset - set a memory block to a byte value.
|
* ISO C memset - set a memory block to a byte value. This function uses fast
|
||||||
|
* string to get better performance than the original function. The code is
|
||||||
|
* simpler and shorter than the orignal function as well.
|
||||||
*
|
*
|
||||||
* rdi destination
|
* rdi destination
|
||||||
* rsi value (char)
|
* rsi value (char)
|
||||||
@@ -31,6 +35,28 @@
|
|||||||
.Lmemset_e:
|
.Lmemset_e:
|
||||||
.previous
|
.previous
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ISO C memset - set a memory block to a byte value. This function uses
|
||||||
|
* enhanced rep stosb to override the fast string function.
|
||||||
|
* The code is simpler and shorter than the fast string function as well.
|
||||||
|
*
|
||||||
|
* rdi destination
|
||||||
|
* rsi value (char)
|
||||||
|
* rdx count (bytes)
|
||||||
|
*
|
||||||
|
* rax original destination
|
||||||
|
*/
|
||||||
|
.section .altinstr_replacement, "ax", @progbits
|
||||||
|
.Lmemset_c_e:
|
||||||
|
movq %rdi,%r9
|
||||||
|
movb %sil,%al
|
||||||
|
movl %edx,%ecx
|
||||||
|
rep stosb
|
||||||
|
movq %r9,%rax
|
||||||
|
ret
|
||||||
|
.Lmemset_e_e:
|
||||||
|
.previous
|
||||||
|
|
||||||
ENTRY(memset)
|
ENTRY(memset)
|
||||||
ENTRY(__memset)
|
ENTRY(__memset)
|
||||||
CFI_STARTPROC
|
CFI_STARTPROC
|
||||||
@@ -112,16 +138,20 @@ ENTRY(__memset)
|
|||||||
ENDPROC(memset)
|
ENDPROC(memset)
|
||||||
ENDPROC(__memset)
|
ENDPROC(__memset)
|
||||||
|
|
||||||
/* Some CPUs run faster using the string instructions.
|
/* Some CPUs support enhanced REP MOVSB/STOSB feature.
|
||||||
It is also a lot simpler. Use this when possible */
|
* It is recommended to use this when possible.
|
||||||
|
*
|
||||||
#include <asm/cpufeature.h>
|
* If enhanced REP MOVSB/STOSB feature is not available, use fast string
|
||||||
|
* instructions.
|
||||||
|
*
|
||||||
|
* Otherwise, use original memset function.
|
||||||
|
*
|
||||||
|
* In .altinstructions section, ERMS feature is placed after REG_GOOD
|
||||||
|
* feature to implement the right patch order.
|
||||||
|
*/
|
||||||
.section .altinstructions,"a"
|
.section .altinstructions,"a"
|
||||||
.align 8
|
altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
|
||||||
.quad memset
|
.Lfinal-memset,.Lmemset_e-.Lmemset_c
|
||||||
.quad .Lmemset_c
|
altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
|
||||||
.word X86_FEATURE_REP_GOOD
|
.Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e
|
||||||
.byte .Lfinal - memset
|
|
||||||
.byte .Lmemset_e - .Lmemset_c
|
|
||||||
.previous
|
.previous
|
||||||
|
Reference in New Issue
Block a user