powerpc: Improve 64bit copy_tofrom_user
Here is a patch from Paul Mackerras that improves the ppc64 copy_tofrom_user. The loop now does 32 bytes at a time and as well as pairing loads and stores. A quick test case that reads 8kB over and over shows the improvement: POWER6: 53% faster POWER7: 51% faster #define _XOPEN_SOURCE 500 #include <stdlib.h> #include <stdio.h> #include <unistd.h> #include <fcntl.h> #include <sys/types.h> #include <sys/stat.h> #define BUFSIZE (8 * 1024) #define ITERATIONS 10000000 int main() { char tmpfile[] = "/tmp/copy_to_user_testXXXXXX"; int fd; char *buf[BUFSIZE]; unsigned long i; fd = mkstemp(tmpfile); if (fd < 0) { perror("open"); exit(1); } if (write(fd, buf, BUFSIZE) != BUFSIZE) { perror("open"); exit(1); } for (i = 0; i < 10000000; i++) { if (pread(fd, buf, BUFSIZE, 0) != BUFSIZE) { perror("pread"); exit(1); } } unlink(tmpfile); return 0; } Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
This commit is contained in:
committed by
Benjamin Herrenschmidt
parent
63e6c5b810
commit
789c299ca2
@@ -44,37 +44,55 @@ BEGIN_FTR_SECTION
|
|||||||
andi. r0,r4,7
|
andi. r0,r4,7
|
||||||
bne .Lsrc_unaligned
|
bne .Lsrc_unaligned
|
||||||
END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
|
END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
|
||||||
srdi r7,r5,4
|
blt cr1,.Ldo_tail /* if < 16 bytes to copy */
|
||||||
20: ld r9,0(r4)
|
srdi r0,r5,5
|
||||||
addi r4,r4,-8
|
cmpdi cr1,r0,0
|
||||||
mtctr r7
|
20: ld r7,0(r4)
|
||||||
andi. r5,r5,7
|
220: ld r6,8(r4)
|
||||||
bf cr7*4+0,22f
|
addi r4,r4,16
|
||||||
addi r3,r3,8
|
mtctr r0
|
||||||
addi r4,r4,8
|
andi. r0,r5,0x10
|
||||||
mr r8,r9
|
beq 22f
|
||||||
blt cr1,72f
|
|
||||||
21: ld r9,8(r4)
|
|
||||||
70: std r8,8(r3)
|
|
||||||
22: ldu r8,16(r4)
|
|
||||||
71: stdu r9,16(r3)
|
|
||||||
bdnz 21b
|
|
||||||
72: std r8,8(r3)
|
|
||||||
beq+ 3f
|
|
||||||
addi r3,r3,16
|
addi r3,r3,16
|
||||||
|
addi r4,r4,-16
|
||||||
|
mr r9,r7
|
||||||
|
mr r8,r6
|
||||||
|
beq cr1,72f
|
||||||
|
21: ld r7,16(r4)
|
||||||
|
221: ld r6,24(r4)
|
||||||
|
addi r4,r4,32
|
||||||
|
70: std r9,0(r3)
|
||||||
|
270: std r8,8(r3)
|
||||||
|
22: ld r9,0(r4)
|
||||||
|
222: ld r8,8(r4)
|
||||||
|
71: std r7,16(r3)
|
||||||
|
271: std r6,24(r3)
|
||||||
|
addi r3,r3,32
|
||||||
|
bdnz 21b
|
||||||
|
72: std r9,0(r3)
|
||||||
|
272: std r8,8(r3)
|
||||||
|
andi. r5,r5,0xf
|
||||||
|
beq+ 3f
|
||||||
|
addi r4,r4,16
|
||||||
.Ldo_tail:
|
.Ldo_tail:
|
||||||
bf cr7*4+1,1f
|
addi r3,r3,16
|
||||||
23: lwz r9,8(r4)
|
bf cr7*4+0,246f
|
||||||
|
244: ld r9,0(r4)
|
||||||
|
addi r4,r4,8
|
||||||
|
245: std r9,0(r3)
|
||||||
|
addi r3,r3,8
|
||||||
|
246: bf cr7*4+1,1f
|
||||||
|
23: lwz r9,0(r4)
|
||||||
addi r4,r4,4
|
addi r4,r4,4
|
||||||
73: stw r9,0(r3)
|
73: stw r9,0(r3)
|
||||||
addi r3,r3,4
|
addi r3,r3,4
|
||||||
1: bf cr7*4+2,2f
|
1: bf cr7*4+2,2f
|
||||||
44: lhz r9,8(r4)
|
44: lhz r9,0(r4)
|
||||||
addi r4,r4,2
|
addi r4,r4,2
|
||||||
74: sth r9,0(r3)
|
74: sth r9,0(r3)
|
||||||
addi r3,r3,2
|
addi r3,r3,2
|
||||||
2: bf cr7*4+3,3f
|
2: bf cr7*4+3,3f
|
||||||
45: lbz r9,8(r4)
|
45: lbz r9,0(r4)
|
||||||
75: stb r9,0(r3)
|
75: stb r9,0(r3)
|
||||||
3: li r3,0
|
3: li r3,0
|
||||||
blr
|
blr
|
||||||
@@ -220,7 +238,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
|
|||||||
131:
|
131:
|
||||||
addi r3,r3,8
|
addi r3,r3,8
|
||||||
120:
|
120:
|
||||||
|
320:
|
||||||
122:
|
122:
|
||||||
|
322:
|
||||||
124:
|
124:
|
||||||
125:
|
125:
|
||||||
126:
|
126:
|
||||||
@@ -229,9 +249,11 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
|
|||||||
129:
|
129:
|
||||||
133:
|
133:
|
||||||
addi r3,r3,8
|
addi r3,r3,8
|
||||||
121:
|
|
||||||
132:
|
132:
|
||||||
addi r3,r3,8
|
addi r3,r3,8
|
||||||
|
121:
|
||||||
|
321:
|
||||||
|
344:
|
||||||
134:
|
134:
|
||||||
135:
|
135:
|
||||||
138:
|
138:
|
||||||
@@ -303,18 +325,22 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
|
|||||||
183:
|
183:
|
||||||
add r3,r3,r7
|
add r3,r3,r7
|
||||||
b 1f
|
b 1f
|
||||||
|
371:
|
||||||
180:
|
180:
|
||||||
addi r3,r3,8
|
addi r3,r3,8
|
||||||
171:
|
171:
|
||||||
177:
|
177:
|
||||||
addi r3,r3,8
|
addi r3,r3,8
|
||||||
170:
|
370:
|
||||||
172:
|
372:
|
||||||
176:
|
176:
|
||||||
178:
|
178:
|
||||||
addi r3,r3,4
|
addi r3,r3,4
|
||||||
185:
|
185:
|
||||||
addi r3,r3,4
|
addi r3,r3,4
|
||||||
|
170:
|
||||||
|
172:
|
||||||
|
345:
|
||||||
173:
|
173:
|
||||||
174:
|
174:
|
||||||
175:
|
175:
|
||||||
@@ -341,11 +367,19 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
|
|||||||
.section __ex_table,"a"
|
.section __ex_table,"a"
|
||||||
.align 3
|
.align 3
|
||||||
.llong 20b,120b
|
.llong 20b,120b
|
||||||
|
.llong 220b,320b
|
||||||
.llong 21b,121b
|
.llong 21b,121b
|
||||||
|
.llong 221b,321b
|
||||||
.llong 70b,170b
|
.llong 70b,170b
|
||||||
|
.llong 270b,370b
|
||||||
.llong 22b,122b
|
.llong 22b,122b
|
||||||
|
.llong 222b,322b
|
||||||
.llong 71b,171b
|
.llong 71b,171b
|
||||||
|
.llong 271b,371b
|
||||||
.llong 72b,172b
|
.llong 72b,172b
|
||||||
|
.llong 272b,372b
|
||||||
|
.llong 244b,344b
|
||||||
|
.llong 245b,345b
|
||||||
.llong 23b,123b
|
.llong 23b,123b
|
||||||
.llong 73b,173b
|
.llong 73b,173b
|
||||||
.llong 44b,144b
|
.llong 44b,144b
|
||||||
|
Reference in New Issue
Block a user