crypto: serpent-sse2 - change transpose_4x4 to only use integer instructions
Matrix transpose macro in serpent-sse2 uses mix of SSE2 integer and SSE floating point instructions, which might cause performance penality on some CPUs. This patch replaces transpose_4x4 macro with version that uses only SSE2 integer instructions. Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
parent
4c58464b80
commit
847cb7ef56
@ -463,23 +463,20 @@
|
|||||||
pand x0, x4; \
|
pand x0, x4; \
|
||||||
pxor x2, x4;
|
pxor x2, x4;
|
||||||
|
|
||||||
#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
|
#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
|
||||||
movdqa x2, t3; \
|
|
||||||
movdqa x0, t1; \
|
|
||||||
unpcklps x3, t3; \
|
|
||||||
movdqa x0, t2; \
|
movdqa x0, t2; \
|
||||||
unpcklps x1, t1; \
|
punpckldq x1, x0; \
|
||||||
unpckhps x1, t2; \
|
punpckhdq x1, t2; \
|
||||||
movdqa t3, x1; \
|
movdqa x2, t1; \
|
||||||
unpckhps x3, x2; \
|
punpckhdq x3, x2; \
|
||||||
movdqa t1, x0; \
|
punpckldq x3, t1; \
|
||||||
movhlps t1, x1; \
|
movdqa x0, x1; \
|
||||||
movdqa t2, t1; \
|
punpcklqdq t1, x0; \
|
||||||
movlhps t3, x0; \
|
punpckhqdq t1, x1; \
|
||||||
movlhps x2, t1; \
|
movdqa t2, x3; \
|
||||||
movhlps t2, x2; \
|
punpcklqdq x2, t2; \
|
||||||
movdqa x2, x3; \
|
punpckhqdq x2, x3; \
|
||||||
movdqa t1, x2;
|
movdqa t2, x2;
|
||||||
|
|
||||||
#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
|
#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
|
||||||
movdqu (0*4*4)(in), x0; \
|
movdqu (0*4*4)(in), x0; \
|
||||||
|
@ -585,23 +585,20 @@
|
|||||||
get_key(i, 1, RK1); \
|
get_key(i, 1, RK1); \
|
||||||
SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
|
SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
|
||||||
|
|
||||||
#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
|
#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
|
||||||
movdqa x2, t3; \
|
|
||||||
movdqa x0, t1; \
|
|
||||||
unpcklps x3, t3; \
|
|
||||||
movdqa x0, t2; \
|
movdqa x0, t2; \
|
||||||
unpcklps x1, t1; \
|
punpckldq x1, x0; \
|
||||||
unpckhps x1, t2; \
|
punpckhdq x1, t2; \
|
||||||
movdqa t3, x1; \
|
movdqa x2, t1; \
|
||||||
unpckhps x3, x2; \
|
punpckhdq x3, x2; \
|
||||||
movdqa t1, x0; \
|
punpckldq x3, t1; \
|
||||||
movhlps t1, x1; \
|
movdqa x0, x1; \
|
||||||
movdqa t2, t1; \
|
punpcklqdq t1, x0; \
|
||||||
movlhps t3, x0; \
|
punpckhqdq t1, x1; \
|
||||||
movlhps x2, t1; \
|
movdqa t2, x3; \
|
||||||
movhlps t2, x2; \
|
punpcklqdq x2, t2; \
|
||||||
movdqa x2, x3; \
|
punpckhqdq x2, x3; \
|
||||||
movdqa t1, x2;
|
movdqa t2, x2;
|
||||||
|
|
||||||
#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
|
#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
|
||||||
movdqu (0*4*4)(in), x0; \
|
movdqu (0*4*4)(in), x0; \
|
||||||
|
Loading…
Reference in New Issue
Block a user