Blackfin arch: Replace C version of 64 bit multiply with hand optimized assembly
Signed-off-by: Bernd Schmidt <bernds_cb1@t-online.de> Signed-off-by: Bryan Wu <cooloney@kernel.org>
This commit is contained in:
68
arch/blackfin/lib/muldi3.S
Normal file
68
arch/blackfin/lib/muldi3.S
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
.align 2
|
||||||
|
.global ___muldi3;
|
||||||
|
.type ___muldi3, STT_FUNC;
|
||||||
|
|
||||||
|
#ifdef CONFIG_ARITHMETIC_OPS_L1
|
||||||
|
.section .l1.text
|
||||||
|
#else
|
||||||
|
.text
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
R1:R0 * R3:R2
|
||||||
|
= R1.h:R1.l:R0.h:R0.l * R3.h:R3.l:R2.h:R2.l
|
||||||
|
[X] = (R1.h * R3.h) * 2^96
|
||||||
|
[X] + (R1.h * R3.l + R1.l * R3.h) * 2^80
|
||||||
|
[X] + (R1.h * R2.h + R1.l * R3.l + R3.h * R0.h) * 2^64
|
||||||
|
[T1] + (R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h) * 2^48
|
||||||
|
[T2] + (R1.l * R2.l + R3.l * R0.l + R0.h * R2.h) * 2^32
|
||||||
|
[T3] + (R0.l * R2.h + R2.l * R0.h) * 2^16
|
||||||
|
[T4] + (R0.l * R2.l)
|
||||||
|
|
||||||
|
We can discard the first three lines marked "X" since we produce
|
||||||
|
only a 64 bit result. So, we need ten 16-bit multiplies.
|
||||||
|
|
||||||
|
Individual mul-acc results:
|
||||||
|
[E1] = R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h
|
||||||
|
[E2] = R1.l * R2.l + R3.l * R0.l + R0.h * R2.h
|
||||||
|
[E3] = R0.l * R2.h + R2.l * R0.h
|
||||||
|
[E4] = R0.l * R2.l
|
||||||
|
|
||||||
|
We also need to add high parts from lower-level results to higher ones:
|
||||||
|
E[n]c = E[n] + (E[n+1]c >> 16), where E4c := E4
|
||||||
|
|
||||||
|
One interesting property is that all parts of the result that depend
|
||||||
|
on the sign of the multiplication are discarded. Those would be the
|
||||||
|
multiplications involving R1.h and R3.h, but only the top 16 bit of
|
||||||
|
the 32 bit result depend on the sign, and since R1.h and R3.h only
|
||||||
|
occur in E1, the top half of these results is cut off.
|
||||||
|
So, we can just use FU mode for all of the 16-bit multiplies, and
|
||||||
|
ignore questions of when to use mixed mode. */
|
||||||
|
|
||||||
|
___muldi3:
|
||||||
|
/* [SP] technically is part of the caller's frame, but we can
|
||||||
|
use it as scratch space. */
|
||||||
|
A0 = R2.H * R1.L, A1 = R2.L * R1.H (FU) || R3 = [SP + 12]; /* E1 */
|
||||||
|
A0 += R3.H * R0.L, A1 += R3.L * R0.H (FU) || [SP] = R4; /* E1 */
|
||||||
|
A0 += A1; /* E1 */
|
||||||
|
R4 = A0.w;
|
||||||
|
A0 = R0.l * R3.l (FU); /* E2 */
|
||||||
|
A0 += R2.l * R1.l (FU); /* E2 */
|
||||||
|
|
||||||
|
A1 = R2.L * R0.L (FU); /* E4 */
|
||||||
|
R3 = A1.w;
|
||||||
|
A1 = A1 >> 16; /* E3c */
|
||||||
|
A0 += R2.H * R0.H, A1 += R2.L * R0.H (FU); /* E2, E3c */
|
||||||
|
A1 += R0.L * R2.H (FU); /* E3c */
|
||||||
|
R0 = A1.w;
|
||||||
|
A1 = A1 >> 16; /* E2c */
|
||||||
|
A0 += A1; /* E2c */
|
||||||
|
R1 = A0.w;
|
||||||
|
|
||||||
|
/* low(result) = low(E3c):low(E4) */
|
||||||
|
R0 = PACK (R0.l, R3.l);
|
||||||
|
/* high(result) = E2c + (E1 << 16) */
|
||||||
|
R1.h = R1.h + R4.l (NS) || R4 = [SP];
|
||||||
|
RTS;
|
||||||
|
|
||||||
|
.size ___muldi3, .-___muldi3
|
@@ -1,99 +0,0 @@
|
|||||||
/*
|
|
||||||
* File: arch/blackfin/lib/muldi3.c
|
|
||||||
* Based on:
|
|
||||||
* Author:
|
|
||||||
*
|
|
||||||
* Created:
|
|
||||||
* Description:
|
|
||||||
*
|
|
||||||
* Modified:
|
|
||||||
* Copyright 2004-2006 Analog Devices Inc.
|
|
||||||
*
|
|
||||||
* Bugs: Enter bugs at http://blackfin.uclinux.org/
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation; either version 2 of the License, or
|
|
||||||
* (at your option) any later version.
|
|
||||||
*
|
|
||||||
* This program is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with this program; if not, see the file COPYING, or write
|
|
||||||
* to the Free Software Foundation, Inc.,
|
|
||||||
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef SI_TYPE_SIZE
|
|
||||||
#define SI_TYPE_SIZE 32
|
|
||||||
#endif
|
|
||||||
#define __ll_b (1L << (SI_TYPE_SIZE / 2))
|
|
||||||
#define __ll_lowpart(t) ((usitype) (t) % __ll_b)
|
|
||||||
#define __ll_highpart(t) ((usitype) (t) / __ll_b)
|
|
||||||
#define BITS_PER_UNIT 8
|
|
||||||
|
|
||||||
#if !defined(umul_ppmm)
|
|
||||||
#define umul_ppmm(w1, w0, u, v) \
|
|
||||||
do { \
|
|
||||||
usitype __x0, __x1, __x2, __x3; \
|
|
||||||
usitype __ul, __vl, __uh, __vh; \
|
|
||||||
\
|
|
||||||
__ul = __ll_lowpart (u); \
|
|
||||||
__uh = __ll_highpart (u); \
|
|
||||||
__vl = __ll_lowpart (v); \
|
|
||||||
__vh = __ll_highpart (v); \
|
|
||||||
\
|
|
||||||
__x0 = (usitype) __ul * __vl; \
|
|
||||||
__x1 = (usitype) __ul * __vh; \
|
|
||||||
__x2 = (usitype) __uh * __vl; \
|
|
||||||
__x3 = (usitype) __uh * __vh; \
|
|
||||||
\
|
|
||||||
__x1 += __ll_highpart (__x0);/* this can't give carry */ \
|
|
||||||
__x1 += __x2; /* but this indeed can */ \
|
|
||||||
if (__x1 < __x2) /* did we get it? */ \
|
|
||||||
__x3 += __ll_b; /* yes, add it in the proper pos. */ \
|
|
||||||
\
|
|
||||||
(w1) = __x3 + __ll_highpart (__x1); \
|
|
||||||
(w0) = __ll_lowpart (__x1) * __ll_b + __ll_lowpart (__x0); \
|
|
||||||
} while (0)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if !defined(__umulsidi3)
|
|
||||||
#define __umulsidi3(u, v) \
|
|
||||||
({diunion __w; \
|
|
||||||
umul_ppmm (__w.s.high, __w.s.low, u, v); \
|
|
||||||
__w.ll; })
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef unsigned int usitype __attribute__ ((mode(SI)));
|
|
||||||
typedef int sitype __attribute__ ((mode(SI)));
|
|
||||||
typedef int ditype __attribute__ ((mode(DI)));
|
|
||||||
typedef int word_type __attribute__ ((mode(__word__)));
|
|
||||||
|
|
||||||
struct distruct {
|
|
||||||
sitype low, high;
|
|
||||||
};
|
|
||||||
typedef union {
|
|
||||||
struct distruct s;
|
|
||||||
ditype ll;
|
|
||||||
} diunion;
|
|
||||||
|
|
||||||
#ifdef CONFIG_ARITHMETIC_OPS_L1
|
|
||||||
ditype __muldi3(ditype u, ditype v)__attribute__((l1_text));
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ditype __muldi3(ditype u, ditype v)
|
|
||||||
{
|
|
||||||
diunion w;
|
|
||||||
diunion uu, vv;
|
|
||||||
|
|
||||||
uu.ll = u, vv.ll = v;
|
|
||||||
w.ll = __umulsidi3(uu.s.low, vv.s.low);
|
|
||||||
w.s.high += ((usitype) uu.s.low * (usitype) vv.s.high
|
|
||||||
+ (usitype) uu.s.high * (usitype) vv.s.low);
|
|
||||||
|
|
||||||
return w.ll;
|
|
||||||
}
|
|
Reference in New Issue
Block a user