*================================================================================
*
* TEXAS INSTRUMENTS, INC.
*
* 32 Bit Multiply With 64 Bit Result
*
* Revision Date: 07/17/97
*
* USAGE This routine is C Callable and can be called as:
*
* void mpy3264(int a, int b, int *c)
*
* a --- first 32 bit input value
* b --- second 32 bit input value
* *c --- pointer to resultant 64 bit value
*
* If routine is not to be used as a C callable function then
* you need to initialize values for all of the values passed
* as these are assumed to be in registers as defined by the
* calling convention of the compiler, (refer to the C compiler
* reference guide).
*
* C CODE
* This is the C equivalent of the assembly code. Note that
* the assembly code is hand optimized and restrictions may
* apply.
*
* void mpy3264(int a, int b, int *c)
* {
* unsigned int c0, c1l, c2l;
* int c1h, c2h, c1, c2, c3, ch, ch1;
* Ulong40 cl;
*
* c0 = (unsigned short) a * (unsigned short) b;
* c1 = (a>>16) * (unsigned short) b;
* c2 = (b>>16) * (unsigned short) a;
* c3 = (a>>16) * (b>>16);
* c1l = c1 * c2l = c2 * c1h = c1 >> 16;
* c2h = c2 >> 16;
* cl = (Ulong40) c1l + (Ulong40) c2l;
* cl += (Ulong40) c0;
* ch1 = c1h + c2h;
* ch1 += c3;
* ch = ch1 + (cl >> 32);
* c[0] = cl;
* c[1] = ch;
* }
*
* DESCRIPTION
*
* This routine takes two 32 bit integer values and calculates
* their product. The inputs are 32-bit integer, and the result
* is a 64-bit integer.
*
* ASSUMPTIONS
*
* 1. Only one sum is computed with a pair of 32-bit values.
* 2. Multiple 32-bit mpys can yield 2 cycle per 32-bit mpy
* on average.
*
* CYCLES
*
* 7 (STW 8)
*
*===============================================================================
.global _mpy3264
.text
_mpy3264:
*** BEGIN Benchmark Timing
B_START:
MPYHSLU .M1x A4, B4, A0 ; c1 = (a>>16)*(u short)b
|| MPYHSLU .M2x B4, A4, B0 ; c2 = (b>>16)*(u short)a
MPYH .M1x A4, B4, A4 ; c3 = (a>>16)*(b>>16)
|| MPYU .M2x B4, A4, B4 ; c0 = (u short)a*(u short)b
SHL .S1 A0, 16, A2 ; c1L = c1 || SHL .S2 B0, 16, B2 ; c2L = c2
SHR .S1 A0, 16, A0 ; c1H = c1 >> 16
|| SHR .S2 B0, 16, B0 ; c2H = c2 >> 16
|| ADDU .L1x A2, B2, A3:A2 ; cL = c1L + c2L
|| ADD .L2x A6, 4, B8 ; update pointer to c[1]
ADD .L2x A0, B0, B0 ; cH1 = c1H + c2H
|| ADDU .L1x B4, A3:A2, A3:A2 ; cL += c0
ADD .L2x B0, A4, B0 ; cH1 += c3
ADD .L2x B0, A3, B0 ; cH += cL>>32
B_END:
*** END Benchmark Timing
STW .D1 A2, *A6 ; c[0] = cL
|| STW .D2 B0, *B8 ; c[1] = cH
|| B .S2 B3 ; return to calling function
STOP:
NOP 5