tahoma2d/thirdparty/openblas/xianyi-OpenBLAS-e6e87a2/kernel/alpha/ztrsm_kernel_2x2_RT.S
2016-03-24 02:47:04 +09:00

2230 lines
32 KiB
ArmAsm

/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "version.h"
#if !defined(EV4) && !defined(EV5) && !defined(EV6)
#error "Architecture is not specified."
#endif
#ifdef EV6
#define PREFETCHSIZE 56
#define UNOP unop
#endif
#ifdef EV5
#define PREFETCHSIZE 48
#define UNOP
#endif
#ifdef EV4
#define UNOP
#endif
.set noat
.set noreorder
.arch ev6
.text
.align 5
.globl CNAME
.ent CNAME
#define STACKSIZE 80
#define M $16
#define N $17
#define K $18
#define A $21
#define B $22
#define C $20
#define LDC $23
#define C1 $19
#define C2 $24
#define AO $at
#define BO $5
#define I $6
#define J $7
#define L $8
#define a1 $f16
#define a2 $f17
#define a3 $f18
#define a4 $f19
#define b1 $f20
#define b2 $f21
#define b3 $f22
#define b4 $f23
#define t1 $f24
#define t2 $f25
#define t3 $f26
#define t4 $f27
#define a5 $f28
#define a6 $f30
#define b5 $f29
#define alpha_i $f29
#define alpha_r $f30
#define c01 $f0
#define c02 $f1
#define c03 $f2
#define c04 $f3
#define c05 $f4
#define c06 $f5
#define c07 $f6
#define c08 $f7
#define c09 $f8
#define c10 $f9
#define c11 $f10
#define c12 $f11
#define c13 $f12
#define c14 $f13
#define c15 $f14
#define c16 $f15
#define TMP1 $0
#define TMP2 $1
#define KK $2
#define AORIG $3
#define OFFSET $4
#if defined(LN) || defined(LT)
#ifndef CONJ
#define ADD1 ADD
#define ADD2 SUB
#define ADD3 ADD
#define ADD4 ADD
#define ADD5 SUB
#define ADD6 ADD
#else
#define ADD1 ADD
#define ADD2 ADD
#define ADD3 SUB
#define ADD4 ADD
#define ADD5 ADD
#define ADD6 SUB
#endif
#else
#ifndef CONJ
#define ADD1 ADD
#define ADD2 SUB
#define ADD3 ADD
#define ADD4 ADD
#define ADD5 SUB
#define ADD6 ADD
#else
#define ADD1 ADD
#define ADD2 ADD
#define ADD3 ADD
#define ADD4 SUB
#define ADD5 ADD
#define ADD6 SUB
#endif
#endif
CNAME:
.frame $sp, STACKSIZE, $26, 0
#ifdef PROFILE
ldgp $gp, 0($27)
lda $at, _mcount
jsr $at, ($at), _mcount
#endif
#ifndef PROFILE
.prologue 0
#else
.prologue 1
#endif
lda $sp, -STACKSIZE($sp)
ldq B, 0 + STACKSIZE($sp)
ldq C, 8 + STACKSIZE($sp)
ldq LDC, 16 + STACKSIZE($sp)
ldq OFFSET, 24 + STACKSIZE($sp)
sll LDC, ZBASE_SHIFT, LDC
stt $f2, 0($sp)
stt $f3, 8($sp)
stt $f4, 16($sp)
stt $f5, 24($sp)
stt $f6, 32($sp)
stt $f7, 40($sp)
stt $f8, 48($sp)
stt $f9, 56($sp)
cmple M, 0, $0
cmple N, 0, $1
cmple K, 0, $2
or $0, $1, $0
or $0, $2, $0
bne $0, $L999
#ifdef LN
addq M, M, TMP2
mulq TMP2, K, TMP1
SXADDQ TMP1, A, A
SXADDQ TMP2, C, C
#endif
#ifdef RN
negq OFFSET, KK
#endif
#ifdef RT
mulq N, K, TMP1
addq TMP1, TMP1, TMP1
SXADDQ TMP1, B, B
mulq N, LDC, TMP1
addq TMP1, C, C
subq N, OFFSET, KK
#endif
and N, 1, J
ble J, $L30
#ifdef RT
sll K, ZBASE_SHIFT, TMP1
subq B, TMP1, B
subq C, LDC, C1
subq C, LDC, C
#else
mov C, C1
addq C, LDC, C
#endif
#ifdef LN
addq M, OFFSET, KK
#endif
#ifdef LT
mov OFFSET, KK
#endif
#if defined(LN) || defined(RT)
mov A, AORIG
#else
mov A, AO
#endif
sra M, 1, I
ble I, $L50
.align 4
$L41:
#if defined(LT) || defined(RN)
LD a1, 0 * SIZE(AO)
fclr t1
LD a2, 1 * SIZE(AO)
fclr t2
LD a3, 2 * SIZE(AO)
fclr t3
LD a4, 3 * SIZE(AO)
fclr t4
LD b1, 0 * SIZE(B)
fclr c01
LD b2, 1 * SIZE(B)
fclr c05
LD b3, 2 * SIZE(B)
fclr c02
LD b4, 3 * SIZE(B)
fclr c06
lda BO, 2 * SIZE(B)
fclr c03
lda AO, 4 * SIZE(AO)
fclr c07
lda L, -2(KK)
fclr c04
fclr c08
ble KK, $L48
ble L, $L45
#else
#ifdef LN
sll K, ZBASE_SHIFT + 1, TMP1
subq AORIG, TMP1, AORIG
#endif
sll KK, ZBASE_SHIFT + 1, TMP1
addq AORIG, TMP1, AO
sll KK, ZBASE_SHIFT, TMP1
addq B, TMP1, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr t1
LD a2, 1 * SIZE(AO)
fclr t2
LD a3, 2 * SIZE(AO)
fclr t3
LD a4, 3 * SIZE(AO)
fclr t4
LD b1, 0 * SIZE(BO)
fclr c01
LD b2, 1 * SIZE(BO)
fclr c05
LD b3, 2 * SIZE(BO)
fclr c02
LD b4, 3 * SIZE(BO)
fclr c06
lda BO, 2 * SIZE(BO)
fclr c03
lda AO, 4 * SIZE(AO)
fclr c07
lda L, -2(TMP1)
fclr c04
fclr c08
ble TMP1, $L48
ble L, $L45
#endif
.align 5
$L42:
ADD4 c05, t1, c05
unop
MUL a1, b1, t1
unop
ADD2 c06, t2, c06
lda L, -2(L)
MUL a2, b1, t2
unop
ADD4 c07, t3, c07
unop
MUL a3, b1, t3
unop
ADD2 c08, t4, c08
unop
MUL a4, b1, t4
LD b1, 2 * SIZE(BO)
ADD1 c01, t1, c01
unop
MUL a1, b2, t1
LD a1, 0 * SIZE(AO)
ADD3 c02, t2, c02
lda BO, 4 * SIZE(BO)
MUL a2, b2, t2
LD a2, 1 * SIZE(AO)
ADD1 c03, t3, c03
unop
MUL a3, b2, t3
LD a3, 2 * SIZE(AO)
ADD3 c04, t4, c04
unop
MUL a4, b2, t4
LD a5, 3 * SIZE(AO)
ADD4 c05, t1, c05
unop
MUL a1, b3, t1
LD b2, -1 * SIZE(BO)
ADD2 c06, t2, c06
unop
MUL a2, b3, t2
unop
ADD4 c07, t3, c07
unop
MUL a3, b3, t3
lda AO, 8 * SIZE(AO)
ADD2 c08, t4, c08
unop
MUL a5, b3, t4
LD b3, 0 * SIZE(BO)
ADD1 c01, t1, c01
unop
MUL a1, b4, t1
LD a1, -4 * SIZE(AO)
ADD3 c02, t2, c02
unop
MUL a2, b4, t2
LD a2, -3 * SIZE(AO)
ADD1 c03, t3, c03
LD a4, -1 * SIZE(AO)
MUL a3, b4, t3
LD a3, -2 * SIZE(AO)
ADD3 c04, t4, c04
MUL a5, b4, t4
LD b4, 1 * SIZE(BO)
bgt L, $L42
.align 4
$L45:
ADD4 c05, t1, c05
MUL b1, a1, t1
#if defined(LT) || defined(RN)
blbs KK, $L47
#else
blbs TMP1, $L47
#endif
.align 4
ADD2 c06, t2, c06
MUL a2, b1, t2
ADD4 c07, t3, c07
MUL a3, b1, t3
ADD2 c08, t4, c08
unop
MUL a4, b1, t4
LD b1, 0 * SIZE(BO)
ADD1 c01, t1, c01
unop
MUL a1, b2, t1
LD a1, 0 * SIZE(AO)
ADD3 c02, t2, c02
unop
MUL a2, b2, t2
LD a2, 1 * SIZE(AO)
ADD1 c03, t3, c03
unop
MUL a3, b2, t3
LD a3, 2 * SIZE(AO)
ADD3 c04, t4, c04
MUL a4, b2, t4
LD a4, 3 * SIZE(AO)
lda AO, 4 * SIZE(AO)
ADD4 c05, t1, c05
LD b2, 1 * SIZE(BO)
MUL a1, b1, t1
lda BO, 2 * SIZE(BO)
.align 4
$L47:
ADD2 c06, t2, c06
MUL a2, b1, t2
ADD4 c07, t3, c07
MUL a3, b1, t3
ADD2 c08, t4, c08
MUL a4, b1, t4
ADD1 c01, t1, c01
MUL a1, b2, t1
ADD3 c02, t2, c02
MUL a2, b2, t2
ADD1 c03, t3, c03
MUL a3, b2, t3
ADD3 c04, t4, c04
lda AO, 4 * SIZE(AO)
MUL a4, b2, t4
lda BO, 2 * SIZE(BO)
ADD4 c05, t1, c05
ADD2 c06, t2, c06
ADD4 c07, t3, c07
ADD2 c08, t4, c08
ADD c01, c06, c01
ADD c02, c05, c02
ADD c03, c08, c03
ADD c04, c07, c04
$L48:
#if defined(LN) || defined(RT)
#ifdef LN
subq KK, 2, TMP1
#else
subq KK, 1, TMP1
#endif
sll TMP1, ZBASE_SHIFT + 1, TMP2
addq AORIG, TMP2, AO
sll TMP1, ZBASE_SHIFT, TMP2
addq B, TMP2, BO
#else
lda AO, -4 * SIZE(AO)
lda BO, -2 * SIZE(BO)
#endif
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c03, c03
SUB a4, c04, c04
#else
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c03, c03
SUB a4, c04, c04
#endif
#ifdef LN
LD a1, 6 * SIZE(AO)
LD a2, 7 * SIZE(AO)
LD a3, 4 * SIZE(AO)
LD a4, 5 * SIZE(AO)
MUL a2, c04, t1
MUL a2, c03, t2
MUL a1, c03, c03
MUL a1, c04, c04
ADD5 c03, t1, c03
ADD6 c04, t2, c04
MUL a3, c03, t1
MUL a3, c04, t2
SUB c01, t1, c01
SUB c02, t2, c02
MUL a4, c04, t1
MUL a4, c03, t2
ADD6 c01, t1, c01
ADD5 c02, t2, c02
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
MUL a2, c02, t1
MUL a2, c01, t2
MUL a1, c01, c01
MUL a1, c02, c02
ADD5 c01, t1, c01
ADD6 c02, t2, c02
#endif
#ifdef LT
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
MUL a2, c02, t1
MUL a2, c01, t2
MUL a1, c01, c01
MUL a1, c02, c02
ADD5 c01, t1, c01
ADD6 c02, t2, c02
MUL a3, c01, t1
MUL a3, c02, t2
SUB c03, t1, c03
SUB c04, t2, c04
MUL a4, c02, t1
MUL a4, c01, t2
ADD6 c03, t1, c03
ADD5 c04, t2, c04
LD a1, 6 * SIZE(AO)
LD a2, 7 * SIZE(AO)
MUL a2, c04, t1
MUL a2, c03, t2
MUL a1, c03, c03
MUL a1, c04, c04
ADD5 c03, t1, c03
ADD6 c04, t2, c04
#endif
#if defined(RN) || defined(RT)
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
MUL a2, c02, t1
MUL a2, c01, t2
MUL a2, c04, t3
MUL a2, c03, t4
MUL a1, c01, c01
MUL a1, c02, c02
MUL a1, c03, c03
MUL a1, c04, c04
ADD5 c01, t1, c01
ADD6 c02, t2, c02
ADD5 c03, t3, c03
ADD6 c04, t4, c04
#endif
#if defined(LN) || defined(LT)
ST c01, 0 * SIZE(BO)
ST c02, 1 * SIZE(BO)
ST c03, 2 * SIZE(BO)
ST c04, 3 * SIZE(BO)
#else
ST c01, 0 * SIZE(AO)
ST c02, 1 * SIZE(AO)
ST c03, 2 * SIZE(AO)
ST c04, 3 * SIZE(AO)
#endif
#ifdef LN
lda C1, -4 * SIZE(C1)
#endif
ST c01, 0 * SIZE(C1)
ST c02, 1 * SIZE(C1)
ST c03, 2 * SIZE(C1)
ST c04, 3 * SIZE(C1)
#ifndef LN
lda C1, 4 * SIZE(C1)
#endif
#ifdef RT
sll K, ZBASE_SHIFT + 1, TMP1
addq AORIG, TMP1, AORIG
#endif
#if defined(LT) || defined(RN)
subq K, KK, TMP1
sll TMP1, ZBASE_SHIFT + 1, TMP2
addq AO, TMP2, AO
sll TMP1, ZBASE_SHIFT, TMP2
addq BO, TMP2, BO
#endif
#ifdef LT
addq KK, 2, KK
#endif
#ifdef LN
subq KK, 2, KK
#endif
lda I, -1(I)
bgt I, $L41
.align 4
$L50:
and M, 1, I
ble I, $L59
#if defined(LT) || defined(RN)
LD a1, 0 * SIZE(AO)
fclr t1
LD a2, 1 * SIZE(AO)
fclr t2
LD a3, 2 * SIZE(AO)
fclr t3
LD a4, 3 * SIZE(AO)
fclr t4
LD b1, 0 * SIZE(B)
fclr c01
LD b2, 1 * SIZE(B)
fclr c05
LD b3, 2 * SIZE(B)
fclr c02
LD b4, 3 * SIZE(B)
fclr c06
lda AO, 2 * SIZE(AO)
lda BO, 2 * SIZE(B)
lda L, -2(KK)
ble KK, $L58
ble L, $L55
#else
#ifdef LN
sll K, ZBASE_SHIFT, TMP1
subq AORIG, TMP1, AORIG
#endif
sll KK, ZBASE_SHIFT, TMP1
addq AORIG, TMP1, AO
sll KK, ZBASE_SHIFT, TMP1
addq B, TMP1, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr t1
LD a2, 1 * SIZE(AO)
fclr t2
LD a3, 2 * SIZE(AO)
fclr t3
LD a4, 3 * SIZE(AO)
fclr t4
LD b1, 0 * SIZE(BO)
fclr c01
LD b2, 1 * SIZE(BO)
fclr c05
LD b3, 2 * SIZE(BO)
fclr c02
LD b4, 3 * SIZE(BO)
fclr c06
lda AO, 2 * SIZE(AO)
lda BO, 2 * SIZE(BO)
lda L, -2(TMP1)
ble TMP1, $L58
ble L, $L55
#endif
.align 5
$L52:
ADD1 c01, t1, c01
unop
MUL a1, b1, t1
unop
ADD3 c02, t2, c02
lda AO, 4 * SIZE(AO)
MUL a2, b1, t2
LD b1, 2 * SIZE(BO)
ADD4 c05, t3, c05
lda L, -2(L)
MUL a1, b2, t3
LD a1, -2 * SIZE(AO)
ADD2 c06, t4, c06
unop
MUL a2, b2, t4
LD a2, -1 * SIZE(AO)
ADD1 c01, t1, c01
LD b2, 3 * SIZE(BO)
MUL a3, b3, t1
lda BO, 4 * SIZE(BO)
ADD3 c02, t2, c02
unop
MUL a4, b3, t2
LD b3, 0 * SIZE(BO)
ADD4 c05, t3, c05
unop
MUL a3, b4, t3
LD a3, 0 * SIZE(AO)
ADD2 c06, t4, c06
MUL a4, b4, t4
LD b4, 1 * SIZE(BO)
unop
LD a4, 1 * SIZE(AO)
unop
unop
bgt L, $L52
.align 4
$L55:
ADD1 c01, t1, c01
MUL a1, b1, t1
#if defined(LT) || defined(RN)
blbs KK, $L57
#else
blbs TMP1, $L57
#endif
.align 4
ADD3 c02, t2, c02
unop
MUL a2, b1, t2
LD b1, 0 * SIZE(BO)
ADD4 c05, t3, c05
lda BO, 2 * SIZE(BO)
MUL a1, b2, t3
LD a1, 0 * SIZE(AO)
ADD2 c06, t4, c06
unop
MUL a2, b2, t4
LD a2, 1 * SIZE(AO)
ADD1 c01, t1, c01
LD b2, -1 * SIZE(BO)
MUL a1, b1, t1
lda AO, 2 * SIZE(AO)
.align 4
$L57:
ADD3 c02, t2, c02
MUL a2, b1, t2
ADD4 c05, t3, c05
MUL a1, b2, t3
ADD2 c06, t4, c06
lda AO, 2 * SIZE(AO)
MUL a2, b2, t4
lda BO, 2 * SIZE(BO)
ADD1 c01, t1, c01
ADD3 c02, t2, c02
ADD4 c05, t3, c05
ADD2 c06, t4, c06
ADD c01, c06, c01
ADD c02, c05, c02
$L58:
#if defined(LN) || defined(RT)
subq KK, 1, TMP1
sll TMP1, ZBASE_SHIFT, TMP2
addq AORIG, TMP2, AO
sll TMP1, ZBASE_SHIFT, TMP2
addq B, TMP2, BO
#else
lda AO, -2 * SIZE(AO)
lda BO, -2 * SIZE(BO)
#endif
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
SUB a1, c01, c01
SUB a2, c02, c02
#else
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
SUB a1, c01, c01
SUB a2, c02, c02
#endif
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
MUL a2, c02, t1
MUL a2, c01, t2
MUL a1, c01, c01
MUL a1, c02, c02
ADD5 c01, t1, c01
ADD6 c02, t2, c02
#endif
#if defined(RN) || defined(RT)
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
MUL a2, c02, t1
MUL a2, c01, t2
MUL a1, c01, c01
MUL a1, c02, c02
ADD5 c01, t1, c01
ADD6 c02, t2, c02
#endif
#if defined(LN) || defined(LT)
ST c01, 0 * SIZE(BO)
ST c02, 1 * SIZE(BO)
#else
ST c01, 0 * SIZE(AO)
ST c02, 1 * SIZE(AO)
#endif
#ifdef LN
lda C1, -2 * SIZE(C1)
#endif
ST c01, 0 * SIZE(C1)
ST c02, 1 * SIZE(C1)
#ifndef LN
lda C1, 2 * SIZE(C1)
#endif
#ifdef RT
sll K, ZBASE_SHIFT, TMP1
addq AORIG, TMP1, AORIG
#endif
#if defined(LT) || defined(RN)
subq K, KK, TMP1
sll TMP1, ZBASE_SHIFT, TMP2
addq AO, TMP2, AO
sll TMP1, ZBASE_SHIFT, TMP2
addq BO, TMP2, BO
#endif
#ifdef LT
addq KK, 1, KK
#endif
#ifdef LN
subq KK, 1, KK
#endif
.align 4
$L59:
#ifdef LN
sll K, ZBASE_SHIFT, TMP1
addq B, TMP1, B
#endif
#if defined(LT) || defined(RN)
mov BO, B
#endif
#ifdef RN
addq KK, 1, KK
#endif
#ifdef RT
subq KK, 1, KK
#endif
.align 4
$L30:
sra N, 1, J
ble J, $L999
.align 4
$L01:
#ifdef RT
sll K, ZBASE_SHIFT + 1, TMP1
subq B, TMP1, B
subq C, LDC, C2
subq C2, LDC, C1
subq C2, LDC, C
#else
mov C, C1
addq C, LDC, C2
addq C2, LDC, C
#endif
#ifdef LN
addq M, OFFSET, KK
#endif
#ifdef LT
mov OFFSET, KK
#endif
#if defined(LN) || defined(RT)
mov A, AORIG
#else
mov A, AO
#endif
sra M, 1, I
fclr t1
fclr t2
fclr t3
fclr t4
fclr c01
fclr c05
ble I, $L20
.align 4
$L11:
#if defined(LT) || defined(RN)
LD a1, 0 * SIZE(AO)
fclr c09
LD a2, 1 * SIZE(AO)
fclr c13
LD a3, 2 * SIZE(AO)
fclr c02
LD a4, 3 * SIZE(AO)
fclr c06
LD b1, 0 * SIZE(B)
fclr c10
LD b2, 1 * SIZE(B)
fclr c14
LD b3, 2 * SIZE(B)
fclr c03
LD b4, 3 * SIZE(B)
fclr c07
lda BO, 4 * SIZE(B)
fclr c11
lda AO, 4 * SIZE(AO)
fclr c15
lds $f31, 4 * SIZE(C1)
fclr c04
lda L, -2(KK)
fclr c08
lds $f31, 4 * SIZE(C2)
fclr c12
fclr c16
ble KK, $L18
ble L, $L15
#else
#ifdef LN
sll K, ZBASE_SHIFT + 1, TMP1
subq AORIG, TMP1, AORIG
#endif
sll KK, ZBASE_SHIFT + 1, TMP1
addq AORIG, TMP1, AO
addq B, TMP1, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr c09
LD a2, 1 * SIZE(AO)
fclr c13
LD a3, 2 * SIZE(AO)
fclr c02
LD a4, 3 * SIZE(AO)
fclr c06
LD b1, 0 * SIZE(BO)
fclr c10
LD b2, 1 * SIZE(BO)
fclr c14
LD b3, 2 * SIZE(BO)
fclr c03
LD b4, 3 * SIZE(BO)
fclr c07
lda BO, 4 * SIZE(BO)
fclr c11
lda AO, 4 * SIZE(AO)
fclr c15
lds $f31, 4 * SIZE(C1)
fclr c04
lda L, -2(TMP1)
fclr c08
lds $f31, 4 * SIZE(C2)
fclr c12
fclr c16
ble TMP1, $L18
ble L, $L15
#endif
.align 5
$L12:
/* 1 */
ADD1 c11, t1, c11
#ifndef EV4
ldq $31, PREFETCHSIZE * SIZE(AO)
#else
unop
#endif
MUL b1, a1, t1
#ifndef EV4
ldl $31, PREFETCHSIZE * SIZE(BO)
#else
unop
#endif
ADD3 c12, t2, c12
unop
MUL b1, a2, t2
unop
ADD2 c16, t3, c16
unop
MUL b2, a2, t3
LD a5, 0 * SIZE(AO)
ADD4 c15, t4, c15
unop
MUL b2, a1, t4
LD b5, 0 * SIZE(BO)
/* 2 */
ADD1 c01, t1, c01
UNOP
MUL b1, a3, t1
UNOP
ADD3 c02, t2, c02
UNOP
MUL b1, a4, t2
UNOP
ADD2 c06, t3, c06
unop
MUL b2, a4, t3
unop
ADD4 c05, t4, c05
unop
MUL b4, a1, t4
unop
/* 3 */
ADD1 c03, t1, c03
unop
MUL b3, a1, t1
unop
ADD3 c04, t2, c04
unop
MUL b3, a2, t2
unop
ADD2 c08, t3, c08
unop
MUL b4, a2, t3
LD a2, 1 * SIZE(AO)
ADD4 c13, t4, c13
unop
MUL b2, a3, t4
LD b2, 1 * SIZE(BO)
/* 4 */
ADD1 c09, t1, c09
unop
MUL b3, a3, t1
LD a6, 2 * SIZE(AO)
ADD3 c10, t2, c10
unop
MUL b3, a4, t2
LD b3, 2 * SIZE(BO)
ADD2 c14, t3, c14
unop
MUL b4, a4, t3
LD a4, 3 * SIZE(AO)
ADD4 c07, t4, c07
unop
MUL b4, a3, t4
LD b4, 3 * SIZE(BO)
/* 5 */
ADD1 c11, t1, c11
unop
MUL b5, a5, t1
LD a1, 4 * SIZE(AO)
ADD3 c12, t2, c12
lda L, -2(L)
MUL b5, a2, t2
LD b1, 4 * SIZE(BO)
ADD2 c16, t3, c16
unop
MUL b2, a2, t3
unop
ADD4 c15, t4, c15
unop
MUL b2, a5, t4
unop
/* 6 */
ADD1 c01, t1, c01
unop
MUL b5, a6, t1
unop
ADD3 c02, t2, c02
unop
MUL b5, a4, t2
unop
ADD2 c06, t3, c06
unop
MUL b2, a4, t3
unop
ADD4 c05, t4, c05
unop
MUL b4, a5, t4
unop
/* 7 */
ADD1 c03, t1, c03
lda AO, 8 * SIZE(AO)
MUL b3, a5, t1
unop
ADD3 c04, t2, c04
lda BO, 8 * SIZE(BO)
MUL b3, a2, t2
unop
ADD2 c08, t3, c08
unop
MUL b4, a2, t3
LD a2, -3 * SIZE(AO)
ADD4 c13, t4, c13
unop
MUL b2, a6, t4
LD b2, -3 * SIZE(BO)
/* 8 */
ADD1 c09, t1, c09
unop
MUL b3, a6, t1
LD a3, -2 * SIZE(AO)
ADD3 c10, t2, c10
unop
MUL b3, a4, t2
LD b3, -2 * SIZE(BO)
ADD2 c14, t3, c14
unop
MUL b4, a4, t3
LD a4, -1 * SIZE(AO)
ADD4 c07, t4, c07
MUL b4, a6, t4
LD b4, -1 * SIZE(BO)
bgt L, $L12
.align 4
$L15:
ADD1 c11, t1, c11
unop
MUL b1, a1, t1
#if defined(LT) || defined(RN)
blbs KK, $L17
#else
blbs TMP1, $L17
#endif
.align 4
ADD3 c12, t2, c12
MUL b1, a2, t2
ADD2 c16, t3, c16
MUL b2, a2, t3
ADD4 c15, t4, c15
MUL b2, a1, t4
ADD1 c01, t1, c01
MUL b1, a3, t1
ADD3 c02, t2, c02
unop
MUL b1, a4, t2
LD b1, 0 * SIZE(BO)
ADD2 c06, t3, c06
MUL b2, a4, t3
ADD4 c05, t4, c05
MUL b4, a1, t4
ADD1 c03, t1, c03
unop
MUL b3, a1, t1
LD a1, 0 * SIZE(AO)
ADD3 c04, t2, c04
unop
MUL b3, a2, t2
unop
ADD2 c08, t3, c08
unop
MUL b4, a2, t3
LD a2, 1 * SIZE(AO)
ADD4 c13, t4, c13
unop
MUL b2, a3, t4
LD b2, 1 * SIZE(BO)
ADD1 c09, t1, c09
unop
MUL b3, a3, t1
lda AO, 4 * SIZE(AO)
ADD3 c10, t2, c10
unop
MUL b3, a4, t2
LD b3, 2 * SIZE(BO)
ADD2 c14, t3, c14
unop
MUL b4, a4, t3
LD a4, -1 * SIZE(AO)
ADD4 c07, t4, c07
unop
MUL b4, a3, t4
LD a3, -2 * SIZE(AO)
ADD1 c11, t1, c11
LD b4, 3 * SIZE(BO)
MUL b1, a1, t1
lda BO, 4 * SIZE(BO)
.align 4
$L17:
ADD3 c12, t2, c12
MUL b1, a2, t2
ADD2 c16, t3, c16
MUL b2, a2, t3
ADD4 c15, t4, c15
MUL b2, a1, t4
ADD1 c01, t1, c01
MUL b1, a3, t1
ADD3 c02, t2, c02
MUL b1, a4, t2
ADD2 c06, t3, c06
MUL b2, a4, t3
ADD4 c05, t4, c05
MUL b4, a1, t4
ADD1 c03, t1, c03
MUL b3, a1, t1
ADD3 c04, t2, c04
MUL b3, a2, t2
ADD2 c08, t3, c08
MUL b4, a2, t3
ADD4 c13, t4, c13
MUL b2, a3, t4
ADD1 c09, t1, c09
MUL b3, a3, t1
ADD3 c10, t2, c10
MUL b3, a4, t2
ADD2 c14, t3, c14
MUL b4, a4, t3
ADD4 c07, t4, c07
lda AO, 4 * SIZE(AO)
MUL b4, a3, t4
lda BO, 4 * SIZE(BO)
ADD1 c11, t1, c11
ADD3 c12, t2, c12
ADD2 c16, t3, c16
ADD4 c15, t4, c15
ADD c01, c06, c01
ADD c02, c05, c02
ADD c03, c08, c03
ADD c04, c07, c04
ADD c09, c14, c09
ADD c10, c13, c10
ADD c11, c16, c11
ADD c12, c15, c12
.align 4
$L18:
#if defined(LN) || defined(RT)
#ifdef LN
subq KK, 2, TMP1
#else
subq KK, 2, TMP1
#endif
sll TMP1, ZBASE_SHIFT + 1, TMP2
addq AORIG, TMP2, AO
sll TMP1, ZBASE_SHIFT + 1, TMP2
addq B, TMP2, BO
#else
lda AO, -4 * SIZE(AO)
lda BO, -4 * SIZE(BO)
#endif
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
LD b1, 4 * SIZE(BO)
LD b2, 5 * SIZE(BO)
LD b3, 6 * SIZE(BO)
LD b4, 7 * SIZE(BO)
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c09, c09
SUB a4, c10, c10
SUB b1, c03, c03
SUB b2, c04, c04
SUB b3, c11, c11
SUB b4, c12, c12
#else
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 4 * SIZE(AO)
LD b2, 5 * SIZE(AO)
LD b3, 6 * SIZE(AO)
LD b4, 7 * SIZE(AO)
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c03, c03
SUB a4, c04, c04
SUB b1, c09, c09
SUB b2, c10, c10
SUB b3, c11, c11
SUB b4, c12, c12
#endif
#ifdef LN
LD a1, 6 * SIZE(AO)
LD a2, 7 * SIZE(AO)
LD a3, 4 * SIZE(AO)
LD a4, 5 * SIZE(AO)
MUL a2, c04, t1
MUL a2, c03, t2
MUL a2, c12, t3
MUL a2, c11, t4
MUL a1, c03, c03
MUL a1, c04, c04
MUL a1, c11, c11
MUL a1, c12, c12
ADD5 c03, t1, c03
ADD6 c04, t2, c04
ADD5 c11, t3, c11
ADD6 c12, t4, c12
MUL a3, c03, t1
MUL a3, c04, t2
MUL a3, c11, t3
MUL a3, c12, t4
SUB c01, t1, c01
SUB c02, t2, c02
SUB c09, t3, c09
SUB c10, t4, c10
MUL a4, c04, t1
MUL a4, c03, t2
MUL a4, c12, t3
MUL a4, c11, t4
ADD6 c01, t1, c01
ADD5 c02, t2, c02
ADD6 c09, t3, c09
ADD5 c10, t4, c10
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
MUL a2, c02, t1
MUL a2, c01, t2
MUL a2, c10, t3
MUL a2, c09, t4
MUL a1, c01, c01
MUL a1, c02, c02
MUL a1, c09, c09
MUL a1, c10, c10
ADD5 c01, t1, c01
ADD6 c02, t2, c02
ADD5 c09, t3, c09
ADD6 c10, t4, c10
#endif
#ifdef LT
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
MUL a2, c02, t1
MUL a2, c01, t2
MUL a2, c10, t3
MUL a2, c09, t4
MUL a1, c01, c01
MUL a1, c02, c02
MUL a1, c09, c09
MUL a1, c10, c10
ADD5 c01, t1, c01
ADD6 c02, t2, c02
ADD5 c09, t3, c09
ADD6 c10, t4, c10
MUL a3, c01, t1
MUL a3, c02, t2
MUL a3, c09, t3
MUL a3, c10, t4
SUB c03, t1, c03
SUB c04, t2, c04
SUB c11, t3, c11
SUB c12, t4, c12
MUL a4, c02, t1
MUL a4, c01, t2
MUL a4, c10, t3
MUL a4, c09, t4
ADD6 c03, t1, c03
ADD5 c04, t2, c04
ADD6 c11, t3, c11
ADD5 c12, t4, c12
LD a1, 6 * SIZE(AO)
LD a2, 7 * SIZE(AO)
MUL a2, c04, t1
MUL a2, c03, t2
MUL a2, c12, t3
MUL a2, c11, t4
MUL a1, c03, c03
MUL a1, c04, c04
MUL a1, c11, c11
MUL a1, c12, c12
ADD5 c03, t1, c03
ADD6 c04, t2, c04
ADD5 c11, t3, c11
ADD6 c12, t4, c12
#endif
#ifdef RN
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
MUL a2, c02, t1
MUL a2, c01, t2
MUL a2, c04, t3
MUL a2, c03, t4
MUL a1, c01, c01
MUL a1, c02, c02
MUL a1, c03, c03
MUL a1, c04, c04
ADD5 c01, t1, c01
ADD6 c02, t2, c02
ADD5 c03, t3, c03
ADD6 c04, t4, c04
MUL a3, c01, t1
MUL a3, c02, t2
MUL a3, c03, t3
MUL a3, c04, t4
SUB c09, t1, c09
SUB c10, t2, c10
SUB c11, t3, c11
SUB c12, t4, c12
MUL a4, c02, t1
MUL a4, c01, t2
MUL a4, c04, t3
MUL a4, c03, t4
ADD6 c09, t1, c09
ADD5 c10, t2, c10
ADD6 c11, t3, c11
ADD5 c12, t4, c12
LD a1, 6 * SIZE(BO)
LD a2, 7 * SIZE(BO)
MUL a2, c10, t1
MUL a2, c09, t2
MUL a2, c12, t3
MUL a2, c11, t4
MUL a1, c09, c09
MUL a1, c10, c10
MUL a1, c11, c11
MUL a1, c12, c12
ADD5 c09, t1, c09
ADD6 c10, t2, c10
ADD5 c11, t3, c11
ADD6 c12, t4, c12
#endif
#ifdef RT
LD a1, 6 * SIZE(BO)
LD a2, 7 * SIZE(BO)
LD a3, 4 * SIZE(BO)
LD a4, 5 * SIZE(BO)
MUL a2, c10, t1
MUL a2, c09, t2
MUL a2, c12, t3
MUL a2, c11, t4
MUL a1, c09, c09
MUL a1, c10, c10
MUL a1, c11, c11
MUL a1, c12, c12
ADD5 c09, t1, c09
ADD6 c10, t2, c10
ADD5 c11, t3, c11
ADD6 c12, t4, c12
MUL a3, c09, t1
MUL a3, c10, t2
MUL a3, c11, t3
MUL a3, c12, t4
SUB c01, t1, c01
SUB c02, t2, c02
SUB c03, t3, c03
SUB c04, t4, c04
MUL a4, c10, t1
MUL a4, c09, t2
MUL a4, c12, t3
MUL a4, c11, t4
ADD6 c01, t1, c01
ADD5 c02, t2, c02
ADD6 c03, t3, c03
ADD5 c04, t4, c04
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
MUL a2, c02, t1
MUL a2, c01, t2
MUL a2, c04, t3
MUL a2, c03, t4
MUL a1, c01, c01
MUL a1, c02, c02
MUL a1, c03, c03
MUL a1, c04, c04
ADD5 c01, t1, c01
ADD6 c02, t2, c02
ADD5 c03, t3, c03
ADD6 c04, t4, c04
#endif
#if defined(LN) || defined(LT)
ST c01, 0 * SIZE(BO)
ST c02, 1 * SIZE(BO)
ST c09, 2 * SIZE(BO)
ST c10, 3 * SIZE(BO)
ST c03, 4 * SIZE(BO)
ST c04, 5 * SIZE(BO)
ST c11, 6 * SIZE(BO)
ST c12, 7 * SIZE(BO)
#else
ST c01, 0 * SIZE(AO)
ST c02, 1 * SIZE(AO)
ST c03, 2 * SIZE(AO)
ST c04, 3 * SIZE(AO)
ST c09, 4 * SIZE(AO)
ST c10, 5 * SIZE(AO)
ST c11, 6 * SIZE(AO)
ST c12, 7 * SIZE(AO)
#endif
#ifdef LN
lda C1, -4 * SIZE(C1)
lda C2, -4 * SIZE(C2)
#endif
ST c01, 0 * SIZE(C1)
ST c02, 1 * SIZE(C1)
ST c03, 2 * SIZE(C1)
ST c04, 3 * SIZE(C1)
ST c09, 0 * SIZE(C2)
ST c10, 1 * SIZE(C2)
ST c11, 2 * SIZE(C2)
ST c12, 3 * SIZE(C2)
#ifndef LN
lda C1, 4 * SIZE(C1)
lda C2, 4 * SIZE(C2)
#endif
fclr t1
fclr t2
fclr t3
fclr t4
#ifdef RT
sll K, ZBASE_SHIFT + 1, TMP1
addq AORIG, TMP1, AORIG
#endif
#if defined(LT) || defined(RN)
subq K, KK, TMP1
sll TMP1, ZBASE_SHIFT + 1, TMP1
addq AO, TMP1, AO
addq BO, TMP1, BO
#endif
#ifdef LT
addq KK, 2, KK
#endif
#ifdef LN
subq KK, 2, KK
#endif
fclr c01
fclr c05
lda I, -1(I)
bgt I, $L11
.align 4
$L20:
and M, 1, I
ble I, $L29
#if defined(LT) || defined(RN)
LD a1, 0 * SIZE(AO)
fclr c09
LD a2, 1 * SIZE(AO)
fclr c13
LD a3, 2 * SIZE(AO)
fclr c02
LD a4, 3 * SIZE(AO)
fclr c06
LD b1, 0 * SIZE(B)
fclr c10
LD b2, 1 * SIZE(B)
fclr c14
LD b3, 2 * SIZE(B)
lda AO, 2 * SIZE(AO)
LD b4, 3 * SIZE(B)
lda BO, 4 * SIZE(B)
lda L, -2(KK)
ble KK, $L28
ble L, $L25
#else
#ifdef LN
sll K, ZBASE_SHIFT + 0, TMP1
subq AORIG, TMP1, AORIG
#endif
sll KK, ZBASE_SHIFT + 0, TMP1
addq AORIG, TMP1, AO
sll KK, ZBASE_SHIFT + 1, TMP1
addq B, TMP1, BO
subq K, KK, TMP1
LD a1, 0 * SIZE(AO)
fclr c09
LD a2, 1 * SIZE(AO)
fclr c13
LD a3, 2 * SIZE(AO)
fclr c02
LD a4, 3 * SIZE(AO)
fclr c06
LD b1, 0 * SIZE(BO)
fclr c10
LD b2, 1 * SIZE(BO)
fclr c14
LD b3, 2 * SIZE(BO)
lda AO, 2 * SIZE(AO)
LD b4, 3 * SIZE(BO)
lda BO, 4 * SIZE(BO)
lda L, -2(TMP1)
ble TMP1, $L28
ble L, $L25
#endif
.align 5
$L22:
ADD1 c09, t1, c09
unop
MUL a1, b1, t1
unop
ADD3 c10, t2, c10
unop
MUL a2, b1, t2
LD b1, 0 * SIZE(BO)
ADD4 c13, t3, c13
unop
MUL a1, b2, t3
lda BO, 8 * SIZE(BO)
ADD2 c14, t4, c14
unop
MUL a2, b2, t4
LD b2, -7 * SIZE(BO)
ADD1 c01, t1, c01
unop
MUL a1, b3, t1
unop
ADD3 c02, t2, c02
unop
MUL a2, b3, t2
LD b3, -6 * SIZE(BO)
ADD4 c05, t3, c05
unop
MUL a1, b4, t3
LD a1, 2 * SIZE(AO)
ADD2 c06, t4, c06
MUL a2, b4, t4
LD b5, -5 * SIZE(BO)
ADD1 c09, t1, c09
unop
MUL a3, b1, t1
LD a2, 3 * SIZE(AO)
ADD3 c10, t2, c10
unop
MUL a4, b1, t2
LD b1, -4 * SIZE(BO)
ADD4 c13, t3, c13
unop
MUL a3, b2, t3
lda AO, 4 * SIZE(AO)
ADD2 c14, t4, c14
MUL a4, b2, t4
LD b2, -3 * SIZE(BO)
ADD1 c01, t1, c01
lda L, -2(L)
MUL a3, b3, t1
LD b4, -1 * SIZE(BO)
ADD3 c02, t2, c02
unop
MUL a4, b3, t2
LD b3, -2 * SIZE(BO)
ADD4 c05, t3, c05
unop
MUL a3, b5, t3
LD a3, 0 * SIZE(AO)
ADD2 c06, t4, c06
MUL a4, b5, t4
LD a4, 1 * SIZE(AO)
bgt L, $L22
.align 4
$L25:
ADD1 c09, t1, c09
MUL a1, b1, t1
#if defined(LT) || defined(RN)
blbs KK, $L27
#else
blbs TMP1, $L27
#endif
.align 4
ADD3 c10, t2, c10
unop
MUL a2, b1, t2
LD b1, 0 * SIZE(BO)
ADD4 c13, t3, c13
unop
MUL a1, b2, t3
unop
ADD2 c14, t4, c14
unop
MUL a2, b2, t4
LD b2, 1 * SIZE(BO)
ADD1 c01, t1, c01
unop
MUL a1, b3, t1
lda AO, 2 * SIZE(AO)
ADD3 c02, t2, c02
unop
MUL a2, b3, t2
LD b3, 2 * SIZE(BO)
ADD4 c05, t3, c05
unop
MUL a1, b4, t3
LD a1, -2 * SIZE(AO)
ADD2 c06, t4, c06
unop
MUL a2, b4, t4
LD a2, -1 * SIZE(AO)
ADD1 c09, t1, c09
LD b4, 3 * SIZE(BO)
MUL a1, b1, t1
lda BO, 4 * SIZE(BO)
.align 4
$L27:
ADD3 c10, t2, c10
MUL a2, b1, t2
ADD4 c13, t3, c13
MUL a1, b2, t3
ADD2 c14, t4, c14
MUL a2, b2, t4
ADD1 c01, t1, c01
MUL a1, b3, t1
ADD3 c02, t2, c02
MUL a2, b3, t2
ADD4 c05, t3, c05
MUL a1, b4, t3
ADD2 c06, t4, c06
lda AO, 2 * SIZE(AO)
MUL a2, b4, t4
lda BO, 4 * SIZE(BO)
ADD1 c09, t1, c09
ADD3 c10, t2, c10
ADD4 c13, t3, c13
ADD2 c14, t4, c14
ADD c01, c06, c01
ADD c02, c05, c02
ADD c09, c14, c09
ADD c10, c13, c10
.align 4
$L28:
#if defined(LN) || defined(RT)
#ifdef LN
subq KK, 1, TMP1
#else
subq KK, 2, TMP1
#endif
sll TMP1, ZBASE_SHIFT + 0, TMP2
addq AORIG, TMP2, AO
sll TMP1, ZBASE_SHIFT + 1, TMP2
addq B, TMP2, BO
#else
lda AO, -2 * SIZE(AO)
lda BO, -4 * SIZE(BO)
#endif
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c09, c09
SUB a4, c10, c10
#else
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
SUB a1, c01, c01
SUB a2, c02, c02
SUB a3, c09, c09
SUB a4, c10, c10
#endif
#if defined(LN) || defined(LT)
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
MUL a2, c02, t1
MUL a2, c01, t2
MUL a2, c10, t3
MUL a2, c09, t4
MUL a1, c01, c01
MUL a1, c02, c02
MUL a1, c09, c09
MUL a1, c10, c10
ADD5 c01, t1, c01
ADD6 c02, t2, c02
ADD5 c09, t3, c09
ADD6 c10, t4, c10
#endif
#ifdef RN
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
LD a3, 2 * SIZE(BO)
LD a4, 3 * SIZE(BO)
MUL a2, c02, t1
MUL a2, c01, t2
MUL a1, c01, c01
MUL a1, c02, c02
ADD5 c01, t1, c01
ADD6 c02, t2, c02
MUL a3, c01, t1
MUL a3, c02, t2
SUB c09, t1, c09
SUB c10, t2, c10
MUL a4, c02, t1
MUL a4, c01, t2
ADD6 c09, t1, c09
ADD5 c10, t2, c10
LD a1, 6 * SIZE(BO)
LD a2, 7 * SIZE(BO)
MUL a2, c10, t1
MUL a2, c09, t2
MUL a1, c09, c09
MUL a1, c10, c10
ADD5 c09, t1, c09
ADD6 c10, t2, c10
#endif
#ifdef RT
LD a1, 6 * SIZE(BO)
LD a2, 7 * SIZE(BO)
LD a3, 4 * SIZE(BO)
LD a4, 5 * SIZE(BO)
MUL a2, c10, t1
MUL a2, c09, t2
MUL a1, c09, c09
MUL a1, c10, c10
ADD5 c09, t1, c09
ADD6 c10, t2, c10
MUL a3, c09, t1
MUL a3, c10, t2
SUB c01, t1, c01
SUB c02, t2, c02
MUL a4, c10, t1
MUL a4, c09, t2
ADD6 c01, t1, c01
ADD5 c02, t2, c02
LD a1, 0 * SIZE(BO)
LD a2, 1 * SIZE(BO)
MUL a2, c02, t1
MUL a2, c01, t2
MUL a1, c01, c01
MUL a1, c02, c02
ADD5 c01, t1, c01
ADD6 c02, t2, c02
#endif
#if defined(LN) || defined(LT)
ST c01, 0 * SIZE(BO)
ST c02, 1 * SIZE(BO)
ST c09, 2 * SIZE(BO)
ST c10, 3 * SIZE(BO)
#else
ST c01, 0 * SIZE(AO)
ST c02, 1 * SIZE(AO)
ST c09, 2 * SIZE(AO)
ST c10, 3 * SIZE(AO)
#endif
#ifdef LN
lda C1, -2 * SIZE(C1)
lda C2, -2 * SIZE(C2)
#endif
ST c01, 0 * SIZE(C1)
ST c02, 1 * SIZE(C1)
ST c09, 0 * SIZE(C2)
ST c10, 1 * SIZE(C2)
#ifndef LN
lda C1, 2 * SIZE(C1)
lda C2, 2 * SIZE(C2)
#endif
#ifdef RT
sll K, ZBASE_SHIFT, TMP1
addq AORIG, TMP1, AORIG
#endif
#if defined(LT) || defined(RN)
subq K, KK, TMP1
sll TMP1, ZBASE_SHIFT + 0, TMP2
addq AO, TMP2, AO
sll TMP1, ZBASE_SHIFT + 1, TMP2
addq BO, TMP2, BO
#endif
#ifdef LT
addq KK, 1, KK
#endif
#ifdef LN
subq KK, 1, KK
#endif
.align 4
$L29:
#ifdef LN
sll K, ZBASE_SHIFT + 1, TMP1
addq B, TMP1, B
#endif
#if defined(LT) || defined(RN)
mov BO, B
#endif
#ifdef RN
addq KK, 2, KK
#endif
#ifdef RT
subq KK, 2, KK
#endif
lda J, -1(J)
bgt J, $L01
.align 4
$L999:
ldt $f2, 0($sp)
ldt $f3, 8($sp)
ldt $f4, 16($sp)
ldt $f5, 24($sp)
ldt $f6, 32($sp)
ldt $f7, 40($sp)
ldt $f8, 48($sp)
ldt $f9, 56($sp)
clr $0
lda $sp, STACKSIZE($sp)
ret
.ident VERSION
.end CNAME