tahoma2d/thirdparty/openblas/xianyi-OpenBLAS-e6e87a2/kernel/mips64/zgemm_kernel.S
2016-03-24 02:47:04 +09:00

1286 lines
23 KiB
ArmAsm

/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define M $4
#define N $5
#define K $6
#define A $9
#define B $10
#define C $11
#define LDC $8
#define AO $12
#define BO $13
#define I $2
#define J $3
#define L $7
#define CO1 $14
#define CO2 $15
#define CO3 $16
#define CO4 $17
#if defined(TRMMKERNEL)
#define OFFSET $18
#define KK $19
#define TEMP $20
#endif
#define a1 $f0
#define a2 $f1
#define a3 $f28
#define a4 $f29
#define b1 $f2
#define b2 $f3
#define b3 $f4
#define b4 $f5
#define b5 $f6
#define b6 $f7
#define b7 $f8
#define b8 $f9
#define a5 b8
#define c11 $f10
#define c12 $f11
#define c21 $f12
#define c22 $f13
#define c31 $f14
#define c32 $f17
#define c41 $f18
#define c42 $f19
#define c51 $f20
#define c52 $f21
#define c61 $f22
#define c62 $f23
#define c71 $f24
#define c72 $f25
#define c81 $f26
#define c82 $f27
#define ALPHA_R $f15
#define ALPHA_I $f16
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define MADD1 MADD
#define MADD2 MADD
#define MADD3 MADD
#define MADD4 NMSUB
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define MADD1 MADD
#define MADD2 MADD
#define MADD3 NMSUB
#define MADD4 MADD
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
#define MADD1 MADD
#define MADD2 NMSUB
#define MADD3 MADD
#define MADD4 MADD
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
#define MADD1 MADD
#define MADD2 NMSUB
#define MADD3 NMSUB
#define MADD4 NMSUB
#endif
PROLOGUE
LDARG LDC, 0($sp)
daddiu $sp, $sp, -128
SDARG $16, 0($sp)
SDARG $17, 8($sp)
sdc1 $f24, 16($sp)
sdc1 $f25, 24($sp)
sdc1 $f26, 32($sp)
sdc1 $f27, 40($sp)
sdc1 $f28, 48($sp)
sdc1 $f29, 56($sp)
#if defined(TRMMKERNEL)
SDARG $18, 64($sp)
SDARG $19, 72($sp)
SDARG $20, 80($sp)
LDARG OFFSET, 128 + 8($sp)
#endif
#ifndef __64BIT__
sdc1 $f20, 88($sp)
sdc1 $f21, 96($sp)
sdc1 $f22,104($sp)
sdc1 $f23,112($sp)
#endif
dsll LDC, LDC, ZBASE_SHIFT
#if defined(TRMMKERNEL) && !defined(LEFT)
neg KK, OFFSET
#endif
dsra J, N, 2
blez J, .L20
nop
.L10:
move CO1, C
MTC $0, c11
daddu CO2, C, LDC
move AO, A
daddu CO3, CO2, LDC
daddiu J, J, -1
daddu CO4, CO3, LDC
MOV c21, c11
MOV c31, c11
#if defined(TRMMKERNEL) && defined(LEFT)
move KK, OFFSET
#endif
MOV c41, c11
MOV c51, c11
move I, M
daddu C, CO4, LDC
blez I, .L19
MOV c61, c11
.L11:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
dsll L, KK, ZBASE_SHIFT
dsll TEMP, KK, 2 + ZBASE_SHIFT
daddu AO, AO, L
daddu BO, B, TEMP
#endif
LD a1, 0 * SIZE(AO)
MOV c71, c11
LD b1, 0 * SIZE(BO)
MOV c81, c11
LD a3, 4 * SIZE(AO)
MOV c12, c11
LD b2, 1 * SIZE(BO)
MOV c22, c11
MOV c32, c11
LD b3, 2 * SIZE(BO)
MOV c42, c11
LD b4, 3 * SIZE(BO)
MOV c52, c11
LD b5, 4 * SIZE(BO)
MOV c62, c11
LD b6, 8 * SIZE(BO)
MOV c72, c11
LD b7, 12 * SIZE(BO)
MOV c82, c11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 1
#else
daddiu TEMP, KK, 4
#endif
dsra L, TEMP, 2
blez L, .L15
NOP
#else
LD a1, 0 * SIZE(AO)
MOV c71, c11
LD b1, 0 * SIZE(B)
MOV c81, c11
LD a3, 4 * SIZE(AO)
MOV c12, c11
LD b2, 1 * SIZE(B)
MOV c22, c11
dsra L, K, 2
MOV c32, c11
LD b3, 2 * SIZE(B)
MOV c42, c11
LD b4, 3 * SIZE(B)
MOV c52, c11
LD b5, 4 * SIZE(B)
MOV c62, c11
LD b6, 8 * SIZE(B)
MOV c72, c11
LD b7, 12 * SIZE(B)
MOV c82, c11
blez L, .L15
move BO, B
#endif
MADD1 c11, c11, a1, b1
LD a2, 1 * SIZE(AO)
MADD3 c21, c21, a1, b2
daddiu L, L, -1
MADD1 c31, c31, a1, b3
NOP
blez L, .L13
MADD3 c41, c41, a1, b4
.align 3
.L12:
MADD2 c12, c12, a2, b1
LD b1, 16 * SIZE(BO)
MADD4 c22, c22, a2, b2
LD b2, 5 * SIZE(BO)
MADD2 c32, c32, a2, b3
LD b3, 6 * SIZE(BO)
MADD4 c42, c42, a2, b4
LD b4, 7 * SIZE(BO)
MADD1 c51, c51, a1, b5
NOP
MADD3 c61, c61, a1, b2
LD a4, 2 * SIZE(AO)
MADD1 c71, c71, a1, b3
NOP
MADD3 c81, c81, a1, b4
LD a1, 8 * SIZE(AO)
MADD2 c52, c52, a2, b5
LD b5, 20 * SIZE(BO)
MADD4 c62, c62, a2, b2
LD b2, 9 * SIZE(BO)
MADD2 c72, c72, a2, b3
LD b3, 10 * SIZE(BO)
MADD4 c82, c82, a2, b4
LD b4, 11 * SIZE(BO)
MADD1 c11, c11, a4, b6
LD a2, 3 * SIZE(AO)
MADD3 c21, c21, a4, b2
NOP
MADD1 c31, c31, a4, b3
NOP
MADD3 c41, c41, a4, b4
NOP
MADD2 c12, c12, a2, b6
LD b6, 24 * SIZE(BO)
MADD4 c22, c22, a2, b2
LD b2, 13 * SIZE(BO)
MADD2 c32, c32, a2, b3
LD b3, 14 * SIZE(BO)
MADD4 c42, c42, a2, b4
LD b4, 15 * SIZE(BO)
MADD1 c51, c51, a4, b7
NOP
MADD3 c61, c61, a4, b2
NOP
MADD1 c71, c71, a4, b3
NOP
MADD3 c81, c81, a4, b4
NOP
MADD2 c52, c52, a2, b7
LD b7, 28 * SIZE(BO)
MADD4 c62, c62, a2, b2
LD b2, 17 * SIZE(BO)
MADD2 c72, c72, a2, b3
LD b3, 18 * SIZE(BO)
MADD4 c82, c82, a2, b4
LD b4, 19 * SIZE(BO)
MADD1 c11, c11, a3, b1
LD a2, 5 * SIZE(AO)
MADD3 c21, c21, a3, b2
NOP
MADD1 c31, c31, a3, b3
NOP
MADD3 c41, c41, a3, b4
NOP
MADD2 c12, c12, a2, b1
LD b1, 32 * SIZE(BO)
MADD4 c22, c22, a2, b2
LD b2, 21 * SIZE(BO)
MADD2 c32, c32, a2, b3
LD b3, 22 * SIZE(BO)
MADD4 c42, c42, a2, b4
LD b4, 23 * SIZE(BO)
MADD1 c51, c51, a3, b5
NOP
MADD3 c61, c61, a3, b2
LD a4, 6 * SIZE(AO)
MADD1 c71, c71, a3, b3
NOP
MADD3 c81, c81, a3, b4
LD a3, 12 * SIZE(AO)
MADD2 c52, c52, a2, b5
LD b5, 36 * SIZE(BO)
MADD4 c62, c62, a2, b2
LD b2, 25 * SIZE(BO)
MADD2 c72, c72, a2, b3
LD b3, 26 * SIZE(BO)
MADD4 c82, c82, a2, b4
LD b4, 27 * SIZE(BO)
MADD1 c11, c11, a4, b6
LD a2, 7 * SIZE(AO)
MADD3 c21, c21, a4, b2
NOP
MADD1 c31, c31, a4, b3
NOP
MADD3 c41, c41, a4, b4
daddiu L, L, -1
MADD2 c12, c12, a2, b6
LD b6, 40 * SIZE(BO)
MADD4 c22, c22, a2, b2
LD b2, 29 * SIZE(BO)
MADD2 c32, c32, a2, b3
LD b3, 30 * SIZE(BO)
MADD4 c42, c42, a2, b4
LD b4, 31 * SIZE(BO)
MADD1 c51, c51, a4, b7
daddiu BO, BO, 32 * SIZE
MADD3 c61, c61, a4, b2
daddiu AO, AO, 8 * SIZE
MADD1 c71, c71, a4, b3
NOP
MADD3 c81, c81, a4, b4
NOP
MADD2 c52, c52, a2, b7
LD b7, 12 * SIZE(BO)
MADD4 c62, c62, a2, b2
LD b2, 1 * SIZE(BO)
MADD2 c72, c72, a2, b3
LD b3, 2 * SIZE(BO)
MADD4 c82, c82, a2, b4
LD b4, 3 * SIZE(BO)
MADD1 c11, c11, a1, b1
LD a2, 1 * SIZE(AO)
MADD3 c21, c21, a1, b2
NOP
MADD1 c31, c31, a1, b3
NOP
bgtz L, .L12
MADD3 c41, c41, a1, b4
.align 3
.L13:
MADD2 c12, c12, a2, b1
LD b1, 16 * SIZE(BO)
MADD4 c22, c22, a2, b2
LD b2, 5 * SIZE(BO)
MADD2 c32, c32, a2, b3
LD b3, 6 * SIZE(BO)
MADD4 c42, c42, a2, b4
LD b4, 7 * SIZE(BO)
MADD1 c51, c51, a1, b5
NOP
MADD3 c61, c61, a1, b2
LD a4, 2 * SIZE(AO)
MADD1 c71, c71, a1, b3
NOP
MADD3 c81, c81, a1, b4
LD a1, 8 * SIZE(AO)
MADD2 c52, c52, a2, b5
LD b5, 20 * SIZE(BO)
MADD4 c62, c62, a2, b2
LD b2, 9 * SIZE(BO)
MADD2 c72, c72, a2, b3
LD b3, 10 * SIZE(BO)
MADD4 c82, c82, a2, b4
LD b4, 11 * SIZE(BO)
MADD1 c11, c11, a4, b6
LD a2, 3 * SIZE(AO)
MADD3 c21, c21, a4, b2
NOP
MADD1 c31, c31, a4, b3
NOP
MADD3 c41, c41, a4, b4
NOP
MADD2 c12, c12, a2, b6
LD b6, 24 * SIZE(BO)
MADD4 c22, c22, a2, b2
LD b2, 13 * SIZE(BO)
MADD2 c32, c32, a2, b3
LD b3, 14 * SIZE(BO)
MADD4 c42, c42, a2, b4
LD b4, 15 * SIZE(BO)
MADD1 c51, c51, a4, b7
NOP
MADD3 c61, c61, a4, b2
NOP
MADD1 c71, c71, a4, b3
NOP
MADD3 c81, c81, a4, b4
NOP
MADD2 c52, c52, a2, b7
LD b7, 28 * SIZE(BO)
MADD4 c62, c62, a2, b2
LD b2, 17 * SIZE(BO)
MADD2 c72, c72, a2, b3
LD b3, 18 * SIZE(BO)
MADD4 c82, c82, a2, b4
LD b4, 19 * SIZE(BO)
MADD1 c11, c11, a3, b1
LD a2, 5 * SIZE(AO)
MADD3 c21, c21, a3, b2
NOP
MADD1 c31, c31, a3, b3
NOP
MADD3 c41, c41, a3, b4
NOP
MADD2 c12, c12, a2, b1
LD b1, 32 * SIZE(BO)
MADD4 c22, c22, a2, b2
LD b2, 21 * SIZE(BO)
MADD2 c32, c32, a2, b3
LD b3, 22 * SIZE(BO)
MADD4 c42, c42, a2, b4
LD b4, 23 * SIZE(BO)
MADD1 c51, c51, a3, b5
NOP
MADD3 c61, c61, a3, b2
LD a4, 6 * SIZE(AO)
MADD1 c71, c71, a3, b3
NOP
MADD3 c81, c81, a3, b4
LD a3, 12 * SIZE(AO)
MADD2 c52, c52, a2, b5
LD b5, 36 * SIZE(BO)
MADD4 c62, c62, a2, b2
LD b2, 25 * SIZE(BO)
MADD2 c72, c72, a2, b3
LD b3, 26 * SIZE(BO)
MADD4 c82, c82, a2, b4
LD b4, 27 * SIZE(BO)
MADD1 c11, c11, a4, b6
LD a2, 7 * SIZE(AO)
MADD3 c21, c21, a4, b2
NOP
MADD1 c31, c31, a4, b3
NOP
MADD3 c41, c41, a4, b4
NOP
MADD2 c12, c12, a2, b6
LD b6, 40 * SIZE(BO)
MADD4 c22, c22, a2, b2
LD b2, 29 * SIZE(BO)
MADD2 c32, c32, a2, b3
LD b3, 30 * SIZE(BO)
MADD4 c42, c42, a2, b4
LD b4, 31 * SIZE(BO)
MADD1 c51, c51, a4, b7
daddiu BO, BO, 32 * SIZE
MADD3 c61, c61, a4, b2
daddiu AO, AO, 8 * SIZE
MADD1 c71, c71, a4, b3
NOP
MADD3 c81, c81, a4, b4
NOP
MADD2 c52, c52, a2, b7
LD b7, 12 * SIZE(BO)
MADD4 c62, c62, a2, b2
LD b2, 1 * SIZE(BO)
MADD2 c72, c72, a2, b3
LD b3, 2 * SIZE(BO)
MADD4 c82, c82, a2, b4
LD b4, 3 * SIZE(BO)
.align 3
.L15:
#ifndef TRMMKERNEL
andi L, K, 3
#else
andi L, TEMP, 3
#endif
NOP
blez L, .L18
NOP
.align 3
.L16:
MADD1 c11, c11, a1, b1
LD a2, 1 * SIZE(AO)
MADD3 c21, c21, a1, b2
NOP
MADD1 c31, c31, a1, b3
NOP
MADD3 c41, c41, a1, b4
NOP
MADD2 c12, c12, a2, b1
LD b1, 8 * SIZE(BO)
MADD4 c22, c22, a2, b2
LD b2, 5 * SIZE(BO)
MADD2 c32, c32, a2, b3
LD b3, 6 * SIZE(BO)
MADD4 c42, c42, a2, b4
LD b4, 7 * SIZE(BO)
MADD1 c51, c51, a1, b5
daddiu L, L, -1
MADD3 c61, c61, a1, b2
daddiu AO, AO, 2 * SIZE
MADD1 c71, c71, a1, b3
daddiu BO, BO, 8 * SIZE
MADD3 c81, c81, a1, b4
LD a1, 0 * SIZE(AO)
MADD2 c52, c52, a2, b5
LD b5, 4 * SIZE(BO)
MADD4 c62, c62, a2, b2
LD b2, 1 * SIZE(BO)
MADD2 c72, c72, a2, b3
LD b3, 2 * SIZE(BO)
MADD4 c82, c82, a2, b4
bgtz L, .L16
LD b4, 3 * SIZE(BO)
.L18:
#ifndef TRMMKERNEL
LD b1, 0 * SIZE(CO1)
ADD c11, c11, c22
LD b2, 1 * SIZE(CO1)
ADD c12, c12, c21
LD b3, 0 * SIZE(CO2)
ADD c31, c31, c42
LD b4, 1 * SIZE(CO2)
ADD c32, c32, c41
LD b5, 0 * SIZE(CO3)
ADD c51, c51, c62
LD b6, 1 * SIZE(CO3)
ADD c52, c52, c61
LD b7, 0 * SIZE(CO4)
ADD c71, c71, c82
LD b8, 1 * SIZE(CO4)
ADD c72, c72, c81
MADD b1, b1, ALPHA_R, c11
daddiu CO1,CO1, 2 * SIZE
MADD b2, b2, ALPHA_R, c12
daddiu CO2,CO2, 2 * SIZE
MADD b3, b3, ALPHA_R, c31
daddiu CO3,CO3, 2 * SIZE
MADD b4, b4, ALPHA_R, c32
daddiu CO4,CO4, 2 * SIZE
MADD b5, b5, ALPHA_R, c51
daddiu I, I, -1
MADD b6, b6, ALPHA_R, c52
NOP
MADD b7, b7, ALPHA_R, c71
NOP
MADD b8, b8, ALPHA_R, c72
NOP
NMSUB b1, b1, ALPHA_I, c12
NOP
MADD b2, b2, ALPHA_I, c11
MTC $0, c11
NMSUB b3, b3, ALPHA_I, c32
NOP
MADD b4, b4, ALPHA_I, c31
NOP
ST b1, -2 * SIZE(CO1)
NMSUB b5, b5, ALPHA_I, c52
ST b2, -1 * SIZE(CO1)
MADD b6, b6, ALPHA_I, c51
ST b3, -2 * SIZE(CO2)
NMSUB b7, b7, ALPHA_I, c72
ST b4, -1 * SIZE(CO2)
MADD b8, b8, ALPHA_I, c71
ST b5, -2 * SIZE(CO3)
MOV c21, c11
ST b6, -1 * SIZE(CO3)
MOV c31, c11
ST b7, -2 * SIZE(CO4)
MOV c41, c11
ST b8, -1 * SIZE(CO4)
MOV c51, c11
#else
ADD c11, c11, c22
daddiu CO1,CO1, 2 * SIZE
ADD c12, c12, c21
daddiu CO2,CO2, 2 * SIZE
ADD c31, c31, c42
daddiu CO3,CO3, 2 * SIZE
ADD c32, c32, c41
daddiu CO4,CO4, 2 * SIZE
ADD c51, c51, c62
daddiu I, I, -1
ADD c52, c52, c61
ADD c71, c71, c82
ADD c72, c72, c81
MUL b1, ALPHA_R, c11
MUL b2, ALPHA_R, c12
MUL b3, ALPHA_R, c31
MUL b4, ALPHA_R, c32
MUL b5, ALPHA_R, c51
MUL b6, ALPHA_R, c52
MUL b7, ALPHA_R, c71
MUL b8, ALPHA_R, c72
NMSUB b1, b1, ALPHA_I, c12
NOP
MADD b2, b2, ALPHA_I, c11
MTC $0, c11
NMSUB b3, b3, ALPHA_I, c32
NOP
MADD b4, b4, ALPHA_I, c31
NOP
ST b1, -2 * SIZE(CO1)
NMSUB b5, b5, ALPHA_I, c52
ST b2, -1 * SIZE(CO1)
MADD b6, b6, ALPHA_I, c51
ST b3, -2 * SIZE(CO2)
NMSUB b7, b7, ALPHA_I, c72
ST b4, -1 * SIZE(CO2)
MADD b8, b8, ALPHA_I, c71
ST b5, -2 * SIZE(CO3)
MOV c21, c11
ST b6, -1 * SIZE(CO3)
MOV c31, c11
ST b7, -2 * SIZE(CO4)
MOV c41, c11
ST b8, -1 * SIZE(CO4)
MOV c51, c11
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, K, KK
#ifdef LEFT
daddiu TEMP, TEMP, -1
#else
daddiu TEMP, TEMP, -4
#endif
dsll L, TEMP, ZBASE_SHIFT
dsll TEMP, TEMP, 2 + ZBASE_SHIFT
daddu AO, AO, L
daddu BO, BO, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 1
#endif
#endif
bgtz I, .L11
MOV c61, c11
.align 3
.L19:
#if defined(TRMMKERNEL) && !defined(LEFT)
daddiu KK, KK, 4
#endif
bgtz J, .L10
move B, BO
.align 3
.L20:
andi J, N, 2
MTC $0, c11
blez J, .L30
move CO1, C
daddu CO2, C, LDC
daddu C, CO2, LDC
#if defined(TRMMKERNEL) && defined(LEFT)
move KK, OFFSET
#endif
move I, M
blez I, .L29
move AO, A
.align 3
.L21:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
dsll L, KK, ZBASE_SHIFT
dsll TEMP, KK, 1 + ZBASE_SHIFT
daddu AO, AO, L
daddu BO, B, TEMP
#endif
LD a1, 0 * SIZE(AO)
MOV c21, c11
LD b1, 0 * SIZE(BO)
MOV c31, c11
LD a3, 4 * SIZE(AO)
MOV c41, c11
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
MOV c12, c11
LD b4, 3 * SIZE(BO)
MOV c22, c11
LD b5, 4 * SIZE(BO)
MOV c32, c11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 1
#else
daddiu TEMP, KK, 2
#endif
dsra L, TEMP, 2
blez L, .L25
MOV c42, c11
#else
LD a1, 0 * SIZE(AO)
MOV c21, c11
LD b1, 0 * SIZE(B)
MOV c31, c11
LD a3, 4 * SIZE(AO)
MOV c41, c11
LD b2, 1 * SIZE(B)
dsra L, K, 2
LD b3, 2 * SIZE(B)
MOV c12, c11
LD b4, 3 * SIZE(B)
MOV c22, c11
LD b5, 4 * SIZE(B)
MOV c32, c11
NOP
MOV c42, c11
blez L, .L25
move BO, B
#endif
.align 3
.L22:
MADD1 c11, c11, a1, b1
LD a2, 1 * SIZE(AO)
MADD3 c21, c21, a1, b2
daddiu L, L, -1
MADD1 c31, c31, a1, b3
NOP
MADD3 c41, c41, a1, b4
LD a1, 2 * SIZE(AO)
MADD2 c12, c12, a2, b1
LD b1, 8 * SIZE(BO)
MADD4 c22, c22, a2, b2
LD b2, 5 * SIZE(BO)
MADD2 c32, c32, a2, b3
LD b3, 6 * SIZE(BO)
MADD4 c42, c42, a2, b4
LD b4, 7 * SIZE(BO)
MADD1 c11, c11, a1, b5
LD a2, 3 * SIZE(AO)
MADD3 c21, c21, a1, b2
NOP
MADD1 c31, c31, a1, b3
NOP
MADD3 c41, c41, a1, b4
LD a1, 8 * SIZE(AO)
MADD2 c12, c12, a2, b5
LD b5, 12 * SIZE(BO)
MADD4 c22, c22, a2, b2
LD b2, 9 * SIZE(BO)
MADD2 c32, c32, a2, b3
LD b3, 10 * SIZE(BO)
MADD4 c42, c42, a2, b4
LD b4, 11 * SIZE(BO)
MADD1 c11, c11, a3, b1
LD a2, 5 * SIZE(AO)
MADD3 c21, c21, a3, b2
NOP
MADD1 c31, c31, a3, b3
NOP
MADD3 c41, c41, a3, b4
LD a3, 6 * SIZE(AO)
MADD2 c12, c12, a2, b1
LD b1, 16 * SIZE(BO)
MADD4 c22, c22, a2, b2
LD b2, 13 * SIZE(BO)
MADD2 c32, c32, a2, b3
LD b3, 14 * SIZE(BO)
MADD4 c42, c42, a2, b4
LD b4, 15 * SIZE(BO)
MADD1 c11, c11, a3, b5
LD a2, 7 * SIZE(AO)
MADD3 c21, c21, a3, b2
daddiu AO, AO, 8 * SIZE
MADD1 c31, c31, a3, b3
NOP
MADD3 c41, c41, a3, b4
LD a3, 4 * SIZE(AO)
MADD2 c12, c12, a2, b5
LD b5, 20 * SIZE(BO)
MADD4 c22, c22, a2, b2
LD b2, 17 * SIZE(BO)
MADD2 c32, c32, a2, b3
LD b3, 18 * SIZE(BO)
MADD4 c42, c42, a2, b4
LD b4, 19 * SIZE(BO)
bgtz L, .L22
daddiu BO, BO, 16 * SIZE
.align 3
.L25:
#ifndef TRMMKERNEL
andi L, K, 3
#else
andi L, TEMP, 3
#endif
NOP
blez L, .L28
NOP
.align 3
.L26:
MADD1 c11, c11, a1, b1
LD a2, 1 * SIZE(AO)
MADD3 c21, c21, a1, b2
daddiu L, L, -1
MADD1 c31, c31, a1, b3
daddiu BO, BO, 4 * SIZE
MADD3 c41, c41, a1, b4
LD a1, 2 * SIZE(AO)
MADD2 c12, c12, a2, b1
LD b1, 0 * SIZE(BO)
MADD4 c22, c22, a2, b2
LD b2, 1 * SIZE(BO)
MADD2 c32, c32, a2, b3
LD b3, 2 * SIZE(BO)
MADD4 c42, c42, a2, b4
LD b4, 3 * SIZE(BO)
bgtz L, .L26
daddiu AO, AO, 2 * SIZE
.L28:
#ifndef TRMMKERNEL
LD b1, 0 * SIZE(CO1)
ADD c11, c11, c22
LD b2, 1 * SIZE(CO1)
ADD c12, c12, c21
LD b3, 0 * SIZE(CO2)
ADD c31, c31, c42
LD b4, 1 * SIZE(CO2)
ADD c32, c32, c41
MADD b1, b1, ALPHA_R, c11
daddiu CO1,CO1, 2 * SIZE
MADD b2, b2, ALPHA_R, c12
daddiu CO2,CO2, 2 * SIZE
MADD b3, b3, ALPHA_R, c31
daddiu I, I, -1
MADD b4, b4, ALPHA_R, c32
NMSUB b1, b1, ALPHA_I, c12
NOP
MADD b2, b2, ALPHA_I, c11
MTC $0, c11
NMSUB b3, b3, ALPHA_I, c32
NOP
MADD b4, b4, ALPHA_I, c31
NOP
ST b1, -2 * SIZE(CO1)
ST b2, -1 * SIZE(CO1)
ST b3, -2 * SIZE(CO2)
#else
ADD c11, c11, c22
ADD c12, c12, c21
ADD c31, c31, c42
ADD c32, c32, c41
MUL b1, ALPHA_R, c11
daddiu CO1,CO1, 2 * SIZE
MUL b2, ALPHA_R, c12
daddiu CO2,CO2, 2 * SIZE
MUL b3, ALPHA_R, c31
daddiu I, I, -1
MUL b4, ALPHA_R, c32
NMSUB b1, b1, ALPHA_I, c12
NOP
MADD b2, b2, ALPHA_I, c11
MTC $0, c11
NMSUB b3, b3, ALPHA_I, c32
NOP
MADD b4, b4, ALPHA_I, c31
NOP
ST b1, -2 * SIZE(CO1)
ST b2, -1 * SIZE(CO1)
ST b3, -2 * SIZE(CO2)
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, K, KK
#ifdef LEFT
daddiu TEMP, TEMP, -1
#else
daddiu TEMP, TEMP, -2
#endif
dsll L, TEMP, ZBASE_SHIFT
dsll TEMP, TEMP, 1 + ZBASE_SHIFT
daddu AO, AO, L
daddu BO, BO, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 1
#endif
#endif
bgtz I, .L21
ST b4, -1 * SIZE(CO2)
.align 3
.L29:
#if defined(TRMMKERNEL) && !defined(LEFT)
daddiu KK, KK, 2
#endif
move B, BO
.align 3
.L30:
andi J, N, 1
MTC $0, c11
blez J, .L999
move CO1, C
#if defined(TRMMKERNEL) && defined(LEFT)
move KK, OFFSET
#endif
move I, M
daddu C, CO1, LDC
blez I, .L39
move AO, A
.align 3
.L31:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
dsll TEMP, KK, ZBASE_SHIFT
daddu AO, AO, TEMP
daddu BO, B, TEMP
#endif
LD a1, 0 * SIZE(AO)
MOV c21, c11
LD b1, 0 * SIZE(BO)
MOV c31, c11
LD a2, 1 * SIZE(AO)
MOV c41, c11
LD b2, 1 * SIZE(BO)
MOV c12, c11
NOP
MOV c22, c11
LD a3, 4 * SIZE(AO)
MOV c32, c11
LD b3, 4 * SIZE(BO)
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 1
#else
daddiu TEMP, KK, 1
#endif
dsra L, TEMP, 2
blez L, .L35
MOV c42, c11
#else
LD a1, 0 * SIZE(AO)
MOV c21, c11
LD b1, 0 * SIZE(B)
MOV c31, c11
LD a2, 1 * SIZE(AO)
MOV c41, c11
LD b2, 1 * SIZE(B)
MOV c12, c11
dsra L, K, 2
MOV c22, c11
LD a3, 4 * SIZE(AO)
MOV c32, c11
LD b3, 4 * SIZE(B)
NOP
MOV c42, c11
blez L, .L35
move BO, B
#endif
.align 3
.L32:
MADD1 c11, c11, a1, b1
LD b4, 3 * SIZE(BO)
MADD3 c21, c21, a1, b2
LD a1, 2 * SIZE(AO)
MADD2 c12, c12, a2, b1
LD b1, 2 * SIZE(BO)
MADD4 c22, c22, a2, b2
LD a2, 3 * SIZE(AO)
MADD1 c11, c11, a1, b1
LD b2, 5 * SIZE(BO)
MADD3 c21, c21, a1, b4
LD a1, 8 * SIZE(AO)
MADD2 c12, c12, a2, b1
LD b1, 8 * SIZE(BO)
MADD4 c22, c22, a2, b4
LD a2, 5 * SIZE(AO)
MADD1 c11, c11, a3, b3
LD b4, 7 * SIZE(BO)
MADD3 c21, c21, a3, b2
LD a3, 6 * SIZE(AO)
MADD2 c12, c12, a2, b3
LD b3, 6 * SIZE(BO)
MADD4 c22, c22, a2, b2
LD a2, 7 * SIZE(AO)
MADD1 c11, c11, a3, b3
LD b2, 9 * SIZE(BO)
MADD3 c21, c21, a3, b4
LD a3, 12 * SIZE(AO)
MADD2 c12, c12, a2, b3
LD b3, 12 * SIZE(BO)
MADD4 c22, c22, a2, b4
LD a2, 9 * SIZE(AO)
daddiu AO, AO, 8 * SIZE
daddiu L, L, -1
bgtz L, .L32
daddiu BO, BO, 8 * SIZE
.align 3
.L35:
#ifndef TRMMKERNEL
andi L, K, 3
#else
andi L, TEMP, 3
#endif
NOP
blez L, .L38
NOP
.align 3
.L36:
MADD1 c11, c11, a1, b1
daddiu L, L, -1
MADD3 c21, c21, a1, b2
LD a1, 2 * SIZE(AO)
MADD2 c12, c12, a2, b1
LD b1, 2 * SIZE(BO)
MADD4 c22, c22, a2, b2
LD a2, 3 * SIZE(AO)
LD b2, 3 * SIZE(BO)
daddiu BO, BO, 2 * SIZE
bgtz L, .L36
daddiu AO, AO, 2 * SIZE
.L38:
#ifndef TRMMKERNEL
LD b1, 0 * SIZE(CO1)
ADD c11, c11, c22
LD b2, 1 * SIZE(CO1)
ADD c12, c12, c21
MADD b1, b1, ALPHA_R, c11
daddiu CO1,CO1, 2 * SIZE
MADD b2, b2, ALPHA_R, c12
daddiu I, I, -1
NMSUB b1, b1, ALPHA_I, c12
NOP
MADD b2, b2, ALPHA_I, c11
MTC $0, c11
ST b1, -2 * SIZE(CO1)
NOP
bgtz I, .L31
ST b2, -1 * SIZE(CO1)
#else
ADD c11, c11, c22
ADD c12, c12, c21
MUL b1, ALPHA_R, c11
daddiu CO1,CO1, 2 * SIZE
MUL b2, ALPHA_R, c12
daddiu I, I, -1
NMSUB b1, b1, ALPHA_I, c12
NOP
MADD b2, b2, ALPHA_I, c11
MTC $0, c11
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, K, KK
#ifdef LEFT
daddiu TEMP, TEMP, -1
#else
daddiu TEMP, TEMP, -1
#endif
dsll TEMP, TEMP, ZBASE_SHIFT
daddu AO, AO, TEMP
daddu BO, BO, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 1
#endif
ST b1, -2 * SIZE(CO1)
NOP
bgtz I, .L31
ST b2, -1 * SIZE(CO1)
#endif
.align 3
.L39:
#if defined(TRMMKERNEL) && !defined(LEFT)
daddiu KK, KK, 1
#endif
move B, BO
.align 3
.L999:
LDARG $16, 0($sp)
LDARG $17, 8($sp)
ldc1 $f24, 16($sp)
ldc1 $f25, 24($sp)
ldc1 $f26, 32($sp)
ldc1 $f27, 40($sp)
ldc1 $f28, 48($sp)
ldc1 $f29, 56($sp)
#if defined(TRMMKERNEL)
LDARG $18, 64($sp)
LDARG $19, 72($sp)
LDARG $20, 80($sp)
#endif
#ifndef __64BIT__
ldc1 $f20, 88($sp)
ldc1 $f21, 96($sp)
ldc1 $f22,104($sp)
ldc1 $f23,112($sp)
#endif
j $31
daddiu $sp, $sp, 128
EPILOGUE