tahoma2d/thirdparty/openblas/xianyi-OpenBLAS-e6e87a2/kernel/mips64/gemm_kernel.S
2016-03-24 02:47:04 +09:00

2250 lines
39 KiB
ArmAsm

/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define M $4
#define N $5
#define K $6
#define A $8
#define B $9
#define C $10
#define LDC $11
#define AO $12
#define BO $13
#define I $2
#define J $3
#define L $7
#define PREFETCHSIZE (4 * 10)
#define CO1 $14
#define CO2 $15
#define CO3 $16
#define CO4 $17
#define CO5 $18
#define CO6 $19
#define CO7 $20
#define CO8 $21
#define BB $22
#if defined(TRMMKERNEL)
#define OFFSET $23
#define KK $24
#define TEMP $25
#endif
#define a1 $f0
#define a2 $f1
#define a3 $f27
#define a4 $f28
#define b1 $f2
#define b2 $f3
#define b3 $f4
#define b4 $f5
#define b5 $f6
#define b6 $f7
#define b7 $f8
#define b8 $f9
#define a5 b8
#define c11 $f10
#define c12 $f11
#define c21 $f12
#define c22 $f13
#define c31 $f14
#define c32 $f16
#define c41 $f17
#define c42 $f18
#define c51 $f19
#define c52 $f20
#define c61 $f21
#define c62 $f22
#define c71 $f23
#define c72 $f24
#define c81 $f25
#define c82 $f26
#define ALPHA $f15
PROLOGUE
daddiu $sp, $sp, -160
SDARG $16, 0($sp)
SDARG $17, 8($sp)
SDARG $18, 16($sp)
SDARG $19, 24($sp)
SDARG $20, 32($sp)
SDARG $21, 40($sp)
SDARG $22, 48($sp)
sdc1 $f24, 56($sp)
sdc1 $f25, 64($sp)
sdc1 $f26, 72($sp)
sdc1 $f27, 80($sp)
sdc1 $f28, 88($sp)
#if defined(TRMMKERNEL)
SDARG $23, 96($sp)
SDARG $24, 104($sp)
SDARG $25, 112($sp)
LDARG OFFSET, 160($sp)
#endif
#ifndef __64BIT__
sdc1 $f20,120($sp)
sdc1 $f21,128($sp)
sdc1 $f22,136($sp)
sdc1 $f23,144($sp)
#endif
dsll LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL) && !defined(LEFT)
neg KK, OFFSET
#endif
dsra J, N, 3
blez J, .L30
nop
.L10:
move CO1, C
MTC $0, c11
daddu CO2, C, LDC
move AO, A
daddu CO3, CO2, LDC
daddiu J, J, -1
daddu CO4, CO3, LDC
MOV c21, c11
daddu CO5, CO4, LDC
MOV c31, c11
daddu CO6, CO5, LDC
MOV c41, c11
daddu CO7, CO6, LDC
MOV c51, c11
daddu CO8, CO7, LDC
dsra I, M, 1
daddu C, CO8, LDC
dsll BB, K, 2 + BASE_SHIFT
daddu BB, B, BB
#if defined(TRMMKERNEL) && defined(LEFT)
move KK, OFFSET
#endif
blez I, .L20
MOV c61, c11
.L11:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
dsll L, KK, 1 + BASE_SHIFT
dsll TEMP, KK, 3 + BASE_SHIFT
daddu AO, AO, L
daddu BO, B, TEMP
#endif
LD a1, 0 * SIZE(AO)
MOV c71, c11
LD b1, 0 * SIZE(BO)
MOV c81, c11
LD a3, 4 * SIZE(AO)
MOV c12, c11
LD b2, 1 * SIZE(BO)
MOV c22, c11
MOV c32, c11
LD b3, 2 * SIZE(BO)
MOV c42, c11
LD b4, 3 * SIZE(BO)
MOV c52, c11
LD b5, 4 * SIZE(BO)
MOV c62, c11
LD b6, 8 * SIZE(BO)
MOV c72, c11
LD b7, 12 * SIZE(BO)
MOV c82, c11
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 2
#else
daddiu TEMP, KK, 8
#endif
dsra L, TEMP, 2
blez L, .L15
NOP
#else
LD a1, 0 * SIZE(AO)
MOV c71, c11
LD b1, 0 * SIZE(B)
MOV c81, c11
pref 1, 3 * SIZE(CO1)
pref 1, 3 * SIZE(CO2)
LD a3, 4 * SIZE(AO)
MOV c12, c11
LD b2, 1 * SIZE(B)
MOV c22, c11
dsra L, K, 2
MOV c32, c11
LD b3, 2 * SIZE(B)
MOV c42, c11
LD b4, 3 * SIZE(B)
MOV c52, c11
LD b5, 4 * SIZE(B)
MOV c62, c11
LD b6, 8 * SIZE(B)
MOV c72, c11
LD b7, 12 * SIZE(B)
MOV c82, c11
blez L, .L15
move BO, B
#endif
MADD c11, c11, a1, b1
LD a2, 1 * SIZE(AO)
MADD c21, c21, a1, b2
daddiu L, L, -1
MADD c31, c31, a1, b3
blez L, .L13
MADD c41, c41, a1, b4
pref 1, 2 * SIZE(CO3)
.align 3
.L12:
MADD c12, c12, a2, b1
LD b1, 16 * SIZE(BO)
MADD c22, c22, a2, b2
LD b2, 5 * SIZE(BO)
MADD c32, c32, a2, b3
LD b3, 6 * SIZE(BO)
MADD c42, c42, a2, b4
LD b4, 7 * SIZE(BO)
MADD c51, c51, a1, b5
LD a4, 2 * SIZE(AO)
MADD c61, c61, a1, b2
NOP
MADD c71, c71, a1, b3
NOP
MADD c81, c81, a1, b4
LD a1, 8 * SIZE(AO)
MADD c52, c52, a2, b5
LD b5, 20 * SIZE(BO)
MADD c62, c62, a2, b2
LD b2, 9 * SIZE(BO)
MADD c72, c72, a2, b3
LD b3, 10 * SIZE(BO)
MADD c82, c82, a2, b4
LD b4, 11 * SIZE(BO)
MADD c11, c11, a4, b6
LD a2, 3 * SIZE(AO)
MADD c21, c21, a4, b2
NOP
MADD c31, c31, a4, b3
NOP
MADD c41, c41, a4, b4
NOP
MADD c12, c12, a2, b6
LD b6, 24 * SIZE(BO)
MADD c22, c22, a2, b2
LD b2, 13 * SIZE(BO)
MADD c32, c32, a2, b3
LD b3, 14 * SIZE(BO)
MADD c42, c42, a2, b4
LD b4, 15 * SIZE(BO)
MADD c51, c51, a4, b7
NOP
MADD c61, c61, a4, b2
NOP
MADD c71, c71, a4, b3
NOP
MADD c81, c81, a4, b4
NOP
MADD c52, c52, a2, b7
LD b7, 28 * SIZE(BO)
MADD c62, c62, a2, b2
LD b2, 17 * SIZE(BO)
MADD c72, c72, a2, b3
LD b3, 18 * SIZE(BO)
MADD c82, c82, a2, b4
LD b4, 19 * SIZE(BO)
MADD c11, c11, a3, b1
LD a2, 5 * SIZE(AO)
MADD c21, c21, a3, b2
NOP
MADD c31, c31, a3, b3
NOP
MADD c41, c41, a3, b4
NOP
MADD c12, c12, a2, b1
LD b1, 32 * SIZE(BO)
MADD c22, c22, a2, b2
LD b2, 21 * SIZE(BO)
MADD c32, c32, a2, b3
LD b3, 22 * SIZE(BO)
MADD c42, c42, a2, b4
LD b4, 23 * SIZE(BO)
MADD c51, c51, a3, b5
LD a4, 6 * SIZE(AO)
MADD c61, c61, a3, b2
NOP
MADD c71, c71, a3, b3
NOP
MADD c81, c81, a3, b4
LD a3, 12 * SIZE(AO)
MADD c52, c52, a2, b5
LD b5, 36 * SIZE(BO)
MADD c62, c62, a2, b2
LD b2, 25 * SIZE(BO)
MADD c72, c72, a2, b3
LD b3, 26 * SIZE(BO)
MADD c82, c82, a2, b4
LD b4, 27 * SIZE(BO)
MADD c11, c11, a4, b6
LD a2, 7 * SIZE(AO)
MADD c21, c21, a4, b2
NOP
MADD c31, c31, a4, b3
NOP
MADD c41, c41, a4, b4
daddiu L, L, -1
MADD c12, c12, a2, b6
LD b6, 40 * SIZE(BO)
MADD c22, c22, a2, b2
LD b2, 29 * SIZE(BO)
MADD c32, c32, a2, b3
LD b3, 30 * SIZE(BO)
MADD c42, c42, a2, b4
LD b4, 31 * SIZE(BO)
MADD c51, c51, a4, b7
daddiu BO, BO, 32 * SIZE
MADD c61, c61, a4, b2
daddiu AO, AO, 8 * SIZE
MADD c71, c71, a4, b3
NOP
MADD c81, c81, a4, b4
NOP
MADD c52, c52, a2, b7
LD b7, 12 * SIZE(BO)
MADD c62, c62, a2, b2
LD b2, 1 * SIZE(BO)
MADD c72, c72, a2, b3
LD b3, 2 * SIZE(BO)
MADD c82, c82, a2, b4
LD b4, 3 * SIZE(BO)
MADD c11, c11, a1, b1
LD a2, 1 * SIZE(AO)
MADD c21, c21, a1, b2
NOP
MADD c31, c31, a1, b3
bgtz L, .L12
MADD c41, c41, a1, b4
NOP
.align 3
.L13:
MADD c12, c12, a2, b1
LD b1, 16 * SIZE(BO)
MADD c22, c22, a2, b2
LD b2, 5 * SIZE(BO)
MADD c32, c32, a2, b3
LD b3, 6 * SIZE(BO)
MADD c42, c42, a2, b4
LD b4, 7 * SIZE(BO)
MADD c51, c51, a1, b5
NOP
MADD c61, c61, a1, b2
LD a4, 2 * SIZE(AO)
MADD c71, c71, a1, b3
NOP
MADD c81, c81, a1, b4
LD a1, 8 * SIZE(AO)
MADD c52, c52, a2, b5
LD b5, 20 * SIZE(BO)
MADD c62, c62, a2, b2
LD b2, 9 * SIZE(BO)
MADD c72, c72, a2, b3
LD b3, 10 * SIZE(BO)
MADD c82, c82, a2, b4
LD b4, 11 * SIZE(BO)
MADD c11, c11, a4, b6
LD a2, 3 * SIZE(AO)
MADD c21, c21, a4, b2
NOP
MADD c31, c31, a4, b3
pref 1, 3 * SIZE(CO4)
MADD c41, c41, a4, b4
NOP
MADD c12, c12, a2, b6
LD b6, 24 * SIZE(BO)
MADD c22, c22, a2, b2
LD b2, 13 * SIZE(BO)
MADD c32, c32, a2, b3
LD b3, 14 * SIZE(BO)
MADD c42, c42, a2, b4
LD b4, 15 * SIZE(BO)
MADD c51, c51, a4, b7
pref 1, 3 * SIZE(CO5)
MADD c61, c61, a4, b2
NOP
MADD c71, c71, a4, b3
pref 1, 3 * SIZE(CO6)
MADD c81, c81, a4, b4
NOP
MADD c52, c52, a2, b7
LD b7, 28 * SIZE(BO)
MADD c62, c62, a2, b2
LD b2, 17 * SIZE(BO)
MADD c72, c72, a2, b3
LD b3, 18 * SIZE(BO)
MADD c82, c82, a2, b4
LD b4, 19 * SIZE(BO)
MADD c11, c11, a3, b1
LD a2, 5 * SIZE(AO)
MADD c21, c21, a3, b2
NOP
MADD c31, c31, a3, b3
pref 1, 3 * SIZE(CO7)
MADD c41, c41, a3, b4
NOP
MADD c12, c12, a2, b1
LD b1, 32 * SIZE(BO)
MADD c22, c22, a2, b2
LD b2, 21 * SIZE(BO)
MADD c32, c32, a2, b3
LD b3, 22 * SIZE(BO)
MADD c42, c42, a2, b4
LD b4, 23 * SIZE(BO)
MADD c51, c51, a3, b5
NOP
MADD c61, c61, a3, b2
LD a4, 6 * SIZE(AO)
MADD c71, c71, a3, b3
NOP
MADD c81, c81, a3, b4
NOP
MADD c52, c52, a2, b5
LD b5, 36 * SIZE(BO)
MADD c62, c62, a2, b2
LD b2, 25 * SIZE(BO)
MADD c72, c72, a2, b3
LD b3, 26 * SIZE(BO)
MADD c82, c82, a2, b4
LD b4, 27 * SIZE(BO)
MADD c11, c11, a4, b6
LD a2, 7 * SIZE(AO)
MADD c21, c21, a4, b2
NOP
MADD c31, c31, a4, b3
NOP
MADD c41, c41, a4, b4
NOP
MADD c12, c12, a2, b6
LD b6, 40 * SIZE(BO)
MADD c22, c22, a2, b2
LD b2, 29 * SIZE(BO)
MADD c32, c32, a2, b3
LD b3, 30 * SIZE(BO)
MADD c42, c42, a2, b4
LD b4, 31 * SIZE(BO)
MADD c51, c51, a4, b7
daddiu BO, BO, 32 * SIZE
MADD c61, c61, a4, b2
daddiu AO, AO, 8 * SIZE
MADD c71, c71, a4, b3
NOP
MADD c81, c81, a4, b4
NOP
MADD c52, c52, a2, b7
LD b7, 12 * SIZE(BO)
MADD c62, c62, a2, b2
LD b2, 1 * SIZE(BO)
MADD c72, c72, a2, b3
LD b3, 2 * SIZE(BO)
MADD c82, c82, a2, b4
LD b4, 3 * SIZE(BO)
.align 3
.L15:
#ifndef TRMMKERNEL
andi L, K, 3
#else
andi L, TEMP, 3
#endif
NOP
blez L, .L18
pref 1, 3 * SIZE(CO8)
.align 3
.L16:
MADD c11, c11, a1, b1
LD a2, 1 * SIZE(AO)
MADD c21, c21, a1, b2
NOP
MADD c31, c31, a1, b3
NOP
MADD c41, c41, a1, b4
NOP
MADD c12, c12, a2, b1
LD b1, 8 * SIZE(BO)
MADD c22, c22, a2, b2
LD b2, 5 * SIZE(BO)
MADD c32, c32, a2, b3
LD b3, 6 * SIZE(BO)
MADD c42, c42, a2, b4
LD b4, 7 * SIZE(BO)
MADD c51, c51, a1, b5
daddiu L, L, -1
MADD c61, c61, a1, b2
daddiu AO, AO, 2 * SIZE
MADD c71, c71, a1, b3
daddiu BO, BO, 8 * SIZE
MADD c81, c81, a1, b4
LD a1, 0 * SIZE(AO)
MADD c52, c52, a2, b5
LD b5, 4 * SIZE(BO)
MADD c62, c62, a2, b2
LD b2, 1 * SIZE(BO)
MADD c72, c72, a2, b3
LD b3, 2 * SIZE(BO)
MADD c82, c82, a2, b4
bgtz L, .L16
LD b4, 3 * SIZE(BO)
.L18:
#ifndef TRMMKERNEL
LD $f0, 0 * SIZE(CO1)
daddiu CO3,CO3, 2 * SIZE
LD $f1, 1 * SIZE(CO1)
daddiu CO1,CO1, 2 * SIZE
LD $f2, 0 * SIZE(CO2)
daddiu CO4,CO4, 2 * SIZE
LD $f3, 1 * SIZE(CO2)
daddiu CO2,CO2, 2 * SIZE
LD $f4, -2 * SIZE(CO3)
daddiu CO5,CO5, 2 * SIZE
LD $f5, -1 * SIZE(CO3)
daddiu CO6,CO6, 2 * SIZE
LD $f6, -2 * SIZE(CO4)
daddiu CO7,CO7, 2 * SIZE
LD $f7, -1 * SIZE(CO4)
daddiu I, I, -1
MADD c11, $f0, ALPHA, c11
LD $f0,-2 * SIZE(CO5)
MADD c12, $f1, ALPHA, c12
LD $f1,-1 * SIZE(CO5)
MADD c21, $f2, ALPHA, c21
LD $f2,-2 * SIZE(CO6)
MADD c22, $f3, ALPHA, c22
LD $f3,-1 * SIZE(CO6)
MADD c31, $f4, ALPHA, c31
LD $f4,-2 * SIZE(CO7)
MADD c32, $f5, ALPHA, c32
LD $f5,-1 * SIZE(CO7)
MADD c41, $f6, ALPHA, c41
LD $f6, 0 * SIZE(CO8)
MADD c42, $f7, ALPHA, c42
LD $f7, 1 * SIZE(CO8)
pref 0, 0 * SIZE(BB)
pref 0, 8 * SIZE(BB)
ST c11, -2 * SIZE(CO1)
MTC $0, c11
ST c12, -1 * SIZE(CO1)
daddiu CO8,CO8, 2 * SIZE
ST c21, -2 * SIZE(CO2)
MOV c21, c11
ST c22, -1 * SIZE(CO2)
daddiu BB, BB, 16 * SIZE
MADD c51, $f0, ALPHA, c51
ST c31, -2 * SIZE(CO3)
MADD c52, $f1, ALPHA, c52
ST c32, -1 * SIZE(CO3)
MADD c61, $f2, ALPHA, c61
ST c41, -2 * SIZE(CO4)
MADD c62, $f3, ALPHA, c62
ST c42, -1 * SIZE(CO4)
MADD c71, $f4, ALPHA, c71
ST c51, -2 * SIZE(CO5)
MADD c72, $f5, ALPHA, c72
ST c52, -1 * SIZE(CO5)
MADD c81, $f6, ALPHA, c81
ST c61, -2 * SIZE(CO6)
MADD c82, $f7, ALPHA, c82
ST c62, -1 * SIZE(CO6)
ST c71, -2 * SIZE(CO7)
MOV c31, c11
ST c72, -1 * SIZE(CO7)
MOV c41, c11
ST c81, -2 * SIZE(CO8)
MOV c51, c11
ST c82, -1 * SIZE(CO8)
bgtz I, .L11
MOV c61, c11
#else
daddiu CO4,CO4, 2 * SIZE
daddiu CO5,CO5, 2 * SIZE
daddiu CO6,CO6, 2 * SIZE
daddiu CO7,CO7, 2 * SIZE
pref 0, 0 * SIZE(BB)
pref 0, 8 * SIZE(BB)
MUL c11, ALPHA, c11
daddiu CO1,CO1, 2 * SIZE
MUL c12, ALPHA, c12
MTC $0, a1
MUL c21, ALPHA, c21
daddiu CO2,CO2, 2 * SIZE
MUL c22, ALPHA, c22
daddiu CO3,CO3, 2 * SIZE
ST c11, -2 * SIZE(CO1)
MUL c31, ALPHA, c31
ST c12, -1 * SIZE(CO1)
MUL c32, ALPHA, c32
ST c21, -2 * SIZE(CO2)
MUL c41, ALPHA, c41
ST c22, -1 * SIZE(CO2)
MUL c42, ALPHA, c42
ST c31, -2 * SIZE(CO3)
MUL c51, ALPHA, c51
ST c32, -1 * SIZE(CO3)
MUL c52, ALPHA, c52
ST c41, -2 * SIZE(CO4)
MUL c61, ALPHA, c61
ST c42, -1 * SIZE(CO4)
MUL c62, ALPHA, c62
ST c51, -2 * SIZE(CO5)
MUL c71, ALPHA, c71
ST c52, -1 * SIZE(CO5)
MUL c72, ALPHA, c72
ST c61, -2 * SIZE(CO6)
MUL c81, ALPHA, c81
ST c62, -1 * SIZE(CO6)
MUL c82, ALPHA, c82
ST c71, -2 * SIZE(CO7)
MOV c11, a1
ST c72, -1 * SIZE(CO7)
MOV c21, a1
daddiu CO8,CO8, 2 * SIZE
daddiu BB, BB, 16 * SIZE
ST c81, -2 * SIZE(CO8)
MOV c31, a1
ST c82, -1 * SIZE(CO8)
MOV c41, a1
daddiu I, I, -1
MOV c51, a1
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, K, KK
#ifdef LEFT
daddiu TEMP, TEMP, -2
#else
daddiu TEMP, TEMP, -8
#endif
dsll L, TEMP, 1 + BASE_SHIFT
dsll TEMP, TEMP, 3 + BASE_SHIFT
daddu AO, AO, L
daddu BO, BO, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 2
#endif
bgtz I, .L11
MOV c61, a1
#endif
.align 3
.L20:
andi I, M, 1
MOV c61, c11
blez I, .L29
MOV c71, c11
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
dsll L, KK, 0 + BASE_SHIFT
dsll TEMP, KK, 3 + BASE_SHIFT
daddu AO, AO, L
daddu BO, B, TEMP
#endif
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
LD b5, 4 * SIZE(BO)
LD b6, 8 * SIZE(BO)
LD b7, 12 * SIZE(BO)
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 1
#else
daddiu TEMP, KK, 8
#endif
dsra L, TEMP, 2
blez L, .L25
MOV c81, c11
#else
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 0 * SIZE(B)
LD b2, 1 * SIZE(B)
LD b3, 2 * SIZE(B)
LD b4, 3 * SIZE(B)
LD b5, 4 * SIZE(B)
LD b6, 8 * SIZE(B)
LD b7, 12 * SIZE(B)
dsra L, K, 2
MOV c81, c11
blez L, .L25
move BO, B
#endif
.align 3
.L22:
MADD c11, c11, a1, b1
LD b1, 16 * SIZE(BO)
MADD c21, c21, a1, b2
LD b2, 5 * SIZE(BO)
MADD c31, c31, a1, b3
LD b3, 6 * SIZE(BO)
MADD c41, c41, a1, b4
LD b4, 7 * SIZE(BO)
MADD c51, c51, a1, b5
LD b5, 20 * SIZE(BO)
MADD c61, c61, a1, b2
LD b2, 9 * SIZE(BO)
MADD c71, c71, a1, b3
LD b3, 10 * SIZE(BO)
MADD c81, c81, a1, b4
LD b4, 11 * SIZE(BO)
LD a1, 4 * SIZE(AO)
daddiu L, L, -1
MADD c11, c11, a2, b6
LD b6, 24 * SIZE(BO)
MADD c21, c21, a2, b2
LD b2, 13 * SIZE(BO)
MADD c31, c31, a2, b3
LD b3, 14 * SIZE(BO)
MADD c41, c41, a2, b4
LD b4, 15 * SIZE(BO)
MADD c51, c51, a2, b7
LD b7, 28 * SIZE(BO)
MADD c61, c61, a2, b2
LD b2, 17 * SIZE(BO)
MADD c71, c71, a2, b3
LD b3, 18 * SIZE(BO)
MADD c81, c81, a2, b4
LD b4, 19 * SIZE(BO)
LD a2, 5 * SIZE(AO)
daddiu AO, AO, 4 * SIZE
MADD c11, c11, a3, b1
LD b1, 32 * SIZE(BO)
MADD c21, c21, a3, b2
LD b2, 21 * SIZE(BO)
MADD c31, c31, a3, b3
LD b3, 22 * SIZE(BO)
MADD c41, c41, a3, b4
LD b4, 23 * SIZE(BO)
MADD c51, c51, a3, b5
LD b5, 36 * SIZE(BO)
MADD c61, c61, a3, b2
LD b2, 25 * SIZE(BO)
MADD c71, c71, a3, b3
LD b3, 26 * SIZE(BO)
MADD c81, c81, a3, b4
LD b4, 27 * SIZE(BO)
LD a3, 2 * SIZE(AO)
daddiu BO, BO, 32 * SIZE
MADD c11, c11, a4, b6
LD b6, 8 * SIZE(BO)
MADD c21, c21, a4, b2
LD b2, -3 * SIZE(BO)
MADD c31, c31, a4, b3
LD b3, -2 * SIZE(BO)
MADD c41, c41, a4, b4
LD b4, -1 * SIZE(BO)
MADD c51, c51, a4, b7
LD b7, 12 * SIZE(BO)
MADD c61, c61, a4, b2
LD b2, 1 * SIZE(BO)
MADD c71, c71, a4, b3
LD b3, 2 * SIZE(BO)
MADD c81, c81, a4, b4
LD b4, 3 * SIZE(BO)
bgtz L, .L22
LD a4, 3 * SIZE(AO)
.align 3
.L25:
#ifndef TRMMKERNEL
andi L, K, 3
#else
andi L, TEMP, 3
#endif
NOP
blez L, .L28
NOP
.align 3
.L26:
MADD c11, c11, a1, b1
LD b1, 8 * SIZE(BO)
MADD c21, c21, a1, b2
LD b2, 5 * SIZE(BO)
MADD c31, c31, a1, b3
LD b3, 6 * SIZE(BO)
MADD c41, c41, a1, b4
LD b4, 7 * SIZE(BO)
daddiu L, L, -1
MOV a2, a2
daddiu AO, AO, 1 * SIZE
daddiu BO, BO, 8 * SIZE
MADD c51, c51, a1, b5
LD b5, 4 * SIZE(BO)
MADD c61, c61, a1, b2
LD b2, 1 * SIZE(BO)
MADD c71, c71, a1, b3
LD b3, 2 * SIZE(BO)
MADD c81, c81, a1, b4
LD a1, 0 * SIZE(AO)
bgtz L, .L26
LD b4, 3 * SIZE(BO)
.L28:
#ifndef TRMMKERNEL
LD $f0, 0 * SIZE(CO1)
LD $f1, 0 * SIZE(CO2)
LD $f2, 0 * SIZE(CO3)
LD $f3, 0 * SIZE(CO4)
MADD c11, $f0, ALPHA, c11
LD $f4, 0 * SIZE(CO5)
MADD c21, $f1, ALPHA, c21
LD $f5, 0 * SIZE(CO6)
MADD c31, $f2, ALPHA, c31
LD $f6, 0 * SIZE(CO7)
MADD c41, $f3, ALPHA, c41
LD $f7, 0 * SIZE(CO8)
MADD c51, $f4, ALPHA, c51
ST c11, 0 * SIZE(CO1)
MADD c61, $f5, ALPHA, c61
ST c21, 0 * SIZE(CO2)
MADD c71, $f6, ALPHA, c71
ST c31, 0 * SIZE(CO3)
MADD c81, $f7, ALPHA, c81
ST c41, 0 * SIZE(CO4)
ST c51, 0 * SIZE(CO5)
ST c61, 0 * SIZE(CO6)
ST c71, 0 * SIZE(CO7)
ST c81, 0 * SIZE(CO8)
#else
MUL c11, ALPHA, c11
MUL c21, ALPHA, c21
MUL c31, ALPHA, c31
MUL c41, ALPHA, c41
ST c11, 0 * SIZE(CO1)
MUL c51, ALPHA, c51
ST c21, 0 * SIZE(CO2)
MUL c61, ALPHA, c61
ST c31, 0 * SIZE(CO3)
MUL c71, ALPHA, c71
ST c41, 0 * SIZE(CO4)
MUL c81, ALPHA, c81
ST c51, 0 * SIZE(CO5)
ST c61, 0 * SIZE(CO6)
ST c71, 0 * SIZE(CO7)
ST c81, 0 * SIZE(CO8)
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, K, KK
#ifdef LEFT
daddiu TEMP, TEMP, -1
#else
daddiu TEMP, TEMP, -8
#endif
dsll L, TEMP, 0 + BASE_SHIFT
dsll TEMP, TEMP, 3 + BASE_SHIFT
daddu AO, AO, L
daddu BO, BO, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 1
#endif
#endif
.align 3
.L29:
#if defined(TRMMKERNEL) && !defined(LEFT)
daddiu KK, KK, 8
#endif
bgtz J, .L10
move B, BO
.align 3
.L30:
andi J, N, 4
blez J, .L50
move AO, A
move CO1, C
MTC $0, c11
daddu CO2, C, LDC
daddu CO3, CO2, LDC
daddu CO4, CO3, LDC
MOV c21, c11
daddu C, CO4, LDC
MOV c31, c11
#if defined(TRMMKERNEL) && defined(LEFT)
move KK, OFFSET
#endif
dsra I, M, 1
blez I, .L40
MOV c41, c11
.L31:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
dsll L, KK, 1 + BASE_SHIFT
dsll TEMP, KK, 2 + BASE_SHIFT
daddu AO, AO, L
daddu BO, B, TEMP
#endif
LD a1, 0 * SIZE(AO)
LD a3, 4 * SIZE(AO)
LD b1, 0 * SIZE(BO)
MOV c12, c11
LD b2, 1 * SIZE(BO)
MOV c22, c11
LD b3, 2 * SIZE(BO)
MOV c32, c11
LD b4, 3 * SIZE(BO)
MOV c42, c11
LD b5, 4 * SIZE(BO)
LD b6, 8 * SIZE(BO)
LD b7, 12 * SIZE(BO)
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 2
#else
daddiu TEMP, KK, 4
#endif
dsra L, TEMP, 2
blez L, .L35
NOP
#else
LD a1, 0 * SIZE(AO)
LD a3, 4 * SIZE(AO)
LD b1, 0 * SIZE(B)
MOV c12, c11
LD b2, 1 * SIZE(B)
MOV c22, c11
LD b3, 2 * SIZE(B)
MOV c32, c11
LD b4, 3 * SIZE(B)
MOV c42, c11
LD b5, 4 * SIZE(B)
dsra L, K, 2
LD b6, 8 * SIZE(B)
LD b7, 12 * SIZE(B)
blez L, .L35
move BO, B
#endif
.align 3
.L32:
MADD c11, c11, a1, b1
LD a2, 1 * SIZE(AO)
MADD c21, c21, a1, b2
daddiu L, L, -1
MADD c31, c31, a1, b3
NOP
MADD c41, c41, a1, b4
LD a1, 2 * SIZE(AO)
MADD c12, c12, a2, b1
LD b1, 16 * SIZE(BO)
MADD c22, c22, a2, b2
LD b2, 5 * SIZE(BO)
MADD c32, c32, a2, b3
LD b3, 6 * SIZE(BO)
MADD c42, c42, a2, b4
LD b4, 7 * SIZE(BO)
MADD c11, c11, a1, b5
LD a2, 3 * SIZE(AO)
MADD c21, c21, a1, b2
NOP
MADD c31, c31, a1, b3
NOP
MADD c41, c41, a1, b4
LD a1, 8 * SIZE(AO)
MADD c12, c12, a2, b5
LD b5, 20 * SIZE(BO)
MADD c22, c22, a2, b2
LD b2, 9 * SIZE(BO)
MADD c32, c32, a2, b3
LD b3, 10 * SIZE(BO)
MADD c42, c42, a2, b4
LD b4, 11 * SIZE(BO)
MADD c11, c11, a3, b6
LD a2, 5 * SIZE(AO)
MADD c21, c21, a3, b2
NOP
MADD c31, c31, a3, b3
NOP
MADD c41, c41, a3, b4
LD a3, 6 * SIZE(AO)
MADD c12, c12, a2, b6
LD b6, 24 * SIZE(BO)
MADD c22, c22, a2, b2
LD b2, 13 * SIZE(BO)
MADD c32, c32, a2, b3
LD b3, 14 * SIZE(BO)
MADD c42, c42, a2, b4
LD b4, 15 * SIZE(BO)
MADD c11, c11, a3, b7
LD a2, 7 * SIZE(AO)
MADD c21, c21, a3, b2
daddiu AO, AO, 8 * SIZE
MADD c31, c31, a3, b3
daddiu BO, BO, 16 * SIZE
MADD c41, c41, a3, b4
LD a3, 4 * SIZE(AO)
MADD c12, c12, a2, b7
LD b7, 12 * SIZE(BO)
MADD c22, c22, a2, b2
LD b2, 1 * SIZE(BO)
MADD c32, c32, a2, b3
LD b3, 2 * SIZE(BO)
MADD c42, c42, a2, b4
NOP
bgtz L, .L32
LD b4, 3 * SIZE(BO)
.align 3
.L35:
#ifndef TRMMKERNEL
andi L, K, 3
#else
andi L, TEMP, 3
#endif
NOP
blez L, .L38
NOP
.align 3
.L36:
MADD c11, c11, a1, b1
LD a2, 1 * SIZE(AO)
MADD c21, c21, a1, b2
daddiu L, L, -1
MADD c31, c31, a1, b3
daddiu AO, AO, 2 * SIZE
MADD c41, c41, a1, b4
LD a1, 0 * SIZE(AO)
MADD c12, c12, a2, b1
LD b1, 4 * SIZE(BO)
MADD c22, c22, a2, b2
LD b2, 5 * SIZE(BO)
MADD c32, c32, a2, b3
LD b3, 6 * SIZE(BO)
MADD c42, c42, a2, b4
LD b4, 7 * SIZE(BO)
bgtz L, .L36
daddiu BO, BO, 4 * SIZE
.L38:
#ifndef TRMMKERNEL
LD $f0, 0 * SIZE(CO1)
daddiu CO3,CO3, 2 * SIZE
LD $f1, 1 * SIZE(CO1)
daddiu CO1,CO1, 2 * SIZE
LD $f2, 0 * SIZE(CO2)
daddiu CO4,CO4, 2 * SIZE
LD $f3, 1 * SIZE(CO2)
daddiu CO2,CO2, 2 * SIZE
LD $f4, -2 * SIZE(CO3)
MADD c11, $f0, ALPHA, c11
LD $f5, -1 * SIZE(CO3)
MADD c12, $f1, ALPHA, c12
LD $f6, -2 * SIZE(CO4)
MADD c21, $f2, ALPHA, c21
LD $f7, -1 * SIZE(CO4)
MADD c22, $f3, ALPHA, c22
MADD c31, $f4, ALPHA, c31
ST c11, -2 * SIZE(CO1)
MADD c32, $f5, ALPHA, c32
ST c12, -1 * SIZE(CO1)
MADD c41, $f6, ALPHA, c41
ST c21, -2 * SIZE(CO2)
MADD c42, $f7, ALPHA, c42
ST c22, -1 * SIZE(CO2)
ST c31, -2 * SIZE(CO3)
MTC $0, c11
ST c32, -1 * SIZE(CO3)
daddiu I, I, -1
ST c41, -2 * SIZE(CO4)
MOV c21, c11
ST c42, -1 * SIZE(CO4)
MOV c31, c11
#else
MUL c11, ALPHA, c11
daddiu CO3,CO3, 2 * SIZE
MUL c12, ALPHA, c12
daddiu CO1,CO1, 2 * SIZE
MUL c21, ALPHA, c21
daddiu CO4,CO4, 2 * SIZE
MUL c22, ALPHA, c22
daddiu CO2,CO2, 2 * SIZE
ST c11, -2 * SIZE(CO1)
MUL c31, ALPHA, c31
ST c12, -1 * SIZE(CO1)
MUL c32, ALPHA, c32
ST c21, -2 * SIZE(CO2)
MUL c41, ALPHA, c41
ST c22, -1 * SIZE(CO2)
MUL c42, ALPHA, c42
ST c31, -2 * SIZE(CO3)
MTC $0, c11
ST c32, -1 * SIZE(CO3)
daddiu I, I, -1
ST c41, -2 * SIZE(CO4)
MOV c21, c11
ST c42, -1 * SIZE(CO4)
MOV c31, c11
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, K, KK
#ifdef LEFT
daddiu TEMP, TEMP, -2
#else
daddiu TEMP, TEMP, -4
#endif
dsll L, TEMP, 1 + BASE_SHIFT
dsll TEMP, TEMP, 2 + BASE_SHIFT
daddu AO, AO, L
daddu BO, BO, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 2
#endif
#endif
bgtz I, .L31
MOV c41, c11
.align 3
.L40:
andi I, M, 1
blez I, .L49
MOV c61, c11
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
dsll L, KK, 0 + BASE_SHIFT
dsll TEMP, KK, 2 + BASE_SHIFT
daddu AO, AO, L
daddu BO, B, TEMP
#endif
LD a1, 0 * SIZE(AO)
MOV c71, c11
LD a2, 1 * SIZE(AO)
MOV c81, c11
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
LD b5, 4 * SIZE(BO)
LD b6, 8 * SIZE(BO)
LD b7, 12 * SIZE(BO)
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 1
#else
daddiu TEMP, KK, 4
#endif
dsra L, TEMP, 2
blez L, .L45
NOP
#else
LD a1, 0 * SIZE(AO)
MOV c71, c11
LD a2, 1 * SIZE(AO)
MOV c81, c11
LD b1, 0 * SIZE(B)
LD b2, 1 * SIZE(B)
LD b3, 2 * SIZE(B)
LD b4, 3 * SIZE(B)
LD b5, 4 * SIZE(B)
LD b6, 8 * SIZE(B)
LD b7, 12 * SIZE(B)
dsra L, K, 2
blez L, .L45
move BO, B
#endif
.align 3
.L42:
MADD c11, c11, a1, b1
LD b1, 16 * SIZE(BO)
MADD c21, c21, a1, b2
LD b2, 5 * SIZE(BO)
MADD c31, c31, a1, b3
LD b3, 6 * SIZE(BO)
MADD c41, c41, a1, b4
LD b4, 7 * SIZE(BO)
LD a1, 4 * SIZE(AO)
daddiu L, L, -1
MADD c11, c11, a2, b5
LD b5, 20 * SIZE(BO)
MADD c21, c21, a2, b2
LD b2, 9 * SIZE(BO)
MADD c31, c31, a2, b3
LD b3, 10 * SIZE(BO)
MADD c41, c41, a2, b4
LD b4, 11 * SIZE(BO)
LD a2, 2 * SIZE(AO)
daddiu AO, AO, 4 * SIZE
MADD c11, c11, a2, b6
LD b6, 24 * SIZE(BO)
MADD c21, c21, a2, b2
LD b2, 13 * SIZE(BO)
MADD c31, c31, a2, b3
LD b3, 14 * SIZE(BO)
MADD c41, c41, a2, b4
LD b4, 15 * SIZE(BO)
LD a2, -1 * SIZE(AO)
daddiu BO, BO, 16 * SIZE
MADD c11, c11, a2, b7
LD b7, 12 * SIZE(BO)
MADD c21, c21, a2, b2
LD b2, 1 * SIZE(BO)
MADD c31, c31, a2, b3
LD b3, 2 * SIZE(BO)
MADD c41, c41, a2, b4
LD b4, 3 * SIZE(BO)
bgtz L, .L42
LD a2, 1 * SIZE(AO)
.align 3
.L45:
#ifndef TRMMKERNEL
andi L, K, 3
#else
andi L, TEMP, 3
#endif
NOP
blez L, .L48
NOP
.align 3
.L46:
MADD c11, c11, a1, b1
LD b1, 4 * SIZE(BO)
MADD c21, c21, a1, b2
LD b2, 5 * SIZE(BO)
MADD c31, c31, a1, b3
LD b3, 6 * SIZE(BO)
MADD c41, c41, a1, b4
LD a1, 1 * SIZE(AO)
LD b4, 7 * SIZE(BO)
daddiu L, L, -1
daddiu AO, AO, 1 * SIZE
MOV a2, a2
bgtz L, .L46
daddiu BO, BO, 4 * SIZE
.L48:
#ifndef TRMMKERNEL
LD $f0, 0 * SIZE(CO1)
LD $f1, 0 * SIZE(CO2)
LD $f2, 0 * SIZE(CO3)
LD $f3, 0 * SIZE(CO4)
MADD c11, $f0, ALPHA, c11
MADD c21, $f1, ALPHA, c21
MADD c31, $f2, ALPHA, c31
MADD c41, $f3, ALPHA, c41
ST c11, 0 * SIZE(CO1)
ST c21, 0 * SIZE(CO2)
ST c31, 0 * SIZE(CO3)
ST c41, 0 * SIZE(CO4)
#else
MUL c11, ALPHA, c11
MUL c21, ALPHA, c21
MUL c31, ALPHA, c31
MUL c41, ALPHA, c41
ST c11, 0 * SIZE(CO1)
ST c21, 0 * SIZE(CO2)
ST c31, 0 * SIZE(CO3)
ST c41, 0 * SIZE(CO4)
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, K, KK
#ifdef LEFT
daddiu TEMP, TEMP, -1
#else
daddiu TEMP, TEMP, -4
#endif
dsll L, TEMP, 0 + BASE_SHIFT
dsll TEMP, TEMP, 2 + BASE_SHIFT
daddu AO, AO, L
daddu BO, BO, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 1
#endif
#endif
.align 3
.L49:
#if defined(TRMMKERNEL) && !defined(LEFT)
daddiu KK, KK, 4
#endif
move B, BO
.align 3
.L50:
andi J, N, 2
blez J, .L70
move AO, A
move CO1, C
daddu CO2, C, LDC
#if defined(TRMMKERNEL) && defined(LEFT)
move KK, OFFSET
#endif
dsra I, M, 1
blez I, .L60
daddu C, CO2, LDC
.L51:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
dsll L, KK, 1 + BASE_SHIFT
dsll TEMP, KK, 1 + BASE_SHIFT
daddu AO, AO, L
daddu BO, B, TEMP
#endif
LD a1, 0 * SIZE(AO)
MTC $0, c11
LD a2, 1 * SIZE(AO)
MOV c21, c11
LD a5, 4 * SIZE(AO)
LD b1, 0 * SIZE(BO)
MOV c12, c11
LD b2, 1 * SIZE(BO)
MOV c22, c11
LD b3, 2 * SIZE(BO)
LD b5, 4 * SIZE(BO)
LD b6, 8 * SIZE(BO)
LD b7, 12 * SIZE(BO)
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 2
#else
daddiu TEMP, KK, 2
#endif
dsra L, TEMP, 2
blez L, .L55
NOP
#else
LD a1, 0 * SIZE(AO)
MTC $0, c11
LD a2, 1 * SIZE(AO)
MOV c21, c11
LD a5, 4 * SIZE(AO)
LD b1, 0 * SIZE(B)
MOV c12, c11
LD b2, 1 * SIZE(B)
MOV c22, c11
LD b3, 2 * SIZE(B)
LD b5, 4 * SIZE(B)
dsra L, K, 2
LD b6, 8 * SIZE(B)
LD b7, 12 * SIZE(B)
blez L, .L55
move BO, B
#endif
.align 3
.L52:
MADD c11, c11, a1, b1
LD a3, 2 * SIZE(AO)
MADD c21, c21, a1, b2
LD b4, 3 * SIZE(BO)
MADD c12, c12, a2, b1
LD a4, 3 * SIZE(AO)
MADD c22, c22, a2, b2
LD b1, 8 * SIZE(BO)
MADD c11, c11, a3, b3
LD a1, 8 * SIZE(AO)
MADD c21, c21, a3, b4
LD b2, 5 * SIZE(BO)
MADD c12, c12, a4, b3
LD a2, 5 * SIZE(AO)
MADD c22, c22, a4, b4
LD b3, 6 * SIZE(BO)
MADD c11, c11, a5, b5
LD a3, 6 * SIZE(AO)
MADD c21, c21, a5, b2
LD b4, 7 * SIZE(BO)
MADD c12, c12, a2, b5
LD a4, 7 * SIZE(AO)
MADD c22, c22, a2, b2
LD b5, 12 * SIZE(BO)
MADD c11, c11, a3, b3
LD a5, 12 * SIZE(AO)
MADD c21, c21, a3, b4
LD b2, 9 * SIZE(BO)
MADD c12, c12, a4, b3
LD a2, 9 * SIZE(AO)
MADD c22, c22, a4, b4
LD b3, 10 * SIZE(BO)
daddiu AO, AO, 8 * SIZE
daddiu L, L, -1
bgtz L, .L52
daddiu BO, BO, 8 * SIZE
.align 3
.L55:
#ifndef TRMMKERNEL
andi L, K, 3
#else
andi L, TEMP, 3
#endif
NOP
blez L, .L58
NOP
.align 3
.L56:
MADD c11, c11, a1, b1
LD a2, 1 * SIZE(AO)
MADD c21, c21, a1, b2
LD a1, 2 * SIZE(AO)
MADD c12, c12, a2, b1
LD b1, 2 * SIZE(BO)
MADD c22, c22, a2, b2
LD b2, 3 * SIZE(BO)
daddiu L, L, -1
daddiu AO, AO, 2 * SIZE
bgtz L, .L56
daddiu BO, BO, 2 * SIZE
.L58:
#ifndef TRMMKERNEL
LD $f0, 0 * SIZE(CO1)
daddiu I, I, -1
LD $f1, 1 * SIZE(CO1)
daddiu CO1,CO1, 2 * SIZE
LD $f2, 0 * SIZE(CO2)
NOP
LD $f3, 1 * SIZE(CO2)
daddiu CO2,CO2, 2 * SIZE
MADD c11, $f0, ALPHA, c11
MADD c12, $f1, ALPHA, c12
MADD c21, $f2, ALPHA, c21
MADD c22, $f3, ALPHA, c22
ST c11, -2 * SIZE(CO1)
ST c12, -1 * SIZE(CO1)
ST c21, -2 * SIZE(CO2)
NOP
bgtz I, .L51
ST c22, -1 * SIZE(CO2)
#else
daddiu I, I, -1
daddiu CO1,CO1, 2 * SIZE
daddiu CO2,CO2, 2 * SIZE
MUL c11, ALPHA, c11
MUL c12, ALPHA, c12
MUL c21, ALPHA, c21
MUL c22, ALPHA, c22
ST c11, -2 * SIZE(CO1)
ST c12, -1 * SIZE(CO1)
ST c21, -2 * SIZE(CO2)
ST c22, -1 * SIZE(CO2)
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, K, KK
#ifdef LEFT
daddiu TEMP, TEMP, -2
#else
daddiu TEMP, TEMP, -2
#endif
dsll L, TEMP, 1 + BASE_SHIFT
dsll TEMP, TEMP, 1 + BASE_SHIFT
daddu AO, AO, L
daddu BO, BO, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 2
#endif
bgtz I, .L51
NOP
#endif
.align 3
.L60:
andi I, M, 1
blez I, .L69
NOP
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
dsll L, KK, 0 + BASE_SHIFT
dsll TEMP, KK, 1 + BASE_SHIFT
daddu AO, AO, L
daddu BO, B, TEMP
#endif
LD a1, 0 * SIZE(AO)
MTC $0, c11
LD a2, 1 * SIZE(AO)
MOV c21, c11
LD a3, 2 * SIZE(AO)
MOV c31, c11
LD a4, 3 * SIZE(AO)
MOV c41, c11
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
LD b5, 4 * SIZE(BO)
LD b6, 8 * SIZE(BO)
LD b7, 12 * SIZE(BO)
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 1
#else
daddiu TEMP, KK, 2
#endif
dsra L, TEMP, 2
blez L, .L65
NOP
#else
dsra L, K, 2
LD a1, 0 * SIZE(AO)
MTC $0, c11
LD a2, 1 * SIZE(AO)
MOV c21, c11
LD a3, 2 * SIZE(AO)
MOV c31, c11
LD a4, 3 * SIZE(AO)
MOV c41, c11
LD b1, 0 * SIZE(B)
LD b2, 1 * SIZE(B)
LD b3, 2 * SIZE(B)
LD b4, 3 * SIZE(B)
LD b5, 4 * SIZE(B)
LD b6, 8 * SIZE(B)
LD b7, 12 * SIZE(B)
blez L, .L65
move BO, B
#endif
.align 3
.L62:
MADD c11, c11, a1, b1
LD b1, 4 * SIZE(BO)
MADD c21, c21, a1, b2
LD b2, 5 * SIZE(BO)
MADD c31, c31, a2, b3
LD b3, 6 * SIZE(BO)
MADD c41, c41, a2, b4
LD b4, 7 * SIZE(BO)
LD a1, 4 * SIZE(AO)
LD a2, 5 * SIZE(AO)
MADD c11, c11, a3, b1
LD b1, 8 * SIZE(BO)
MADD c21, c21, a3, b2
LD b2, 9 * SIZE(BO)
MADD c31, c31, a4, b3
LD b3, 10 * SIZE(BO)
MADD c41, c41, a4, b4
LD b4, 11 * SIZE(BO)
LD a3, 6 * SIZE(AO)
LD a4, 7 * SIZE(AO)
daddiu L, L, -1
daddiu AO, AO, 4 * SIZE
bgtz L, .L62
daddiu BO, BO, 8 * SIZE
.align 3
.L65:
#ifndef TRMMKERNEL
andi L, K, 3
#else
andi L, TEMP, 3
#endif
NOP
blez L, .L68
NOP
.align 3
.L66:
MADD c11, c11, a1, b1
LD b1, 2 * SIZE(BO)
MADD c21, c21, a1, b2
LD b2, 3 * SIZE(BO)
LD a1, 1 * SIZE(AO)
daddiu L, L, -1
daddiu AO, AO, 1 * SIZE
bgtz L, .L66
daddiu BO, BO, 2 * SIZE
.L68:
#ifndef TRMMKERNEL
LD $f0, 0 * SIZE(CO1)
LD $f1, 0 * SIZE(CO2)
ADD c11, c11, c31
ADD c21, c21, c41
MADD c11, $f0, ALPHA, c11
MADD c21, $f1, ALPHA, c21
ST c11, 0 * SIZE(CO1)
ST c21, 0 * SIZE(CO2)
#else
ADD c11, c11, c31
ADD c21, c21, c41
MUL c11, ALPHA, c11
MUL c21, ALPHA, c21
ST c11, 0 * SIZE(CO1)
ST c21, 0 * SIZE(CO2)
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, K, KK
#ifdef LEFT
daddiu TEMP, TEMP, -1
#else
daddiu TEMP, TEMP, -2
#endif
dsll L, TEMP, 0 + BASE_SHIFT
dsll TEMP, TEMP, 1 + BASE_SHIFT
daddu AO, AO, L
daddu BO, BO, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 1
#endif
#endif
.align 3
.L69:
#if defined(TRMMKERNEL) && !defined(LEFT)
daddiu KK, KK, 2
#endif
move B, BO
.align 3
.L70:
andi J, N, 1
blez J, .L999
move AO, A
move CO1, C
#if defined(TRMMKERNEL) && defined(LEFT)
move KK, OFFSET
#endif
dsra I, M, 1
blez I, .L80
daddu C, CO1, LDC
.L71:
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
dsll L, KK, 1 + BASE_SHIFT
dsll TEMP, KK, 0 + BASE_SHIFT
daddu AO, AO, L
daddu BO, B, TEMP
#endif
LD a1, 0 * SIZE(AO)
MTC $0, c11
LD a2, 1 * SIZE(AO)
MOV c21, c11
LD a5, 4 * SIZE(AO)
LD b1, 0 * SIZE(BO)
MOV c12, c11
LD b2, 1 * SIZE(BO)
MOV c22, c11
LD b3, 2 * SIZE(BO)
LD b5, 4 * SIZE(BO)
LD b6, 8 * SIZE(BO)
LD b7, 12 * SIZE(BO)
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 2
#else
daddiu TEMP, KK, 1
#endif
dsra L, TEMP, 2
blez L, .L75
NOP
#else
LD a1, 0 * SIZE(AO)
MTC $0, c11
LD a2, 1 * SIZE(AO)
MOV c21, c11
LD a5, 4 * SIZE(AO)
LD b1, 0 * SIZE(B)
MOV c12, c11
LD b2, 1 * SIZE(B)
MOV c22, c11
LD b3, 2 * SIZE(B)
LD b5, 4 * SIZE(B)
dsra L, K, 2
LD b6, 8 * SIZE(B)
LD b7, 12 * SIZE(B)
blez L, .L75
move BO, B
#endif
.align 3
.L72:
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
MADD c11, c11, a1, b1
MADD c12, c12, a2, b1
LD a1, 2 * SIZE(AO)
LD a2, 3 * SIZE(AO)
LD b1, 1 * SIZE(BO)
MADD c11, c11, a1, b1
MADD c12, c12, a2, b1
LD a1, 4 * SIZE(AO)
LD a2, 5 * SIZE(AO)
LD b1, 2 * SIZE(BO)
MADD c11, c11, a1, b1
MADD c12, c12, a2, b1
LD a1, 6 * SIZE(AO)
LD a2, 7 * SIZE(AO)
LD b1, 3 * SIZE(BO)
MADD c11, c11, a1, b1
MADD c12, c12, a2, b1
daddiu L, L, -1
daddiu AO, AO, 8 * SIZE
bgtz L, .L72
daddiu BO, BO, 4 * SIZE
.align 3
.L75:
#ifndef TRMMKERNEL
andi L, K, 3
#else
andi L, TEMP, 3
#endif
NOP
blez L, .L78
NOP
.align 3
.L76:
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
MADD c11, c11, a1, b1
MADD c12, c12, a2, b1
daddiu L, L, -1
daddiu AO, AO, 2 * SIZE
bgtz L, .L76
daddiu BO, BO, 1 * SIZE
.L78:
#ifndef TRMMKERNEL
LD $f0, 0 * SIZE(CO1)
daddiu I, I, -1
LD $f1, 1 * SIZE(CO1)
daddiu CO1,CO1, 2 * SIZE
ADD c11, c11, c21
ADD c12, c12, c22
MADD c11, $f0, ALPHA, c11
MADD c12, $f1, ALPHA, c12
ST c11, -2 * SIZE(CO1)
bgtz I, .L71
ST c12, -1 * SIZE(CO1)
#else
ADD c11, c11, c21
daddiu I, I, -1
ADD c12, c12, c22
daddiu CO1,CO1, 2 * SIZE
MUL c11, ALPHA, c11
MUL c12, ALPHA, c12
ST c11, -2 * SIZE(CO1)
ST c12, -1 * SIZE(CO1)
#if ( defined(LEFT) && defined(TRANSA)) || \
(!defined(LEFT) && !defined(TRANSA))
dsubu TEMP, K, KK
#ifdef LEFT
daddiu TEMP, TEMP, -2
#else
daddiu TEMP, TEMP, -1
#endif
dsll L, TEMP, 1 + BASE_SHIFT
dsll TEMP, TEMP, 0 + BASE_SHIFT
daddu AO, AO, L
daddu BO, BO, TEMP
#endif
#ifdef LEFT
daddiu KK, KK, 2
#endif
bgtz I, .L71
NOP
#endif
.align 3
.L80:
andi I, M, 1
blez I, .L89
NOP
#if defined(TRMMKERNEL)
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
move BO, B
#else
dsll L, KK, 0 + BASE_SHIFT
dsll TEMP, KK, 0 + BASE_SHIFT
daddu AO, AO, L
daddu BO, B, TEMP
#endif
LD a1, 0 * SIZE(AO)
MTC $0, c11
LD a2, 1 * SIZE(AO)
MOV c21, c11
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
LD b5, 4 * SIZE(BO)
LD b6, 8 * SIZE(BO)
LD b7, 12 * SIZE(BO)
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
dsubu TEMP, K, KK
#elif defined(LEFT)
daddiu TEMP, KK, 1
#else
daddiu TEMP, KK, 1
#endif
dsra L, TEMP, 2
blez L, .L85
NOP
#else
LD a1, 0 * SIZE(AO)
MTC $0, c11
LD a2, 1 * SIZE(AO)
MOV c21, c11
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 0 * SIZE(B)
LD b2, 1 * SIZE(B)
LD b3, 2 * SIZE(B)
LD b4, 3 * SIZE(B)
LD b5, 4 * SIZE(B)
LD b6, 8 * SIZE(B)
LD b7, 12 * SIZE(B)
dsra L, K, 2
blez L, .L85
move BO, B
#endif
.align 3
.L82:
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(BO)
MADD c11, c11, a1, b1
LD a1, 1 * SIZE(AO)
LD b1, 1 * SIZE(BO)
MADD c21, c21, a1, b1
LD a1, 2 * SIZE(AO)
LD b1, 2 * SIZE(BO)
MADD c11, c11, a1, b1
LD a1, 3 * SIZE(AO)
LD b1, 3 * SIZE(BO)
MADD c21, c21, a1, b1
daddiu L, L, -1
daddiu AO, AO, 4 * SIZE
bgtz L, .L82
daddiu BO, BO, 4 * SIZE
.align 3
.L85:
#ifndef TRMMKERNEL
andi L, K, 3
#else
andi L, TEMP, 3
#endif
NOP
blez L, .L88
NOP
.align 3
.L86:
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(BO)
MADD c11, c11, a1, b1
daddiu L, L, -1
daddiu AO, AO, 1 * SIZE
bgtz L, .L86
daddiu BO, BO, 1 * SIZE
.L88:
#ifndef TRMMKERNEL
LD $f0, 0 * SIZE(CO1)
ADD c11, c11, c21
MADD c11, $f0, ALPHA, c11
ST c11, 0 * SIZE(CO1)
#else
ADD c11, c11, c21
MUL c11, ALPHA, c11
ST c11, 0 * SIZE(CO1)
#endif
.align 3
.L89:
#if defined(TRMMKERNEL) && !defined(LEFT)
daddiu KK, KK, 1
#endif
move B, BO
.align 3
.L999:
LDARG $16, 0($sp)
LDARG $17, 8($sp)
LDARG $18, 16($sp)
LDARG $19, 24($sp)
LDARG $20, 32($sp)
LDARG $21, 40($sp)
LDARG $22, 48($sp)
ldc1 $f24, 56($sp)
ldc1 $f25, 64($sp)
ldc1 $f26, 72($sp)
ldc1 $f27, 80($sp)
ldc1 $f28, 88($sp)
#if defined(TRMMKERNEL)
LDARG $23, 96($sp)
LDARG $24, 104($sp)
LDARG $25, 112($sp)
#endif
#ifndef __64BIT__
ldc1 $f20,120($sp)
ldc1 $f21,128($sp)
ldc1 $f22,136($sp)
ldc1 $f23,144($sp)
#endif
j $31
daddiu $sp, $sp, 160
EPILOGUE