tahoma2d/thirdparty/openblas/xianyi-OpenBLAS-e6e87a2/kernel/mips64/trsm_kernel_RN_loongson3a.S
2016-03-24 02:47:04 +09:00

1852 lines
34 KiB
ArmAsm

#define REALNAME ASMNAME
#define ASSEMBLER
#include "common.h"
#define M $4
#define N $5
#define K $6
#define A $8
#define B $9
#define C $10
#define LDC $11
#define AO $12
#define BO $13
#define I $2
#define J $3
#define L $7
#define CO1 $14
#define CO2 $15
#define CO3 $16
#define CO4 $17
#define OFFSET $22
#define KK $23
#define TEMP $24
#define AORIG $25
#define a1 $f0
#define a2 $f1
#define a3 $f26
#define a4 $f27
#define a5 $f28
#define a6 $f29
#define a7 $f30
#define a8 $f31
#define b1 $f2
#define b2 $f3
#define b3 $f4
#define b4 $f5
#define b5 $f6
#define b6 $f7
#define b7 $f8
#define b8 $f9
#define t11 $f10
#define t21 $f11
#define t31 $f12
#define t41 $f13
#define t12 $f14
#define t22 $f15
#define t32 $f16
#define t42 $f17
#define t13 $f18
#define t23 $f19
#define t33 $f20
#define t43 $f21
#define t14 $f22
#define t24 $f23
#define t34 $f24
#define t44 $f25
PROLOGUE
daddiu $sp, $sp, -144
SDARG $16, 0($sp)
SDARG $17, 8($sp)
SDARG $18, 16($sp)
SDARG $19, 24($sp)
SDARG $20, 32($sp)
SDARG $21, 40($sp)
sdc1 $f24, 48($sp)
sdc1 $f25, 56($sp)
sdc1 $f26, 64($sp)
sdc1 $f27, 72($sp)
sdc1 $f28, 80($sp)
SDARG $22, 88($sp)
SDARG $23, 96($sp)
SDARG $24, 104($sp)
SDARG $25, 112($sp)
#ifndef __64BIT__
sdc1 $f20,112($sp)
sdc1 $f21,120($sp)
sdc1 $f22,128($sp)
sdc1 $f23,136($sp)
#endif
# RN compute from top to bottom left to right
.align 3
LDARG OFFSET, 144($sp) # get the last parameter
dsll LDC, LDC, BASE_SHIFT # LDC * data_Byte
neg KK, OFFSET # for RN OFFSET always 0
dsra J, N, 2 # J = NC/4
blez J, .L30
NOP
.L10:
daddiu J, J, -1
move CO1, C
daddu CO2, C, LDC
daddu CO3, CO2, LDC
daddu CO4, CO3, LDC
move AO, A # A is the retangular matrix and B is the trigular matrix
daddu C, CO4, LDC # Fixed pointer C
dsra I, M, 2 # I=MC/4
blez I, .L20
NOP
.align 3
.L11:
MTC $0, t11 # clear results registers
MOV t21, t11
MOV t31, t11
MOV t41, t11
MOV t12, t11
MOV t22, t11
MOV t32, t11
MOV t42, t11
MOV t13, t11
MOV t23, t11
MOV t33, t11
MOV t43, t11
MOV t14, t11
MOV t24, t11
MOV t34, t11
MOV t44, t11
LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
LD a2, 1 * SIZE(AO) # get 4 a
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
LD b2, 1 * SIZE(B) # get 4 b
LD b3, 2 * SIZE(B)
LD b4, 3 * SIZE(B)
dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
blez L, .L15
move BO, B # reset B
.L12:
LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
LD b5, 4 * SIZE(BO)
LD b6, 5 * SIZE(BO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
MADD t11, t11, a1, b1
MADD t21, t21, a2, b1
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
MADD t32, t32, a3, b2
MADD t42, t42, a4, b2
MADD t13, t13, a1, b3
MADD t23, t23, a2, b3
MADD t33, t33, a3, b3
MADD t43, t43, a4, b3
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
MADD t34, t34, a3, b4
MADD t44, t44, a4, b4 # fisrt
LD a1, 8 * SIZE(AO)
LD a2, 9 * SIZE(AO)
LD a3, 10 * SIZE(AO)
LD a4, 11 * SIZE(AO)
LD b1, 8 * SIZE(BO)
LD b2, 9 * SIZE(BO)
LD b3, 10 * SIZE(BO)
LD b4, 11 * SIZE(BO)
MADD t11, t11, a5, b5
MADD t21, t21, a6, b5
MADD t31, t31, a7, b5
MADD t41, t41, a8, b5
MADD t12, t12, a5, b6
MADD t22, t22, a6, b6
MADD t32, t32, a7, b6
MADD t42, t42, a8, b6
MADD t13, t13, a5, b7
MADD t23, t23, a6, b7
MADD t33, t33, a7, b7
MADD t43, t43, a8, b7
MADD t14, t14, a5, b8
MADD t24, t24, a6, b8
MADD t34, t34, a7, b8
MADD t44, t44, a8, b8 # second
LD a5, 12 * SIZE(AO)
LD a6, 13 * SIZE(AO)
LD a7, 14 * SIZE(AO)
LD a8, 15 * SIZE(AO)
LD b5, 12 * SIZE(BO)
LD b6, 13 * SIZE(BO)
LD b7, 14 * SIZE(BO)
LD b8, 15 * SIZE(BO)
MADD t11, t11, a1, b1
MADD t21, t21, a2, b1
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
MADD t32, t32, a3, b2
MADD t42, t42, a4, b2
MADD t13, t13, a1, b3
MADD t23, t23, a2, b3
MADD t33, t33, a3, b3
MADD t43, t43, a4, b3
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
MADD t34, t34, a3, b4
MADD t44, t44, a4, b4 # third
daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
daddiu BO, BO, 16 * SIZE # BP += 4nr*4kr
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
MADD t11, t11, a5, b5
MADD t21, t21, a6, b5
MADD t31, t31, a7, b5
MADD t41, t41, a8, b5
MADD t12, t12, a5, b6
MADD t22, t22, a6, b6
MADD t32, t32, a7, b6
MADD t42, t42, a8, b6
MADD t13, t13, a5, b7
MADD t23, t23, a6, b7
MADD t33, t33, a7, b7
MADD t43, t43, a8, b7
MADD t14, t14, a5, b8
MADD t24, t24, a6, b8
MADD t34, t34, a7, b8
MADD t44, t44, a8, b8 # fouth
daddiu L, L, -1
bgtz L, .L12
NOP
.L15:
andi L, KK, 3 # deal with kc remainder part
blez L, .L18
NOP
.align 3
.L16:
MADD t11, t11, a1, b1
MADD t21, t21, a2, b1
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
MADD t32, t32, a3, b2
MADD t42, t42, a4, b2
MADD t13, t13, a1, b3
MADD t23, t23, a2, b3
MADD t33, t33, a3, b3
MADD t43, t43, a4, b3
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
MADD t34, t34, a3, b4
MADD t44, t44, a4, b4
daddiu AO, AO, 4 * SIZE # AO += 4mr
daddiu BO, BO, 4 * SIZE # BP += 4nr
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L16
NOP
.align 3
.L18: # .L18 always deal with the trigular data part
LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
LD b2, 1 * SIZE(AO) # Fixed results
LD b3, 2 * SIZE(AO)
LD b4, 3 * SIZE(AO) # sa stored as col major
SUB t11, b1, t11
SUB t21, b2, t21
SUB t31, b3, t31
SUB t41, b4, t41
LD b5, 4 * SIZE(AO)
LD b6, 5 * SIZE(AO)
LD b7, 6 * SIZE(AO)
LD b8, 7 * SIZE(AO)
SUB t12, b5, t12
SUB t22, b6, t22
SUB t32, b7, t32
SUB t42, b8, t42
LD b1, 8 * SIZE(AO)
LD b2, 9 * SIZE(AO)
LD b3, 10 * SIZE(AO)
LD b4, 11 * SIZE(AO)
SUB t13, b1, t13
SUB t23, b2, t23
SUB t33, b3, t33
SUB t43, b4, t43
LD b5, 12 * SIZE(AO)
LD b6, 13 * SIZE(AO)
LD b7, 14 * SIZE(AO)
LD b8, 15 * SIZE(AO)
SUB t14, b5, t14
SUB t24, b6, t24
SUB t34, b7, t34
SUB t44, b8, t44
LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
MUL t11, b1, t11
MUL t21, b1, t21
MUL t31, b1, t31
MUL t41, b1, t41
NMSUB t12, t12, b2, t11
NMSUB t22, t22, b2, t21
NMSUB t32, t32, b2, t31
NMSUB t42, t42, b2, t41
NMSUB t13, t13, b3, t11
NMSUB t23, t23, b3, t21
NMSUB t33, t33, b3, t31
NMSUB t43, t43, b3, t41
NMSUB t14, t14, b4, t11
NMSUB t24, t24, b4, t21
NMSUB t34, t34, b4, t31
NMSUB t44, t44, b4, t41
LD b5, 5 * SIZE(BO)
LD b6, 6 * SIZE(BO)
LD b7, 7 * SIZE(BO)
MUL t12, b5, t12
MUL t22, b5, t22
MUL t32, b5, t32
MUL t42, b5, t42
NMSUB t13, t13, b6, t12
NMSUB t23, t23, b6, t22
NMSUB t33, t33, b6, t32
NMSUB t43, t43, b6, t42
NMSUB t14, t14, b7, t12
NMSUB t24, t24, b7, t22
NMSUB t34, t34, b7, t32
NMSUB t44, t44, b7, t42
LD b8, 10 * SIZE(BO)
LD b1, 11 * SIZE(BO)
MUL t13, b8, t13
MUL t23, b8, t23
MUL t33, b8, t33
MUL t43, b8, t43
NMSUB t14, t14, b1, t13
NMSUB t24, t24, b1, t23
NMSUB t34, t34, b1, t33
NMSUB t44, t44, b1, t43
LD b2, 15 * SIZE(BO)
MUL t14, b2, t14
MUL t24, b2, t24
MUL t34, b2, t34
MUL t44, b2, t44
ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
ST t21, 1 * SIZE(AO)
ST t31, 2 * SIZE(AO)
ST t41, 3 * SIZE(AO)
ST t12, 4 * SIZE(AO)
ST t22, 5 * SIZE(AO)
ST t32, 6 * SIZE(AO)
ST t42, 7 * SIZE(AO)
ST t13, 8 * SIZE(AO)
ST t23, 9 * SIZE(AO)
ST t33, 10 * SIZE(AO)
ST t43, 11 * SIZE(AO)
ST t14, 12 * SIZE(AO)
ST t24, 13 * SIZE(AO)
ST t34, 14 * SIZE(AO)
ST t44, 15 * SIZE(AO)
ST t11, 0 * SIZE(CO1) # write back results
ST t21, 1 * SIZE(CO1)
ST t31, 2 * SIZE(CO1)
ST t41, 3 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
ST t22, 1 * SIZE(CO2)
ST t32, 2 * SIZE(CO2)
ST t42, 3 * SIZE(CO2)
ST t13, 0 * SIZE(CO3)
ST t23, 1 * SIZE(CO3)
ST t33, 2 * SIZE(CO3)
ST t43, 3 * SIZE(CO3)
ST t14, 0 * SIZE(CO4)
ST t24, 1 * SIZE(CO4)
ST t34, 2 * SIZE(CO4)
ST t44, 3 * SIZE(CO4)
daddiu CO1, CO1, 4 * SIZE # fixed address
daddiu CO2, CO2, 4 * SIZE
daddiu CO3, CO3, 4 * SIZE
daddiu CO4, CO4, 4 * SIZE
dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
dsll L, TEMP, 2 + BASE_SHIFT
dsll TEMP, TEMP, 2 + BASE_SHIFT
daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
daddu BO, BO, TEMP # move BO to the end of this panel
daddiu I, I, -1
bgtz I, .L11
NOP
.align 3
.L20:
andi I, M, 2 # mr=2
blez I, .L50
nop
MTC $0, t11 # clear results registers
MOV t21, t11
MOV t31, t11
MOV t41, t11
MOV t12, t11
MOV t22, t11
MOV t32, t11
MOV t42, t11
MOV t13, t11
MOV t23, t11
MOV t33, t11
MOV t43, t11
MOV t14, t11
MOV t24, t11
MOV t34, t11
MOV t44, t11
LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
LD a2, 1 * SIZE(AO) # get 4 a
LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
LD b2, 1 * SIZE(B) # get 4 b
LD b3, 2 * SIZE(B)
LD b4, 3 * SIZE(B)
dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
blez L, .L25
move BO, B # reset B
.L22:
LD a5, 2 * SIZE(AO)
LD a6, 3 * SIZE(AO)
LD b5, 4 * SIZE(BO)
LD b6, 5 * SIZE(BO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
MADD t11, t11, a1, b1
MADD t21, t21, a2, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
MADD t13, t13, a1, b3
MADD t23, t23, a2, b3
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
LD a3, 4 * SIZE(AO)
LD a4, 5 * SIZE(AO)
LD b1, 8 * SIZE(BO)
LD b2, 9 * SIZE(BO)
LD b3, 10 * SIZE(BO)
LD b4, 11 * SIZE(BO)
MADD t11, t11, a5, b5
MADD t21, t21, a6, b5
MADD t12, t12, a5, b6
MADD t22, t22, a6, b6
MADD t13, t13, a5, b7
MADD t23, t23, a6, b7
MADD t14, t14, a5, b8
MADD t24, t24, a6, b8
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
LD b5, 12 * SIZE(BO)
LD b6, 13 * SIZE(BO)
LD b7, 14 * SIZE(BO)
LD b8, 15 * SIZE(BO)
MADD t11, t11, a3, b1
MADD t21, t21, a4, b1
MADD t12, t12, a3, b2
MADD t22, t22, a4, b2
MADD t13, t13, a3, b3
MADD t23, t23, a4, b3
MADD t14, t14, a3, b4
MADD t24, t24, a4, b4
daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
daddiu BO, BO, 16 * SIZE # BP += 4nr*4kr
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
MADD t11, t11, a7, b5
MADD t21, t21, a8, b5
MADD t12, t12, a7, b6
MADD t22, t22, a8, b6
MADD t13, t13, a7, b7
MADD t23, t23, a8, b7
MADD t14, t14, a7, b8
MADD t24, t24, a8, b8
daddiu L, L, -1
bgtz L, .L22
NOP
.L25:
andi L, KK, 3 # deal with kc remainder part
blez L, .L28
NOP
.align 3
.L26:
MADD t11, t11, a1, b1
MADD t21, t21, a2, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
MADD t13, t13, a1, b3
MADD t23, t23, a2, b3
MADD t14, t14, a1, b4
MADD t24, t24, a2, b4
daddiu AO, AO, 2 * SIZE # AO += 2mr
daddiu BO, BO, 4 * SIZE # BP += 4nr
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L26
NOP
.align 3
.L28: # .L18 always deal with the trigular data part
LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
LD b2, 1 * SIZE(AO) # Fixed results
SUB t11, b1, t11
SUB t21, b2, t21
LD b5, 2 * SIZE(AO)
LD b6, 3 * SIZE(AO)
SUB t12, b5, t12
SUB t22, b6, t22
LD b3, 4 * SIZE(AO)
LD b4, 5 * SIZE(AO)
SUB t13, b3, t13
SUB t23, b4, t23
LD b7, 6 * SIZE(AO)
LD b8, 7 * SIZE(AO)
SUB t14, b7, t14
SUB t24, b8, t24
LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
MUL t11, b1, t11
MUL t21, b1, t21
NMSUB t12, t12, b2, t11
NMSUB t22, t22, b2, t21
NMSUB t13, t13, b3, t11
NMSUB t23, t23, b3, t21
NMSUB t14, t14, b4, t11
NMSUB t24, t24, b4, t21
LD b5, 5 * SIZE(BO)
LD b6, 6 * SIZE(BO)
LD b7, 7 * SIZE(BO)
MUL t12, b5, t12
MUL t22, b5, t22
NMSUB t13, t13, b6, t12
NMSUB t23, t23, b6, t22
NMSUB t14, t14, b7, t12
NMSUB t24, t24, b7, t22
LD b8, 10 * SIZE(BO)
LD b1, 11 * SIZE(BO)
MUL t13, b8, t13
MUL t23, b8, t23
NMSUB t14, t14, b1, t13
NMSUB t24, t24, b1, t23
LD b2, 15 * SIZE(BO)
MUL t14, b2, t14
MUL t24, b2, t24
ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
ST t21, 1 * SIZE(AO)
ST t12, 2 * SIZE(AO)
ST t22, 3 * SIZE(AO)
ST t13, 4 * SIZE(AO)
ST t23, 5 * SIZE(AO)
ST t14, 6 * SIZE(AO)
ST t24, 7 * SIZE(AO)
ST t11, 0 * SIZE(CO1) # write back results
ST t21, 1 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
ST t22, 1 * SIZE(CO2)
ST t13, 0 * SIZE(CO3)
ST t23, 1 * SIZE(CO3)
ST t14, 0 * SIZE(CO4)
ST t24, 1 * SIZE(CO4)
daddiu CO1, CO1, 2 * SIZE # fixed address
daddiu CO2, CO2, 2 * SIZE # mr=2
daddiu CO3, CO3, 2 * SIZE
daddiu CO4, CO4, 2 * SIZE
dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
dsll L, TEMP, 1 + BASE_SHIFT # mr=2
dsll TEMP, TEMP, 2 + BASE_SHIFT
daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
daddu BO, BO, TEMP # move BO to the end of this panel
.align 3
.L50:
andi I, M, 1 # mr=1
blez I, .L29
nop
MTC $0, t11 # clear results registers
MOV t21, t11
MOV t31, t11
MOV t41, t11
MOV t12, t11
MOV t22, t11
MOV t32, t11
MOV t42, t11
MOV t13, t11
MOV t23, t11
MOV t33, t11
MOV t43, t11
MOV t14, t11
MOV t24, t11
MOV t34, t11
MOV t44, t11
LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
LD b2, 1 * SIZE(B) # get 4 b
LD b3, 2 * SIZE(B)
LD b4, 3 * SIZE(B)
dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
blez L, .L55
move BO, B # reset B
.L52:
LD a5, 1 * SIZE(AO)
LD b5, 4 * SIZE(BO)
LD b6, 5 * SIZE(BO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
MADD t11, t11, a1, b1
MADD t12, t12, a1, b2
MADD t13, t13, a1, b3
MADD t14, t14, a1, b4
LD a3, 2 * SIZE(AO)
LD b1, 8 * SIZE(BO)
LD b2, 9 * SIZE(BO)
LD b3, 10 * SIZE(BO)
LD b4, 11 * SIZE(BO)
MADD t11, t11, a5, b5
MADD t12, t12, a5, b6
MADD t13, t13, a5, b7
MADD t14, t14, a5, b8
LD a7, 3 * SIZE(AO)
LD b5, 12 * SIZE(BO)
LD b6, 13 * SIZE(BO)
LD b7, 14 * SIZE(BO)
LD b8, 15 * SIZE(BO)
MADD t11, t11, a3, b1
MADD t12, t12, a3, b2
MADD t13, t13, a3, b3
MADD t14, t14, a3, b4
daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
daddiu BO, BO, 16 * SIZE # BP += 4nr*4kr
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
MADD t11, t11, a7, b5
MADD t12, t12, a7, b6
MADD t13, t13, a7, b7
MADD t14, t14, a7, b8
daddiu L, L, -1
bgtz L, .L52
NOP
.L55:
andi L, KK, 3 # deal with kc remainder part
blez L, .L58
NOP
.align 3
.L56:
MADD t11, t11, a1, b1
MADD t12, t12, a1, b2
MADD t13, t13, a1, b3
MADD t14, t14, a1, b4
daddiu AO, AO, 1 * SIZE # AO += 1mr
daddiu BO, BO, 4 * SIZE # BP += 4nr
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L56
NOP
.align 3
.L58: # .L18 always deal with the trigular data part
LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
LD b5, 1 * SIZE(AO)
LD b3, 2 * SIZE(AO)
LD b7, 3 * SIZE(AO)
SUB t11, b1, t11
SUB t12, b5, t12
SUB t13, b3, t13
SUB t14, b7, t14
LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
LD b2, 1 * SIZE(BO)
LD b3, 2 * SIZE(BO)
LD b4, 3 * SIZE(BO)
MUL t11, b1, t11
NMSUB t12, t12, b2, t11
NMSUB t13, t13, b3, t11
NMSUB t14, t14, b4, t11
LD b5, 5 * SIZE(BO)
LD b6, 6 * SIZE(BO)
LD b7, 7 * SIZE(BO)
MUL t12, b5, t12
NMSUB t13, t13, b6, t12
NMSUB t14, t14, b7, t12
LD b8, 10 * SIZE(BO)
LD b1, 11 * SIZE(BO)
MUL t13, b8, t13
NMSUB t14, t14, b1, t13
LD b2, 15 * SIZE(BO)
MUL t14, b2, t14
ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
ST t12, 1 * SIZE(AO)
ST t13, 2 * SIZE(AO)
ST t14, 3 * SIZE(AO)
ST t11, 0 * SIZE(CO1) # write back results
ST t12, 0 * SIZE(CO2)
ST t13, 0 * SIZE(CO3)
ST t14, 0 * SIZE(CO4)
daddiu CO1, CO1, 1 * SIZE # fixed address
daddiu CO2, CO2, 1 * SIZE # mr=2
daddiu CO3, CO3, 1 * SIZE
daddiu CO4, CO4, 1 * SIZE
dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
dsll L, TEMP, BASE_SHIFT # mr=2
dsll TEMP, TEMP, 2 + BASE_SHIFT
daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
daddu BO, BO, TEMP # move BO to the end of this panel
.align 3
.L29:
move B, BO # change to next panel of Bj
daddiu KK, KK, 4 # rectangular data length increase by 4
bgtz J, .L10
NOP
.align 3
.L30:
andi J, N, 2
blez J, .L70
nop
move CO1, C
daddu CO2, C, LDC
move AO, A # A is the retangular matrix and B is the trigular matrix
daddu C, CO2, LDC # Fixed pointer C
dsra I, M, 2 # I=MC/4
blez I, .L40
NOP
.align 3
.L31:
MTC $0, t11 # clear results registers
MOV t21, t11
MOV t31, t11
MOV t41, t11
MOV t12, t11
MOV t22, t11
MOV t32, t11
MOV t42, t11
LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
LD a2, 1 * SIZE(AO) # get 4 a
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
LD b2, 1 * SIZE(B) # get 4 b
dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
blez L, .L35
move BO, B # reset B
.L32:
LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
LD b5, 2 * SIZE(BO)
LD b6, 3 * SIZE(BO)
MADD t11, t11, a1, b1
MADD t21, t21, a2, b1
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
MADD t32, t32, a3, b2
MADD t42, t42, a4, b2
LD a1, 8 * SIZE(AO)
LD a2, 9 * SIZE(AO)
LD a3, 10 * SIZE(AO)
LD a4, 11 * SIZE(AO)
LD b3, 4 * SIZE(BO)
LD b4, 5 * SIZE(BO)
MADD t11, t11, a5, b5
MADD t21, t21, a6, b5
MADD t31, t31, a7, b5
MADD t41, t41, a8, b5
MADD t12, t12, a5, b6
MADD t22, t22, a6, b6
MADD t32, t32, a7, b6
MADD t42, t42, a8, b6
LD a5, 12 * SIZE(AO)
LD a6, 13 * SIZE(AO)
LD a7, 14 * SIZE(AO)
LD a8, 15 * SIZE(AO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
MADD t11, t11, a1, b3
MADD t21, t21, a2, b3
MADD t31, t31, a3, b3
MADD t41, t41, a4, b3
MADD t12, t12, a1, b4
MADD t22, t22, a2, b4
MADD t32, t32, a3, b4
MADD t42, t42, a4, b4
daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
daddiu BO, BO, 8 * SIZE # BP += 2nr*4kr
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
MADD t11, t11, a5, b7
MADD t21, t21, a6, b7
MADD t31, t31, a7, b7
MADD t41, t41, a8, b7
MADD t12, t12, a5, b8
MADD t22, t22, a6, b8
MADD t32, t32, a7, b8
MADD t42, t42, a8, b8
daddiu L, L, -1
bgtz L, .L32
NOP
.L35:
andi L, KK, 3 # deal with kc remainder part
blez L, .L38
NOP
.align 3
.L36:
MADD t11, t11, a1, b1
MADD t21, t21, a2, b1
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
MADD t32, t32, a3, b2
MADD t42, t42, a4, b2
daddiu AO, AO, 4 * SIZE # AO += 4mr
daddiu BO, BO, 2 * SIZE # BP += 2nr
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L36
NOP
.align 3
.L38: # .L38 always deal with the trigular data part
LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
LD b2, 1 * SIZE(AO) # Fixed results
LD b3, 2 * SIZE(AO)
LD b4, 3 * SIZE(AO) # sa stored as col major
SUB t11, b1, t11
SUB t21, b2, t21
SUB t31, b3, t31
SUB t41, b4, t41
LD b5, 4 * SIZE(AO)
LD b6, 5 * SIZE(AO)
LD b7, 6 * SIZE(AO)
LD b8, 7 * SIZE(AO)
SUB t12, b5, t12
SUB t22, b6, t22
SUB t32, b7, t32
SUB t42, b8, t42
LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
LD b2, 1 * SIZE(BO)
MUL t11, b1, t11
MUL t21, b1, t21
MUL t31, b1, t31
MUL t41, b1, t41
NMSUB t12, t12, b2, t11
NMSUB t22, t22, b2, t21
NMSUB t32, t32, b2, t31
NMSUB t42, t42, b2, t41
LD b5, 3 * SIZE(BO)
MUL t12, b5, t12
MUL t22, b5, t22
MUL t32, b5, t32
MUL t42, b5, t42
ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
ST t21, 1 * SIZE(AO)
ST t31, 2 * SIZE(AO)
ST t41, 3 * SIZE(AO)
ST t12, 4 * SIZE(AO)
ST t22, 5 * SIZE(AO)
ST t32, 6 * SIZE(AO)
ST t42, 7 * SIZE(AO)
ST t11, 0 * SIZE(CO1) # write back results
ST t21, 1 * SIZE(CO1)
ST t31, 2 * SIZE(CO1)
ST t41, 3 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
ST t22, 1 * SIZE(CO2)
ST t32, 2 * SIZE(CO2)
ST t42, 3 * SIZE(CO2)
daddiu CO1, CO1, 4 * SIZE # fixed address
daddiu CO2, CO2, 4 * SIZE
dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
dsll L, TEMP, 2 + BASE_SHIFT
dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2
daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
daddu BO, BO, TEMP # move BO to the end of this panel
daddiu I, I, -1
bgtz I, .L31
NOP
.align 3
.L40:
andi I, M,2
blez I,.L60
nop
MTC $0, t11 # clear results registers
MOV t21, t11
MOV t12, t11
MOV t22, t11
LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
LD a2, 1 * SIZE(AO) # get 4 a
LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
LD b2, 1 * SIZE(B) # get 4 b
dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
blez L, .L45
move BO, B # reset B
.L42:
LD a5, 2 * SIZE(AO)
LD a6, 3 * SIZE(AO)
LD b5, 2 * SIZE(BO)
LD b6, 3 * SIZE(BO)
MADD t11, t11, a1, b1
MADD t21, t21, a2, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
LD a3, 4 * SIZE(AO)
LD a4, 5 * SIZE(AO)
LD b3, 4 * SIZE(BO)
LD b4, 5 * SIZE(BO)
MADD t11, t11, a5, b5
MADD t21, t21, a6, b5
MADD t12, t12, a5, b6
MADD t22, t22, a6, b6
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
MADD t11, t11, a3, b3
MADD t21, t21, a4, b3
MADD t12, t12, a3, b4
MADD t22, t22, a4, b4
daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
daddiu BO, BO, 8 * SIZE # BP += 2nr*4kr
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
MADD t11, t11, a7, b7
MADD t21, t21, a8, b7
MADD t12, t12, a7, b8
MADD t22, t22, a8, b8
daddiu L, L, -1
bgtz L, .L42
NOP
.L45:
andi L, KK, 3 # deal with kc remainder part
blez L, .L48
NOP
.align 3
.L46:
MADD t11, t11, a1, b1
MADD t21, t21, a2, b1
MADD t12, t12, a1, b2
MADD t22, t22, a2, b2
daddiu AO, AO, 2 * SIZE # AO += 2mr
daddiu BO, BO, 2 * SIZE # BP += 2nr
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L46
NOP
.align 3
.L48: # .L48 always deal with the trigular data part
LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
LD b2, 1 * SIZE(AO) # Fixed results
SUB t11, b1, t11
SUB t21, b2, t21
LD b5, 2 * SIZE(AO)
LD b6, 3 * SIZE(AO)
SUB t12, b5, t12
SUB t22, b6, t22
LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
LD b2, 1 * SIZE(BO)
MUL t11, b1, t11
MUL t21, b1, t21
NMSUB t12, t12, b2, t11
NMSUB t22, t22, b2, t21
LD b5, 3 * SIZE(BO)
MUL t12, b5, t12
MUL t22, b5, t22
ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
ST t21, 1 * SIZE(AO)
ST t12, 2 * SIZE(AO)
ST t22, 3 * SIZE(AO)
ST t11, 0 * SIZE(CO1) # write back results
ST t21, 1 * SIZE(CO1)
ST t12, 0 * SIZE(CO2)
ST t22, 1 * SIZE(CO2)
daddiu CO1, CO1, 2 * SIZE # fixed address
daddiu CO2, CO2, 2 * SIZE
dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
dsll L, TEMP, 1 + BASE_SHIFT
dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2
daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
daddu BO, BO, TEMP # move BO to the end of this panel
.align 3
.L60:
andi I,M,1 # nr=2 mr=1
blez I,.L39
nop
MTC $0, t11 # clear results registers
MOV t12, t11
LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
LD b2, 1 * SIZE(B) # get 4 b
dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
blez L, .L65
move BO, B # reset B
.L62:
LD a5, 1 * SIZE(AO)
LD b5, 2 * SIZE(BO)
LD b6, 3 * SIZE(BO)
MADD t11, t11, a1, b1
MADD t12, t12, a1, b2
LD a3, 2 * SIZE(AO)
LD b3, 4 * SIZE(BO)
LD b4, 5 * SIZE(BO)
MADD t11, t11, a5, b5
MADD t12, t12, a5, b6
LD a7, 3 * SIZE(AO)
LD b7, 6 * SIZE(BO)
LD b8, 7 * SIZE(BO)
MADD t11, t11, a3, b3
MADD t12, t12, a3, b4
daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
daddiu BO, BO, 8 * SIZE # BP += 2nr*4kr
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
MADD t11, t11, a7, b7
MADD t12, t12, a7, b8
daddiu L, L, -1
bgtz L, .L62
NOP
.L65:
andi L, KK, 3 # deal with kc remainder part
blez L, .L68
NOP
.align 3
.L66:
MADD t11, t11, a1, b1
MADD t12, t12, a1, b2
daddiu AO, AO, 1 * SIZE # AO += mr
daddiu BO, BO, 2 * SIZE # BP += 2nr
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(BO)
LD b2, 1 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L66
NOP
.align 3
.L68: # .L48 always deal with the trigular data part
LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
LD b5, 1 * SIZE(AO) # Fixed results
SUB t11, b1, t11
SUB t12, b5, t12
LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
LD b2, 1 * SIZE(BO)
MUL t11, b1, t11
NMSUB t12, t12, b2, t11
LD b5, 3 * SIZE(BO)
MUL t12, b5, t12
ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
ST t12, 1 * SIZE(AO)
ST t11, 0 * SIZE(CO1) # write back results
ST t12, 0 * SIZE(CO2)
daddiu CO1, CO1, 1 * SIZE # fixed address
daddiu CO2, CO2, 1 * SIZE
dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
dsll L, TEMP, BASE_SHIFT # mr=1
dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2
daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
daddu BO, BO, TEMP # move BO to the end of this panel
.align 3
.L39:
move B, BO # change to next panel of Bj
daddiu KK, KK, 2 # rectangular data length increase by 4
.align 3
.L70:
andi J, N, 1 # nr=1
blez J, .L999
NOP
move CO1, C
move AO, A
daddu C, CO1, LDC
dsra I, M, 2 # I=MC/4
blez I, .L80
NOP
.align 3
.L71:
MTC $0, t11 # clear results registers
MOV t21, t11
MOV t31, t11
MOV t41, t11
LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
LD a2, 1 * SIZE(AO) # get 4 a
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
blez L, .L75
move BO, B # reset B
.L72:
LD a5, 4 * SIZE(AO)
LD a6, 5 * SIZE(AO)
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
LD b5, 1 * SIZE(BO)
MADD t11, t11, a1, b1
MADD t21, t21, a2, b1
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
LD a1, 8 * SIZE(AO)
LD a2, 9 * SIZE(AO)
LD a3, 10 * SIZE(AO)
LD a4, 11 * SIZE(AO)
LD b3, 2 * SIZE(BO)
MADD t11, t11, a5, b5
MADD t21, t21, a6, b5
MADD t31, t31, a7, b5
MADD t41, t41, a8, b5
LD a5, 12 * SIZE(AO)
LD a6, 13 * SIZE(AO)
LD a7, 14 * SIZE(AO)
LD a8, 15 * SIZE(AO)
LD b7, 3 * SIZE(BO)
MADD t11, t11, a1, b3
MADD t21, t21, a2, b3
MADD t31, t31, a3, b3
MADD t41, t41, a4, b3
daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
daddiu BO, BO, 4 * SIZE # BP += 1nr*4kr
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 0 * SIZE(BO)
MADD t11, t11, a5, b7
MADD t21, t21, a6, b7
MADD t31, t31, a7, b7
MADD t41, t41, a8, b7
daddiu L, L, -1
bgtz L, .L72
NOP
.L75:
andi L, KK, 3 # deal with kc remainder part
blez L, .L78
NOP
.align 3
.L76:
MADD t11, t11, a1, b1
MADD t21, t21, a2, b1
MADD t31, t31, a3, b1
MADD t41, t41, a4, b1
daddiu AO, AO, 4 * SIZE # AO += 4mr
daddiu BO, BO, 1 * SIZE # BP += 1nr
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD a3, 2 * SIZE(AO)
LD a4, 3 * SIZE(AO)
LD b1, 0 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L76
NOP
.align 3
.L78: # .L78 always deal with the trigular data part
LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
LD b2, 1 * SIZE(AO) # Fixed results
LD b3, 2 * SIZE(AO)
LD b4, 3 * SIZE(AO) # sa stored as col major
SUB t11, b1, t11
SUB t21, b2, t21
SUB t31, b3, t31
SUB t41, b4, t41
LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
MUL t11, b1, t11
MUL t21, b1, t21
MUL t31, b1, t31
MUL t41, b1, t41
ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
ST t21, 1 * SIZE(AO)
ST t31, 2 * SIZE(AO)
ST t41, 3 * SIZE(AO)
ST t11, 0 * SIZE(CO1) # write back results
ST t21, 1 * SIZE(CO1)
ST t31, 2 * SIZE(CO1)
ST t41, 3 * SIZE(CO1)
daddiu CO1, CO1, 4 * SIZE # fixed address
dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
dsll L, TEMP, 2 + BASE_SHIFT
dsll TEMP, TEMP, BASE_SHIFT # nr=1
daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
daddu BO, BO, TEMP # move BO to the end of this panel
daddiu I, I, -1
bgtz I, .L71
NOP
.align 3
.L80:
andi I, M, 2 # mr=2
blez I, .L90
nop
MTC $0, t11 # clear results registers
MOV t21, t11
LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
LD a2, 1 * SIZE(AO) # get 4 a
LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
blez L, .L85
move BO, B # reset B
.L82:
LD a5, 2 * SIZE(AO)
LD a6, 3 * SIZE(AO)
LD b5, 1 * SIZE(BO)
MADD t11, t11, a1, b1
MADD t21, t21, a2, b1
LD a3, 4 * SIZE(AO)
LD a4, 5 * SIZE(AO)
LD b3, 2 * SIZE(BO)
MADD t11, t11, a5, b5
MADD t21, t21, a6, b5
LD a7, 6 * SIZE(AO)
LD a8, 7 * SIZE(AO)
LD b7, 3 * SIZE(BO)
MADD t11, t11, a3, b3
MADD t21, t21, a4, b3
daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
daddiu BO, BO, 4 * SIZE # BP += 1nr*4kr
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
MADD t11, t11, a7, b7
MADD t21, t21, a8, b7
daddiu L, L, -1
bgtz L, .L82
NOP
.L85:
andi L, KK, 3 # deal with kc remainder part
blez L, .L88
NOP
.align 3
.L86:
MADD t11, t11, a1, b1
MADD t21, t21, a2, b1
daddiu AO, AO, 2 * SIZE # AO += 2mr
daddiu BO, BO, 1 * SIZE # BP += 1nr
LD a1, 0 * SIZE(AO)
LD a2, 1 * SIZE(AO)
LD b1, 0 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L86
NOP
.align 3
.L88: # .L88 always deal with the trigular data part
LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
LD b2, 1 * SIZE(AO) # Fixed results
SUB t11, b1, t11
SUB t21, b2, t21
LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
MUL t11, b1, t11
MUL t21, b1, t21
ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
ST t21, 1 * SIZE(AO)
ST t11, 0 * SIZE(CO1) # write back results
ST t21, 1 * SIZE(CO1)
daddiu CO1, CO1, 2 * SIZE # fixed address
dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
dsll L, TEMP, 1 + BASE_SHIFT
dsll TEMP, TEMP, BASE_SHIFT # nr=1
daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
daddu BO, BO, TEMP # move BO to the end of this panel
.align 3
.L90:
andi I, M, 1 # mr=1
blez I, .L79
nop
MTC $0, t11 # clear results registers
LD a1, 0 * SIZE(AO) # AO point to the beginning address of sa
LD b1, 0 * SIZE(B) # B point to the beginning address of every panel Bj
dsra L, KK, 2 # L=KK/4, KK is the length of the retangular data part of Bj
blez L, .L95
move BO, B # reset B
.L92:
LD a5, 1 * SIZE(AO)
LD b5, 1 * SIZE(BO)
MADD t11, t11, a1, b1
LD a3, 2 * SIZE(AO)
LD b3, 2 * SIZE(BO)
MADD t11, t11, a5, b5
LD a7, 3 * SIZE(AO)
LD b7, 3 * SIZE(BO)
MADD t11, t11, a3, b3
daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
daddiu BO, BO, 4 * SIZE # BP += 1nr*4kr
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(BO)
MADD t11, t11, a7, b7
daddiu L, L, -1
bgtz L, .L92
NOP
.L95:
andi L, KK, 3 # deal with kc remainder part
blez L, .L98
NOP
.align 3
.L96:
MADD t11, t11, a1, b1
daddiu AO, AO, 1 * SIZE # AO += 2mr
daddiu BO, BO, 1 * SIZE # BP += 1nr
LD a1, 0 * SIZE(AO)
LD b1, 0 * SIZE(BO)
daddiu L, L, -1
bgtz L, .L96
NOP
.align 3
.L98: # .L98 always deal with the trigular data part
LD b1, 0 * SIZE(AO) # for RN & RT A is the result matrix
SUB t11, b1, t11
LD b1, 0 * SIZE(BO) # BO point to the beginning of the trigular data part of Bj
MUL t11, b1, t11
ST t11, 0 * SIZE(AO) # update packed blockA for follow-up compute
ST t11, 0 * SIZE(CO1) # write back results
daddiu CO1, CO1, 1 * SIZE # fixed address
dsubu TEMP, K, KK # temp = kc - retangular data length of every panel
dsll L, TEMP, BASE_SHIFT
dsll TEMP, TEMP, BASE_SHIFT # nr=1
daddu AO, AO, L # move AO to the end of this panel. also the beginning of next panel
daddu BO, BO, TEMP # move BO to the end of this panel
.align 3
.L79:
move B, BO
daddiu KK, KK, 1
.align 3
.L999:
LDARG $16, 0($sp)
LDARG $17, 8($sp)
LDARG $18, 16($sp)
LDARG $19, 24($sp)
LDARG $20, 32($sp)
LDARG $21, 40($sp)
ldc1 $f24, 48($sp)
ldc1 $f25, 56($sp)
ldc1 $f26, 64($sp)
ldc1 $f27, 72($sp)
ldc1 $f28, 80($sp)
LDARG $22, 88($sp)
LDARG $23, 96($sp)
LDARG $24, 104($sp)
LDARG $25, 112($sp)
#ifndef __64BIT__
ldc1 $f20,112($sp)
ldc1 $f21,120($sp)
ldc1 $f22,128($sp)
ldc1 $f23,136($sp)
#endif
j $31
daddiu $sp, $sp, 144
EPILOGUE