tahoma2d/thirdparty/openblas/xianyi-OpenBLAS-e6e87a2/symcopy.h
2016-03-24 02:47:04 +09:00

1873 lines
30 KiB
C

/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
/* This implementation is completely wrong. I'll rewrite this */
#ifndef SYMCOPY_H
#define SYMCOPY_H
#if !defined(XDOUBLE) || !defined(QUAD_PRECISION)
static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
BLASLONG is, js;
FLOAT *aa1, *aa2;
FLOAT *b1, *b2;
FLOAT *bb1, *bb2;
FLOAT *cc1, *cc2;
FLOAT a11, a12;
FLOAT a21, a22;
b1 = b;
b2 = b;
for (js = 0; js < m; js += 2){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda + 2;
bb1 = b1 + 0 * m;
bb2 = b1 + 1 * m;
b1 += 2 * m + 2;
cc1 = b2 + 0 * m;
cc2 = b2 + 1 * m;
b2 += 2 * m + 2;
if (m - js >= 2){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a22 = *(aa2 + 1);
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a21;
*(bb2 + 1) = a22;
aa1 += 2;
aa2 += 2;
bb1 += 2;
bb2 += 2;
cc1 += 2 * m;
cc2 += 2 * m;
is = ((m - js - 2) >> 1);
while (is > 0){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
aa1 += 2;
aa2 += 2;
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
*(cc1 + 0) = a11;
*(cc1 + 1) = a12;
*(cc2 + 0) = a21;
*(cc2 + 1) = a22;
bb1 += 2;
bb2 += 2;
cc1 += 2 * m;
cc2 += 2 * m;
is --;
}
is = ((m - js - 2) & 1);
if (is == 1){
a11 = *(aa1 + 0);
a12 = *(aa2 + 0);
*(bb1 + 0) = a11;
*(bb2 + 0) = a12;
*(cc1 + 0) = a11;
*(cc1 + 1) = a12;
}
}
if (m - js == 1){
a11 = *(aa1 + 0);
*(bb1 + 0) = a11;
}
}
}
static inline void SYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
BLASLONG is, js;
FLOAT *aa1, *aa2;
FLOAT *b1, *b2;
FLOAT *bb1, *bb2;
FLOAT *cc1, *cc2;
FLOAT a11, a12;
FLOAT a21, a22;
b1 = b;
b2 = b;
for (js = 0; js < m; js += 2){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda;
bb1 = b1 + 0 * m;
bb2 = b1 + 1 * m;
b1 += 2 * m;
cc1 = b2 + 0 * m;
cc2 = b2 + 1 * m;
b2 += 2;
if (m - js >= 2){
for (is = 0; is < js; is += 2){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
aa1 += 2;
aa2 += 2;
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
*(cc1 + 0) = a11;
*(cc1 + 1) = a12;
*(cc2 + 0) = a21;
*(cc2 + 1) = a22;
bb1 += 2;
bb2 += 2;
cc1 += 2 * m;
cc2 += 2 * m;
}
a11 = *(aa1 + 0);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
*(bb1 + 0) = a11;
*(bb1 + 1) = a12;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
}
if (m - js == 1){
for (is = 0; is < js; is += 2){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
aa1 += 2;
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(cc1 + 0) = a11;
*(cc2 + 0) = a21;
bb1 += 2;
cc1 += 2 * m;
cc2 += 2 * m;
}
a11 = *(aa1 + 0);
*(bb1 + 0) = a11;
}
}
}
static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
BLASLONG is, js;
FLOAT *aa1, *aa2;
FLOAT *b1, *b2;
FLOAT *bb1, *bb2;
FLOAT *cc1, *cc2;
FLOAT a11, a21, a31, a41;
FLOAT a12, a22, a32, a42;
b1 = b;
b2 = b;
lda *= 2;
for (js = 0; js < m; js += 2){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda + 4;
bb1 = b1 + 0 * m;
bb2 = b1 + 2 * m;
b1 += 4 * m + 4;
cc1 = b2 + 0 * m;
cc2 = b2 + 2 * m;
b2 += 4 * m + 4;
if (m - js >= 2){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
a12 = *(aa2 + 2);
a22 = *(aa2 + 3);
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
*(bb1 + 3) = a41;
*(bb2 + 0) = a31;
*(bb2 + 1) = a41;
*(bb2 + 2) = a12;
*(bb2 + 3) = a22;
aa1 += 4;
aa2 += 4;
bb1 += 4;
bb2 += 4;
cc1 += 4 * m;
cc2 += 4 * m;
is = ((m - js - 2) >> 1);
while (is > 0){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
a32 = *(aa2 + 2);
a42 = *(aa2 + 3);
aa1 += 4;
aa2 += 4;
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
*(bb1 + 3) = a41;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
*(bb2 + 2) = a32;
*(bb2 + 3) = a42;
*(cc1 + 0) = a11;
*(cc1 + 1) = a21;
*(cc1 + 2) = a12;
*(cc1 + 3) = a22;
*(cc2 + 0) = a31;
*(cc2 + 1) = a41;
*(cc2 + 2) = a32;
*(cc2 + 3) = a42;
bb1 += 4;
bb2 += 4;
cc1 += 4 * m;
cc2 += 4 * m;
is --;
}
if (m & 1){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
*(cc1 + 0) = a11;
*(cc1 + 1) = a21;
*(cc1 + 2) = a12;
*(cc1 + 3) = a22;
}
}
if (m - js == 1){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
}
}
}
static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
BLASLONG is, js;
FLOAT *aa1, *aa2;
FLOAT *b1, *b2;
FLOAT *bb1, *bb2;
FLOAT *cc1, *cc2;
FLOAT a11, a21, a31, a41;
FLOAT a12, a22, a32, a42;
b1 = b;
b2 = b;
lda *= 2;
for (js = 0; js < m; js += 2){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda;
bb1 = b1 + 0 * m;
bb2 = b1 + 2 * m;
b1 += 4 * m;
cc1 = b2 + 0 * m;
cc2 = b2 + 2 * m;
b2 += 4;
if (m - js >= 2){
for (is = 0; is < js; is += 2){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
a32 = *(aa2 + 2);
a42 = *(aa2 + 3);
aa1 += 4;
aa2 += 4;
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
*(bb1 + 3) = a41;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
*(bb2 + 2) = a32;
*(bb2 + 3) = a42;
*(cc1 + 0) = a11;
*(cc1 + 1) = a21;
*(cc1 + 2) = a12;
*(cc1 + 3) = a22;
*(cc2 + 0) = a31;
*(cc2 + 1) = a41;
*(cc2 + 2) = a32;
*(cc2 + 3) = a42;
bb1 += 4;
bb2 += 4;
cc1 += 4 * m;
cc2 += 4 * m;
}
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
a32 = *(aa2 + 2);
a42 = *(aa2 + 3);
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a12;
*(bb1 + 3) = a22;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
*(bb2 + 2) = a32;
*(bb2 + 3) = a42;
}
if (m - js == 1){
for (is = 0; is < js; is += 2){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
aa1 += 4;
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
*(bb1 + 3) = a41;
*(cc1 + 0) = a11;
*(cc1 + 1) = a21;
*(cc2 + 0) = a31;
*(cc2 + 1) = a41;
bb1 += 4;
cc1 += 4 * m;
cc2 += 4 * m;
}
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
}
}
}
static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
BLASLONG is, js;
FLOAT *aa1, *aa2;
FLOAT *b1, *b2;
FLOAT *bb1, *bb2;
FLOAT *cc1, *cc2;
FLOAT a11, a21, a31, a41;
FLOAT a12, a22, a32, a42;
b1 = b;
b2 = b;
lda *= 2;
for (js = 0; js < m; js += 2){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda + 4;
bb1 = b1 + 0 * m;
bb2 = b1 + 2 * m;
b1 += 4 * m + 4;
cc1 = b2 + 0 * m;
cc2 = b2 + 2 * m;
b2 += 4 * m + 4;
if (m - js >= 2){
a11 = *(aa1 + 0);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
a12 = *(aa2 + 2);
*(bb1 + 0) = a11;
*(bb1 + 1) = 0.;
*(bb1 + 2) = a31;
*(bb1 + 3) = a41;
*(bb2 + 0) = a31;
*(bb2 + 1) = -a41;
*(bb2 + 2) = a12;
*(bb2 + 3) = 0.;
aa1 += 4;
aa2 += 4;
bb1 += 4;
bb2 += 4;
cc1 += 4 * m;
cc2 += 4 * m;
is = ((m - js - 2) >> 1);
while (is > 0){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
a32 = *(aa2 + 2);
a42 = *(aa2 + 3);
aa1 += 4;
aa2 += 4;
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
*(bb1 + 3) = a41;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
*(bb2 + 2) = a32;
*(bb2 + 3) = a42;
*(cc1 + 0) = a11;
*(cc1 + 1) = -a21;
*(cc1 + 2) = a12;
*(cc1 + 3) = -a22;
*(cc2 + 0) = a31;
*(cc2 + 1) = -a41;
*(cc2 + 2) = a32;
*(cc2 + 3) = -a42;
bb1 += 4;
bb2 += 4;
cc1 += 4 * m;
cc2 += 4 * m;
is --;
}
if (m & 1){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
*(cc1 + 0) = a11;
*(cc1 + 1) = -a21;
*(cc1 + 2) = a12;
*(cc1 + 3) = -a22;
}
}
if (m - js == 1){
a11 = *(aa1 + 0);
*(bb1 + 0) = a11;
*(bb1 + 1) = 0.;
}
}
}
static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
BLASLONG is, js;
FLOAT *aa1, *aa2;
FLOAT *b1, *b2;
FLOAT *bb1, *bb2;
FLOAT *cc1, *cc2;
FLOAT a11, a21, a31, a41;
FLOAT a12, a22, a32, a42;
b1 = b;
b2 = b;
lda *= 2;
for (js = 0; js < m; js += 2){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda;
bb1 = b1 + 0 * m;
bb2 = b1 + 2 * m;
b1 += 4 * m;
cc1 = b2 + 0 * m;
cc2 = b2 + 2 * m;
b2 += 4;
if (m - js >= 2){
for (is = 0; is < js; is += 2){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
a32 = *(aa2 + 2);
a42 = *(aa2 + 3);
aa1 += 4;
aa2 += 4;
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
*(bb1 + 3) = a41;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
*(bb2 + 2) = a32;
*(bb2 + 3) = a42;
*(cc1 + 0) = a11;
*(cc1 + 1) = -a21;
*(cc1 + 2) = a12;
*(cc1 + 3) = -a22;
*(cc2 + 0) = a31;
*(cc2 + 1) = -a41;
*(cc2 + 2) = a32;
*(cc2 + 3) = -a42;
bb1 += 4;
bb2 += 4;
cc1 += 4 * m;
cc2 += 4 * m;
}
a11 = *(aa1 + 0);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
a32 = *(aa2 + 2);
*(bb1 + 0) = a11;
*(bb1 + 1) = 0.;
*(bb1 + 2) = a12;
*(bb1 + 3) = -a22;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
*(bb2 + 2) = a32;
*(bb2 + 3) = 0.;
}
if (m - js == 1){
for (is = 0; is < js; is += 2){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
aa1 += 4;
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
*(bb1 + 3) = a41;
*(cc1 + 0) = a11;
*(cc1 + 1) = -a21;
*(cc2 + 0) = a31;
*(cc2 + 1) = -a41;
bb1 += 4;
cc1 += 4 * m;
cc2 += 4 * m;
}
a11 = *(aa1 + 0);
*(bb1 + 0) = a11;
*(bb1 + 1) = 0.;
}
}
}
static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
BLASLONG is, js;
FLOAT *aa1, *aa2;
FLOAT *b1, *b2;
FLOAT *bb1, *bb2;
FLOAT *cc1, *cc2;
FLOAT a11, a21, a31, a41;
FLOAT a12, a22, a32, a42;
b1 = b;
b2 = b;
lda *= 2;
for (js = 0; js < m; js += 2){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda + 4;
bb1 = b1 + 0 * m;
bb2 = b1 + 2 * m;
b1 += 4 * m + 4;
cc1 = b2 + 0 * m;
cc2 = b2 + 2 * m;
b2 += 4 * m + 4;
if (m - js >= 2){
a11 = *(aa1 + 0);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
a12 = *(aa2 + 2);
*(bb1 + 0) = a11;
*(bb1 + 1) = 0.;
*(bb1 + 2) = a31;
*(bb1 + 3) = -a41;
*(bb2 + 0) = a31;
*(bb2 + 1) = a41;
*(bb2 + 2) = a12;
*(bb2 + 3) = 0.;
aa1 += 4;
aa2 += 4;
bb1 += 4;
bb2 += 4;
cc1 += 4 * m;
cc2 += 4 * m;
is = ((m - js - 2) >> 1);
while (is > 0){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
a32 = *(aa2 + 2);
a42 = *(aa2 + 3);
aa1 += 4;
aa2 += 4;
*(bb1 + 0) = a11;
*(bb1 + 1) = -a21;
*(bb1 + 2) = a31;
*(bb1 + 3) = -a41;
*(bb2 + 0) = a12;
*(bb2 + 1) = -a22;
*(bb2 + 2) = a32;
*(bb2 + 3) = -a42;
*(cc1 + 0) = a11;
*(cc1 + 1) = a21;
*(cc1 + 2) = a12;
*(cc1 + 3) = a22;
*(cc2 + 0) = a31;
*(cc2 + 1) = a41;
*(cc2 + 2) = a32;
*(cc2 + 3) = a42;
bb1 += 4;
bb2 += 4;
cc1 += 4 * m;
cc2 += 4 * m;
is --;
}
if (m & 1){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
*(bb1 + 0) = a11;
*(bb1 + 1) = -a21;
*(bb2 + 0) = a12;
*(bb2 + 1) = -a22;
*(cc1 + 0) = a11;
*(cc1 + 1) = a21;
*(cc1 + 2) = a12;
*(cc1 + 3) = a22;
}
}
if (m - js == 1){
a11 = *(aa1 + 0);
*(bb1 + 0) = a11;
*(bb1 + 1) = 0.;
}
}
}
static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
BLASLONG is, js;
FLOAT *aa1, *aa2;
FLOAT *b1, *b2;
FLOAT *bb1, *bb2;
FLOAT *cc1, *cc2;
FLOAT a11, a21, a31, a41;
FLOAT a12, a22, a32, a42;
b1 = b;
b2 = b;
lda *= 2;
for (js = 0; js < m; js += 2){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda;
bb1 = b1 + 0 * m;
bb2 = b1 + 2 * m;
b1 += 4 * m;
cc1 = b2 + 0 * m;
cc2 = b2 + 2 * m;
b2 += 4;
if (m - js >= 2){
for (is = 0; is < js; is += 2){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
a32 = *(aa2 + 2);
a42 = *(aa2 + 3);
aa1 += 4;
aa2 += 4;
*(bb1 + 0) = a11;
*(bb1 + 1) = -a21;
*(bb1 + 2) = a31;
*(bb1 + 3) = -a41;
*(bb2 + 0) = a12;
*(bb2 + 1) = -a22;
*(bb2 + 2) = a32;
*(bb2 + 3) = -a42;
*(cc1 + 0) = a11;
*(cc1 + 1) = a21;
*(cc1 + 2) = a12;
*(cc1 + 3) = a22;
*(cc2 + 0) = a31;
*(cc2 + 1) = a41;
*(cc2 + 2) = a32;
*(cc2 + 3) = a42;
bb1 += 4;
bb2 += 4;
cc1 += 4 * m;
cc2 += 4 * m;
}
a11 = *(aa1 + 0);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
a32 = *(aa2 + 2);
*(bb1 + 0) = a11;
*(bb1 + 1) = 0.;
*(bb1 + 2) = a12;
*(bb1 + 3) = a22;
*(bb2 + 0) = a12;
*(bb2 + 1) = -a22;
*(bb2 + 2) = a32;
*(bb2 + 3) = 0.;
}
if (m - js == 1){
for (is = 0; is < js; is += 2){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
aa1 += 4;
*(bb1 + 0) = a11;
*(bb1 + 1) = -a21;
*(bb1 + 2) = a31;
*(bb1 + 3) = -a41;
*(cc1 + 0) = a11;
*(cc1 + 1) = a21;
*(cc2 + 0) = a31;
*(cc2 + 1) = a41;
bb1 += 4;
cc1 += 4 * m;
cc2 += 4 * m;
}
a11 = *(aa1 + 0);
*(bb1 + 0) = a11;
*(bb1 + 1) = 0.;
}
}
}
static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
BLASLONG is, js;
FLOAT *aa1, *aa2;
FLOAT *b1, *b2;
FLOAT *bb1, *bb2;
FLOAT *cc1, *cc2;
FLOAT a11, a12;
FLOAT a21, a22;
b1 = b;
b2 = b;
for (js = 0; js < m; js += 2){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda + 2;
bb1 = b1 + 0 * m;
bb2 = b1 + 1 * m;
b1 += 2 * m + 2;
cc1 = b2 + 0 * m;
cc2 = b2 + 1 * m;
b2 += 2 * m + 2;
if (m - js >= 2){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a22 = *(aa2 + 1);
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a21;
*(bb2 + 1) = a22;
aa1 += 2;
aa2 += 2;
bb1 += 2;
bb2 += 2;
cc1 += 2 * m;
cc2 += 2 * m;
is = ((m - js - 2) >> 1);
while (is > 0){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
aa1 += 2;
aa2 += 2;
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
*(cc1 + 0) = a11;
*(cc1 + 1) = a12;
*(cc2 + 0) = a21;
*(cc2 + 1) = a22;
bb1 += 2;
bb2 += 2;
cc1 += 2 * m;
cc2 += 2 * m;
is --;
}
is = ((m - js - 2) & 1);
if (is == 1){
a11 = *(aa1 + 0);
a12 = *(aa2 + 0);
*(bb1 + 0) = a11;
*(bb2 + 0) = a12;
*(cc1 + 0) = a11;
*(cc1 + 1) = a12;
}
}
if (m - js == 1){
a11 = *(aa1 + 0);
*(bb1 + 0) = a11;
}
}
}
static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
BLASLONG is, js;
FLOAT *aa1, *aa2;
FLOAT *b1, *b2;
FLOAT *bb1, *bb2;
FLOAT *cc1, *cc2;
FLOAT a11, a12;
FLOAT a21, a22;
b1 = b;
b2 = b;
for (js = 0; js < m; js += 2){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda + 2;
bb1 = b1 + 0 * m;
bb2 = b1 + 1 * m;
b1 += 2 * m + 2;
cc1 = b2 + 0 * m;
cc2 = b2 + 1 * m;
b2 += 2 * m + 2;
if (m - js >= 2){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a22 = *(aa2 + 1);
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a21;
*(bb2 + 1) = a22;
aa1 += 2;
aa2 += 2;
bb1 += 2;
bb2 += 2;
cc1 += 2 * m;
cc2 += 2 * m;
is = ((m - js - 2) >> 1);
while (is > 0){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
aa1 += 2;
aa2 += 2;
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
*(cc1 + 0) = a11;
*(cc1 + 1) = a12;
*(cc2 + 0) = a21;
*(cc2 + 1) = a22;
bb1 += 2;
bb2 += 2;
cc1 += 2 * m;
cc2 += 2 * m;
is --;
}
is = ((m - js - 2) & 1);
if (is == 1){
a11 = *(aa1 + 0);
a12 = *(aa2 + 0);
*(bb1 + 0) = a11;
*(bb2 + 0) = a12;
*(cc1 + 0) = a11;
*(cc1 + 1) = a12;
}
}
if (m - js == 1){
a11 = *(aa1 + 0);
*(bb1 + 0) = a11;
}
}
}
static inline void TRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
BLASLONG is, js;
FLOAT *aa1, *aa2;
FLOAT *b1, *b2;
FLOAT *bb1, *bb2;
FLOAT *cc1, *cc2;
FLOAT a11, a12;
FLOAT a21, a22;
b1 = b;
b2 = b;
for (js = 0; js < m; js += 2){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda;
bb1 = b1 + 0 * m;
bb2 = b1 + 1 * m;
b1 += 2 * m;
cc1 = b2 + 0 * m;
cc2 = b2 + 1 * m;
b2 += 2;
if (m - js >= 2){
for (is = 0; is < js; is += 2){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
aa1 += 2;
aa2 += 2;
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
*(cc1 + 0) = a11;
*(cc1 + 1) = a12;
*(cc2 + 0) = a21;
*(cc2 + 1) = a22;
bb1 += 2;
bb2 += 2;
cc1 += 2 * m;
cc2 += 2 * m;
}
a11 = *(aa1 + 0);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
*(bb1 + 0) = a11;
*(bb1 + 1) = a12;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
}
if (m - js == 1){
for (is = 0; is < js; is += 2){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
aa1 += 2;
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(cc1 + 0) = a11;
*(cc2 + 0) = a21;
bb1 += 2;
cc1 += 2 * m;
cc2 += 2 * m;
}
a11 = *(aa1 + 0);
*(bb1 + 0) = a11;
}
}
}
static inline void TRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
BLASLONG is, js;
FLOAT *aa1, *aa2;
FLOAT *b1, *b2;
FLOAT *bb1, *bb2;
FLOAT *cc1, *cc2;
FLOAT a11, a12;
FLOAT a21, a22;
b1 = b;
b2 = b;
for (js = 0; js < m; js += 2){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda;
bb1 = b1 + 0 * m;
bb2 = b1 + 1 * m;
b1 += 2 * m;
cc1 = b2 + 0 * m;
cc2 = b2 + 1 * m;
b2 += 2;
if (m - js >= 2){
for (is = 0; is < js; is += 2){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
aa1 += 2;
aa2 += 2;
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
*(cc1 + 0) = a11;
*(cc1 + 1) = a12;
*(cc2 + 0) = a21;
*(cc2 + 1) = a22;
bb1 += 2;
bb2 += 2;
cc1 += 2 * m;
cc2 += 2 * m;
}
a11 = *(aa1 + 0);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
*(bb1 + 0) = a11;
*(bb1 + 1) = a12;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
}
if (m - js == 1){
for (is = 0; is < js; is += 2){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
aa1 += 2;
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(cc1 + 0) = a11;
*(cc2 + 0) = a21;
bb1 += 2;
cc1 += 2 * m;
cc2 += 2 * m;
}
a11 = *(aa1 + 0);
*(bb1 + 0) = a11;
}
}
}
static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
BLASLONG is, js;
FLOAT *aa1, *aa2;
FLOAT *b1, *b2;
FLOAT *bb1, *bb2;
FLOAT *cc1, *cc2;
FLOAT a11, a21, a31, a41;
FLOAT a12, a22, a32, a42;
b1 = b;
b2 = b;
lda *= 2;
for (js = 0; js < m; js += 2){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda + 4;
bb1 = b1 + 0 * m;
bb2 = b1 + 2 * m;
b1 += 4 * m + 4;
cc1 = b2 + 0 * m;
cc2 = b2 + 2 * m;
b2 += 4 * m + 4;
if (m - js >= 2){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
a12 = *(aa2 + 2);
a22 = *(aa2 + 3);
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
*(bb1 + 3) = a41;
*(bb2 + 0) = a31;
*(bb2 + 1) = a41;
*(bb2 + 2) = a12;
*(bb2 + 3) = a22;
aa1 += 4;
aa2 += 4;
bb1 += 4;
bb2 += 4;
cc1 += 4 * m;
cc2 += 4 * m;
is = ((m - js - 2) >> 1);
while (is > 0){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
a32 = *(aa2 + 2);
a42 = *(aa2 + 3);
aa1 += 4;
aa2 += 4;
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
*(bb1 + 3) = a41;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
*(bb2 + 2) = a32;
*(bb2 + 3) = a42;
*(cc1 + 0) = a11;
*(cc1 + 1) = a21;
*(cc1 + 2) = a12;
*(cc1 + 3) = a22;
*(cc2 + 0) = a31;
*(cc2 + 1) = a41;
*(cc2 + 2) = a32;
*(cc2 + 3) = a42;
bb1 += 4;
bb2 += 4;
cc1 += 4 * m;
cc2 += 4 * m;
is --;
}
if (m & 1){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
*(cc1 + 0) = a11;
*(cc1 + 1) = a21;
*(cc1 + 2) = a12;
*(cc1 + 3) = a22;
}
}
if (m - js == 1){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
}
}
}
static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
BLASLONG is, js;
FLOAT *aa1, *aa2;
FLOAT *b1, *b2;
FLOAT *bb1, *bb2;
FLOAT *cc1, *cc2;
FLOAT a11, a21, a31, a41;
FLOAT a12, a22, a32, a42;
b1 = b;
b2 = b;
lda *= 2;
for (js = 0; js < m; js += 2){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda + 4;
bb1 = b1 + 0 * m;
bb2 = b1 + 2 * m;
b1 += 4 * m + 4;
cc1 = b2 + 0 * m;
cc2 = b2 + 2 * m;
b2 += 4 * m + 4;
if (m - js >= 2){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
a12 = *(aa2 + 2);
a22 = *(aa2 + 3);
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
*(bb1 + 3) = a41;
*(bb2 + 0) = a31;
*(bb2 + 1) = a41;
*(bb2 + 2) = a12;
*(bb2 + 3) = a22;
aa1 += 4;
aa2 += 4;
bb1 += 4;
bb2 += 4;
cc1 += 4 * m;
cc2 += 4 * m;
is = ((m - js - 2) >> 1);
while (is > 0){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
a32 = *(aa2 + 2);
a42 = *(aa2 + 3);
aa1 += 4;
aa2 += 4;
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
*(bb1 + 3) = a41;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
*(bb2 + 2) = a32;
*(bb2 + 3) = a42;
*(cc1 + 0) = a11;
*(cc1 + 1) = a21;
*(cc1 + 2) = a12;
*(cc1 + 3) = a22;
*(cc2 + 0) = a31;
*(cc2 + 1) = a41;
*(cc2 + 2) = a32;
*(cc2 + 3) = a42;
bb1 += 4;
bb2 += 4;
cc1 += 4 * m;
cc2 += 4 * m;
is --;
}
if (m & 1){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
*(cc1 + 0) = a11;
*(cc1 + 1) = a21;
*(cc1 + 2) = a12;
*(cc1 + 3) = a22;
}
}
if (m - js == 1){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
}
}
}
static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
BLASLONG is, js;
FLOAT *aa1, *aa2;
FLOAT *b1, *b2;
FLOAT *bb1, *bb2;
FLOAT *cc1, *cc2;
FLOAT a11, a21, a31, a41;
FLOAT a12, a22, a32, a42;
b1 = b;
b2 = b;
lda *= 2;
for (js = 0; js < m; js += 2){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda;
bb1 = b1 + 0 * m;
bb2 = b1 + 2 * m;
b1 += 4 * m;
cc1 = b2 + 0 * m;
cc2 = b2 + 2 * m;
b2 += 4;
if (m - js >= 2){
for (is = 0; is < js; is += 2){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
a32 = *(aa2 + 2);
a42 = *(aa2 + 3);
aa1 += 4;
aa2 += 4;
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
*(bb1 + 3) = a41;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
*(bb2 + 2) = a32;
*(bb2 + 3) = a42;
*(cc1 + 0) = a11;
*(cc1 + 1) = a21;
*(cc1 + 2) = a12;
*(cc1 + 3) = a22;
*(cc2 + 0) = a31;
*(cc2 + 1) = a41;
*(cc2 + 2) = a32;
*(cc2 + 3) = a42;
bb1 += 4;
bb2 += 4;
cc1 += 4 * m;
cc2 += 4 * m;
}
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
a32 = *(aa2 + 2);
a42 = *(aa2 + 3);
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a12;
*(bb1 + 3) = a22;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
*(bb2 + 2) = a32;
*(bb2 + 3) = a42;
}
if (m - js == 1){
for (is = 0; is < js; is += 2){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
aa1 += 4;
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
*(bb1 + 3) = a41;
*(cc1 + 0) = a11;
*(cc1 + 1) = a21;
*(cc2 + 0) = a31;
*(cc2 + 1) = a41;
bb1 += 4;
cc1 += 4 * m;
cc2 += 4 * m;
}
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
}
}
}
static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
BLASLONG is, js;
FLOAT *aa1, *aa2;
FLOAT *b1, *b2;
FLOAT *bb1, *bb2;
FLOAT *cc1, *cc2;
FLOAT a11, a21, a31, a41;
FLOAT a12, a22, a32, a42;
b1 = b;
b2 = b;
lda *= 2;
for (js = 0; js < m; js += 2){
aa1 = a + 0 * lda;
aa2 = a + 1 * lda;
a += 2 * lda;
bb1 = b1 + 0 * m;
bb2 = b1 + 2 * m;
b1 += 4 * m;
cc1 = b2 + 0 * m;
cc2 = b2 + 2 * m;
b2 += 4;
if (m - js >= 2){
for (is = 0; is < js; is += 2){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
a32 = *(aa2 + 2);
a42 = *(aa2 + 3);
aa1 += 4;
aa2 += 4;
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
*(bb1 + 3) = a41;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
*(bb2 + 2) = a32;
*(bb2 + 3) = a42;
*(cc1 + 0) = a11;
*(cc1 + 1) = a21;
*(cc1 + 2) = a12;
*(cc1 + 3) = a22;
*(cc2 + 0) = a31;
*(cc2 + 1) = a41;
*(cc2 + 2) = a32;
*(cc2 + 3) = a42;
bb1 += 4;
bb2 += 4;
cc1 += 4 * m;
cc2 += 4 * m;
}
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a12 = *(aa2 + 0);
a22 = *(aa2 + 1);
a32 = *(aa2 + 2);
a42 = *(aa2 + 3);
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a12;
*(bb1 + 3) = a22;
*(bb2 + 0) = a12;
*(bb2 + 1) = a22;
*(bb2 + 2) = a32;
*(bb2 + 3) = a42;
}
if (m - js == 1){
for (is = 0; is < js; is += 2){
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
a31 = *(aa1 + 2);
a41 = *(aa1 + 3);
aa1 += 4;
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
*(bb1 + 2) = a31;
*(bb1 + 3) = a41;
*(cc1 + 0) = a11;
*(cc1 + 1) = a21;
*(cc2 + 0) = a31;
*(cc2 + 1) = a41;
bb1 += 4;
cc1 += 4 * m;
cc2 += 4 * m;
}
a11 = *(aa1 + 0);
a21 = *(aa1 + 1);
*(bb1 + 0) = a11;
*(bb1 + 1) = a21;
}
}
}
#endif
#endif