?? atl_gemv_sse.c
字號:
/* * Automatically Tuned Linear Algebra Software v3.8.0 * (C) Copyright 2003 Camm Maguire * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions, and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the ATLAS group or the names of its contributers may * not be used to endorse or promote products derived from this * software without specific written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * */#include <stdio.h>#include <stdlib.h>#include "atlas_misc.h"#include "camm_util.h"#ifndef ATL_GAS_x8632 #error "This kernel requires gas x86-32 assembler!"#endif#define COPY_B#ifdef COPY_B#define plb(a_,b_,c_) pla(a_,b_,c_)#else#define plb(a_,b_,c_) pl(a_,b_,c_)#endif#undef p1_4_gemvT_1#define p1_4_gemvT_1(a_) \ pls(SS(a_,MM(0,RS4)),bx,4) \ pls(SS(a_,MM(0,RS4)),ax,0) \ pls(SS(a_,MM(0,RS4)),si,2) \ pmsr(4,0) \ pasr(0,6) \ pmsr(4,2) \ pasr(2,7)#undef p1_2_gemvT_1#define p1_2_gemvT_1(a_) \ px(4) \ pld(SS(a_,MM(0,RS4)),bx,4) \ px(0) \ pld(SS(a_,MM(0,RS4)),ax,0) \ px(2) \ pld(SS(a_,MM(0,RS4)),si,2) \ pm(4,0) \ pa(0,6) \ pm(4,2) \ pa(2,7)#undef p1_gemvT_1#define p1_gemvT_1(a_) \ plb(SS(a_,MM(0,RS4)),bx,4) \ plq(SS(a_,MM(0,RS4)),ax,0) \ plq(SS(a_,MM(0,RS4)),si,2) \ pm(4,0) \ pa(0,6) \ pm(4,2) \ pa(2,7)#undef p2_gemvT_1#define p2_gemvT_1(a_) \ plb(SS(a_,MM(0,RS4)),bx,4) \ plb(SS(a_,MM(1,RS4)),bx,5) \ plq(SS(a_,MM(0,RS4)),ax,0) \ plq(SS(a_,MM(1,RS4)),ax,1) \ plq(SS(a_,MM(0,RS4)),si,2) \ plq(SS(a_,MM(1,RS4)),si,3) \ pm(4,0) \ pa(0,6) \ pm(4,2) \ pa(2,7) \ pm(5,1) \ pa(1,6) \ pm(5,3) \ pa(3,7)#undef p4_gemvT_1#define p4_gemvT_1(a_) \ plb(SS(a_,MM(0,RS4)),bx,4) \ plb(SS(a_,MM(1,RS4)),bx,5) \ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ plq(SS(a_,MM(0,RS4)),ax,0) \ plq(SS(a_,MM(1,RS4)),ax,1) \ plq(SS(a_,MM(0,RS4)),si,2) \ plq(SS(a_,MM(1,RS4)),si,3) \ pm(4,0) \ pa(0,6) \ pm(4,2) \ pa(2,7) \ pm(5,1) \ pa(1,6) \ pm(5,3) \ pa(3,7) \ plb(SS(a_,MM(2,RS4)),bx,4) \ plb(SS(a_,MM(3,RS4)),bx,5) \ plq(SS(a_,MM(2,RS4)),ax,0) \ plq(SS(a_,MM(3,RS4)),ax,1) \ f(nta,SS(a_,MM((SS(0,CL)),RS4)),si) \ plq(SS(a_,MM(2,RS4)),si,2) \ plq(SS(a_,MM(3,RS4)),si,3) \ pm(4,0) \ pa(0,6) \ pm(4,2) \ pa(2,7) \ pm(5,1) \ pa(1,6) \ pm(5,3) \ pa(3,7)#undef lpgemvT_1#define lpgemvT_1(a_)#undef dpgemvT_1#define dpgemvT_1(a_) p4_gemvT_1(a_)#undef plgemvT_1#define plgemvT_1 16#undef p1_4_gemvT_1_1#define p1_4_gemvT_1_1(a_) \ pls(SS(a_,MM(0,RS4)),bx,4) \ pls(SS(a_,MM(0,RS4)),ax,0) \ pmsr(4,0) \ pasr(0,6)#undef p1_2_gemvT_1_1#define p1_2_gemvT_1_1(a_) \ px(4) \ pld(SS(a_,MM(0,RS4)),bx,4) \ px(0) \ pld(SS(a_,MM(0,RS4)),ax,0) \ pm(4,0) \ pa(0,6)#undef p1_gemvT_1_1#define p1_gemvT_1_1(a_) \ plb(SS(a_,MM(0,RS4)),bx,4) \ plq(SS(a_,MM(0,RS4)),ax,0) \ pm(4,0) \ pa(0,6)#undef p2_gemvT_1_1#define p2_gemvT_1_1(a_) \ plb(SS(a_,MM(0,RS4)),bx,4) \ plb(SS(a_,MM(1,RS4)),bx,5) \ plq(SS(a_,MM(0,RS4)),ax,0) \ plq(SS(a_,MM(1,RS4)),ax,1) \ pm(4,0) \ pa(0,6) \ pm(5,1) \ pa(1,6)#undef p4_gemvT_1_1#define p4_gemvT_1_1(a_) \ plb(SS(a_,MM(0,RS4)),bx,4) \ plb(SS(a_,MM(1,RS4)),bx,5) \ plb(SS(a_,MM(2,RS4)),bx,3) \ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ plq(SS(a_,MM(0,RS4)),ax,0) \ plq(SS(a_,MM(1,RS4)),ax,1) \ plq(SS(a_,MM(2,RS4)),ax,2) \ pm(4,0) \ pa(0,6) \ plb(SS(a_,MM(3,RS4)),bx,4) \ plq(SS(a_,MM(3,RS4)),ax,0) \ pm(5,1) \ pa(1,6) \ pm(3,2) \ pa(2,6) \ pm(4,0) \ pa(0,6)#undef lpgemvT_1_1#define lpgemvT_1_1(a_)#undef dpgemvT_1_1#define dpgemvT_1_1(a_) p4_gemvT_1_1(a_)#undef plgemvT_1_1#define plgemvT_1_1 16#undef p1_4_gemvT_1_3#define p1_4_gemvT_1_3(a_) \ pls(SS(a_,MM(0,RS4)),bx,4) \ pls(SS(a_,MM(0,RS4)),ax,0) \ pmsr(4,0) \ pasr(0,6)#undef p1_2_gemvT_1_3#define p1_2_gemvT_1_3(a_) \ px(4) \ pld(SS(a_,MM(0,RS4)),bx,4) \ px(0) \ pld(SS(a_,MM(0,RS4)),ax,0) \ pm(4,0) \ pa(0,6)#undef p1_gemvT_1_3#define p1_gemvT_1_3(a_) \ plb(SS(a_,MM(0,RS4)),bx,4) \ plq(SS(a_,MM(0,RS4)),ax,0) \ pm(4,0) \ pa(0,6)#undef p2_gemvT_1_3#define p2_gemvT_1_3(a_) \ plb(SS(a_,MM(0,RS4)),bx,4) \ plb(SS(a_,MM(1,RS4)),bx,5) \ plq(SS(a_,MM(0,RS4)),ax,0) \ plq(SS(a_,MM(1,RS4)),ax,1) \ pm(4,0) \ pa(0,6) \ pm(5,1) \ pa(1,6)#undef p4_gemvT_1_3#define p4_gemvT_1_3(a_) \ plb(SS(a_,MM(0,RS4)),bx,4) \ plb(SS(a_,MM(1,RS4)),bx,5) \ plq(SS(a_,MM(0,RS4)),ax,0) \ plb(SS(a_,MM(2,RS4)),bx,3) \ plq(SS(a_,MM(1,RS4)),ax,1) \ plq(SS(a_,MM(2,RS4)),ax,2) \ f(nta,SS(a_,MM((SS(2,CL)),RS4)),ax) \ pm(4,0) \ pa(0,6) \ plb(SS(a_,MM(3,RS4)),bx,4) \ plq(SS(a_,MM(3,RS4)),ax,0) \ pm(5,1) \ pa(1,7) \ pm(3,2) \ pa(2,6) \ pm(4,0) \ pa(0,7)#undef p8_gemvT_1_3#define p8_gemvT_1_3(a_) \ plb(SS(a_,MM(0,RS4)),bx,4) \ plb(SS(a_,MM(1,RS4)),bx,5) \ plb(SS(a_,MM(2,RS4)),bx,3) \ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ plq(SS(a_,MM(0,RS4)),ax,0) \ plq(SS(a_,MM(1,RS4)),ax,1) \ plq(SS(a_,MM(2,RS4)),ax,2) \ pm(4,0) \ pa(0,6) \ plb(SS(a_,MM(3,RS4)),bx,4) \ plq(SS(a_,MM(3,RS4)),ax,0) \ pm(5,1) \ pa(1,7) \ pm(3,2) \ pa(2,6) \ pm(4,0) \ pa(0,7) \ plb(SS(a_,MM(4,RS4)),bx,4) \ plb(SS(a_,MM(5,RS4)),bx,5) \ plb(SS(a_,MM(6,RS4)),bx,3) \ plq(SS(a_,MM(4,RS4)),ax,0) \ plq(SS(a_,MM(5,RS4)),ax,1) \ plq(SS(a_,MM(6,RS4)),ax,2) \ pm(4,0) \ pa(0,6) \ plb(SS(a_,MM(7,RS4)),bx,4) \ plq(SS(a_,MM(7,RS4)),ax,0) \ pm(5,1) \ pa(1,7) \ pm(3,2) \ pa(2,6) \ pm(4,0) \ pa(0,7)#undef lpgemvT_1_3#define lpgemvT_1_3(a_)#undef dpgemvT_1_3#define dpgemvT_1_3(a_) p4_gemvT_1_3(a_)#undef plgemvT_1_3#define plgemvT_1_3 16#undef p1_4_gemvT_1_1c#define p1_4_gemvT_1_1c(a_)#undef p1_2_gemvT_1_1c#define p1_2_gemvT_1_1c(a_) \ px(4) \ pld(SS(a_,MM(0,RS4)),bx,4) \ px(0) \ pld(SS(a_,MM(0,RS4)),ax,0) \ pc(4,2) \ ps(CSHUF,4,4) \ pm(0,2) \ pa(2,6) \ pm(0,4) \ pa(4,7)#undef p1_gemvT_1_1c#define p1_gemvT_1_1c(a_) \ plb(SS(a_,MM(0,RS4)),bx,4) \ plq(SS(a_,MM(0,RS4)),ax,0) \ pc(4,2) \ ps(CSHUF,4,4) \ pm(0,2) \ pa(2,6) \ pm(0,4) \ pa(4,7)#undef p2_gemvT_1_1c#define p2_gemvT_1_1c(a_) \ plb(SS(a_,MM(0,RS4)),bx,4) \ plb(SS(a_,MM(1,RS4)),bx,5) \ plq(SS(a_,MM(0,RS4)),ax,0) \ plq(SS(a_,MM(1,RS4)),ax,1) \ pc(4,2) \ pc(5,3) \ ps(CSHUF,4,4) \ ps(CSHUF,5,5) \ pm(0,2) \ pa(2,6) \ pm(0,4) \ pa(4,7) \ pm(1,3) \ pa(3,6) \ pm(1,5) \ pa(5,7)#undef p4_gemvT_1_1c#define p4_gemvT_1_1c(a_) \ plb(SS(a_,MM(0,RS4)),bx,4) \ plb(SS(a_,MM(1,RS4)),bx,5) \ f(nta,SS(a_,MM((SS(0,CL)),RS4)),ax) \ plq(SS(a_,MM(0,RS4)),ax,0) \ plq(SS(a_,MM(1,RS4)),ax,1) \ pc(4,2) \ pc(5,3) \ ps(CSHUF,4,4) \ ps(CSHUF,5,5) \ pm(0,2) \ pa(2,6) \ pm(0,4) \ pa(4,7) \ pm(1,3) \ pa(3,6) \ pm(1,5) \ pa(5,7) \ plb(SS(a_,MM(2,RS4)),bx,4) \ plb(SS(a_,MM(3,RS4)),bx,5) \ plq(SS(a_,MM(2,RS4)),ax,0) \ plq(SS(a_,MM(3,RS4)),ax,1) \ pc(4,2) \ pc(5,3) \ ps(CSHUF,4,4) \ ps(CSHUF,5,5) \ pm(0,2) \ pa(2,6) \ pm(0,4) \ pa(4,7) \ pm(1,3) \ pa(3,6) \ pm(1,5) \ pa(5,7)#undef lpgemvT_1_1c#define lpgemvT_1_1c(a_)#undef dpgemvT_1_1c#define dpgemvT_1_1c(a_) p4_gemvT_1_1c(a_)#undef plgemvT_1_1c#define plgemvT_1_1c 16#undef p1_4_gemvT_3_1#define p1_4_gemvT_3_1(a_) \ pls(SS(a_,MM(0,RS4)),bx,3) \ pls(SS(a_,MM(0,RS4)),ax,0) \ plsx(SS(a_,MM(0,RS4)),ax,bp,1,1) \ plsx(SS(a_,MM(0,RS4)),ax,bp,2,2) \ pmsr(3,0) \ pasr(0,4) \ pmsr(3,1) \ pasr(1,5) \ pmsr(3,2) \ pasr(2,6)#undef p1_2_gemvT_3_1#define p1_2_gemvT_3_1(a_) \ px(3) \ px(0) \ px(1) \ px(2) \ pld(SS(a_,MM(0,RS4)),bx,3) \ pld(SS(a_,MM(0,RS4)),ax,0) \ pldx(SS(a_,MM(0,RS4)),ax,bp,1,1) \ pldx(SS(a_,MM(0,RS4)),ax,bp,2,2) \ pm(3,0) \ pa(0,4) \ pm(3,1) \ pa(1,5) \ pm(3,2) \ pa(2,6)#undef p1_gemvT_3_1#define p1_gemvT_3_1(a_) \ plb(SS(a_,MM(0,RS4)),bx,3) \ plq(SS(a_,MM(0,RS4)),ax,0) \ plqx(SS(a_,MM(0,RS4)),ax,bp,1,1) \ plqx(SS(a_,MM(0,RS4)),ax,bp,2,2) \ pm(3,0) \ pa(0,4) \ pm(3,1) \ pa(1,5) \ pm(3,2) \ pa(2,6)#undef p2_gemvT_3_1#define p2_gemvT_3_1(a_) \ plb(SS(a_,MM(0,RS4)),bx,3) \ plq(SS(a_,MM(0,RS4)),ax,0) \ plqx(SS(a_,MM(0,RS4)),ax,bp,1,1) \ plqx(SS(a_,MM(0,RS4)),ax,bp,2,2) \ pm(3,0) \ pa(0,4) \ pm(3,1) \ pa(1,5) \
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -