?? t1sv_32.c
字號:
/* * Copyright (c) 2003, 2007-8 Matteo Frigo * Copyright (c) 2003, 2007-8 Massachusetts Institute of Technology * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * *//* This file was automatically generated --- DO NOT EDIT *//* Generated on Sat Nov 15 20:53:18 EST 2008 */#include "codelet-dft.h"#ifdef HAVE_FMA/* Generated by: ../../../genfft/gen_twiddle -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 32 -name t1sv_32 -include ts.h *//* * This function contains 434 FP additions, 260 FP multiplications, * (or, 236 additions, 62 multiplications, 198 fused multiply/add), * 158 stack variables, 7 constants, and 128 memory accesses */#include "ts.h"static void t1sv_32(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms){ DVK(KP831469612, +0.831469612302545237078788377617905756738560812); DVK(KP980785280, +0.980785280403230449126182236134239036973933731); DVK(KP668178637, +0.668178637919298919997757686523080761552472251); DVK(KP198912367, +0.198912367379658006911597622644676228597850501); DVK(KP923879532, +0.923879532511286756128183189396788286822416626); DVK(KP414213562, +0.414213562373095048801688724209698078569671875); DVK(KP707106781, +0.707106781186547524400844362104849039284835938); INT m; for (m = mb, W = W + (mb * 62); m < me; m = m + (2 * VL), ri = ri + ((2 * VL) * ms), ii = ii + ((2 * VL) * ms), W = W + ((2 * VL) * 62), MAKE_VOLATILE_STRIDE(rs)) { V T8Z, T90; { V T87, T8x, T3w, T8, T3B, T83, Tl, T8y, T6F, Tz, T3J, T5T, T6G, TM, T3Q; V T5U, T46, T5Y, T7D, T6L, T5X, T3Z, T6M, T1f, T4l, T61, T7E, T6R, T60, T4e; V T6O, T1G, T5r, T6c, T78, T7N, T54, T6f, T32, T7b, T4S, T65, T6X, T7I, T4v; V T68, T29, T70, T4x, T2f, T5b, T5s, T7O, T7e, T5t, T5i, T79, T3t, T2h, T2k; V T2j, T2o, T2r, T4H, T2y, T2n, T2q, T4y, T2i; { V T3U, TU, TW, TZ, TY, T13, T16, T12, T15, T3V, TX, T44, T1d; { V T1, T86, T3, T6, T5, Ta, Td, Tg, Tj, Tf, T84, T4, Tc, Ti, T3x; V Tb, T2, T9; T1 = LD(&(ri[0]), ms, &(ri[0])); T86 = LD(&(ii[0]), ms, &(ii[0])); T3 = LD(&(ri[WS(rs, 16)]), ms, &(ri[0])); T6 = LD(&(ii[WS(rs, 16)]), ms, &(ii[0])); T2 = LDW(&(W[TWVL * 30])); T5 = LDW(&(W[TWVL * 31])); Ta = LD(&(ri[WS(rs, 8)]), ms, &(ri[0])); Td = LD(&(ii[WS(rs, 8)]), ms, &(ii[0])); T9 = LDW(&(W[TWVL * 14])); Tg = LD(&(ri[WS(rs, 24)]), ms, &(ri[0])); Tj = LD(&(ii[WS(rs, 24)]), ms, &(ii[0])); Tf = LDW(&(W[TWVL * 46])); T84 = VMUL(T2, T6); T4 = VMUL(T2, T3); Tc = LDW(&(W[TWVL * 15])); Ti = LDW(&(W[TWVL * 47])); T3x = VMUL(T9, Td); Tb = VMUL(T9, Ta); { V Tu, Tx, T3F, Ts, Tt, Tw; { V To, Tr, Tq, T3E, Tp; { V T3y, Te, Tn, T3A, Tk; { V T3z, Th, T85, T7; To = LD(&(ri[WS(rs, 4)]), ms, &(ri[0])); T3z = VMUL(Tf, Tj); Th = VMUL(Tf, Tg); T85 = VFNMS(T5, T3, T84); T7 = VFMA(T5, T6, T4); Tr = LD(&(ii[WS(rs, 4)]), ms, &(ii[0])); T3y = VFNMS(Tc, Ta, T3x); Te = VFMA(Tc, Td, Tb); Tn = LDW(&(W[TWVL * 6])); T3A = VFNMS(Ti, Tg, T3z); Tk = VFMA(Ti, Tj, Th); T87 = VADD(T85, T86); T8x = VSUB(T86, T85); T3w = VSUB(T1, T7); T8 = VADD(T1, T7); } Tq = LDW(&(W[TWVL * 7])); T3E = VMUL(Tn, Tr); Tp = VMUL(Tn, To); T3B = VSUB(T3y, T3A); T83 = VADD(T3y, T3A); Tl = VADD(Te, Tk); T8y = VSUB(Te, Tk); } Tu = LD(&(ri[WS(rs, 20)]), ms, &(ri[0])); Tx = LD(&(ii[WS(rs, 20)]), ms, &(ii[0])); T3F = VFNMS(Tq, To, T3E); Ts = VFMA(Tq, Tr, Tp); Tt = LDW(&(W[TWVL * 38])); Tw = LDW(&(W[TWVL * 39])); } { V TB, TE, TD, TH, TK, T3G, Tv, TG, TJ, T3L, TC, TA; TB = LD(&(ri[WS(rs, 28)]), ms, &(ri[0])); TE = LD(&(ii[WS(rs, 28)]), ms, &(ii[0])); TA = LDW(&(W[TWVL * 54])); TD = LDW(&(W[TWVL * 55])); TH = LD(&(ri[WS(rs, 12)]), ms, &(ri[0])); TK = LD(&(ii[WS(rs, 12)]), ms, &(ii[0])); T3G = VMUL(Tt, Tx); Tv = VMUL(Tt, Tu); TG = LDW(&(W[TWVL * 22])); TJ = LDW(&(W[TWVL * 23])); T3L = VMUL(TA, TE); TC = VMUL(TA, TB); { V T19, T1c, T3P, T3K, T18, T1b, TV, T43, T1a; { V TQ, TT, T3M, TF, TS, T3I, T3D, T3O, TL, T3T, TR; { V T3H, Ty, T3N, TI, TP; TQ = LD(&(ri[WS(rs, 2)]), ms, &(ri[0])); TT = LD(&(ii[WS(rs, 2)]), ms, &(ii[0])); T3H = VFNMS(Tw, Tu, T3G); Ty = VFMA(Tw, Tx, Tv); T3N = VMUL(TG, TK); TI = VMUL(TG, TH); T3M = VFNMS(TD, TB, T3L); TF = VFMA(TD, TE, TC); TP = LDW(&(W[TWVL * 2])); TS = LDW(&(W[TWVL * 3])); T6F = VADD(T3F, T3H); T3I = VSUB(T3F, T3H); Tz = VADD(Ts, Ty); T3D = VSUB(Ts, Ty); T3O = VFNMS(TJ, TH, T3N); TL = VFMA(TJ, TK, TI); T3T = VMUL(TP, TT); TR = VMUL(TP, TQ); } T19 = LD(&(ri[WS(rs, 26)]), ms, &(ri[0])); T1c = LD(&(ii[WS(rs, 26)]), ms, &(ii[0])); T3J = VADD(T3D, T3I); T5T = VSUB(T3I, T3D); T6G = VADD(T3M, T3O); T3P = VSUB(T3M, T3O); TM = VADD(TF, TL); T3K = VSUB(TF, TL); T3U = VFNMS(TS, TQ, T3T); TU = VFMA(TS, TT, TR); T18 = LDW(&(W[TWVL * 50])); T1b = LDW(&(W[TWVL * 51])); } TW = LD(&(ri[WS(rs, 18)]), ms, &(ri[0])); TZ = LD(&(ii[WS(rs, 18)]), ms, &(ii[0])); T3Q = VSUB(T3K, T3P); T5U = VADD(T3K, T3P); TV = LDW(&(W[TWVL * 34])); TY = LDW(&(W[TWVL * 35])); T43 = VMUL(T18, T1c); T1a = VMUL(T18, T19); T13 = LD(&(ri[WS(rs, 10)]), ms, &(ri[0])); T16 = LD(&(ii[WS(rs, 10)]), ms, &(ii[0])); T12 = LDW(&(W[TWVL * 18])); T15 = LDW(&(W[TWVL * 19])); T3V = VMUL(TV, TZ); TX = VMUL(TV, TW); T44 = VFNMS(T1b, T19, T43); T1d = VFMA(T1b, T1c, T1a); } } } } { V T4Z, T2H, T2J, T2M, T2L, T2Q, T2T, T2P, T2S, T5p, T30, T50, T2K; { V T49, T1l, T1n, T1q, T1p, T1u, T1x, T4j, T1E, T1t, T1w, T4a, T1o; { V T1A, T1D, T1C, T4i, T1B, T1m; { V T1h, T1k, T41, T14, T3W, T10, T1g, T1j; T1h = LD(&(ri[WS(rs, 30)]), ms, &(ri[0])); T1k = LD(&(ii[WS(rs, 30)]), ms, &(ii[0])); T41 = VMUL(T12, T16); T14 = VMUL(T12, T13); T3W = VFNMS(TY, TW, T3V); T10 = VFMA(TY, TZ, TX); T1g = LDW(&(W[TWVL * 58])); T1j = LDW(&(W[TWVL * 59])); { V T6J, T3X, T11, T40, T48, T1i, T6K, T45, T1e, T3Y, T1z, T42, T17; T1A = LD(&(ri[WS(rs, 22)]), ms, &(ri[0])); T1D = LD(&(ii[WS(rs, 22)]), ms, &(ii[0])); T42 = VFNMS(T15, T13, T41); T17 = VFMA(T15, T16, T14); T6J = VADD(T3U, T3W); T3X = VSUB(T3U, T3W); T11 = VADD(TU, T10); T40 = VSUB(TU, T10); T48 = VMUL(T1g, T1k); T1i = VMUL(T1g, T1h); T6K = VADD(T42, T44); T45 = VSUB(T42, T44); T1e = VADD(T17, T1d); T3Y = VSUB(T17, T1d); T1z = LDW(&(W[TWVL * 42])); T1C = LDW(&(W[TWVL * 43])); T49 = VFNMS(T1j, T1h, T48); T1l = VFMA(T1j, T1k, T1i); T46 = VADD(T40, T45); T5Y = VSUB(T40, T45); T7D = VADD(T6J, T6K); T6L = VSUB(T6J, T6K); T5X = VADD(T3X, T3Y); T3Z = VSUB(T3X, T3Y); T6M = VSUB(T11, T1e); T1f = VADD(T11, T1e); T4i = VMUL(T1z, T1D); T1B = VMUL(T1z, T1A); } } T1n = LD(&(ri[WS(rs, 14)]), ms, &(ri[0])); T1q = LD(&(ii[WS(rs, 14)]), ms, &(ii[0])); T1m = LDW(&(W[TWVL * 26])); T1p = LDW(&(W[TWVL * 27])); T1u = LD(&(ri[WS(rs, 6)]), ms, &(ri[0])); T1x = LD(&(ii[WS(rs, 6)]), ms, &(ii[0])); T4j = VFNMS(T1C, T1A, T4i); T1E = VFMA(T1C, T1D, T1B); T1t = LDW(&(W[TWVL * 10])); T1w = LDW(&(W[TWVL * 11])); T4a = VMUL(T1m, T1q); T1o = VMUL(T1m, T1n); } { V T2W, T2Z, T6P, T4c, T1s, T4f, T6Q, T4k, T1F, T4d, T2V, T2Y, T5o, T2X, T2I; { V T2D, T2G, T2C, T2F, T4g, T1v, T4b, T1r; T2D = LD(&(ri[WS(rs, 31)]), ms, &(ri[WS(rs, 1)])); T2G = LD(&(ii[WS(rs, 31)]), ms, &(ii[WS(rs, 1)])); T2C = LDW(&(W[TWVL * 60])); T2F = LDW(&(W[TWVL * 61])); T4g = VMUL(T1t, T1x); T1v = VMUL(T1t, T1u); T4b = VFNMS(T1p, T1n, T4a); T1r = VFMA(T1p, T1q, T1o); T2W = LD(&(ri[WS(rs, 23)]), ms, &(ri[WS(rs, 1)])); T2Z = LD(&(ii[WS(rs, 23)]), ms, &(ii[WS(rs, 1)])); { V T4Y, T2E, T4h, T1y; T4Y = VMUL(T2C, T2G); T2E = VMUL(T2C, T2D); T4h = VFNMS(T1w, T1u, T4g); T1y = VFMA(T1w, T1x, T1v); T6P = VADD(T49, T4b); T4c = VSUB(T49, T4b); T1s = VADD(T1l, T1r); T4f = VSUB(T1l, T1r); T4Z = VFNMS(T2F, T2D, T4Y); T2H = VFMA(T2F, T2G, T2E); T6Q = VADD(T4h, T4j); T4k = VSUB(T4h, T4j); T1F = VADD(T1y, T1E); T4d = VSUB(T1y, T1E); T2V = LDW(&(W[TWVL * 44])); } T2Y = LDW(&(W[TWVL * 45])); } T2J = LD(&(ri[WS(rs, 15)]), ms, &(ri[WS(rs, 1)])); T2M = LD(&(ii[WS(rs, 15)]), ms, &(ii[WS(rs, 1)])); T4l = VADD(T4f, T4k); T61 = VSUB(T4f, T4k); T7E = VADD(T6P, T6Q); T6R = VSUB(T6P, T6Q); T60 = VADD(T4c, T4d); T4e = VSUB(T4c, T4d); T6O = VSUB(T1s, T1F); T1G = VADD(T1s, T1F); T5o = VMUL(T2V, T2Z); T2X = VMUL(T2V, T2W); T2I = LDW(&(W[TWVL * 28])); T2L = LDW(&(W[TWVL * 29])); T2Q = LD(&(ri[WS(rs, 7)]), ms, &(ri[WS(rs, 1)])); T2T = LD(&(ii[WS(rs, 7)]), ms, &(ii[WS(rs, 1)])); T2P = LDW(&(W[TWVL * 12])); T2S = LDW(&(W[TWVL * 13])); T5p = VFNMS(T2Y, T2W, T5o); T30 = VFMA(T2Y, T2Z, T2X); T50 = VMUL(T2I, T2M); T2K = VMUL(T2I, T2J); } } { V T4q, T1O, T1Q, T1T, T1S, T1X, T20, T4Q, T27, T1W, T1Z, T4r, T1R; { V T23, T26, T25, T4P, T24, T1P; { V T1K, T1N, T5m, T2R, T1J, T1M, T51, T2N; T1K = LD(&(ri[WS(rs, 1)]), ms, &(ri[WS(rs, 1)])); T1N = LD(&(ii[WS(rs, 1)]), ms, &(ii[WS(rs, 1)])); T5m = VMUL(T2P, T2T); T2R = VMUL(T2P, T2Q); T1J = LDW(&(W[0])); T1M = LDW(&(W[TWVL * 1])); T51 = VFNMS(T2L, T2J, T50); T2N = VFMA(T2L, T2M, T2K); { V T76, T52, T2O, T5l, T77, T5q, T31, T53, T22; T23 = LD(&(ri[WS(rs, 25)]), ms, &(ri[WS(rs, 1)])); T26 = LD(&(ii[WS(rs, 25)]), ms, &(ii[WS(rs, 1)])); { V T5n, T2U, T4p, T1L; T5n = VFNMS(T2S, T2Q, T5m); T2U = VFMA(T2S, T2T, T2R); T4p = VMUL(T1J, T1N); T1L = VMUL(T1J, T1K); T76 = VADD(T4Z, T51); T52 = VSUB(T4Z, T51); T2O = VADD(T2H, T2N); T5l = VSUB(T2H, T2N); T77 = VADD(T5n, T5p); T5q = VSUB(T5n, T5p); T31 = VADD(T2U, T30); T53 = VSUB(T2U, T30); T4q = VFNMS(T1M, T1K, T4p); T1O = VFMA(T1M, T1N, T1L); T22 = LDW(&(W[TWVL * 48])); } T25 = LDW(&(W[TWVL * 49])); T5r = VADD(T5l, T5q); T6c = VSUB(T5l, T5q); T78 = VSUB(T76, T77); T7N = VADD(T76, T77); T54 = VSUB(T52, T53); T6f = VADD(T52, T53); T32 = VADD(T2O, T31); T7b = VSUB(T2O, T31); T4P = VMUL(T22, T26); T24 = VMUL(T22, T23); } }
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -