?? s_erff.s
字號:
.file "erff.s"// Copyright (c) 2001 - 2005, Intel Corporation// All rights reserved.//// Contributed 2001 by the Intel Numerics Group, Intel Corporation//// Redistribution and use in source and binary forms, with or without// modification, are permitted provided that the following conditions are// met://// * Redistributions of source code must retain the above copyright// notice, this list of conditions and the following disclaimer.//// * Redistributions in binary form must reproduce the above copyright// notice, this list of conditions and the following disclaimer in the// documentation and/or other materials provided with the distribution.//// * The name of Intel Corporation may not be used to endorse or promote// products derived from this software without specific prior written// permission.// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Intel Corporation is the author of this code, and requests that all// problem reports or change requests be submitted to it directly at // http://www.intel.com/software/products/opensource/libraries/num.htm.//// History//==============================================================// 08/14/01 Initial version// 05/20/02 Cleaned up namespace and sf0 syntax// 02/06/03 Reordered header: .section, .global, .proc, .align// 03/31/05 Reformatted delimiters between data tables//// API//==============================================================// float erff(float)//// Overview of operation//==============================================================// Background////// There are 8 paths:// 1. x = +/-0.0// Return erff(x) = +/-0.0//// 2. 0.0 < |x| < 0.125// Return erff(x) = x *Pol3(x^2),// where Pol3(x^2) = C3*x^6 + C2*x^4 + C1*x^2 + C0//// 3. 0.125 <= |x| < 4.0// Return erff(x) = sign(x)*PolD(x)*PolC(|x|) + sign(x)*PolA(|x|),// where sign(x)*PolD(x) = sign(x)*(|x|^7 + D2*x^6 + D1*|x|^5 + D0*x^4),// PolC(|x|) = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0,// PolA(|x|) = A3|x|^3 + A2*x^2 + A1*|x| + A0//// Actually range 0.125<=|x|< 4.0 is splitted to 5 subranges.// For each subrange there is particular set of coefficients.// Below is the list of subranges:// 3.1 0.125 <= |x| < 0.25// 3.2 0.25 <= |x| < 0.5// 3.3 0.5 <= |x| < 1.0// 3.4 1.0 <= |x| < 2.0// 3.5 2.0 <= |x| < 4.0//// 4. 4.0 <= |x| < +INF// Return erff(x) = sign(x)*(1.0d - 2^(-52))//// 5. |x| = INF// Return erff(x) = sign(x) * 1.0//// 6. x = [S,Q]NaN // Return erff(x) = QNaN//// 7. x is positive denormal// Return erff(x) = C0*x - x^2,// where C0 = 2.0/sqrt(Pi)//// 8. x is negative denormal// Return erff(x) = C0*x + x^2,// where C0 = 2.0/sqrt(Pi)//// Registers used//==============================================================// Floating Point registers used: // f8, input// f32 -> f59// General registers used: // r32 -> r45, r2, r3// Predicate registers used:// p0, p6 -> p12, p14, p15// p6 to filter out case when x = [Q,S]NaN or +/-0// p7 to filter out case when x = denormal// p8 set if |x| >= 0.3125, used also to process denormal input// p9 to filter out case when |x| = inf// p10 to filter out case when |x| < 0.125// p11 to filter out case when 0.125 <= |x| < 4.0// p12 to filter out case when |x| >= 4.0// p14 set to 1 for positive x// p15 set to 1 for negative x// Assembly macros//==============================================================rDataPtr = r2rDataPtr1 = r3rBias = r33rCoeffAddr3 = r34rCoeffAddr1 = r35rCoeffAddr2 = r36rOffset2 = r37rBias2 = r38rMask = r39rArg = r40rBound = r41rSignBit = r42rAbsArg = r43rDataPtr2 = r44rSaturation = r45//==============================================================fA0 = f32fA1 = f33fA2 = f34fA3 = f35fC0 = f36fC1 = f37fC2 = f38fC3 = f39fD0 = f40fD1 = f41fD2 = f42fB0 = f43fArgSqr = f44fAbsArg = f45fSignumX = f46fArg4 = f47fArg4Sgn = f48fArg3 = f49fArg3Sgn = f50fArg7Sgn = f51fArg6Sgn = f52fPolC = f53fPolCTmp = f54fPolA = f55fPolATmp = f56fPolD = f57fPolDTmp = f58fArgSqrSgn = f59// Data tables//==============================================================RODATA.align 16LOCAL_OBJECT_START(erff_data)// Polynomial coefficients for the erf(x), 0.125 <= |x| < 0.25data8 0xBE4218BB56B49E66 // C0data8 0x3F7AFB8315DA322B // C1data8 0x3F615D6EBEE0CA32 // C2data8 0xBF468D71CF4F0918 // C3data8 0x40312115B0932F24 // D0data8 0xC0160D6CD0991EA3 // D1data8 0xBFE04A567A6DBE4A // D2data8 0xBF4207BC640D1509 // B0 // Polynomial coefficients for the erf(x), 0.25 <= |x| < 0.5data8 0x3F90849356383F58 // C0data8 0x3F830BD5BA240F09 // C1data8 0xBF3FA4970E2BCE23 // C2data8 0xBF6061798E58D0FD // C3data8 0xBF68C0D83DD22E02 // D0data8 0x401C0A9EE4108F94 // D1data8 0xC01056F9B5E387F5 // D2data8 0x3F1C9744E36A5706 // B0// Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0data8 0x3F85F7D419A13DE3 // C0data8 0x3F791A13FF66D45A // C1data8 0x3F46B17B16B5929F // C2data8 0xBF5124947A8BF45E // C3data8 0x3FA1B3FD95EA9564 // D0data8 0x40250CECD79A020A // D1data8 0xC0190DC96FF66CCD // D2data8 0x3F4401AE28BA4DD5 // B0// Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0data8 0xBF49E07E3584C3AE // C0data8 0x3F3166621131445C // C1data8 0xBF65B7FC1EAC2099 // C2data8 0x3F508C6BD211D736 // C3data8 0xC053FABD70601067 // D0data8 0x404A06640EE87808 // D1data8 0xC0283F30817A3F08 // D2data8 0xBF2F6DBBF4D6257F // B0// Polynomial coefficients for the erf(x), 2.0 <= |x| < 4.0data8 0xBF849855D67E9407 // C0data8 0x3F5ECA5FEC01C70C // C1data8 0xBF483110C30FABA4 // C2data8 0x3F1618DA72860403 // C3data8 0xC08A5C9D5FE8B9F6 // D0data8 0x406EFF5F088CEC4B // D1data8 0xC03A5743DF38FDE0 // D2data8 0xBEE397A9FA5686A2 // B0// Polynomial coefficients for the erf(x), -0.125 < x < 0.125 data8 0x3FF20DD7504270CB // C0data8 0xBFD8127465AFE719 // C1data8 0x3FBCE2D77791DD77 // C2data8 0xBF9B582755CDF345 // C3// Polynomial coefficients for the erf(x), 0.125 <= |x| < 0.25data8 0xBD54E7E451AF0E36 // A0data8 0x3FF20DD75043FE20 // A1data8 0xBE05680ACF8280E4 // A2data8 0xBFD812745E92C3D3 // A3// Polynomial coefficients for the erf(x), 0.25 <= |x| < 0.5data8 0xBE1ACEC2859CB55F // A0data8 0x3FF20DD75E8D2B64 // A1data8 0xBEABC6A83208FCFC // A2data8 0xBFD81253E42E7B99 // A3// Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0data8 0x3EABD5A2482B4979 // A0data8 0x3FF20DCAA52085D5 // A1data8 0x3F13A994A348795B // A2data8 0xBFD8167B2DFCDE44 // A3// Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0data8 0xBF5BA377DDAB4E17 // A0data8 0x3FF2397F1D8FC0ED // A1data8 0xBF9945BFC1915C21 // A2data8 0xBFD747AAABB690D8 // A3// Polynomial coefficients for the erf(x), 2.0 <= |x| < 4.0data8 0x3FF0E2920E0391AF // A0data8 0xC00D249D1A95A5AE // A1data8 0x40233905061C3803 // A2data8 0xC027560B851F7690 // A3//data8 0x3FEFFFFFFFFFFFFF // 1.0 - epsilondata8 0x3FF20DD750429B6D // C0 = 2.0/sqrt(Pi)LOCAL_OBJECT_END(erff_data).section .textGLOBAL_LIBM_ENTRY(erff){ .mfi alloc r32 = ar.pfs, 0, 14, 0, 0 fmerge.s fAbsArg = f1, f8 // |x| addl rMask = 0x806, r0}{ .mfi addl rDataPtr = @ltoff(erff_data), gp fma.s1 fArgSqr = f8, f8, f0 // x^2 adds rSignBit = 0x1, r0};;{ .mfi getf.s rArg = f8 // x in GR fclass.m p7,p0 = f8, 0x0b // is x denormal ? // sign bit and 2 most bits in significand shl rMask = rMask, 20 }{ .mfi ld8 rDataPtr = [rDataPtr] nop.f 0 adds rBias2 = 0x1F0, r0
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -