?? e_asin.s
字號:
.file "asin.s"// Copyright (c) 2000 - 2003 Intel Corporation// All rights reserved.//// Contributed 2000 by the Intel Numerics Group, Intel Corporation//// Redistribution and use in source and binary forms, with or without// modification, are permitted provided that the following conditions are// met://// * Redistributions of source code must retain the above copyright// notice, this list of conditions and the following disclaimer.//// * Redistributions in binary form must reproduce the above copyright// notice, this list of conditions and the following disclaimer in the// documentation and/or other materials provided with the distribution.//// * The name of Intel Corporation may not be used to endorse or promote// products derived from this software without specific prior written// permission.// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.//// Intel Corporation is the author of this code, and requests that all// problem reports or change requests be submitted to it directly at// http://www.intel.com/software/products/opensource/libraries/num.htm.// History//==============================================================// 02/02/00 Initial version// 08/17/00 New and much faster algorithm.// 08/31/00 Avoided bank conflicts on loads, shortened |x|=1 path,// fixed mfb split issue stalls.// 12/19/00 Fixed small arg cases to force inexact, or inexact and underflow.// 08/02/02 New and much faster algorithm II// 02/06/03 Reordered header: .section, .global, .proc, .align// Description//=========================================// The asin function computes the principal value of the arc sine of x.// asin(0) returns 0, asin(1) returns pi/2, asin(-1) returns -pi/2.// A doman error occurs for arguments not in the range [-1,+1].//// The asin function returns the arc sine in the range [-pi/2, +pi/2] radians.//// There are 8 paths:// 1. x = +/-0.0// Return asin(x) = +/-0.0//// 2. 0.0 < |x| < 0.625// Return asin(x) = x + x^3 *PolA(x^2)// where PolA(x^2) = A3 + A5*x^2 + A7*x^4 +...+ A35*x^32//// 3. 0.625 <=|x| < 1.0// Return asin(x) = sign(x) * ( Pi/2 - sqrt(R) * PolB(R))// Where R = 1 - |x|,// PolB(R) = B0 + B1*R + B2*R^2 +...+B12*R^12//// sqrt(R) is approximated using the following sequence:// y0 = (1 + eps)/sqrt(R) - initial approximation by frsqrta,// |eps| < 2^(-8)// Then 3 iterations are used to refine the result:// H0 = 0.5*y0// S0 = R*y0//// d0 = 0.5 - H0*S0// H1 = H0 + d0*H0// S1 = S0 + d0*S0//// d1 = 0.5 - H1*S1// H2 = H1 + d0*H1// S2 = S1 + d0*S1//// d2 = 0.5 - H2*S2// S3 = S3 + d2*S3//// S3 approximates sqrt(R) with enough accuracy for this algorithm//// So, the result should be reconstracted as follows:// asin(x) = sign(x) * (Pi/2 - S3*PolB(R))//// But for optimization perposes the reconstruction step is slightly// changed:// asin(x) = sign(x)*(Pi/2 - PolB(R)*S2) + sign(x)*d2*S2*PolB(R)//// 4. |x| = 1.0// Return asin(x) = sign(x)*Pi/2//// 5. 1.0 < |x| <= +INF// A doman error occurs for arguments not in the range [-1,+1]//// 6. x = [S,Q]NaN// Return asin(x) = QNaN//// 7. x is denormal// Return asin(x) = x + x^3,//// 8. x is unnormal// Normalize input in f8 and return to the very beginning of the function//// Registers used//==============================================================// Floating Point registers used:// f8, input, output// f6, f7, f9 -> f15, f32 -> f63// General registers used:// r3, r21 -> r31, r32 -> r38// Predicate registers used:// p0, p6 -> p14//// Assembly macros//=========================================// integer registers used// scratchrTblAddr = r3rPiBy2Ptr = r21rTmpPtr3 = r22rDenoBound = r23rOne = r24rAbsXBits = r25rHalf = r26r0625 = r27rSign = r28rXBits = r29rTmpPtr2 = r30rTmpPtr1 = r31// stackedGR_SAVE_PFS = r32GR_SAVE_B0 = r33GR_SAVE_GP = r34GR_Parameter_X = r35GR_Parameter_Y = r36GR_Parameter_RESULT = r37GR_Parameter_TAG = r38// floating point registers usedFR_X = f10FR_Y = f1FR_RESULT = f8// scratchfXSqr = f6fXCube = f7fXQuadr = f9f1pX = f10f1mX = f11f1pXRcp = f12f1mXRcp = f13fH = f14fS = f15// stackedfA3 = f32fB1 = f32fA5 = f33fB2 = f33fA7 = f34fPiBy2 = f34fA9 = f35fA11 = f36fB10 = f35fB11 = f36fA13 = f37fA15 = f38fB4 = f37fB5 = f38fA17 = f39fA19 = f40fB6 = f39fB7 = f40fA21 = f41fA23 = f42fB3 = f41fB8 = f42fA25 = f43fA27 = f44fB9 = f43fB12 = f44fA29 = f45fA31 = f46fA33 = f47fA35 = f48fBaseP = f49fB0 = f50fSignedS = f51fD = f52fHalf = f53fR = f54fCloseTo1Pol = f55fSignX = f56fDenoBound = f57fNormX = f58fX8 = f59fRSqr = f60fRQuadr = f61fR8 = f62fX16 = f63// Data tables//==============================================================RODATA.align 16LOCAL_OBJECT_START(asin_base_range_table)// Ai: Polynomial coefficients for the asin(x), |x| < .625000// Bi: Polynomial coefficients for the asin(x), |x| > .625000data8 0xBFDAAB56C01AE468 //A29data8 0x3FE1C470B76A5B2B //A31data8 0xBFDC5FF82A0C4205 //A33data8 0x3FC71FD88BFE93F0 //A35data8 0xB504F333F9DE6487, 0x00003FFF //B0data8 0xAAAAAAAAAAAAFC18, 0x00003FFC //A3data8 0x3F9F1C71BC4A7823 //A9data8 0x3F96E8BBAAB216B2 //A11data8 0x3F91C4CA1F9F8A98 //A13data8 0x3F8C9DDCEDEBE7A6 //A15data8 0x3F877784442B1516 //A17data8 0x3F859C0491802BA2 //A19data8 0x9999999998C88B8F, 0x00003FFB //A5data8 0x3F6BD7A9A660BF5E //A21data8 0x3F9FC1659340419D //A23data8 0xB6DB6DB798149BDF, 0x00003FFA //A7data8 0xBFB3EF18964D3ED3 //A25data8 0x3FCD285315542CF2 //A27data8 0xF15BEEEFF7D2966A, 0x00003FFB //B1data8 0x3EF0DDA376D10FB3 //B10data8 0xBEB83CAFE05EBAC9 //B11data8 0x3F65FFB67B513644 //B4data8 0x3F5032FBB86A4501 //B5data8 0x3F392162276C7CBA //B6data8 0x3F2435949FD98BDF //B7data8 0xD93923D7FA08341C, 0x00003FF9 //B2data8 0x3F802995B6D90BDB //B3data8 0x3F10DF86B341A63F //B8data8 0xC90FDAA22168C235, 0x00003FFF // Pi/2data8 0x3EFA3EBD6B0ECB9D //B9data8 0x3EDE18BA080E9098 //B12LOCAL_OBJECT_END(asin_base_range_table).section .textGLOBAL_LIBM_ENTRY(asin)asin_unnormal_back:{ .mfi getf.d rXBits = f8 // grab bits of input value // set p12 = 1 if x is a NaN, denormal, or zero fclass.m p12, p0 = f8, 0xcf adds rSign = 1, r0}{ .mfi addl rTblAddr = @ltoff(asin_base_range_table),gp // 1 - x = 1 - |x| for positive x fms.s1 f1mX = f1, f1, f8 addl rHalf = 0xFFFE, r0 // exponent of 1/2};;{ .mfi addl r0625 = 0x3FE4, r0 // high 16 bits of 0.625 // set p8 = 1 if x < 0 fcmp.lt.s1 p8, p9 = f8, f0 shl rSign = rSign, 63 // sign bit}{ .mfi // point to the beginning of the table ld8 rTblAddr = [rTblAddr] // 1 + x = 1 - |x| for negative x fma.s1 f1pX = f1, f1, f8 adds rOne = 0x3FF, r0};;{ .mfi andcm rAbsXBits = rXBits, rSign // bits of |x| fmerge.s fSignX = f8, f1 // signum(x) shl r0625 = r0625, 48 // bits of DP representation of 0.625}{ .mfb setf.exp fHalf = rHalf // load A2 to FP reg fma.s1 fXSqr = f8, f8, f0 // x^2 // branch on special path if x is a NaN, denormal, or zero(p12) br.cond.spnt asin_special};;{ .mfi adds rPiBy2Ptr = 272, rTblAddr nop.f 0 shl rOne = rOne, 52 // bits of 1.0}{ .mfi adds rTmpPtr1 = 16, rTblAddr nop.f 0 // set p6 = 1 if |x| < 0.625 cmp.lt p6, p7 = rAbsXBits, r0625};;{ .mfi ldfpd fA29, fA31 = [rTblAddr] // A29, fA31 // 1 - x = 1 - |x| for positive x(p9) fms.s1 fR = f1, f1, f8 // point to coefficient of "near 1" polynomial(p7) adds rTmpPtr2 = 176, rTblAddr}{ .mfi ldfpd fA33, fA35 = [rTmpPtr1], 16 // A33, fA35 // 1 + x = 1 - |x| for negative x(p8) fma.s1 fR = f1, f1, f8(p6) adds rTmpPtr2 = 48, rTblAddr};;{ .mfi ldfe fB0 = [rTmpPtr1], 16 // B0 nop.f 0 nop.i 0}{ .mib adds rTmpPtr3 = 16, rTmpPtr2 // set p10 = 1 if |x| = 1.0 cmp.eq p10, p0 = rAbsXBits, rOne // branch on special path for |x| = 1.0(p10) br.cond.spnt asin_abs_1};;{ .mfi ldfe fA3 = [rTmpPtr2], 48 // A3 or B1 nop.f 0 adds rTmpPtr1 = 64, rTmpPtr3}{ .mib ldfpd fA9, fA11 = [rTmpPtr3], 16 // A9, A11 or B10, B11 // set p11 = 1 if |x| > 1.0 cmp.gt p11, p0 = rAbsXBits, rOne // branch on special path for |x| > 1.0(p11) br.cond.spnt asin_abs_gt_1};;{ .mfi ldfpd fA17, fA19 = [rTmpPtr2], 16 // A17, A19 or B6, B7 // initial approximation of 1 / sqrt(1 - x) frsqrta.s1 f1mXRcp, p0 = f1mX nop.i 0}{ .mfi ldfpd fA13, fA15 = [rTmpPtr3] // A13, A15 or B4, B5 fma.s1 fXCube = fXSqr, f8, f0 // x^3 nop.i 0};;{ .mfi ldfe fA5 = [rTmpPtr2], 48 // A5 or B2 // initial approximation of 1 / sqrt(1 + x) frsqrta.s1 f1pXRcp, p0 = f1pX nop.i 0}{ .mfi ldfpd fA21, fA23 = [rTmpPtr1], 16 // A21, A23 or B3, B8 fma.s1 fXQuadr = fXSqr, fXSqr, f0 // x^4 nop.i 0};;{ .mfi ldfe fA7 = [rTmpPtr1] // A7 or Pi/2 fma.s1 fRSqr = fR, fR, f0 // R^2 nop.i 0}{ .mfb ldfpd fA25, fA27 = [rTmpPtr2] // A25, A27 or B9, B12 nop.f 0(p6) br.cond.spnt asin_base_range;};;{ .mfi nop.m 0(p9) fma.s1 fH = fHalf, f1mXRcp, f0 // H0 for x > 0 nop.i 0}{ .mfi nop.m 0(p9) fma.s1 fS = f1mX, f1mXRcp, f0 // S0 for x > 0 nop.i 0};;{ .mfi nop.m 0(p8) fma.s1 fH = fHalf, f1pXRcp, f0 // H0 for x < 0 nop.i 0}{ .mfi nop.m 0(p8) fma.s1 fS = f1pX, f1pXRcp, f0 // S0 for x > 0 nop.i 0};;{ .mfi nop.m 0 fma.s1 fRQuadr = fRSqr, fRSqr, f0 // R^4 nop.i 0};;{ .mfi nop.m 0 fma.s1 fB11 = fB11, fR, fB10 nop.i 0}{ .mfi nop.m 0 fma.s1 fB1 = fB1, fR, fB0 nop.i 0};;{ .mfi nop.m 0 fma.s1 fB5 = fB5, fR, fB4 nop.i 0
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -