?? s_tanhf.s
字號:
fmerge.s fSignumX = f8, f1 // signum(x) shl rSignBit = rSignBit, 31 // mask for sign bit}{ .mfi adds rBound = 0x3EA, r0 nop.f 0 addl rSaturation = 0x4112, r0};;{ .mfi andcm rOffset2 = rArg, rMask fclass.m p6,p0 = f8, 0xc7 // is x [S,Q]NaN or +/-0 ? shl rBound = rBound, 20 // 1.0f in GR}{ .mfb andcm rAbsArg = rArg, rSignBit // |x| in GR nop.f 0(p7) br.cond.spnt tanhf_denormal // branch out if x is denormal};;{ .mfi adds rCoeffAddr2 = 352, rDataPtr fclass.m p9,p0 = f8, 0x23 // is x +/- inf? shr rOffset2 = rOffset2, 21}{ .mfi cmp.lt p10, p8 = rAbsArg, rBound // |x| < 0.3125? nop.f 0 adds rCoeffAddr3 = 16, rDataPtr};;{ .mfi(p8) sub rBias = rOffset2, rBias2 fma.s1 fArg4 = fArgSqr, fArgSqr, f0 // x^4 shl rSaturation = rSaturation, 16}{ .mfb(p10) adds rBias = 0x14, r0(p6) fma.s.s0 f8 = f8,f1,f8 // NaN or +/-0(p6) br.ret.spnt b0 // exit for x = NaN or +/-0};;{ .mfi shladd rCoeffAddr1 = rBias, 4, rDataPtr fma.s1 fArg3Sgn = fArgSqr, f8, f0 // sign(x)*|x|^3 // is |x| < 9.125? cmp.lt p11, p12 = rAbsArg, rSaturation }{ .mfi shladd rCoeffAddr3 = rBias, 4, rCoeffAddr3 fma.s1 fArg3 = fArgSqr, fAbsArg, f0 // |x|^3 shladd rCoeffAddr2 = rBias, 3, rCoeffAddr2};;{ .mfi(p11) ldfpd fC0, fC1 = [rCoeffAddr1](p9) fmerge.s f8 = f8,f1 // +/- inf(p12) adds rDataPtr = 544, rDataPtr }{ .mfb(p11) ldfpd fC2, fC3 = [rCoeffAddr3], 16 nop.f 0(p9) br.ret.spnt b0 // exit for x = +/- inf};;{ .mfi(p11) ldfpd fA0, fA1 = [rCoeffAddr2], 16 nop.f 0(p8) cmp.eq.unc p13, p0 = rBias, rNearSaturation}{ .mfi add rCoeffAddr1 = 48, rCoeffAddr1 nop.f 0 nop.i 0};;{ .mfi(p11) ldfpd fD0, fD1 = [rCoeffAddr3] nop.f 0 nop.i 0}{ .mfb(p11) ldfpd fD2, fB0 = [rCoeffAddr1] // sign(x)*|x|^2 fma.s1 fArgSqrSgn = fArgSqr, fSignumX, f0(p10) br.cond.spnt tanhf_near_zero};;{ .mfi(p11) ldfpd fA2, fA3 = [rCoeffAddr2], 16 fcmp.lt.s1 p15, p14 = f8,f0 nop.i 0}{ .mfb(p12) ldfd fA0 = [rDataPtr] fma.s1 fArg4Sgn = fArg4, fSignumX, f0 // sign(x)*|x|^4(p12) br.cond.spnt tanhf_saturation};;{ .mfi nop.m 0 fma.s1 fArg7Sgn = fArg4, fArg3Sgn, f0 // sign(x)*|x|^7 nop.i 0}{ .mfb nop.m 0 fma.s1 fArg6Sgn = fArg3, fArg3Sgn, f0 // sign(x)*|x|^6(p13) br.cond.spnt tanhf_close_to_saturation };;{ .mfi nop.m 0 fma.s1 fPolC = fC3, fAbsArg, fC2 // C3*|x| + C2 nop.i 0}{ .mfi nop.m 0 fma.s1 fPolCTmp = fC1, fAbsArg, fC0 // C1*|x| + C0 nop.i 0};;{ .mfi nop.m 0 fma.s1 fPolA = fA1, fAbsArg, fA0 // A1*|x| + A0 nop.i 0};;{ .mfi nop.m 0 fma.s1 fPolD = fD1, fAbsArg, fD0 // D1*|x| + D0 nop.i 0}{ .mfi nop.m 0 // sign(x)*(|x|^7 + D2*x^6) fma.s1 fPolDTmp = fArg6Sgn, fD2, fArg7Sgn nop.i 0};;{ .mfi nop.m 0 fma.s1 fPolATmp = fA3, fAbsArg, fA2 // A3*|x| + A2 nop.i 0}{ .mfi nop.m 0 fma.s1 fB0 = fB0, fArg4, f0 // B0*x^4 nop.i 0};;{ .mfi nop.m 0 // C3*|x|^3 + C2*x^2 + C1*|x| + C0 fma.s1 fPolC = fPolC, fArgSqr, fPolCTmp nop.i 0};;{ .mfi nop.m 0 // PolD = sign(x)*(|x|^7 + D2*x^6 + D1*|x|^5 + D0*x^4) fma.d.s1 fPolD = fPolD, fArg4Sgn, fPolDTmp nop.i 0};;{ .mfi nop.m 0 // PolA = A3|x|^3 + A2*x^2 + A1*|x| + A0 fma.d.s1 fPolA = fPolATmp, fArgSqr, fPolA nop.i 0};; { .mfi nop.m 0 // PolC = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0 fma.d.s1 fPolC = fPolC, f1, fB0 nop.i 0};; { .mfi nop.m 0(p14) fma.s.s0 f8 = fPolC, fPolD, fPolA // for positive x nop.i 0 }{ .mfb nop.m 0(p15) fms.s.s0 f8 = fPolC, fPolD, fPolA // for negative x br.ret.sptk b0 // Exit for 0.3125 <=|x|< 8.0};;// Here if |x| < 0.3125tanhf_near_zero:{ .mfi nop.m 0 fma.s1 fPolC = fC3, fArgSqr, fC2 // C3*x^2 + C2 nop.i 0}{ .mfi nop.m 0 fma.s1 fPolCTmp = fC1, fArgSqr, fC0 // C1*x^2 + C0 nop.i 0};;{ .mfi nop.m 0 fma.s1 fPolC = fPolC, fArg4, fPolCTmp // C3*x^6 + C2*x^4 + C1*x^2 + C0 nop.i 0};;{ .mfb nop.m 0 // x + x^3*(C3*x^6 + C2*x^4 + C1*x^2 + C0) fma.s.s0 f8 = fPolC, fArg3Sgn, f8 br.ret.sptk b0 // Exit for |x| < 0.3125};;// Here if 9.125 <= |x| < +inftanhf_saturation:{ .mfb nop.m 0 fma.s.s0 f8 = fA0, fSignumX, f0 // sign(x)*(1.0d - 2^(-52)) // Exit for 9.125 <= |x| < +inf br.ret.sptk b0 // Exit for 9.125 <=|x|< +inf};; // Here if 8.0 <= |x| < 9.125tanhf_close_to_saturation:{ .mfi nop.m 0 fma.s1 fPolATmp = fA1, fAbsArg, fA0 // A1*|x| + A0 nop.i 0}{ .mfi nop.m 0 fma.s1 fPolA = fA3, fAbsArg, fA2 // A3*|x| + A2 nop.i 0} ;;.pred.rel "mutex", p14, p15{ .mfi nop.m 0 // for positive x(p14) fma.s.s0 f8 = fPolA, fArgSqr, fPolATmp nop.i 0 }{ .mfb nop.m 0 // for negative x(p15) fms.s.s0 f8 = fPolA, fArgSqrSgn, fPolATmp br.ret.sptk b0 // Exit for 8.0 <=|x|< 9.125};;// Here if x is single precision denormaltanhf_denormal:{ .mfi nop.m 0 fclass.m p7,p8 = f8, 0x0a // is x -denormal ? nop.i 0};;{ .mfi nop.m 0(p7) fma.s.s0 f8 = f8,f8,f8 // -denormal nop.i 0}{ .mfb nop.m 0(p8) fnma.s.s0 f8 = f8,f8,f8 // +denormal br.ret.sptk b0 // Exit for denormal};;GLOBAL_LIBM_END(tanhf)
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -