?? s_cosl.s
字號:
};;{ .mmi(p9) ldfe FR_S_4 = [GR_ad_se], -16 // Load S_4 if i_1=0(p10) ldfe FR_C_4 = [GR_ad_ce], -16 // Load C_4 if i_1=1 nop.i 999};;SINCOSL_SMALL_R_0:// Entry point for 2^-3 < |x| < pi/4.pred.rel "mutex",p9,p10SINCOSL_SMALL_R_1:// Entry point for pi/4 < |x| < 2^24 and |r| < 2^-3.pred.rel "mutex",p9,p10{ .mfi(p9) ldfe FR_S_3 = [GR_ad_se], -16 // Load S_3 if i_1=0 fma.s1 FR_Z = FR_rsq, FR_rsq, f0 // Z = rsq * rsq nop.i 999}{ .mfi(p10) ldfe FR_C_3 = [GR_ad_ce], -16 // Load C_3 if i_1=1(p10) fnma.s1 FR_c = FR_c, FR_r, f0 // c = -c * r if i_1=0 nop.i 999};;{ .mmf(p9) ldfe FR_S_2 = [GR_ad_se], -16 // Load S_2 if i_1=0(p10) ldfe FR_C_2 = [GR_ad_ce], -16 // Load C_2 if i_1=1(p10) fmerge.s FR_r = f1, f1};;{ .mmi(p9) ldfe FR_S_1 = [GR_ad_se], -16 // Load S_1 if i_1=0(p10) ldfe FR_C_1 = [GR_ad_ce], -16 // Load C_1 if i_1=1 nop.i 999};;{ .mfi nop.m 999(p9) fma.s1 FR_Z = FR_Z, FR_r, f0 // Z = Z * r if i_1=0 nop.i 999};;{ .mfi nop.m 999(p9) fma.s1 FR_poly_lo = FR_rsq, FR_S_5, FR_S_4 // poly_lo=rsq*S_5+S_4 if i_1=0 nop.i 999}{ .mfi nop.m 999(p10) fma.s1 FR_poly_lo = FR_rsq, FR_C_5, FR_C_4 // poly_lo=rsq*C_5+C_4 if i_1=1 nop.i 999};;{ .mfi nop.m 999(p9) fma.s1 FR_poly_hi = FR_rsq, FR_S_2, FR_S_1 // poly_hi=rsq*S_2+S_1 if i_1=0 nop.i 999}{ .mfi nop.m 999(p10) fma.s1 FR_poly_hi = FR_rsq, FR_C_2, FR_C_1 // poly_hi=rsq*C_2+C_1 if i_1=1 nop.i 999};;{ .mfi nop.m 999 fma.s1 FR_Z = FR_Z, FR_rsq, f0 // Z = Z * rsq nop.i 999};;{ .mfi nop.m 999(p9) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_S_3 // p_lo=p_lo*rsq+S_3, i_1=0 nop.i 999}{ .mfi nop.m 999(p10) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_C_3 // p_lo=p_lo*rsq+C_3, i_1=1 nop.i 999};;{ .mfi nop.m 999(p9) fma.s0 FR_inexact = FR_S_4, FR_S_4, f0 // Dummy op to set inexact tbit.z p11,p12 = GR_N_Inc, 1 // p11 if i_0=0, N mod 4 = 0,2 // p12 if i_0=1, N mod 4 = 1,3}{ .mfi nop.m 999(p10) fma.s0 FR_inexact = FR_C_1, FR_C_1, f0 // Dummy op to set inexact nop.i 999};;{ .mfi nop.m 999(p9) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0 // p_hi=p_hi*rsq if i_1=0 nop.i 999}{ .mfi nop.m 999(p10) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0 // p_hi=p_hi*rsq if i_1=1 nop.i 999};;{ .mfi nop.m 999 fma.s1 FR_poly = FR_Z, FR_poly_lo, FR_c // poly=Z*poly_lo+c nop.i 999};;{ .mfi nop.m 999(p9) fma.s1 FR_poly_hi = FR_r, FR_poly_hi, f0 // p_hi=r*p_hi if i_1=0 nop.i 999};;{ .mfi nop.m 999(p12) fms.s1 FR_r = f0, f1, FR_r // r = -r if i_0=1 nop.i 999};;{ .mfi nop.m 999 fma.s1 FR_poly = FR_poly, f1, FR_poly_hi // poly=poly+poly_hi nop.i 999};;//// if (i_0 == 0) Result = r + poly// if (i_0 != 0) Result = r - poly//{ .mfi nop.m 999(p11) fma.s0 FR_Result = FR_r, f1, FR_poly nop.i 999}{ .mfb nop.m 999(p12) fms.s0 FR_Result = FR_r, f1, FR_poly br.ret.sptk b0 // Exit for |r| < 2^-3};;SINCOSL_NORMAL_R://// Here if 2^-3 <= |r| < pi/4// THIS IS THE MAIN PATH//// Enter with r, c, and N_Inc having been computed//{ .mfi ldfe FR_PP_6 = [GR_ad_pp], 16 // Load PP_6 fma.s1 FR_rsq = FR_r, FR_r, f0 // rsq = r * r tbit.z p9,p10 = GR_N_Inc, 0 // p9 if i_1=0, N mod 4 = 0,1 // p10 if i_1=1, N mod 4 = 2,3}{ .mfi ldfe FR_QQ_6 = [GR_ad_qq], 16 // Load QQ_6 nop.f 999 nop.i 999};;{ .mmi(p9) ldfe FR_PP_5 = [GR_ad_pp], 16 // Load PP_5 if i_1=0(p10) ldfe FR_QQ_5 = [GR_ad_qq], 16 // Load QQ_5 if i_1=1 nop.i 999};;SINCOSL_NORMAL_R_0:// Entry for 2^-3 < |x| < pi/4.pred.rel "mutex",p9,p10{ .mmf(p9) ldfe FR_C_1 = [GR_ad_pp], 16 // Load C_1 if i_1=0(p10) ldfe FR_S_1 = [GR_ad_qq], 16 // Load S_1 if i_1=1 frcpa.s1 FR_r_hi, p6 = f1, FR_r // r_hi = frcpa(r)};;{ .mfi nop.m 999(p9) fma.s1 FR_poly = FR_rsq, FR_PP_8, FR_PP_7 // poly = rsq*PP_8+PP_7 if i_1=0 nop.i 999}{ .mfi nop.m 999(p10) fma.s1 FR_poly = FR_rsq, FR_QQ_8, FR_QQ_7 // poly = rsq*QQ_8+QQ_7 if i_1=1 nop.i 999};;{ .mfi nop.m 999 fma.s1 FR_r_cubed = FR_r, FR_rsq, f0 // rcubed = r * rsq nop.i 999};;SINCOSL_NORMAL_R_1:// Entry for pi/4 <= |x| < 2^24.pred.rel "mutex",p9,p10{ .mmf(p9) ldfe FR_PP_1 = [GR_ad_pp], 16 // Load PP_1_hi if i_1=0(p10) ldfe FR_QQ_1 = [GR_ad_qq], 16 // Load QQ_1 if i_1=1 frcpa.s1 FR_r_hi, p6 = f1, FR_r_hi // r_hi = frpca(frcpa(r))};;{ .mfi(p9) ldfe FR_PP_4 = [GR_ad_pp], 16 // Load PP_4 if i_1=0(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_6 // poly = rsq*poly+PP_6 if i_1=0 nop.i 999}{ .mfi(p10) ldfe FR_QQ_4 = [GR_ad_qq], 16 // Load QQ_4 if i_1=1(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_6 // poly = rsq*poly+QQ_6 if i_1=1 nop.i 999};;{ .mfi nop.m 999(p9) fma.s1 FR_corr = FR_C_1, FR_rsq, f0 // corr = C_1 * rsq if i_1=0 nop.i 999}{ .mfi nop.m 999(p10) fma.s1 FR_corr = FR_S_1, FR_r_cubed, FR_r // corr = S_1 * r^3 + r if i_1=1 nop.i 999};;{ .mfi(p9) ldfe FR_PP_3 = [GR_ad_pp], 16 // Load PP_3 if i_1=0 fma.s1 FR_r_hi_sq = FR_r_hi, FR_r_hi, f0 // r_hi_sq = r_hi * r_hi nop.i 999}{ .mfi(p10) ldfe FR_QQ_3 = [GR_ad_qq], 16 // Load QQ_3 if i_1=1 fms.s1 FR_r_lo = FR_r, f1, FR_r_hi // r_lo = r - r_hi nop.i 999};;{ .mfi(p9) ldfe FR_PP_2 = [GR_ad_pp], 16 // Load PP_2 if i_1=0(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_5 // poly = rsq*poly+PP_5 if i_1=0 nop.i 999}{ .mfi(p10) ldfe FR_QQ_2 = [GR_ad_qq], 16 // Load QQ_2 if i_1=1(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_5 // poly = rsq*poly+QQ_5 if i_1=1 nop.i 999};;{ .mfi(p9) ldfe FR_PP_1_lo = [GR_ad_pp], 16 // Load PP_1_lo if i_1=0(p9) fma.s1 FR_corr = FR_corr, FR_c, FR_c // corr = corr * c + c if i_1=0 nop.i 999}{ .mfi nop.m 999(p10) fnma.s1 FR_corr = FR_corr, FR_c, f0 // corr = -corr * c if i_1=1 nop.i 999};;{ .mfi nop.m 999(p9) fma.s1 FR_U_lo = FR_r, FR_r_hi, FR_r_hi_sq // U_lo = r*r_hi+r_hi_sq, i_1=0 nop.i 999}{ .mfi nop.m 999(p10) fma.s1 FR_U_lo = FR_r_hi, f1, FR_r // U_lo = r_hi + r if i_1=1 nop.i 999};;{ .mfi nop.m 999(p9) fma.s1 FR_U_hi = FR_r_hi, FR_r_hi_sq, f0 // U_hi = r_hi*r_hi_sq if i_1=0 nop.i 999}{ .mfi nop.m 999(p10) fma.s1 FR_U_hi = FR_QQ_1, FR_r_hi_sq, f1 // U_hi = QQ_1*r_hi_sq+1, i_1=1 nop.i 999};;{ .mfi nop.m 999(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_4 // poly = poly*rsq+PP_4 if i_1=0 nop.i 999}{ .mfi nop.m 999(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_4 // poly = poly*rsq+QQ_4 if i_1=1 nop.i 999};;{ .mfi nop.m 999(p9) fma.s1 FR_U_lo = FR_r, FR_r, FR_U_lo // U_lo = r * r + U_lo if i_1=0 nop.i 999}{ .mfi nop.m 999(p10) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0 // U_lo = r_lo * U_lo if i_1=1 nop.i 999};;{ .mfi nop.m 999(p9) fma.s1 FR_U_hi = FR_PP_1, FR_U_hi, f0 // U_hi = PP_1 * U_hi if i_1=0 nop.i 999};;{ .mfi nop.m 999(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_3 // poly = poly*rsq+PP_3 if i_1=0 nop.i 999}{ .mfi nop.m 999(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_3 // poly = poly*rsq+QQ_3 if i_1=1 nop.i 999};;{ .mfi nop.m 999(p9) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0 // U_lo = r_lo * U_lo if i_1=0 nop.i 999}{ .mfi nop.m 999(p10) fma.s1 FR_U_lo = FR_QQ_1,FR_U_lo, f0 // U_lo = QQ_1 * U_lo if i_1=1 nop.i 999};;{ .mfi nop.m 999(p9) fma.s1 FR_U_hi = FR_r, f1, FR_U_hi // U_hi = r + U_hi if i_1=0 nop.i 999};;{ .mfi nop.m 999(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_2 // poly = poly*rsq+PP_2 if i_1=0 nop.i 999}{ .mfi nop.m 999(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_2 // poly = poly*rsq+QQ_2 if i_1=1 nop.i 999};;{ .mfi nop.m 999(p9) fma.s1 FR_U_lo = FR_PP_1, FR_U_lo, f0 // U_lo = PP_1 * U_lo if i_1=0 nop.i 999};;{ .mfi nop.m 999(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_1_lo // poly =poly*rsq+PP1lo i_1=0 nop.i 999}{ .mfi nop.m 999(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0 // poly = poly*rsq if i_1=1 nop.i 999};;{ .mfi nop.m 999 fma.s1 FR_V = FR_U_lo, f1, FR_corr // V = U_lo + corr tbit.z p11,p12 = GR_N_Inc, 1 // p11 if i_0=0, N mod 4 = 0,2 // p12 if i_0=1, N mod 4 = 1,3};;{ .mfi nop.m 999(p9) fma.s0 FR_inexact = FR_PP_5, FR_PP_4, f0 // Dummy op to set inexact nop.i 999}{ .mfi nop.m 999(p10) fma.s0 FR_inexact = FR_QQ_5, FR_QQ_5, f0 // Dummy op to set inexact nop.i 999};;{ .mfi nop.m 999(p9) fma.s1 FR_poly = FR_r_cubed, FR_poly, f0 // poly = poly*r^3 if i_1=0 nop.i 999}{ .mfi nop.m 999(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0 // poly = poly*rsq if i_1=1 nop.i 999};;{ .mfi nop.m 999(p11) fma.s1 FR_tmp_result = f0, f1, f1// tmp_result=+1.0 if i_0=0 nop.i 999}{ .mfi nop.m 999(p12) fms.s1 FR_tmp_result = f0, f1, f1// tmp_result=-1.0 if i_0=1 nop.i 999};;{ .mfi nop.m 999 fma.s1 FR_V = FR_poly, f1, FR_V // V = poly + V nop.i 999};;// If i_0 = 0 Result = U_hi + V// If i_0 = 1 Result = -U_hi - V{ .mfi nop.m 999(p11) fma.s0 FR_Result = FR_tmp_result, FR_U_hi, FR_V nop.i 999}{ .mfb nop.m 999(p12) fms.s0 FR_Result = FR_tmp_result, FR_U_hi, FR_V br.ret.sptk b0 // Exit for 2^-3 <= |r| < pi/4};;SINCOSL_ZERO:// Here if x = 0{ .mfi cmp.eq.unc p6, p7 = 0x1, GR_Sin_or_Cos nop.f 999 nop.i 999};;{ .mfi nop.m 999(p7) fmerge.s FR_Result = FR_Input_X, FR_Input_X // If sin, result = input nop.i 999}{ .mfb nop.m 999(p6) fma.s0 FR_Result = f1, f1, f0 // If cos, result=1.0 br.ret.sptk b0 // Exit for x=0};;SINCOSL_DENORMAL:{ .mmb getf.exp GR_signexp_x = FR_norm_x // Get sign and exponent of x nop.m 999 br.cond.sptk SINCOSL_COMMON // Return to common code};;SINCOSL_SPECIAL:{ .mfb nop.m 999//// Path for Arg = +/- QNaN, SNaN, Inf// Invalid can be raised. SNaNs// become QNaNs// fmpy.s0 FR_Result = FR_Input_X, f0 br.ret.sptk b0 ;;}GLOBAL_IEEE754_END(cosl)// *******************************************************************// *******************************************************************// *******************************************************************//// Special Code to handle very large argument case.// Call int __libm_pi_by_2_reduce(x,r,c) for |arguments| >= 2**63// The interface is custom:// On input:// (Arg or x) is in f8// On output:// r is in f8// c is in f9// N is in r8// Be sure to allocate at least 2 GP registers as output registers for// __libm_pi_by_2_reduce. This routine uses r59-60. These are used as// scratch registers within the __libm_pi_by_2_reduce routine (for speed).//// We know also that __libm_pi_by_2_reduce preserves f10-15, f71-127. We// use this to eliminate save/restore of key fp registers in this calling// function.//// *******************************************************************// *******************************************************************// *******************************************************************LOCAL_LIBM_ENTRY(__libm_callout)SINCOSL_ARG_TOO_LARGE:.prologue{ .mfi nop.f 0.save ar.pfs,GR_SAVE_PFS mov GR_SAVE_PFS=ar.pfs // Save ar.pfs};;{ .mmi setf.exp FR_Two_to_M3 = GR_exp_2_to_m3 // Form 2^-3 mov GR_SAVE_GP=gp // Save gp.save b0, GR_SAVE_B0 mov GR_SAVE_B0=b0 // Save b0};;.body//// Call argument reduction with x in f8// Returns with N in r8, r in f8, c in f9// Assumes f71-127 are preserved across the call//{ .mib setf.exp FR_Neg_Two_to_M3 = GR_exp_m2_to_m3 // Form -(2^-3) nop.i 0 br.call.sptk b0=__libm_pi_by_2_reduce#};;{ .mfi add GR_N_Inc = GR_Sin_or_Cos,r8 fcmp.lt.unc.s1 p6, p0 = FR_r, FR_Two_to_M3 mov b0 = GR_SAVE_B0 // Restore return address};;{ .mfi mov gp = GR_SAVE_GP // Restore gp(p6) fcmp.gt.unc.s1 p6, p0 = FR_r, FR_Neg_Two_to_M3 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs};;{ .mbb nop.m 999(p6) br.cond.spnt SINCOSL_SMALL_R // Branch if |r|< 2^-3 for |x| >= 2^63 br.cond.sptk SINCOSL_NORMAL_R // Branch if |r|>=2^-3 for |x| >= 2^63};;LOCAL_LIBM_END(__libm_callout).type __libm_pi_by_2_reduce#,@function.global __libm_pi_by_2_reduce#
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -