?? s_cos.s
字號:
.file "sincos.s"// Copyright (c) 2000 - 2005, Intel Corporation// All rights reserved.//// Contributed 2000 by the Intel Numerics Group, Intel Corporation//// Redistribution and use in source and binary forms, with or without// modification, are permitted provided that the following conditions are// met://// * Redistributions of source code must retain the above copyright// notice, this list of conditions and the following disclaimer.//// * Redistributions in binary form must reproduce the above copyright// notice, this list of conditions and the following disclaimer in the// documentation and/or other materials provided with the distribution.//// * The name of Intel Corporation may not be used to endorse or promote// products derived from this software without specific prior written// permission.// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.//// Intel Corporation is the author of this code, and requests that all// problem reports or change requests be submitted to it directly at// http://www.intel.com/software/products/opensource/libraries/num.htm.//// History//==============================================================// 02/02/00 Initial version// 04/02/00 Unwind support added.// 06/16/00 Updated tables to enforce symmetry// 08/31/00 Saved 2 cycles in main path, and 9 in other paths.// 09/20/00 The updated tables regressed to an old version, so reinstated them// 10/18/00 Changed one table entry to ensure symmetry// 01/03/01 Improved speed, fixed flag settings for small arguments.// 02/18/02 Large arguments processing routine excluded// 05/20/02 Cleaned up namespace and sf0 syntax// 06/03/02 Insure inexact flag set for large arg result// 09/05/02 Work range is widened by reduction strengthen (3 parts of Pi/16)// 02/10/03 Reordered header: .section, .global, .proc, .align// 08/08/03 Improved performance// 10/28/04 Saved sincos_r_sincos to avoid clobber by dynamic loader // 03/31/05 Reformatted delimiters between data tables// API//==============================================================// double sin( double x);// double cos( double x);//// Overview of operation//==============================================================//// Step 1// ======// Reduce x to region -1/2*pi/2^k ===== 0 ===== +1/2*pi/2^k where k=4// divide x by pi/2^k.// Multiply by 2^k/pi.// nfloat = Round result to integer (round-to-nearest)//// r = x - nfloat * pi/2^k// Do this as ((((x - nfloat * HIGH(pi/2^k))) - // nfloat * LOW(pi/2^k)) - // nfloat * LOWEST(pi/2^k) for increased accuracy.// pi/2^k is stored as two numbers that when added make pi/2^k.// pi/2^k = HIGH(pi/2^k) + LOW(pi/2^k)// HIGH and LOW parts are rounded to zero values, // and LOWEST is rounded to nearest one.//// x = (nfloat * pi/2^k) + r// r is small enough that we can use a polynomial approximation// and is referred to as the reduced argument.//// Step 3// ======// Take the unreduced part and remove the multiples of 2pi.// So nfloat = nfloat (with lower k+1 bits cleared) + lower k+1 bits//// nfloat (with lower k+1 bits cleared) is a multiple of 2^(k+1)// N * 2^(k+1)// nfloat * pi/2^k = N * 2^(k+1) * pi/2^k + (lower k+1 bits) * pi/2^k// nfloat * pi/2^k = N * 2 * pi + (lower k+1 bits) * pi/2^k// nfloat * pi/2^k = N2pi + M * pi/2^k////// Sin(x) = Sin((nfloat * pi/2^k) + r)// = Sin(nfloat * pi/2^k) * Cos(r) + Cos(nfloat * pi/2^k) * Sin(r)//// Sin(nfloat * pi/2^k) = Sin(N2pi + Mpi/2^k)// = Sin(N2pi)Cos(Mpi/2^k) + Cos(N2pi)Sin(Mpi/2^k)// = Sin(Mpi/2^k)//// Cos(nfloat * pi/2^k) = Cos(N2pi + Mpi/2^k)// = Cos(N2pi)Cos(Mpi/2^k) + Sin(N2pi)Sin(Mpi/2^k)// = Cos(Mpi/2^k)//// Sin(x) = Sin(Mpi/2^k) Cos(r) + Cos(Mpi/2^k) Sin(r)////// Step 4// ======// 0 <= M < 2^(k+1)// There are 2^(k+1) Sin entries in a table.// There are 2^(k+1) Cos entries in a table.//// Get Sin(Mpi/2^k) and Cos(Mpi/2^k) by table lookup.////// Step 5// ======// Calculate Cos(r) and Sin(r) by polynomial approximation.//// Cos(r) = 1 + r^2 q1 + r^4 q2 + r^6 q3 + ... = Series for Cos// Sin(r) = r + r^3 p1 + r^5 p2 + r^7 p3 + ... = Series for Sin//// and the coefficients q1, q2, ... and p1, p2, ... are stored in a table////// Calculate// Sin(x) = Sin(Mpi/2^k) Cos(r) + Cos(Mpi/2^k) Sin(r)//// as follows//// S[m] = Sin(Mpi/2^k) and C[m] = Cos(Mpi/2^k)// rsq = r*r////// P = p1 + r^2p2 + r^4p3 + r^6p4// Q = q1 + r^2q2 + r^4q3 + r^6q4//// rcub = r * rsq// Sin(r) = r + rcub * P// = r + r^3p1 + r^5p2 + r^7p3 + r^9p4 + ... = Sin(r)//// The coefficients are not exactly these values, but almost.//// p1 = -1/6 = -1/3!// p2 = 1/120 = 1/5!// p3 = -1/5040 = -1/7!// p4 = 1/362889 = 1/9!//// P = r + rcub * P//// Answer = S[m] Cos(r) + [Cm] P//// Cos(r) = 1 + rsq Q// Cos(r) = 1 + r^2 Q// Cos(r) = 1 + r^2 (q1 + r^2q2 + r^4q3 + r^6q4)// Cos(r) = 1 + r^2q1 + r^4q2 + r^6q3 + r^8q4 + ...//// S[m] Cos(r) = S[m](1 + rsq Q)// S[m] Cos(r) = S[m] + Sm rsq Q// S[m] Cos(r) = S[m] + s_rsq Q// Q = S[m] + s_rsq Q//// Then,//// Answer = Q + C[m] P// Registers used//==============================================================// general input registers:// r14 -> r26// r32 -> r35// predicate registers used:// p6 -> p11// floating-point registers used// f9 -> f15// f32 -> f61// Assembly macros//==============================================================sincos_NORM_f8 = f9sincos_W = f10sincos_int_Nfloat = f11sincos_Nfloat = f12sincos_r = f13sincos_rsq = f14sincos_rcub = f15sincos_save_tmp = f15sincos_Inv_Pi_by_16 = f32sincos_Pi_by_16_1 = f33sincos_Pi_by_16_2 = f34sincos_Inv_Pi_by_64 = f35sincos_Pi_by_16_3 = f36sincos_r_exact = f37sincos_Sm = f38sincos_Cm = f39sincos_P1 = f40sincos_Q1 = f41sincos_P2 = f42sincos_Q2 = f43sincos_P3 = f44sincos_Q3 = f45sincos_P4 = f46sincos_Q4 = f47sincos_P_temp1 = f48sincos_P_temp2 = f49sincos_Q_temp1 = f50sincos_Q_temp2 = f51sincos_P = f52sincos_Q = f53sincos_srsq = f54sincos_SIG_INV_PI_BY_16_2TO61 = f55sincos_RSHF_2TO61 = f56sincos_RSHF = f57sincos_2TOM61 = f58sincos_NFLOAT = f59sincos_W_2TO61_RSH = f60fp_tmp = f61/////////////////////////////////////////////////////////////sincos_GR_sig_inv_pi_by_16 = r14sincos_GR_rshf_2to61 = r15sincos_GR_rshf = r16sincos_GR_exp_2tom61 = r17sincos_GR_n = r18sincos_GR_m = r19sincos_GR_32m = r19sincos_GR_all_ones = r19sincos_AD_1 = r20sincos_AD_2 = r21sincos_exp_limit = r22sincos_r_signexp = r23sincos_r_17_ones = r24sincos_r_sincos = r25sincos_r_exp = r26GR_SAVE_PFS = r33GR_SAVE_B0 = r34GR_SAVE_GP = r35GR_SAVE_r_sincos = r36RODATA// Pi/16 parts.align 16LOCAL_OBJECT_START(double_sincos_pi) data8 0xC90FDAA22168C234, 0x00003FFC // pi/16 1st part data8 0xC4C6628B80DC1CD1, 0x00003FBC // pi/16 2nd part data8 0xA4093822299F31D0, 0x00003F7A // pi/16 3rd partLOCAL_OBJECT_END(double_sincos_pi)// Coefficients for polynomialsLOCAL_OBJECT_START(double_sincos_pq_k4) data8 0x3EC71C963717C63A // P4 data8 0x3EF9FFBA8F191AE6 // Q4 data8 0xBF2A01A00F4E11A8 // P3 data8 0xBF56C16C05AC77BF // Q3 data8 0x3F8111111110F167 // P2 data8 0x3FA555555554DD45 // Q2 data8 0xBFC5555555555555 // P1 data8 0xBFDFFFFFFFFFFFFC // Q1LOCAL_OBJECT_END(double_sincos_pq_k4)// Sincos table (S[m], C[m])LOCAL_OBJECT_START(double_sin_cos_beta_k4)data8 0x0000000000000000 , 0x00000000 // sin( 0 pi/16) S0data8 0x8000000000000000 , 0x00003fff // cos( 0 pi/16) C0//data8 0xc7c5c1e34d3055b3 , 0x00003ffc // sin( 1 pi/16) S1data8 0xfb14be7fbae58157 , 0x00003ffe // cos( 1 pi/16) C1//data8 0xc3ef1535754b168e , 0x00003ffd // sin( 2 pi/16) S2data8 0xec835e79946a3146 , 0x00003ffe // cos( 2 pi/16) C2//data8 0x8e39d9cd73464364 , 0x00003ffe // sin( 3 pi/16) S3data8 0xd4db3148750d181a , 0x00003ffe // cos( 3 pi/16) C3//data8 0xb504f333f9de6484 , 0x00003ffe // sin( 4 pi/16) S4data8 0xb504f333f9de6484 , 0x00003ffe // cos( 4 pi/16) C4//data8 0xd4db3148750d181a , 0x00003ffe // sin( 5 pi/16) C3data8 0x8e39d9cd73464364 , 0x00003ffe // cos( 5 pi/16) S3//data8 0xec835e79946a3146 , 0x00003ffe // sin( 6 pi/16) C2data8 0xc3ef1535754b168e , 0x00003ffd // cos( 6 pi/16) S2//data8 0xfb14be7fbae58157 , 0x00003ffe // sin( 7 pi/16) C1data8 0xc7c5c1e34d3055b3 , 0x00003ffc // cos( 7 pi/16) S1//data8 0x8000000000000000 , 0x00003fff // sin( 8 pi/16) C0data8 0x0000000000000000 , 0x00000000 // cos( 8 pi/16) S0//data8 0xfb14be7fbae58157 , 0x00003ffe // sin( 9 pi/16) C1data8 0xc7c5c1e34d3055b3 , 0x0000bffc // cos( 9 pi/16) -S1//data8 0xec835e79946a3146 , 0x00003ffe // sin(10 pi/16) C2data8 0xc3ef1535754b168e , 0x0000bffd // cos(10 pi/16) -S2//data8 0xd4db3148750d181a , 0x00003ffe // sin(11 pi/16) C3data8 0x8e39d9cd73464364 , 0x0000bffe // cos(11 pi/16) -S3//data8 0xb504f333f9de6484 , 0x00003ffe // sin(12 pi/16) S4data8 0xb504f333f9de6484 , 0x0000bffe // cos(12 pi/16) -S4//data8 0x8e39d9cd73464364 , 0x00003ffe // sin(13 pi/16) S3data8 0xd4db3148750d181a , 0x0000bffe // cos(13 pi/16) -C3//data8 0xc3ef1535754b168e , 0x00003ffd // sin(14 pi/16) S2data8 0xec835e79946a3146 , 0x0000bffe // cos(14 pi/16) -C2//data8 0xc7c5c1e34d3055b3 , 0x00003ffc // sin(15 pi/16) S1data8 0xfb14be7fbae58157 , 0x0000bffe // cos(15 pi/16) -C1//data8 0x0000000000000000 , 0x00000000 // sin(16 pi/16) S0data8 0x8000000000000000 , 0x0000bfff // cos(16 pi/16) -C0//data8 0xc7c5c1e34d3055b3 , 0x0000bffc // sin(17 pi/16) -S1data8 0xfb14be7fbae58157 , 0x0000bffe // cos(17 pi/16) -C1//data8 0xc3ef1535754b168e , 0x0000bffd // sin(18 pi/16) -S2data8 0xec835e79946a3146 , 0x0000bffe // cos(18 pi/16) -C2//data8 0x8e39d9cd73464364 , 0x0000bffe // sin(19 pi/16) -S3data8 0xd4db3148750d181a , 0x0000bffe // cos(19 pi/16) -C3//data8 0xb504f333f9de6484 , 0x0000bffe // sin(20 pi/16) -S4data8 0xb504f333f9de6484 , 0x0000bffe // cos(20 pi/16) -S4//data8 0xd4db3148750d181a , 0x0000bffe // sin(21 pi/16) -C3data8 0x8e39d9cd73464364 , 0x0000bffe // cos(21 pi/16) -S3//data8 0xec835e79946a3146 , 0x0000bffe // sin(22 pi/16) -C2data8 0xc3ef1535754b168e , 0x0000bffd // cos(22 pi/16) -S2//data8 0xfb14be7fbae58157 , 0x0000bffe // sin(23 pi/16) -C1data8 0xc7c5c1e34d3055b3 , 0x0000bffc // cos(23 pi/16) -S1//data8 0x8000000000000000 , 0x0000bfff // sin(24 pi/16) -C0data8 0x0000000000000000 , 0x00000000 // cos(24 pi/16) S0//data8 0xfb14be7fbae58157 , 0x0000bffe // sin(25 pi/16) -C1data8 0xc7c5c1e34d3055b3 , 0x00003ffc // cos(25 pi/16) S1//data8 0xec835e79946a3146 , 0x0000bffe // sin(26 pi/16) -C2data8 0xc3ef1535754b168e , 0x00003ffd // cos(26 pi/16) S2//data8 0xd4db3148750d181a , 0x0000bffe // sin(27 pi/16) -C3data8 0x8e39d9cd73464364 , 0x00003ffe // cos(27 pi/16) S3//data8 0xb504f333f9de6484 , 0x0000bffe // sin(28 pi/16) -S4data8 0xb504f333f9de6484 , 0x00003ffe // cos(28 pi/16) S4//data8 0x8e39d9cd73464364 , 0x0000bffe // sin(29 pi/16) -S3data8 0xd4db3148750d181a , 0x00003ffe // cos(29 pi/16) C3//data8 0xc3ef1535754b168e , 0x0000bffd // sin(30 pi/16) -S2data8 0xec835e79946a3146 , 0x00003ffe // cos(30 pi/16) C2//data8 0xc7c5c1e34d3055b3 , 0x0000bffc // sin(31 pi/16) -S1data8 0xfb14be7fbae58157 , 0x00003ffe // cos(31 pi/16) C1//data8 0x0000000000000000 , 0x00000000 // sin(32 pi/16) S0
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -