?? eval.cc
字號:
#ifndef BZ_ARRAYEVAL_CC#define BZ_ARRAYEVAL_CC#ifndef BZ_ARRAY_H #error <blitz/array/eval.cc> must be included via <blitz/array.h>#endifBZ_NAMESPACE(blitz)/* * Assign an expression to an array. For performance reasons, there are * several traversal mechanisms: * * - Index traversal scans through the destination array in storage order. * The expression is evaluated using a TinyVector<int,N> operand. This * version is used only when there are index placeholders in the expression * (see <blitz/indexexpr.h>) * - Stack traversal also scans through the destination array in storage * order. However, push/pop stack iterators are used. * - Fast traversal follows a Hilbert (or other) space-filling curve to * improve cache reuse for stencilling operations. Currently, the * space filling curves must be generated by calling * generateFastTraversalOrder(TinyVector<int,N_dimensions>). * - 2D tiled traversal follows a tiled traversal, to improve cache reuse * for 2D stencils. Space filling curves have too much overhead to use * in two-dimensions. * * _bz_tryFastTraversal is a helper class. Fast traversals are only * attempted if the expression looks like a stencil -- it's at least * three-dimensional, has at least six array operands, and there are * no index placeholders in the expression. These are all things which * can be checked at compile time, so the if()/else() syntax has been * replaced with this class template. */// Fast traversals require <set> from the ISO/ANSI C++ standard library#ifdef BZ_HAVE_STD#ifdef BZ_ARRAY_SPACE_FILLING_TRAVERSALtemplate<bool canTryFastTraversal>struct _bz_tryFastTraversal { template<typename T_numtype, int N_rank, typename T_expr, typename T_update> static bool tryFast(Array<T_numtype,N_rank>& array, BZ_ETPARM(T_expr) expr, T_update) { return false; }};template<>struct _bz_tryFastTraversal<true> { template<typename T_numtype, int N_rank, typename T_expr, typename T_update> static bool tryFast(Array<T_numtype,N_rank>& array, BZ_ETPARM(T_expr) expr, T_update) { // See if there's an appropriate space filling curve available. // Currently fast traversals use an N-1 dimensional curve. The // Nth dimension column corresponding to each point on the curve // is traversed in the normal fashion. TraversalOrderCollection<N_rank-1> traversals; TinyVector<int, N_rank - 1> traversalGridSize; for (int i=0; i < N_rank - 1; ++i) traversalGridSize[i] = array.length(array.ordering(i+1));#ifdef BZ_DEBUG_TRAVERSEcout << "traversalGridSize = " << traversalGridSize << endl;cout.flush();#endif const TraversalOrder<N_rank-1>* order = traversals.find(traversalGridSize); if (order) {#ifdef BZ_DEBUG_TRAVERSE cerr << "Array<" << BZ_DEBUG_TEMPLATE_AS_STRING_LITERAL(T_numtype) << ", " << N_rank << ">: Using stack traversal" << endl;#endif // A curve was available -- use fast traversal. array.evaluateWithFastTraversal(*order, expr, T_update()); return true; } return false; }};#endif // BZ_ARRAY_SPACE_FILLING_TRAVERSAL#endif // BZ_HAVE_STDtemplate<typename T_numtype, int N_rank> template<typename T_expr, typename T_update>inline Array<T_numtype, N_rank>& Array<T_numtype, N_rank>::evaluate(T_expr expr, T_update){ // Check that all arrays have the same shape#ifdef BZ_DEBUG if (!expr.shapeCheck(shape())) { if (assertFailMode == false) { cerr << "[Blitz++] Shape check failed: Module " << __FILE__ << " line " << __LINE__ << endl << " Expression: "; prettyPrintFormat format(true); // Use terse formatting BZ_STD_SCOPE(string) str; expr.prettyPrint(str, format); cerr << str << endl ; }#if 0// Shape dumping is broken by change to using string for prettyPrint << " Shapes: " << shape() << " = "; prettyPrintFormat format2; format2.setDumpArrayShapesMode(); expr.prettyPrint(cerr, format2); cerr << endl;#endif BZ_PRE_FAIL; }#endif BZPRECHECK(expr.shapeCheck(shape()), "Shape check failed." << endl << "Expression:"); BZPRECHECK((T_expr::rank == N_rank) || (T_expr::numArrayOperands == 0), "Assigned rank " << T_expr::rank << " expression to rank " << N_rank << " array."); /* * Check that the arrays are not empty (e.g. length 0 arrays) * This fixes a bug found by Peter Bienstman, 6/16/99, where * Array<double,2> A(0,0),B(0,0); B=A(tensor::j,tensor::i); * went into an infinite loop. */ if (numElements() == 0) return *this;#ifdef BZ_DEBUG_TRAVERSE cout << "T_expr::numIndexPlaceholders = " << T_expr::numIndexPlaceholders << endl; cout.flush();#endif // Tau profiling code. Provide Tau with a pretty-printed version of // the expression. // NEEDS_WORK-- use a static initializer somehow.#ifdef BZ_TAU_PROFILING static BZ_STD_SCOPE(string) exprDescription; if (!exprDescription.length()) // faked static initializer { exprDescription = "A"; prettyPrintFormat format(true); // Terse mode on format.nextArrayOperandSymbol(); T_update::prettyPrint(exprDescription); expr.prettyPrint(exprDescription, format); } TAU_PROFILE(" ", exprDescription, TAU_BLITZ);#endif // Determine which evaluation mechanism to use if (T_expr::numIndexPlaceholders > 0) { // The expression involves index placeholders, so have to // use index traversal rather than stack traversal. if (N_rank == 1) return evaluateWithIndexTraversal1(expr, T_update()); else return evaluateWithIndexTraversalN(expr, T_update()); } else { // If this expression looks like an array stencil, then attempt to // use a fast traversal order. // Fast traversals require <set> from the ISO/ANSI C++ standard // library.#ifdef BZ_HAVE_STD#ifdef BZ_ARRAY_SPACE_FILLING_TRAVERSAL enum { isStencil = (N_rank >= 3) && (T_expr::numArrayOperands > 6) && (T_expr::numIndexPlaceholders == 0) }; if (_bz_tryFastTraversal<isStencil>::tryFast(*this, expr, T_update())) return *this;#endif#endif#ifdef BZ_ARRAY_2D_STENCIL_TILING // Does this look like a 2-dimensional stencil on a largeish // array? if ((N_rank == 2) && (T_expr::numArrayOperands >= 5)) { // Use a heuristic to determine whether a tiled traversal // is desirable. First, estimate how much L1 cache is needed // to achieve a high hit rate using the stack traversal. // Try to err on the side of using tiled traversal even when // it isn't strictly needed. // Assumptions: // Stencil width 3 // 3 arrays involved in stencil // Uniform data type in arrays (all T_numtype) int cacheNeeded = 3 * 3 * sizeof(T_numtype) * length(ordering(0)); if (cacheNeeded > BZ_L1_CACHE_ESTIMATED_SIZE) return evaluateWithTiled2DTraversal(expr, T_update()); }#endif // If fast traversal isn't available or appropriate, then just // do a stack traversal. if (N_rank == 1) return evaluateWithStackTraversal1(expr, T_update()); else return evaluateWithStackTraversalN(expr, T_update()); }}template<typename T_numtype, int N_rank> template<typename T_expr, typename T_update>inline Array<T_numtype, N_rank>&Array<T_numtype, N_rank>::evaluateWithStackTraversal1( T_expr expr, T_update){#ifdef BZ_DEBUG_TRAVERSE BZ_DEBUG_MESSAGE("Array<" << BZ_DEBUG_TEMPLATE_AS_STRING_LITERAL(T_numtype) << ", " << N_rank << ">: Using stack traversal");#endif FastArrayIterator<T_numtype, N_rank> iter(*this); iter.loadStride(firstRank); expr.loadStride(firstRank); bool useUnitStride = iter.isUnitStride(firstRank) && expr.isUnitStride(firstRank);#ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE int commonStride = expr.suggestStride(firstRank); if (iter.suggestStride(firstRank) > commonStride) commonStride = iter.suggestStride(firstRank); bool useCommonStride = iter.isStride(firstRank,commonStride) && expr.isStride(firstRank,commonStride); #ifdef BZ_DEBUG_TRAVERSE BZ_DEBUG_MESSAGE("BZ_ARRAY_EXPR_USE_COMMON_STRIDE:" << endl << " commonStride = " << commonStride << " useCommonStride = " << useCommonStride); #endif#else int commonStride = 1; bool useCommonStride = false;#endif const T_numtype * last = iter.data() + length(firstRank) * stride(firstRank); if (useUnitStride || useCommonStride) {#ifdef BZ_USE_FAST_READ_ARRAY_EXPR#ifdef BZ_DEBUG_TRAVERSE BZ_DEBUG_MESSAGE("BZ_USE_FAST_READ_ARRAY_EXPR with commonStride");#endif int ubound = length(firstRank) * commonStride; T_numtype* restrict data = const_cast<T_numtype*>(iter.data()); if (commonStride == 1) { #ifndef BZ_ARRAY_STACK_TRAVERSAL_UNROLL for (int i=0; i < ubound; ++i) T_update::update(*data++, expr.fastRead(i)); #else int n1 = ubound & 3; int i = 0; for (; i < n1; ++i) T_update::update(*data++, expr.fastRead(i)); for (; i < ubound; i += 4) {#ifndef BZ_ARRAY_STACK_TRAVERSAL_CSE_AND_ANTIALIAS T_update::update(*data++, expr.fastRead(i)); T_update::update(*data++, expr.fastRead(i+1)); T_update::update(*data++, expr.fastRead(i+2)); T_update::update(*data++, expr.fastRead(i+3));#else const int t1 = i+1; const int t2 = i+2; const int t3 = i+3; _bz_typename T_expr::T_numtype tmp1, tmp2, tmp3, tmp4; tmp1 = expr.fastRead(i); tmp2 = expr.fastRead(BZ_NO_PROPAGATE(t1)); tmp3 = expr.fastRead(BZ_NO_PROPAGATE(t2)); tmp4 = expr.fastRead(BZ_NO_PROPAGATE(t3)); T_update::update(*data++, tmp1); T_update::update(*data++, tmp2); T_update::update(*data++, tmp3); T_update::update(*data++, tmp4);#endif } #endif // BZ_ARRAY_STACK_TRAVERSAL_UNROLL } #ifdef BZ_ARRAY_EXPR_USE_COMMON_STRIDE else { #ifndef BZ_ARRAY_STACK_TRAVERSAL_UNROLL for (int i=0; i != ubound; i += commonStride) T_update::update(data[i], expr.fastRead(i)); #else int n1 = (length(firstRank) & 3) * commonStride; int i = 0; for (; i != n1; i += commonStride) T_update::update(data[i], expr.fastRead(i)); int strideInc = 4 * commonStride; for (; i != ubound; i += strideInc) { T_update::update(data[i], expr.fastRead(i)); int i2 = i + commonStride; T_update::update(data[i2], expr.fastRead(i2)); int i3 = i + 2 * commonStride; T_update::update(data[i3], expr.fastRead(i3)); int i4 = i + 3 * commonStride; T_update::update(data[i4], expr.fastRead(i4)); } #endif // BZ_ARRAY_STACK_TRAVERSAL_UNROLL } #endif // BZ_ARRAY_EXPR_USE_COMMON_STRIDE#else // ! BZ_USE_FAST_READ_ARRAY_EXPR#ifdef BZ_DEBUG_TRAVERSE BZ_DEBUG_MESSAGE("Common stride, no fast read");#endif while (iter.data() != last) { T_update::update(*const_cast<T_numtype*>(iter.data()), *expr); iter.advance(commonStride); expr.advance(commonStride); }#endif } else { while (iter.data() != last) { T_update::update(*const_cast<T_numtype*>(iter.data()), *expr); iter.advance(); expr.advance(); } } return *this;}template<typename T_numtype, int N_rank> template<typename T_expr, typename T_update>inline Array<T_numtype, N_rank>&Array<T_numtype, N_rank>::evaluateWithStackTraversalN( T_expr expr, T_update){ /* * A stack traversal replaces the usual nested loops: * * for (int i=A.lbound(firstDim); i <= A.ubound(firstDim); ++i) * for (int j=A.lbound(secondDim); j <= A.ubound(secondDim); ++j) * for (int k=A.lbound(thirdDim); k <= A.ubound(thirdDim); ++k) * A(i,j,k) = 0; * * with a stack data structure. The stack allows this single * routine to replace any number of nested loops. * * For each dimension (loop), these quantities are needed: * - a pointer to the first element encountered in the loop * - the stride associated with the dimension/loop * - a pointer to the last element encountered in the loop * * The basic idea is that entering each loop is a "push" onto the * stack, and exiting each loop is a "pop". In practice, this * routine treats accesses the stack in a random-access way, * which confuses the picture a bit. But conceptually, that's * what is going on. */ /* * ordering(0) gives the dimension associated with the smallest * stride (usually; the exceptions have to do with subarrays and * are uninteresting). We call this dimension maxRank; it will * become the innermost "loop". * * Ordering the loops from ordering(N_rank-1) down to * ordering(0) ensures that the largest stride is associated * with the outermost loop, and the smallest stride with the * innermost. This is critical for good performance on * cached machines. */ const int maxRank = ordering(0); // const int secondLastRank = ordering(1); // Create an iterator for the array receiving the result
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -