?? dispatch.cc
字號:
/* * Copyright (c) 2000, 2001, 2002, 2003, 2004, 2005 * The Regents of The University of Michigan * All Rights Reserved * * This code is part of the M5 simulator, developed by Nathan Binkert, * Erik Hallnor, Steve Raasch, and Steve Reinhardt, with contributions * from Ron Dreslinski, Dave Greene, Lisa Hsu, Kevin Lim, Ali Saidi, * and Andrew Schultz. * * Permission is granted to use, copy, create derivative works and * redistribute this software and such derivative works for any * purpose, so long as the copyright notice above, this grant of * permission, and the disclaimer below appear in all copies made; and * so long as the name of The University of Michigan is not used in * any advertising or publicity pertaining to the use or distribution * of this software without specific, written prior authorization. * * THIS SOFTWARE IS PROVIDED AS IS, WITHOUT REPRESENTATION FROM THE * UNIVERSITY OF MICHIGAN AS TO ITS FITNESS FOR ANY PURPOSE, AND * WITHOUT WARRANTY BY THE UNIVERSITY OF MICHIGAN OF ANY KIND, EITHER * EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE. THE REGENTS OF THE UNIVERSITY OF MICHIGAN SHALL NOT BE * LIABLE FOR ANY DAMAGES, INCLUDING DIRECT, SPECIAL, INDIRECT, * INCIDENTAL, OR CONSEQUENTIAL DAMAGES, WITH RESPECT TO ANY CLAIM * ARISING OUT OF OR IN CONNECTION WITH THE USE OF THE SOFTWARE, EVEN * IF IT HAS BEEN OR IS HEREAFTER ADVISED OF THE POSSIBILITY OF SUCH * DAMAGES. */#include <algorithm>#include "base/cprintf.hh"#include "base/predictor.hh"#include "base/statistics.hh"#include "encumbered/cpu/full/cpu.hh"#include "encumbered/cpu/full/create_vector.hh"#include "encumbered/cpu/full/dd_queue.hh"#include "encumbered/cpu/full/dep_link.hh"#include "encumbered/cpu/full/dyn_inst.hh"#include "encumbered/cpu/full/fetch.hh"#include "encumbered/cpu/full/floss_reasons.hh"#include "encumbered/cpu/full/iq/iqueue.hh"#include "encumbered/cpu/full/iq/segmented/chain_info.hh"#include "encumbered/cpu/full/iq/segmented/chain_wire.hh"#include "encumbered/cpu/full/readyq.hh"#include "encumbered/cpu/full/reg_info.hh"#include "encumbered/cpu/full/rob_station.hh"#include "encumbered/cpu/full/spec_state.hh"#include "sim/stats.hh"using namespace std;extern const char *dispatch_policy_strings[];//// Debugging break point//InstSeqNum dispatch_break = 0;voiddispatch_breakpoint(){ cprintf("got to the DISP break_point!\n");}#define MODULO_VAL 4// Setting this to 1 will cause instructions w/ no pending ideps// to become chain heads (only if it produces a result)#define CHAIN_HEAD_IND_INSTS 0#define DUMP_CHAIN_INFO 0// Setting this will cause the contents of IQ[0] to be dumped every dispatch cycle#define DUMP_IQ 0//// For statistics...//const char *chain_cr_class_desc[] = { " Inst has no outstanding IDEPS", " IDEP chain reached max depth", " Inst is a load", " Chain has multiple chained IDEPS"};enum DispatchInstClass{ INSN_CLASS_ALL_RDY=0, // All operands ready INSN_CLASS_ONE_NOT_RDY, // All but one operand ready INSN_CLASS_MULT_CHAINS, // None ready INSN_CLASS_ONE_CHAINED, // None ready INSN_CLASS_ALL_SELF_TIMED, // None ready NUM_INSN_CLASSES};const char *dispatchInstClassDesc[] = { " All ops ready", " One op not ready", " None rdy, mult chains", " None rdy, one chained", " None rdy, all self-timed"};//// THE BIG PICTURE://// Move instructions from the fetch queue into the decode/dispatch queue.// Operation of the decodeQueue is "backwards" ("dispatch" end of pipe first)// so that we can grab instructions out of the tail end (dispatch end) of the// pipe during the same cycle we place insts into the head (decode end) of// the pipe.///*Notes regarding dispatch stage operation:---For the MULTI-THREADED Front-End: (1) Pick a destination IQ for each thread. If a thread can not dispatch to that IQ, then it can't dispatch. - Static allocation: dest_iq = thread_number % numIQueues - Modulo-N allocation: dest_iq = ++dest_iq % numIQueues (Only if dispatch_count % N is zero) ==> Switch to the next IQ every N instructions, but stop dispatch if select IQ is full. (2) Develop a "score" for each thread. The thread with the *largest* score will be selected for dispatch. The following situations can block dispatch for a thread. This is indicated to the final choice code by setting the score to INT_MIN (from <limits.h>): - IQ has met it's cap - ROB has met it's cap - Insufficient IQ slots - Insufficient LSQ slots - Insufficient ROB slots - Insufficient physical registers (3) Depending upon fetch policy, the following modifications are made: ICOUNT: the sum of the count of instructions in the IQ plus any bias value is *subtracted* from the score All Others: the number of cycles since a thread's last dispatch is *added* to the score (4) Select the thread with the smallest score to dispatch.For the SINGLE-THREADED Front-End: This is the same as for the MT Front-End, except we skip step (1).---Once a thread is chosen for dispatch, we know it is _possible_ for allinstructions in the packet to dispatch. But we also know that individualinstructions may not dispatch due to "objections" from the IQ, LSQ, or ROB.==> If any of the instructions in the packet do not dispatch, then that pipe- stage DOES NOT advance (since there is still an inst in the last stage).NOP's and squashed instructions are dropped, but do count against the dispatchbandwidth.Since it is possible that an instruction will not dispatch, we need to peekinto the dec_disp_queue, do the dispatch, and if the dispatch succeeds, onlythen remove the instruction from the queue.*/////////////////////////////////////////////////////////////////// Determine which IQ this instruction should use////////////////////////////////////////////////////////////////intFullCPU::choose_iqueue(unsigned thread){ int iq_idx = 0; if (numIQueues > 1) { // // Initial choice of IQ for this instruction // switch (dispatch_policy) { case DEPENDENCE: // // Access the register-info file and choose the destination // cluster based on dependence information for each instruction // // that means that doing it here (by thread) makes no sense // break; case MODULO_N: // MODULO-N algorithm puts 'n' inst blocks into the same // queue. // // if the "correct" queue doesn't have enough spots, stall // iq_idx = mod_n_queue_idx; break; case THREAD_PER_QUEUE: // Allocate one thread per queue (wrap if there aren't // enough queues) --> static partitioning iq_idx = thread % numIQueues; break; default: panic("dispatch stage misconfigured"); break; } } return iq_idx;}//// This structure holds the information we need for each thread to// rank them for dispatching//struct ThreadList{ Tick score; Tick lastDispatchTime; unsigned thread_number; unsigned iq_idx; unsigned disp_insts; bool eligable; ThreadList() { score = 0; eligable = false; }};class ThreadListSortComp{ public: bool operator()(const ThreadList &l1, const ThreadList &l2) const { if (l1.eligable && !l2.eligable) return true; // we want l1 to go before (lower) than l2 if (!l1.eligable && l2.eligable) return false; // we want l1 to go after (highter) than l2 if (l1.score == l2.score) { // RR operation for these two threads... return l1.lastDispatchTime < l2.lastDispatchTime; } return l1.score > l2.score; // hopefully an ASCENDING sort }};//// Returns true if this cluster is good for dispatch//boolFullCPU::checkClusterForDispatch(unsigned clust, bool chainHead) { bool rv = (IQ[clust]->free_slots() != 0); if (rv && chainHead) rv = (chainWires->freeWires(clust) != 0); return rv;}DispatchEndCauseFullCPU::checkThreadForDispatch(unsigned t, unsigned idx, unsigned needSlots){ // Check for free slots in the IQ if (IQ[idx]->free_slots() < needSlots) { IQ[idx]->markFull(); return FLOSS_DIS_IQ_FULL; } // Check for IQ cap if (IQ[idx]->cap_met(t)) { iq_cap_events[t]++; iq_cap_inst_count[t] += ifq[t].num_total(); // Indicate to fetch stage that the cap is active iq_cap_active[t] = true; return FLOSS_DIS_IQ_CAP; } // Check for ROB Caps if ((rob_cap[t] < ROB_size) && (ROB.num_thread(t) >= rob_cap[t])) { rob_cap_events[t]++; rob_cap_inst_count[t] += ifq[t].num_total(); // Indicate to fetch stage that the cap is active rob_cap_active[t] = true; return FLOSS_DIS_ROB_CAP; } // // Check for available IQ bandwidth // if (IQ[idx]->add_bw() < needSlots) return FLOSS_DIS_BW; DispatchEndCause c = checkGlobalResourcesForDispatch(needSlots); if (c != FLOSS_DIS_CAUSE_NOT_SET) return c; return FLOSS_DIS_CAUSE_NOT_SET;}DispatchEndCauseFullCPU::checkGlobalResourcesForDispatch(unsigned needSlots){ // Check for free slots in the LSQ if (LSQ->free_slots() < needSlots) { ++lsq_fcount; return FLOSS_DIS_LSQ_FULL; } // Check for free slots in the ROB if (ROB.num_free() < needSlots) { ++ROB_fcount; return FLOSS_DIS_ROB_FULL; } // Check for sufficient INT physical registers if (free_int_physical_regs < needSlots) { ++reg_int_full; return FLOSS_DIS_IREG_FULL; } // Check for sufficient FP physical registers if (free_fp_physical_regs < needSlots) { ++reg_fp_full; return FLOSS_DIS_FPREG_FULL; } return FLOSS_DIS_CAUSE_NOT_SET;}//============================================================================//// This dispatch stage itself:// - Determine the (first) IQ that each thread should dispatch to// - Calculate a score for each thread// - Sort by score -- high score gets a chance to dispatch first// - Dispatch thread(s) to IQ(s) as appropriate for IQ configuration and// dispatch policy//voidFullCPU::dispatch(){ DispatchEndCause &endCause = floss_state.dispatch_end_cause; endCause = FLOSS_DIS_CAUSE_NOT_SET; // We will sort this vector to determine which thread to attempt // to dispatch first, second, etc... vector<ThreadList> tlist(SMT_MAX_THREADS); // // bail early if no instructions to dispatch // if (decodeQueue->instsAvailable() == 0) { endCause = FLOSS_DIS_NO_INSN; return; } endCause = checkGlobalResourcesForDispatch(1); if (endCause != FLOSS_DIS_CAUSE_NOT_SET) return; m5_assert(chainWires == 0 || chainWires->sanityCheckOK()); m5_assert(clusterSharedInfo->ci_table == 0 || clusterSharedInfo->ci_table->sanityCheckOK()); // // For each thread: // - Populate ThreadList entry // - Check for dispatch-able instrucitons // - Calculate score // for (int t = 0; t < SMT_MAX_THREADS; ++t) { int idx = choose_iqueue(t); if (idx < 0) { // no available clusters tlist[t].eligable = false; SET_FIRST_FLOSS_CAUSE(endCause, FLOSS_DIS_POLICY); continue; // try next thread } tlist[t].iq_idx = idx; tlist[t].thread_number = t; tlist[t].lastDispatchTime = lastDispatchTime[t]; // how many insturctions can we possibly dispatch? tlist[t].disp_insts = decodeQueue->instsAvailable(t); if (tlist[t].disp_insts > dispatch_width) tlist[t].disp_insts = dispatch_width; if (tlist[t].disp_insts == 0) { tlist[t].eligable = false; SET_FIRST_FLOSS_CAUSE(endCause, FLOSS_DIS_NO_INSN); continue; // try next thread } // // modify the score based on the fetch policy // unsigned adj = 0; switch (fetch_policy) { case IC: // The number of slots available to this thread without regard // to the cap... tlist[t].score += IQNumSlots; // // adjust the score by the number of instructions in the IQ // ==> Be careful not to underflow the unsigned score value // adj = IQNumInstructions(t) + static_icount_bias[t]; if (adj > tlist[t].score) tlist[t].score = 0; else tlist[t].score -= adj; break; default: // The number of cycles since this thread last dispatched tlist[t].score += (curTick - lastDispatchTime[t]); break; } tlist[t].eligable = true; } // // Now that the scores have been calculated... sort threads... // sort(tlist.begin(), tlist.end(), ThreadListSortComp()); // // If the first element isn't going to dispatch, none are... // --> bail out early // if (tlist[0].eligable) { // reset the end cause... we've got something to dispatch... endCause = FLOSS_DIS_CAUSE_NOT_SET; } else { // The only possible floss cause here is NO_INSN return; } //--------------------------------------------------------------------- // // We're finally ready to start dispatching. If there is only one // IQ, then we simply dispatch the thread with the highest score to // the sole IQ. // // The clustered architecture is slightly more interesting... // unsigned dispatched_this_cycle = 0; if (numIQueues == 1) { // // Non-clustered architecture: // for (int i = 0; i < number_of_threads; ++i) { // early exit... if (!tlist[i].eligable) break; unsigned thread = tlist[i].thread_number; unsigned count = dispatch_thread(thread, 0, 0, endCause); if (endCause == FLOSS_DIS_CAUSE_NOT_SET && count == dispatch_width) endCause = FLOSS_DIS_BW; if (count) { lastDispatchTime[thread] = curTick; // exit after we dispatch from a thread break; } }#if DUMP_IQ IQ[0]->dump();#endif return; } //------------------------------------------------------------------------ // // Clustered machine... // // Dispatch behavior depends on dispatch policy // unsigned iq_idx = tlist[0].iq_idx; unsigned thread = tlist[0].thread_number; bool done = false; switch (dispatch_policy) { case DEPENDENCE: // // We dispatch A SINGLE thread to as many IQ's as necessary. // // rotate through all the IQ's until: // (1) We run out of instructions to dispatch // (2) We try to dispatch to an IQ, and fail // // ==> This means that we have to check each IQ for caps, etc // as we rotate through... // lastDispatchTime[thread] = curTick; do { DispatchEndCause queue_endCause = FLOSS_DIS_CAUSE_NOT_SET; // // Logic internal to dispatch_thread() will direct instructions // to the appropriate instruction queues // unsigned dispatched_this_queue = dispatch_thread(thread, iq_idx, dispatch_width, queue_endCause); dispatched_this_cycle += dispatched_this_queue; switch( queue_endCause ) { // // The following end-causes indicate that we can't dispatch // any more instructions this cycle // case FLOSS_DIS_ROB_FULL: case FLOSS_DIS_LSQ_FULL: case FLOSS_DIS_IREG_FULL: case FLOSS_DIS_FPREG_FULL: done = true; endCause = queue_endCause; break; // // The following end-causes indicate that we can't continue // dispatching this thread this cycle // case FLOSS_DIS_ROB_CAP: case FLOSS_DIS_NO_INSN: done = true; endCause = queue_endCause; break; // // The following end-causes indicate that we can't dispatch // the next instruction, so we should give up now... // case FLOSS_DIS_IQ_FULL: case FLOSS_DIS_IQ_CAP: case FLOSS_DIS_BW: done = true; break; //
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -