?? psmo_solver.cpp
字號:
{ int j; NEXT_RAND = 1; for (int i=0; i<n; ++i) { do { j = next_rand_pos() % l; } while (work_status[j] != WORK_N); work_status[j] = WORK_B; } int k=0; j=0; for(int i=0; i<l; ++i) { if(work_status[i] == WORK_B) { work_set[j] = i; ++j; work_count[i] = 0; } else { not_work_set[k] = i; ++k; work_count[i] = -1; } }}void Solver_Parallel_SMO::Solve(int l, const QMatrix& Q, const double *b_, const schar *y_, double *alpha_, double Cp, double Cn, double eps, SolutionInfo* si, int shrinking){ double total_time = MPI_Wtime(); double problem_setup_time = 0; double inner_solver_time = 0; double gradient_updating_time = 0; double time = 0; // Initialization this->l = l; this->active_size = l; this->Q = &Q; this->si = si; QD = Q.get_QD(); this->si = si; clone(b,b_,l); clone(y,y_,l); clone(alpha,alpha_,l); this->Cp = Cp; this->Cn = Cn; this->eps = eps; this->lmn = l - n; this->G_n = new double[lmn]; int *work_set = new int[n]; int *not_work_set = new int[l]; old_idx = new int[n]; memset(old_idx, -1, sizeof(int)*n); double *delta_alpha = new double[n]; // Setup alpha, work and parallel cache status { alpha_status = new char[l]; work_status = new char[l]; work_count = new int[l]; p_cache_status = new char[size*l]; idx_cached = new int[n]; idx_not_cached = new int[n]; nz = new int[n]; for(int i=0; i<l; ++i) { update_alpha_status(i); work_status[i] = WORK_N; work_count[i] = -1; for(int k=0; k<size; ++k) p_cache_status[k*size+i] = NOT_CACHED; } } // Setup local index ranges { this->l_low = new int[size]; this->l_up = new int[size]; this->n_low = new int[size]; this->n_up = new int[size]; this->lmn_low = new int[size]; this->lmn_up = new int[size]; setup_range(l_low, l_up, l); setup_range(n_low, n_up, n); setup_range(lmn_low, lmn_up, lmn); this->l_low_loc = l_low[rank]; this->l_up_loc = l_up[rank]; this->n_low_loc = n_low[rank]; this->n_up_loc = n_up[rank]; this->lmn_up_loc = lmn_up[rank]; this->lmn_low_loc = lmn_low[rank];// this->local_l = l_up_loc - l_low_loc;// this->local_n = n_up_loc - n_low_loc;// this->local_lmn = lmn_up_loc - lmn_low_loc; } // Setup gradient { info("Initializing gradient..."); info_flush(); G = new double[l]; double *G_send = new double[l]; double *G_recv = new double[l]; for(int i=0; i<l; ++i) { G[i] = b[i]; G_send[i] = 0; } // Determine even distribution of work w.r.t. // variables at lower bound int k=0; int count_not_lower=0; int *idx_not_lower = new int[l]; for(int i=0; i<l; ++i) { if(!is_lower_bound(i)) { if(rank == k) { // We have to compute it idx_not_lower[count_not_lower]=i; ++count_not_lower; } k = k == size-1 ? 0 : k+1; } } // Compute local portion of gradient for(int i=0; i<count_not_lower; ++i) { const Qfloat *Q_i = Q.get_Q(idx_not_lower[i],l); double alpha_i = alpha[idx_not_lower[i]]; for(int j=0; j<l; ++j) G_send[j] += alpha_i * Q_i[j]; } delete[] idx_not_lower; // Get contributions from other processors for(int k=0; k<size; ++k) { if(rank == k) { ierr = MPI_Bcast(G_send, l, MPIfloat, k, comm); CheckError(ierr); for(int i=0; i<l; ++i) G[i] += G_send[i]; } else { ierr = MPI_Bcast(G_recv, l, MPIfloat, k, comm); CheckError(ierr); for(int i=0; i<l; ++i) G[i] += G_recv[i]; } } delete[] G_recv; delete[] G_send; info("done.\n"); info_flush(); ierr = MPI_Barrier(comm); CheckError(ierr); } // Allocate space for local subproblem Q_bb = new Qfloat[n*n]; QD_b = new Qfloat[n]; alpha_b = new double[n]; c = new double[n]; a = new schar[n];// time = clock(); // double tmp = 0; // for(int i=0; i<100; ++i) // { // for(int j=0; j<l; ++j) // { // tmp = Q.get_non_cached(i,j); // } // } // time = clock() - time; // printf("Computation time for 100 kernel rows = %.2lf\n", (double)time/CLOCKS_PER_SEC); if(rank == 0) { info(" it | setup time | solver it | solver time | gradient time "); info("| kkt violation\n"); } iter=0; // Optimization loop while(1) { // Only one processor does the working set selection. int status = 0; if(rank == 0) { if(iter > 0) { time = MPI_Wtime(); // info("Starting working set selection\n"); info_flush(); status = select_working_set(work_set, not_work_set); // info("select ws time = %.2f\n", MPI_Wtime() - time); } else { // info("Starting init ws\n"); info_flush(); init_working_set(work_set, not_work_set); } } // Send status to other processors. ierr = MPI_Bcast(&status, 1, MPI_INT, 0, comm); // Now check for optimality if(status != 0) break; // Send new eps, as select_working_set might have // changed it. ierr = MPI_Bcast(&eps, 1, MPI_DOUBLE, 0, comm); // Send new working set size and working set to other processors. ierr = MPI_Bcast(&n, 1, MPI_INT, 0, comm); ierr = MPI_Bcast(work_set, n, MPI_INT, 0, comm); lmn = l - n; ierr = MPI_Bcast(not_work_set, lmn, MPI_INT, 0, comm); // Recompute ranges, as n and lmn might have changed setup_range(n_low, n_up, n); this->n_low_loc = n_low[rank]; this->n_up_loc = n_up[rank]; setup_range(lmn_low, lmn_up, lmn); this->lmn_low_loc = lmn_low[rank]; this->lmn_up_loc = lmn_up[rank];// this->local_n = n_up_loc - n_low_loc;// this->local_lmn = lmn_up_loc - lmn_low_loc; ++iter; // Setup subproblem time = MPI_Wtime(); for(int i=0; i<n; ++i) { c[i] = G[work_set[i]]; alpha_b[i] = alpha[work_set[i]]; a[i] = y[work_set[i]]; } // info("Setting up Q_bb..."); info_flush(); for(int i=n_low_loc; i<n_up_loc; ++i) {// const Qfloat *Q_i = Q.get_Q_subset(work_set[i],work_set,n); if(Q.is_cached(work_set[i])) { const Qfloat *Q_i = Q.get_Q_subset(work_set[i],work_set,n); for(int j=0; j<=i; ++j) { Q_bb[i*n+j] = Q_i[work_set[j]]; } } else if(old_idx[i] == -1) { for(int j=0; j<=i; ++j) { // Q_bb[i*n+j] = Q_i[work_set[j]]; Q_bb[i*n+j] = Q.get_non_cached(work_set[i],work_set[j]); } } else { for(int j=0; j<i; ++j) { if(old_idx[j] == -1) Q_bb[i*n+j] = Q.get_non_cached(work_set[i],work_set[j]); else Q_bb[i*n+j] = Q_bb[old_idx[j]*n+old_idx[i]]; } Q_bb[i*n+i] = Q.get_non_cached(work_set[i],work_set[i]); } } // Synchronize Q_bb ierr = MPI_Barrier(comm); CheckError(ierr); int num_elements = 0; for(int k=0; k<size; ++k) { ierr = MPI_Bcast(&Q_bb[num_elements], (n_up[k]-n_low[k])*n, MPI_FLOAT, k, comm); CheckError(ierr); num_elements += (n_up[k]-n_low[k])*n; } // Complete symmetric Q for(int i=0; i<n; ++i) { for(int j=0; j<i; ++j) Q_bb[j*n+i] = Q_bb[i*n+j]; } for(int i=0; i<n; ++i) { for(int j=0; j<n; ++j) { if(alpha[work_set[j]] > TOL_ZERO) c[i] -= Q_bb[i*n+j]*alpha[work_set[j]]; } QD_b[i] = Q_bb[i*n+i]; } //info("done.\n"); info_flush(); time = MPI_Wtime() - time; problem_setup_time += time; if(rank == 0) { info("%5d | %10.2f |",iter,time); info_flush(); time = MPI_Wtime(); // Call SMO inner solver solve_inner(); time = MPI_Wtime() - time; info(" %11.2f |", time); info_flush(); inner_solver_time += time; } // Send alpha_b to other processors ierr = MPI_Bcast(alpha_b, n, MPI_DOUBLE, 0, comm); CheckError(ierr); // Update gradient. time = MPI_Wtime(); for(int i=0; i<n; ++i) { delta_alpha[i] = alpha_b[i] - alpha[work_set[i]]; if(fabs(delta_alpha[i]) > TOL_ZERO) nz[i] = 1; else nz[i] = 0; } // info("Updating G_b..."); info_flush(); // Compute G_b if(rank == 0) { // Only first processor does updating, since // it has the whole Q_bb for(int i=0; i<n; ++i) for(int j=0; j<n; ++j) // G_b { if(nz[j]) G[work_set[i]] += Q_bb[n*i+j]*delta_alpha[j]; } } // info("done.\n"); info_flush(); determine_cached(work_set); // info("count_cached[%d] = %d, count_not_cached[%d] = %d\n", rank, // count_cached, rank, count_not_cached); info_flush();// MPI_Barrier(comm); // printf("count_cached = %d\n", count_cached); // info("Updating G_n..."); info_flush(); // Compute G_n for(int j=0; j<lmn; ++j) G_n[j] = 0; // First update the cached part... for(int i=0; i<count_cached; ++i) { const Qfloat *Q_i = Q.get_Q_subset(work_set[idx_cached[i]], not_work_set,lmn); for(int j=0; j<lmn; ++j) G_n[j] += Q_i[not_work_set[j]] * delta_alpha[idx_cached[i]]; } // ...now update the non-cached part for(int i=0; i<count_not_cached; ++i) { const Qfloat *Q_i = Q.get_Q_subset(work_set[idx_not_cached[i]], not_work_set,lmn); for(int j=0; j<lmn; ++j) G_n[j] += Q_i[not_work_set[j]] * delta_alpha[idx_not_cached[i]]; } // info("done.\n"); info_flush(); // Synchronize gradient with other processors // info("Synchronizing gradient..."); info_flush(); sync_gradient(work_set, not_work_set); // info("done.\n"); info_flush(); time = MPI_Wtime() - time; gradient_updating_time += time; if(rank == 0) info(" %13.2f |", time); info_flush(); // Update alpha for(int i=0; i<n; ++i) { alpha[work_set[i]] = alpha_b[i]; update_alpha_status(work_set[i]); } } // while(1) // Calculate rho si->rho = calculate_rho(); // Calculate objective value { double v = 0; int i; for(i=0;i<l;i++) v += alpha[i] * (G[i] + b[i]); si->obj = v/2; } // Put back the solution { for(int i=0;i<l;i++) alpha_[i] = alpha[i]; } si->upper_bound_p = Cp; si->upper_bound_n = Cn; total_time = MPI_Wtime() - total_time; // print timing statistics if(rank == 0) { info("\n"); info("Total opt. time = %.2lf\n", total_time); info_flush(); info("Problem setup time = %.2lf (%.2lf%%)\n", problem_setup_time, problem_setup_time/total_time*100); info_flush(); info("Inner solver time = %.2lf (%.2lf%%)\n", inner_solver_time, inner_solver_time/total_time*100); info_flush(); info("Gradient updating time = %.2lf (%.2lf%%)\n", gradient_updating_time, gradient_updating_time/total_time*100); info_flush(); } MPI_Barrier(comm); info("\noptimization finished, #iter = %d\n",iter); // Clean up delete[] b; delete[] y; delete[] alpha; delete[] alpha_status; delete[] delta_alpha; delete[] work_status; delete[] work_count; delete[] work_set; delete[] not_work_set; delete[] G; delete[] Q_bb; delete[] QD_b; delete[] alpha_b; delete[] c; delete[] a; delete[] l_low; delete[] l_up; delete[] n_low; delete[] n_up; delete[] lmn_low; delete[] lmn_up; delete[] G_n; delete[] p_cache_status; delete[] idx_cached; delete[] idx_not_cached; delete[] nz;}void Solver_Parallel_SMO::determine_cached(int *work_set){ count_cached = 0; count_not_cached = 0; // Update local part of the parallel cache status for(int i=0; i<l; ++i) { if(Q->is_cached(i)) p_cache_status[rank*l + i] = CACHED; else p_cache_status[rank*l + i] = NOT_CACHED; } // Synchronize parallel cache status for(int k=0; k<size; ++k) { ierr = MPI_Bcast(&p_cache_status[k*l], l, MPI_CHAR, k, comm); CheckError(ierr); } // Smart parallel cache handling int next_k = 0; bool found = false; int next_not_cached = 0; for(int i=0; i<n; ++i) { if(nz[i]) { for(int k=next_k; !found && k<size; ++k) { if(p_cache_status[k*l + work_set[i]] == CACHED) { if(k == rank) // Do we have it? { idx_cached[count_cached] = i; ++count_cached; } found = true; next_k = k == size-1 ? 0 : k+1; } } for(int k=0; !found && k<next_k; ++k) { if(p_cache_status[k*l + work_set[i]] == CACHED) { if(k == rank) // Do we have it? { idx_cached[count_cached] = i; ++count_cached; } found = true; next_k = k == size-1 ? 0 : k+1; } } if(!found) // not in any cache { if(rank == next_not_cached) // Do we have to compute it? { idx_not_cached[count_not_cached] = i; ++count_not_cached; } next_not_cached = next_not_cached == size-1 ? 0 : next_not_cached + 1; } found = false; } // if(nz[i]) }}void Solver_Parallel_SMO::setup_range(int *range_low, int *range_up, int total_sz){ int local_sz = total_sz/size; int idx_up = local_sz; int idx_low = 0; if(total_sz != 0) { for(int i=0; i<size-1; ++i) { range_low[i] = idx_low; range_up[i] = idx_up; idx_low = idx_up; idx_up = idx_low + local_sz + 1; } range_low[size-1] = idx_low; range_up[size-1]=total_sz; } else { for(int i=0; i<size; ++i) { range_low[i] = 0; range_up[i] = 0; } }}void Solver_Parallel_SMO::sync_gradient(int *work_set, int *not_work_set){ // Synchronize G_b double *G_buf = new double[l]; if(rank == 0) { for(int i=0; i<n; ++i) G_buf[i] = G[work_set[i]]; } ierr = MPI_Bcast(G_buf, n, MPI_DOUBLE, 0, comm); CheckError(ierr); if(rank != 0) { for(int i=0; i<n; ++i) G[work_set[i]] = G_buf[i]; } // Synchronize G_n for(int i=0; i<size; ++i) { if(rank == i) { for(int j=0; j<lmn; ++j) G_buf[j] = G_n[j]; } ierr = MPI_Bcast(G_buf, lmn, MPI_DOUBLE, i, comm); CheckError(ierr); // Accumulate contributions for(int j=0; j<lmn; ++j) G[not_work_set[j]] += G_buf[j]; } delete[] G_buf;}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -