?? loqo_solver.cpp
字號:
ierr = PLA_Obj_local_buffer(x_loc_view, (void **) &addr_buf); CheckError(ierr); alpha_new = (LOQOfloat *) addr_buf; ierr = PLA_Obj_local_buffer(yy_loc, (void **) &addr_buf); CheckError(ierr); dual = (LOQOfloat *) addr_buf; ierr = PLA_Obj_local_buffer(dist_loc_view, (void **) &addr_buf); dist_ = (LOQOfloat *) addr_buf; ++iter; // Setup problem for parallel LOQO solver time = MPI_Wtime(); info("setup_problem..."); setup_problem(work_set, not_work_set); info("done.\n"); info_flush(); info("setup_up..."); setup_up(work_set); info("done.\n"); info_flush(); info("setup_low..."); setup_low(); info("done.\n"); info_flush(); //print_problem(); time = MPI_Wtime() - time; problem_setup_time += time; // Compute obj before optimization info("Computing obj before..."); info_flush(); obj_before = 0.0; int start_i = 0; for(int i=0; i<n; ++i, ++start_i) { if(work_set[i] >= l_low_loc) break; } // for(int i=n_low_loc; i<n_up_loc; ++i) for(int i=start_i; work_set[i] < l_up_loc && i<n; ++i) { //obj_before += alpha[work_set[i]]*c[i-n_low_loc]; obj_before += alpha[work_set[i]]*c[i-start_i]; for(int j=0; j<n; ++j)// obj_before += 0.5*alpha[work_set[i]]*Q_bb[local_n*j+(i-n_low_loc)]*// alpha[work_set[j]]; obj_before += 0.5*alpha[work_set[i]]*Q_bb[num_rows*j+(i-start_i)]* alpha[work_set[j]]; } // Get local changes for obj_before from other processors double obj_before_loc=0.0; double obj_before_other=0.0; for(int i=0; i<size; ++i) { if(i == rank) ierr = MPI_Bcast(&obj_before, 1, MPI_DOUBLE, i, comm); else { ierr = MPI_Bcast(&obj_before_loc, 1, MPI_DOUBLE, i, comm); obj_before_other += obj_before_loc; } } obj_before += obj_before_other; printf("obj before = %g\n", obj_before); fflush(stdout); info("done.\n"); info_flush(); // Run solver time = MPI_Wtime(); status = solve_inner(work_set); time = MPI_Wtime() - time; inner_solver_time += time; // Update gradient. time = MPI_Wtime(); int *nz = new int[n]; for(int i=0; i<n; ++i) { delta_alpha[i] = alpha_new[i] - alpha[work_set[i]]; if(fabs(delta_alpha[i]) > TOL_ZERO) nz[i] = 1; else nz[i] = 0; } info("Updating G_b..."); info_flush(); // Compute G_b for(int i=start_i; work_set[i] < l_up_loc && i<n; ++i) // for(int i=n_low_loc; i<n_up_loc; ++i) { for(int j=0; j<n; ++j) { if(nz[j])// G[work_set[i]] += Q_bb[j*local_n+(i-n_low_loc)] * // delta_alpha[j]; G[work_set[i]] += Q_bb[j*num_rows+(i-start_i)] * delta_alpha[j]; } } info("done.\n"); info_flush(); // Update local part of the parallel cache status for(int i=0; i<l; ++i) { if(Q.is_cached(i)) p_cache_status[rank*size + i] = CACHED; else p_cache_status[rank*size + i] = NOT_CACHED; } // Synchronize parallel cache status for(int k=0; k<size; ++k) { ierr = MPI_Bcast(&p_cache_status[k*size], l, MPI_CHAR, k, comm); CheckError(ierr); }// printf("p_cache_status %d =",rank);// for(int k=0; k<size; ++k)// {// for(int i=0; i<l; ++i)// {// printf(" %d", p_cache_status[k*size+i]);// }// printf("\n");// }// printf("\n"); // Smart parallel cache handling int *idx_cached = new int[n]; int *idx_not_cached = new int[n]; int count_cached = 0; int count_not_cached = 0; int next_k = 0; bool found = false; int next_not_cached = 0; for(int i=0; i<n; ++i) { if(nz[i]) { for(int k=next_k; !found && k<size; ++k) { if(p_cache_status[k*size + work_set[i]] == CACHED) { if(k == rank) // Do we have it? { idx_cached[count_cached] = i; ++count_cached; } found = true; next_k = k == size-1 ? 0 : k+1; } } for(int k=0; !found && k<next_k; ++k) { if(p_cache_status[k*size + work_set[i]] == CACHED) { if(k == rank) // Do we have it? { idx_cached[count_cached] = i; ++count_cached; } found = true; next_k = k == size-1 ? 0 : k+1; } } if(!found) // not in any cache { if(rank == next_not_cached) // Do we have to compute it? { idx_not_cached[count_not_cached] = i; ++count_not_cached; } next_not_cached = next_not_cached == size-1 ? 0 : next_not_cached + 1; } found = false; } // if(nz[i]) } info("Updating G_n..."); info_flush(); // Compute G_n for(int j=0; j<lmn; ++j) G_n[j] = 0;// printf("idx_cached %d =", rank);// for(int i=0; i<count_cached; ++i)// printf(" %d", idx_cached[i]);// printf("\n");// printf("idx_not_cached %d =", rank);// for(int i=0; i<count_not_cached; ++i)// printf(" %d", idx_not_cached[i]);// printf("\n"); // First update the cached part... for(int i=0; i<count_cached; ++i) { const Qfloat *Q_i = Q.get_Q_subset(work_set[idx_cached[i]], not_work_set,lmn); for(int j=0; j<lmn; ++j) G_n[j] += Q_i[not_work_set[j]] * delta_alpha[idx_cached[i]]; } // now update the non-cached part for(int i=0; i<count_not_cached; ++i) { const Qfloat *Q_i = Q.get_Q_subset(work_set[idx_not_cached[i]], not_work_set,lmn); for(int j=0; j<lmn; ++j) G_n[j] += Q_i[not_work_set[j]] * delta_alpha[idx_not_cached[i]]; }// for(int i=start_i; work_set[i] < l_up_loc && i<n; ++i)// // for(int i=n_low_loc; i<n_up_loc; ++i)// {// if(nz[i]){// const Qfloat *Q_i = Q.get_Q_subset(work_set[i],not_work_set,lmn);// for(int j=0; j<lmn; ++j)// G_n[j] += Q_i[not_work_set[j]] * delta_alpha[i];// }// } info("done.\n"); info_flush(); delete[] idx_cached; delete[] idx_not_cached; delete[] nz; time = MPI_Wtime() - time; gradient_updating_time += time; // Synchronize gradient with other processors info("Synchronizing gradient..."); info_flush(); sync_gradient(work_set, not_work_set); info("done.\n"); info_flush();// info("synced G proc[%d]=",rank);// for(int i=0; i<l; ++i)// info(" %g", G[i]);// info("\n");// info_flush(); // Update alpha for(int i=0; i<n; ++i) { alpha[work_set[i]] = alpha_new[i]; update_alpha_status(work_set[i]); } } // while(1) // Calculate rho si->rho = calculate_rho(); // Calculate objective value { double v = 0; int i; for(i=0;i<l;i++) v += alpha[i] * (G[i] + b[i]); si->obj = v/2; } // Put back the solution {// printf("alpha = "); for(int i=0;i<l;i++) { alpha_[i] = alpha[i];// printf(" %g\n",alpha[i]); }// printf("\n"); } si->upper_bound_p = Cp; si->upper_bound_n = Cn; info("\noptimization finished, #iter = %d\n",iter); total_time = MPI_Wtime() - total_time; // print timing statistics if(rank == 0) { info("Total opt. time = %lf\n", total_time); info_flush(); info("Problem setup time = %lf (%lf%%)\n", problem_setup_time, problem_setup_time/total_time*100); info_flush(); info("Inner solver time = %lf (%lf%%)\n", inner_solver_time, inner_solver_time/total_time*100); info_flush(); info("Gradient updating time = %lf (%lf%%)\n", gradient_updating_time, gradient_updating_time/total_time*100); info_flush(); } // Clean up delete[] b; delete[] y; delete[] alpha; delete[] alpha_status; delete[] delta_alpha; delete[] work_status; delete[] work_count; delete[] work_set; delete[] not_work_set; delete[] G; delete[] Q_bb; delete[] c; delete[] up; delete[] low; delete[] a; delete[] d; delete[] l_low; delete[] l_up; delete[] n_low; delete[] n_up; delete[] lmn_low; delete[] lmn_up; delete[] G_n; delete[] p_cache_status; ierr = PLA_Obj_free(&Q_bb_global); CheckError(ierr); ierr = PLA_Obj_free(&c_global); CheckError(ierr); ierr = PLA_Obj_free(&up_global); CheckError(ierr); ierr = PLA_Obj_free(&low_global); CheckError(ierr); ierr = PLA_Obj_free(&a_global); CheckError(ierr); ierr = PLA_Obj_free(&d_global); CheckError(ierr); ierr = PLA_Obj_free(&x_loc); CheckError(ierr); ierr = PLA_Obj_free(&x); CheckError(ierr); ierr = PLA_Obj_free(&g); CheckError(ierr); ierr = PLA_Obj_free(&t); CheckError(ierr); ierr = PLA_Obj_free(&yy); CheckError(ierr) ierr = PLA_Obj_free(&yy_loc); CheckError(ierr); ierr = PLA_Obj_free(&z); CheckError(ierr); ierr = PLA_Obj_free(&s); CheckError(ierr);}void Solver_Parallel_LOQO::sync_gradient(int *work_set, int *not_work_set){ int start_i = 0; for(int i=0; i<n; ++i, ++start_i) { if(work_set[i] >= l_low_loc) break; } // Synchronize G_b // double *G_send = new double[local_n]; double *G_send = new double[num_rows]; double *G_recv = new double[n]; int count=0; for(int i=start_i; work_set[i] < l_up_loc && i<n; ++i) //for(int i=n_low_loc; i<n_up_loc; ++i) { G_send[count] = G[work_set[i]]; ++count; }// printf("G_send = ");// for(int i=start_i; work_set[i] < l_up_loc && i<n; ++i)// printf(" %g", G_send[i]);// printf("\n");// printf("count = %d\n", count);// printf("num_rows = %d\n", num_rows); int *sendcounts = new int[size]; int *sdispls = new int[size]; for(int i=0; i<size; ++i) { // sendcounts[i] = local_n; sendcounts[i] = num_rows; sdispls[i] = 0; } int *recvcounts = new int[size]; int *rdispls = new int[size]; for(int k=0; k<size; ++k) { int start_i_other = 0; for(int i=0; i<n; ++i, ++start_i_other) if(work_set[i] >= l_low[k]) break; int num_rows_other = 0; for(int i=start_i_other; work_set[i] < l_up[k] && i<n; ++i) ++num_rows_other; recvcounts[k] = num_rows_other; if(k == 0) rdispls[k] = 0; else rdispls[k] = rdispls[k-1] + recvcounts[k-1]; }// recvcounts[0] = n_up[0] - n_low[0];// rdispls[0] = 0;// for(int i=1; i<size; ++i)// {// recvcounts[i] = n_up[i] - n_low[i];// rdispls[i] = rdispls[i-1] + recvcounts[i-1];// } ierr = MPI_Alltoallv(G_send, sendcounts, sdispls, MPI_DOUBLE, G_recv, recvcounts, rdispls, MPI_DOUBLE, comm); CheckError(ierr); // Update local G_b for(int k=0; k<size; ++k) { if(k != rank) { int start_i_other = 0; for(int i=0; i<n; ++i, ++start_i_other) if(work_set[i] >= l_low[k]) break; // G_b for(int i=start_i_other; work_set[i] < l_up[k] && i<n; ++i) G[work_set[i]] = G_recv[rdispls[k]+(i-start_i_other)];// for(int j=n_low[i]; j<n_up[i]; ++j)// G[work_set[j]] = G_recv[rdispls[i]+(j-n_low[i])]; } } delete[] G_send; delete[] G_recv; delete[] sendcounts; delete[] sdispls; delete[] recvcounts; delete[] rdispls; double *G_buf = new double[lmn]; // Synchronize G_n for(int i=0; i<size; ++i) { if(rank == i) { for(int j=0; j<lmn; ++j) G_buf[j] = G_n[j]; } ierr = MPI_Bcast(G_buf, lmn, MPI_DOUBLE, i, comm); CheckError(ierr); // Accumulate contributions for(int j=0; j<lmn; ++j) G[not_work_set[j]] += G_buf[j]; } delete[] G_buf;}int Solver_LOQO::select_working_set(int *work_set, int *not_work_set){ printf("selecting working set..."); // reset work status n = n_old; for(int i=0; i<l; ++i) work_status[i] = WORK_N; double Gmax1 = -INF; // max { -y_i * grad(f)_i | i in I_up(\alpha) } double Gmax2 = -INF; // max { y_i * grad(f)_i | i in I_low(\alpha) } for(int i=0; i<l; ++i) { if(!is_upper_bound(i)) { if(y[i] == +1) { if(-G[i] > Gmax1) Gmax1 = -G[i]; } else { if(-G[i] > Gmax2) Gmax2 = -G[i]; } } if(!is_lower_bound(i)) { if(y[i] == +1) { if(G[i] > Gmax2) Gmax2 = G[i]; } else { if(G[i] > Gmax1) Gmax1 = G[i]; } } } // check for optimality printf("Gmax1 + Gmax2 = %g < %g\n", Gmax1+Gmax2,eps); if(Gmax1 + Gmax2 < eps) return 1; // Compute yG double *yG = new double[l]; int *pidx = new int[l]; for(int i=0; i<l; ++i) { if(y[i] == +1) yG[i] = G[i]; else yG[i] = -G[i]; pidx[i] = i; } quick_sort(yG, pidx, 0, l-1);// printf("yG = ");// for(int i=0; i<l; ++i)// printf(" %g",yG[i]);// printf("\n");// printf("pidx = ");// for(int i=0; i<l; ++i)// printf(" %d",pidx[i]);// printf("\n"); int top=l-1; int bot=0; int count=0; // Select a full set initially int nselect = iter == 0 ? n : q; while(top > bot && count < nselect) { while(!(is_free(pidx[top]) || (is_upper_bound(pidx[top]) && y[pidx[top]] == +1) || (is_lower_bound(pidx[top]) && y[pidx[top]] == -1) )) --top; while(!(is_free(pidx[bot]) || (is_upper_bound(pidx[bot]) && y[pidx[bot]] == -1) || (is_lower_bound(pidx[bot]) && y[pidx[bot]] == +1) )) ++bot; if(top > bot)
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -