?? lrn_pifa.c
字號:
#ifdef PGRL_FA
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <float.h>
#include "lrn_PIFA.h"
#include "gaussian.h"
#include "misc.h"
#define MAX_SV_RATIO 1e22
#define MIN_PI 0.0001
int Episodes_Per_Parameter_Update;
int Max_Num_Grad_Calc;
int Update_Policy_Parameters = 0;
extern int **modes_visited;
extern int Mode_Execute;
extern int Step_To_Execute_Mode;
double **dpdc_t, **dpdv_t, *wrk;
double **drhodc, **drhodv;
double **Q,****dpdc, ****dpdv;
double V_for_Policy = 0.0;
extern int dim;
extern int num_of_gaussians;
#ifdef GRAPHICS
extern int Update_Boundaries;
#endif
int Num_of_Grad_Calculations = 0;
double ***drhodc_coeff, ***drhodv_coeff;
double **p_pi;
double **states_visited;
int total_states_visited;
void Initialize_PGRL_PIFA(void)
{
char error_text[256];
FILE *fp;
int i_tmp,i,j,k;
if ((fp = fopen("PGRL_DirectQ.ini", "r")) == NULL)
{
sprintf(error_text, "Couldn't open \"%s\"\n", "PGRL_DirectQ.ini");
My_Error(error_text);
}
if (fscanf(fp, "%d", &(i_tmp) ) != 1)
{
sprintf(error_text,
"Initialize_Learning_parameters: cannot read Episodes_Per_Parameter_Update\n");
My_Error(error_text);
}
skiptoend(fp);
Episodes_Per_Parameter_Update = i_tmp;
if (fscanf(fp, "%d", &(i_tmp) ) != 1)
{
sprintf(error_text,
"Initialize_Learning_parameters: cannot read Max_Num_Grad_Calc\n");
My_Error(error_text);
}
skiptoend(fp);
Max_Num_Grad_Calc = i_tmp;
fclose(fp);
// the reward file
if ((fp = fopen("rf.txt", "w")) == NULL)
{
sprintf(error_text, "Couldn't open \"%s\"\n", "rf.txt");
My_Error(error_text);
}
fclose(fp);
if ((fp = fopen("gf.txt", "w")) == NULL)
{
sprintf(error_text, "Couldn't open \"%s\"\n", "gf.txt");
My_Error(error_text);
}
fclose(fp);
//
dpdc_t = (double **)My_Malloc((long)dim * sizeof(double*));
dpdv_t = (double **)My_Malloc((long)dim * sizeof(double*));
for ( i = 0; i < dim; i++ )
{
dpdc_t[i] = (double *)My_Malloc((long)num_of_gaussians * sizeof(double));
dpdv_t[i] = (double *)My_Malloc((long)num_of_gaussians * sizeof(double));
}
wrk = (double *)My_Malloc((long)dim * sizeof(double));
drhodc = (double **)My_Malloc((long)num_of_gaussians * sizeof(double*));
drhodv = (double **)My_Malloc((long)num_of_gaussians * sizeof(double*));
drhodc_coeff = (double ***)My_Malloc((long)num_of_gaussians * sizeof(double**));
drhodv_coeff = (double ***)My_Malloc((long)num_of_gaussians * sizeof(double**));
for ( i = 0; i < num_of_gaussians; i++ )
{
drhodc[i] = (double *)My_Malloc((long)dim * sizeof(double));
drhodv[i] = (double *)My_Malloc((long)dim * sizeof(double));
drhodc_coeff[i] = (double **)My_Malloc((long)dim * sizeof(double*));
drhodv_coeff[i] = (double **)My_Malloc((long)dim * sizeof(double*));
for ( j = 0; j < dim; j++ )
{
drhodc[i][j] = 0.0;
drhodv[i][j] = 0.0;
drhodc_coeff[i][j] = (double *)My_Malloc((long)num_of_gaussians * sizeof(double));
drhodv_coeff[i][j] = (double *)My_Malloc((long)num_of_gaussians * sizeof(double));
for ( k = 0; k < num_of_gaussians; k++ )
{
drhodc_coeff[i][j][k] = 0.0;
drhodv_coeff[i][j][k] = 0.0;
}
}
}
}
void PGRL_PIFA(int steps,
double **s, double **g,
double **cen, double **var,
int *mode, double alpha, double gam,
double *r)
{
double g_tot;
int i,j,k,q;
if ( Update_Policy_Parameters == 0 )
{
total_states_visited = steps - 1;
states_visited = (double **)My_Malloc((long)total_states_visited * sizeof(double*));
for ( k = 0; k < total_states_visited; k++ )
{
states_visited[k] = (double *)My_Malloc((long)num_of_gaussians * sizeof(double));
}
Q = (double **)My_Malloc((long)steps * sizeof(double*));
p_pi = (double **)My_Malloc((long)steps * sizeof(double*));
dpdc = (double ****)My_Malloc((long)steps * sizeof(double***));
dpdv = (double ****)My_Malloc((long)steps * sizeof(double***));
for ( k = 0; k < steps; k++ )
{
Q[k] = (double *)My_Malloc((long)num_of_gaussians * sizeof(double));
p_pi[k] = (double *)My_Malloc((long)num_of_gaussians * sizeof(double));
dpdc[k] = (double ***)My_Malloc((long)num_of_gaussians * sizeof(double**));
dpdv[k] = (double ***)My_Malloc((long)num_of_gaussians * sizeof(double**));
for ( i = 0; i < num_of_gaussians; i++ )
{
Q[k][i] = 0.0;
p_pi[k][i] = 0.0;
dpdc[k][i] = (double **)My_Malloc((long)dim * sizeof(double*));
dpdv[k][i] = (double **)My_Malloc((long)dim * sizeof(double*));
for ( j = 0; j < dim; j++ )
{
dpdc[k][i][j] = (double *)My_Malloc((long)num_of_gaussians * sizeof(double));
dpdv[k][i][j] = (double *)My_Malloc((long)num_of_gaussians * sizeof(double));
for ( q = 0; q < num_of_gaussians; q++ )
{
dpdc[k][i][j][q] = 0.0;
dpdv[k][i][j][q] = 0.0;
}
}
}
}
for ( i = 1; i < steps; i++ )
{
Q[i][mode[i]] = 0.0;
for ( j = i; j < steps; j++ )
{
Q[i][mode[i]] = Q[i][mode[i]] + r[j] * pow(gam,(double)(j-i));
}
}
V_for_Policy = Q[1][mode[1]];
{ //
char error_text[256];
FILE *fp;
if ((fp = fopen("rf.txt", "a")) == NULL)
{
sprintf(error_text, "Couldn't open \"%s\"\n", "rf.txt");
My_Error(error_text);
}
fprintf(fp,"%g\n",V_for_Policy);
fclose(fp);
printf("%d: %g\n",Num_of_Grad_Calculations,V_for_Policy);
}
for ( i = 1; i < steps; i++ )
{
g_tot = 0.0;
for ( j = 0; j < num_of_gaussians; j++ )
{
g_tot = g_tot + g[i][j];
}
for ( j = 0; j < dim; j++ )
{
states_visited[i-1][j] = s[i][j];
}
p_pi[i][mode[i]] = g[i][mode[i]]/g_tot;
if ( p_pi[i][mode[i]] < MIN_PI )
{
p_pi[i][mode[i]] = MIN_PI;
}
avaluate_total_gradient(dim, s[i], cen[mode[i]], var[mode[i]],
dpdc_t, dpdv_t, wrk, mode[i], g[i], num_of_gaussians, g_tot);
for ( j = 0; j < dim; j++ )
{
for ( q = 0; q < num_of_gaussians; q++ )
{
dpdc[i][mode[i]][j][q] = dpdc_t[j][q];
dpdv[i][mode[i]][j][q] = dpdv_t[j][q];
}
}
}
Episodes_Per_Parameter_Update = steps - 1;
modes_visited = (int **)My_Malloc((long)Episodes_Per_Parameter_Update * sizeof(int*));
for ( i = 1; i < Episodes_Per_Parameter_Update; i++ )
{
modes_visited[i] = (int *)My_Malloc((long)num_of_gaussians * sizeof(int));
for ( j = 0; j < num_of_gaussians; j++ )
{
modes_visited[i][j] = 0;
}
modes_visited[i][mode[i]] = 1;
}
Update_Policy_Parameters++;
Step_To_Execute_Mode = 1;
for ( i = 0; i < num_of_gaussians; i++ )
{
if ( modes_visited[Step_To_Execute_Mode][i] == 0 )
{
Mode_Execute = i;
//modes_visited[Step_To_Execute_Mode][i] = 1;
break;
}
}
}
else if ( steps <= Step_To_Execute_Mode )
{ // update the gradient
Update_Policy_Parameters = Episodes_Per_Parameter_Update;
}
else
{
double Q_tmp;
int inr_step;
#ifdef TMP777777
int prev_total_states_visited;
double **tmp_states_visited;
prev_total_states_visited = total_states_visited;
tmp_states_visited = (double **)My_Malloc((long)total_states_visited * sizeof(double*));
for ( k = 0; k < total_states_visited; k++ )
{
tmp_states_visited[k] = (double *)My_Malloc((long)num_of_gaussians * sizeof(double));
for ( j = 0; j < dim; j++ )
{
tmp_states_visited[k][j] = states_visited[k][j];
}
}
for ( k = 0; k < total_states_visited; k++ )
{
free(states_visited[k]);
}
free(states_visited);
total_states_visited = total_states_visited + steps - 1;
states_visited = (double **)My_Malloc((long)total_states_visited * sizeof(double*));
for ( k = 0; k < total_states_visited; k++ )
{
states_visited[k] = (double *)My_Malloc((long)num_of_gaussians * sizeof(double));
for ( j = 0; j < dim; j++ )
{
if ( k < prev_total_states_visited )
{
states_visited[k][j] = tmp_states_visited[k][j];
}
else
{
states_visited[k][j] = s[k+1-prev_total_states_visited][j];
}
}
}
for ( k = 0; k < prev_total_states_visited; k++ )
{
free(tmp_states_visited[k]);
}
free(tmp_states_visited);
#endif
Q_tmp = 0.0;
for ( j = Step_To_Execute_Mode; j < steps; j++ )
{
Q_tmp = Q_tmp + r[j] * pow(gam,(double)(j-Step_To_Execute_Mode));
}
i = Step_To_Execute_Mode;
Q[i][mode[i]] = Q_tmp;
if ( Q[i][mode[i]] == 0.0 )
{
printf("Q_tmp[%d][%d] = %g\n",i,mode[i],Q[i][mode[i]]);
}
g_tot = 0.0;
for ( j = 0; j < num_of_gaussians; j++ )
{
g_tot = g_tot + g[i][j];
}
p_pi[i][mode[i]] = g[i][mode[i]]/g_tot;
if ( p_pi[i][mode[i]] < MIN_PI )
{
p_pi[i][mode[i]] = MIN_PI;
}
avaluate_total_gradient(dim, s[i], cen[mode[i]], var[mode[i]],
dpdc_t, dpdv_t, wrk, mode[i], g[i], num_of_gaussians, g_tot);
for ( j = 0; j < dim; j++ )
{
for ( q = 0; q < num_of_gaussians; q++ )
{
dpdc[i][mode[i]][j][q] = dpdc_t[j][q];
dpdv[i][mode[i]][j][q] = dpdv_t[j][q];
}
}
modes_visited[Step_To_Execute_Mode][mode[i]] = 1;
inr_step = 1;
for ( i = 0; i < num_of_gaussians; i++ )
{
if ( modes_visited[Step_To_Execute_Mode][i] == 0 )
{
inr_step = 0;
Mode_Execute = i;
//modes_visited[Step_To_Execute_Mode][i] = 1;
break;
}
}
if ( inr_step == 1)
{
Step_To_Execute_Mode++;
Update_Policy_Parameters++;
if ( Update_Policy_Parameters < Episodes_Per_Parameter_Update )
{
for ( i = 0; i < num_of_gaussians; i++ )
{
if ( modes_visited[Step_To_Execute_Mode][i] == 0 )
{
inr_step = 0;
Mode_Execute = i;
//modes_visited[Step_To_Execute_Mode][i] = 1;
break;
}
}
}
}
}
if ( Episodes_Per_Parameter_Update <= Update_Policy_Parameters )
{
double tdc=0.0, tdv=0.0, **Q_a;
int cont_grad = 1,cnt;
for ( j = 0; j < num_of_gaussians; j++ )
{
for ( i = 0; i < dim; i++ )
{
drhodc[j][i] = 0.0;
drhodv[j][i] = 0.0;
}
}
#ifdef BIAS_FA
{
double t1;
for ( k = 1; k < Episodes_Per_Parameter_Update && (cont_grad == 1); k++ )
{
t1 = 0.0;
for ( n = 0; n < num_of_gaussians; n++ )
{
t1 = t1 + Q[k][n];
}
t1 = t1 / (double)num_of_gaussians;
for ( n = 0; n < num_of_gaussians; n++ )
{
Q[k][n] = Q[k][n] - t1;
}
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -