?? cpolicygradient.cpp
字號:
void CLineSearchPolicyGradientUpdater::setWorkingParamters(CFeatureList *gradient, rlt_real stepSize, rlt_real *startParameters, rlt_real *workParameters)
{
DebugPrint('l', "Applying StepSize: %f\n", stepSize);
memcpy(workParameters, startParameters, sizeof(rlt_real) * updateFunction->getNumWeights());
CFeatureList::iterator it = gradient->begin();
for (; it != gradient->end(); it ++)
{
workParameters[(*it)->featureIndex] += stepSize * (*it)->factor;
}
}
CLineSearchPolicyGradientUpdater::CLineSearchPolicyGradientUpdater(CGradientUpdateFunction *updateFunction, CPolicySameStateEvaluator *policyEvaluator, rlt_real *l_startStepSizes, int l_numStepSizes, int maxSteps) : CPolicyGradientUpdater(updateFunction)
{
this->evaluator = policyEvaluator;
startParameters = new rlt_real[updateFunction->getNumWeights()];
workParameters = new rlt_real[updateFunction->getNumWeights()];
this->numStepSizes = l_numStepSizes;
this->maxSteps = maxSteps;
this->startStepSizes = new rlt_real[numStepSizes];
memcpy(this->startStepSizes, l_startStepSizes, sizeof(rlt_real) * numStepSizes);
addParameter("LineSearchStepSizeScale", 1.0);
}
CLineSearchPolicyGradientUpdater::~CLineSearchPolicyGradientUpdater()
{
delete startParameters;
delete workParameters;
delete startStepSizes;
}
void CLineSearchPolicyGradientUpdater::updateWeights(CFeatureList *gradient)
{
int maxIndex = 0;
rlt_real maxValue = 0.0;
rlt_real maxLearnRate = 0.0;
rlt_real *values = new rlt_real[numStepSizes];
rlt_real searchValues[3];
rlt_real searchStepSizes[3];
updateFunction->getWeights(startParameters);
printf("Searching in Gradient Direction, %d start points\n", numStepSizes);
//evaluator->getNewStartStates();
int i = 0;
DebugPrint('l', "Beginning Line Search\n");
DebugPrint('l', "Gradient: ");
if (DebugIsEnabled('l'))
{
gradient->saveASCII(DebugGetFileHandle('l'));
DebugPrint('l', "Gradient Norm: %f\n", gradient->multFeatureList(gradient));
DebugPrint('l', "\n");
}
for (i = 0; i < numStepSizes; i++)
{
setWorkingParamters(gradient, startStepSizes[i] * getParameter("LineSearchStepSizeScale"), startParameters, workParameters);
updateFunction->setWeights(workParameters);
rlt_real newValue = 0.0;
try
{
values[i] = evaluator->evaluatePolicy();
}
catch (CMyException *E)
{
values[i] = - 100000000;
}
printf("StepSize %f : %f\n", startStepSizes[i] * getParameter("LineSearchStepSizeScale"), values[i]);
DebugPrint('l', "Finished Evaluation of StepSize %f : Value %f\n", startStepSizes[i], values[i]);
if (i == 0 || values[i] > maxValue + (fabs(maxValue) * 0.0001))
{
maxIndex = i;
maxValue = values[i];
maxLearnRate = startStepSizes[i] * getParameter("LineSearchStepSizeScale");
printf("Found New Maximum\n");
}
}
if (i < maxSteps)
{
if (maxIndex == 0 || maxIndex == numStepSizes - 1)
{
maxIndex ++;
printf("Maximum outside the start step intervall, not searching further\n");
}
else
{
for (int j = 0; j < 3; j ++)
{
searchValues[j] = values[maxIndex + j - 1];
searchStepSizes[j] = startStepSizes[maxIndex + j - 1] * getParameter("LineSearchStepSizeScale");
}
while (i < maxSteps)
{
i ++;
if (searchValues[0] / (searchStepSizes[1] - searchStepSizes[0]) > searchValues[2] / (searchStepSizes[2] - searchStepSizes[1]))
{
rlt_real newStepSize = (searchStepSizes[0] + searchStepSizes[1]) * 0.5;
setWorkingParamters(gradient, newStepSize, startParameters, workParameters);
updateFunction->setWeights(workParameters);
rlt_real newValue = 0.0;
try
{
newValue = evaluator->evaluatePolicy();
}
catch (CMyException *E)
{
newValue = - 100000000;
}
printf("StepSize %f : %f\n", newStepSize, newValue);
DebugPrint('l', "Finished Evaluation of StepSize %f : Value %f\n", newStepSize, newValue);
if (newValue > searchValues[1])
{
searchValues[2] = searchValues[1];
searchValues[1] = newValue;
searchStepSizes[2] = searchStepSizes[1];
searchStepSizes[1] = newStepSize;
printf("Found New Maximum\n");
}
else
{
searchValues[0] = newValue;
searchStepSizes[0] =newStepSize;
}
}
else
{
rlt_real newStepSize = (searchStepSizes[2] + searchStepSizes[1]) * 0.5;
setWorkingParamters(gradient, newStepSize, startParameters, workParameters);
updateFunction->setWeights(workParameters);
rlt_real newValue = evaluator->evaluatePolicy();
printf("StepSize %f : %f\n", newStepSize, newValue);
DebugPrint('l', "Finished Evaluation of StepSize %f : Value %f\n", newStepSize, newValue);
if (newValue > searchValues[1])
{
searchValues[0] = searchValues[1];
searchValues[1] = newValue;
searchStepSizes[0] = searchStepSizes[1];
searchStepSizes[1] = newStepSize;
printf("Found New Maximum\n");
}
else
{
searchValues[2] = newValue;
searchStepSizes[2] = newStepSize;
}
}
}
maxLearnRate = searchStepSizes[1];
}
}
delete [] values;
DebugPrint('l', "End Line Search, applying step Size %f\n", maxLearnRate);
printf("Applying maximum stepsize %f\n", maxLearnRate);
setWorkingParamters(gradient, maxLearnRate, startParameters, workParameters);
updateFunction->setWeights(workParameters);
}
CPolicyGradientLearner::CPolicyGradientLearner(CPolicyGradientCalculator *gradientCalculator, CPolicyGradientUpdater *gradientUpdater, rlt_real epsilon)
{
addParameters(gradientCalculator);
addParameters(gradientUpdater);
addParameter("GradientResolution", epsilon);
addParameter("PolicyGradientWeightDecay", 0.0);
gradient = new CFeatureList();
hGradient = new CFeatureList();
gGradient = new CFeatureList();
this->gradientCalculator = gradientCalculator;
this->gradientUpdater = gradientUpdater;
}
CPolicyGradientLearner::~CPolicyGradientLearner()
{
delete gradient;
delete hGradient;
delete gGradient;
}
void CPolicyGradientLearner::doUpdate(CFeatureList *gradient)
{
rlt_real gamma = getParameter("PolicyGradientWeightDecay");
rlt_real *oldParameters = new rlt_real[gradientUpdater->getUpdateFunction()->getNumWeights()];
rlt_real *newParameters = new rlt_real[gradientUpdater->getUpdateFunction()->getNumWeights()];
gradientUpdater->getUpdateFunction()->getWeights(oldParameters);
gradientUpdater->updateWeights(gradient);
if (gamma > 0.0)
{
gradientUpdater->getUpdateFunction()->getWeights(newParameters);
printf("Updating Gradient with weight decay %f\n", gamma);
for (int i = 0; i < gradientUpdater->getUpdateFunction()->getNumWeights(); i++)
{
newParameters[i] -= gamma * oldParameters[i];
}
gradientUpdater->getUpdateFunction()->setWeights(newParameters);
}
delete [] oldParameters;
delete [] newParameters;
}
rlt_real CPolicyGradientLearner::learnPolicy(int maxGradientUpdates, CPolicyEvaluator *evaluator, bool useOldGradient)
{
rlt_real epsilon = getParameter("GradientResolution");
gradient->clear();
if (!useOldGradient)
{
hGradient->clear();
gGradient->clear();
}
rlt_real normG = gGradient->multFeatureList(gGradient);
DebugPrint('g', "Gradient-Norm: %f\n", normG);
printf("Gradient-Norm: %f\n", normG);
int gradientUpdates = 0;
rlt_real value = 0.0;
do
{
if (evaluator)
{
value = evaluator->evaluatePolicy();
printf("Value after %d Gradient Update: %f\n", gradientUpdates, value);
}
gradient->clear();
gradientCalculator->getGradient(gradient);
if (gGradient->size() > 0)
{
gGradient->add(gradient);
normG = gGradient->multFeatureList(gGradient);
gGradient->multFactor(-1.0);
gGradient->add(gradient, 1.0);
rlt_real gamma = gGradient->multFeatureList(gradient) / normG;
DebugPrint('g', "Calculated Gradient :\n");
if (DebugIsEnabled('g'))
{
gradient->saveASCII(DebugGetFileHandle('g'));
}
DebugPrint('g', "PGLearner: Gamma %f", gamma);
hGradient->multFactor(gamma);
hGradient->add(gradient);
if (hGradient->multFeatureList(gradient) < 0)
{
hGradient->clear();
hGradient->add(gradient);
}
DebugPrint('g', "Update-Gradient :\n");
if (DebugIsEnabled('g'))
{
hGradient->saveASCII(DebugGetFileHandle('g'));
}
}
else
{
hGradient->add(gradient);
normG = hGradient->multFeatureList(hGradient);
}
gGradient->clear();
gGradient->add(gradient);
DebugPrint('g', "Gradient-Norm: %f\n", normG);
printf("Gradient-Norm: %f\n", normG);
printf("Updating Gradient...");
doUpdate(hGradient);
gradientUpdates ++;
}
while (normG > epsilon && gradientUpdates < maxGradientUpdates);
if (gradientUpdates < maxGradientUpdates)
{
printf("Updating Gradient...");
doUpdate(hGradient);
gradientUpdates ++;
if (evaluator)
{
rlt_real value = evaluator->evaluatePolicy();
printf("Value after %d Gradient Update: %f\n", gradientUpdates, value);
}
}
return value;
}
CPolicyGradientWeightDecayListener::CPolicyGradientWeightDecayListener(CGradientUpdateFunction *updateFunction, rlt_real weightdecay)
{
addParameter("PolicyGradientWeightDecay", weightdecay);
this->updateFunction = updateFunction;
parameters = new rlt_real[updateFunction->getNumWeights()];
}
CPolicyGradientWeightDecayListener::~CPolicyGradientWeightDecayListener()
{
delete [] parameters;
}
void CPolicyGradientWeightDecayListener::newEpisode()
{
updateFunction->getWeights(parameters);
rlt_real factor = 1 - getParameter("PolicyGradientWeightDecay");
for (int i = 0; i < updateFunction->getNumWeights(); i++)
{
parameters[i] = factor * parameters[i];
}
updateFunction->setWeights(parameters);
}
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -