?? i_qlearner_id.java
字號(hào):
/** * i_QLearner_id.java */package EDU.gatech.cc.is.learning;import java.io.*;import java.util.*;/** * An object that learns to select from several actions based on * a reward. Uses the Q-learning method as defined by Watkins. * <P> * The module will learn to select a discrete output based on * state and a continuous reinforcement input. The "i"s in front * of and behind the name imply that this class takes integers as * input and output. The "d" indicates a double for the reinforcement * input (i.e. a continuous value). * <P> * Copyright (c)2000 Tucker Balch * * @author Tucker Balch (tucker@cc.gatech.edu) * @version $Revision: 1.1 $ */public class i_QLearner_id extends i_ReinforcementLearner_id implements Cloneable, Serializable { /** * Used to indicate the learner uses average rewards. */ public static final int AVERAGE = 0; /** * Used to indicate the learner uses discounted rewards. */ public static final int DISCOUNTED = 1; private int criteria = DISCOUNTED; // assume discounted rewards private double q[][]; // the q-values private double p[][]; // count of times in each // state/action private double profile[][]; // count of times in each // state/action for this trial private int last_policy[]; // used to count changes in // policy private int changes = 0; // used to count changes // in policy per trial private int queries = 0; // queries per trial private double total_reward = 0; // reward over trial private int first_of_trial = 1; // indicates if first time private double gamma=0.8; // discount rate private double alpha=0.2; // learning rate private double randomrate=0.1; // frequency of random actions private double randomratedecay=0.99; // decay rate of random actions private Random rgen; // the random number generator private int xn; // last state private int an; // last action private long seed=0; // random number seed private static final boolean DEBUG=false; /** * Instantiate a Q learner using default parameters. * Parameters may be adjusted using accessor methods. * * @param numstates int, the number of states the system could be in. * @param numactions int, the number of actions or outputs to * select from. * @param criteria int, should be DISCOUNTED or AVERAGE. * @param seed long, the seed. */ public i_QLearner_id(int numstatesin, int numactionsin, int criteriain, long seedin) { super(numstatesin, numactionsin); if ((criteriain != DISCOUNTED)&&(criteriain != AVERAGE)) { System.out.println("i_QLearner_id: invalid criteria"); criteria = DISCOUNTED; } else criteria = criteriain; if (criteria == DISCOUNTED) System.out.println("i_QLearner_id: DISCOUNTED"); else System.out.println("i_QLearner_id: AVERAGE"); seed = seedin; rgen = new Random(seed); q = new double[numstates][numactions]; profile = new double[numstates][numactions]; p = new double[numstates][numactions]; last_policy = new int[numstates]; for(int i=0; i<numstates; i++) { for(int j=0; j<numactions; j++) { q[i][j] = rgen.nextDouble()*2 - 1; p[i][j] = 0; profile[i][j] = 0; } last_policy[i] = 0; } xn = an = 0; } /** * Instantiate a Q learner using default parameters. * This version assumes you will use a seed of 0. * Parameters may be adjusted using accessor methods. * * @param numstates int, the number of states the system could be in. * @param numactions int, the number of actions or outputs to * select from. * @param criteria int, should be DISCOUNTED or AVERAGE. */ public i_QLearner_id(int numstatesin, int numactionsin, int criteriain) { super(numstatesin, numactionsin); if ((criteriain != DISCOUNTED)&&(criteriain != AVERAGE)) { System.out.println("i_QLearner_id: invalid criteria"); criteria = DISCOUNTED; } else criteria = criteriain; if (criteria == DISCOUNTED) System.out.println("i_QLearner_id: DISCOUNTED"); else System.out.println("i_QLearner_id: AVERAGE"); rgen = new Random(seed); q = new double[numstates][numactions]; profile = new double[numstates][numactions]; p = new double[numstates][numactions]; last_policy = new int[numstates]; for(int i=0; i<numstates; i++) { for(int j=0; j<numactions; j++) { q[i][j] = rgen.nextDouble()*2 - 1; p[i][j] = 0; profile[i][j] = 0; } last_policy[i] = 0; } xn = an = 0; } /** * Instantiate a Q learner using default parameters. * This version assumes you will use discounted rewards. * Parameters may be adjusted using accessor methods. * * @param numstates int, the number of states the system could be in. * @param numactions int, the number of actions or outputs to * select from. */ public i_QLearner_id(int numstatesin, int numactionsin) { super(numstatesin, numactionsin); System.out.println("i_QLearner_id: DISCOUNTED"); criteria = DISCOUNTED; rgen = new Random(seed); q = new double[numstates][numactions]; profile = new double[numstates][numactions]; p = new double[numstates][numactions]; last_policy = new int[numstates]; for(int i=0; i<numstates; i++) { for(int j=0; j<numactions; j++) { q[i][j] = rgen.nextDouble()*2 - 1; p[i][j] = 0; profile[i][j] = 0; } last_policy[i] = 0; } xn = an = 0; } /** * Set gamma for the Q-learner. * This is the discount rate, 0.8 is typical value. * It should be between 0 and 1. * * @param g double, the new value for gamma (0 < g < 1). */ public void setGamma(double g) { if ((g<0)||(g>1)) { System.out.println("id_QLearner_i.setGamma: illegal value"); return; } gamma = g; } /** * Set alpha for the Q-learner. * This reflects how quickly it should learn. * Alpha should be between 0 and 1. * * @param a double, the new value for alpha (0 < a < 1). */ public void setAlpha(double a) { alpha = a; } /** * Set the random rate for the Q-learner. * This reflects how frequently it picks a random action. * Should be between 0 and 1. * * @param r double, the new value for random rate (0 < r < 1). */ public void setRandomRate(double r) { randomrate = r; } /** * Set the random decay for the Q-learner. * This reflects how quickly the rate of chosing random actions * decays. 1 would never decay, 0 would cause it to immediately * quit chosing random values. * Should be between 0 and 1. * * @param r double, the new value for randomdecay (0 < r < 1). */ public void setRandomRateDecay(double r) { randomratedecay = r; } /** * Generate a String that describes the current state of the * learner. * * @return a String describing the learner. */ public String toString() { int i, j; String retval = super.toString(); retval = retval + "type = id_QLearner_i alpha = " +alpha+" gamma = " +gamma+"\n"; for (i=0; i<numstates;i++) { for (j=0; j<numactions;j++) { retval = retval + q[i][j] + " "; } if (i<(numstates - 1)) retval += "\n"; } return retval; } /** * Select an output based on the state and reward. * * @param statein int, the current state. * @param rewardin double, reward for the last output, positive * numbers are "good." */ public int query(int yn, double rn) { //System.out.println("state "+yn+" reward "+rn); total_reward += rn; queries++; // yn is present state, rn is present reward double pick; int action; if (yn>(numstates -1)) // very bad { System.out.println("id_QLearner_i.query: state "+yn +" is out of range."); return 0; } /* * Find approximate value of present state, and best action. * * ie: max q[yn][i] over all i, i is the best action. */ double Vn = -9999999999f; //very bad action = 0; for (int i = 0; i < numactions; i++) { if (q[yn][i] > Vn) { Vn = q[yn][i]; action = i; } } /* * Now update according to Watkin's iteration: */ if (first_of_trial != 1) { if (DEBUG) System.out.println( "xn ="+xn+" an ="+an+" rn="+rn); if (criteria == DISCOUNTED) { // Watkins update rule: q[xn][an] = (1 - alpha)*q[xn][an] + alpha*(rn + gamma*Vn); } else // criteria == AVERAGE { // Average update rule q[xn][an] = (p[xn][an] * q[xn][an] + rn + Vn)/ (p[xn][an] + 2);
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號(hào)
Ctrl + =
減小字號(hào)
Ctrl + -