?? myhmm.c
字號:
/************************************************************************
* *
* Program packages 'myHMM' : *
* *
* myHMM.c *
* -the main program for myHMM hidden Markov model *
* *
* Version 0.1 *
* Date: 3 May 2003 *
* *
* NOTE: This program package is copyrighted in the sense that it *
* may be used for scientific purposes. The package as a whole, or *
* parts thereof, cannot be included or used in any commercial *
* application without written permission granted by its producents. *
* No programs contained in this package may be copied for commercial *
* distribution. *
* *
* All comments concerning this program package may be sent to the *
* e-mail address 'dcslgl@nus.edu.sg'. *
* *
************************************************************************/
/*************************************************************************
NAME
myHMM - train the hidden Markov model
SYNOPSIS
myHMM -hmm [HMM] [-train trainFile] [-trainedHmm trainedHmmFile]
[-test testFile] [-tested testedFile] [-mode mode]
DESCRIPTION
Input:
required: pre-defined hidden Markov model
optional: training data, testing data
Output:
optional: trained hidden Markov model, tested result
Global variables:
N: integer, number of states
M: integer, number of observations
T: integer, the length of the longest input sequence
S: integer, finite set of possible states
O: integer, finite set of possible observations
t: double, transition matrix: N x N
e: integer, emission matrix: N x M
pi: integer, initial state distribution: 1 x N
%% A hidden Markov model (HMM) is a five-tuple (S,O,t,e,pi).
%% Let lambda = {t,e,pi} denote the parameters for a given HMM
%% with fixed S and O.
%% There are three basic problems in HMM:
%% 1) probability of the observation sequence given the HMM model
%% -> forward algorithm & backward algorithm
%% 2) the most probable state path of the observation sequence
%% given the HMM model
%% -> Viterbi algorithm
%% 3) Building the model given a training set of sequence
%% -> Baum-Welch algorithm
OPTIONS
-hmm [HMM]
pre-defined hidden Markov Model
-train trainFile
training file
-trainedHmmFile trainedHmmFile
trained hidden Markov model
-test testFile
testing file
-tested testedFile
tested result file
-mode mode
mode must be one of 1, 2 or 3. '1' represents TRAINING, '2'
represents TESTING, and '3' represents TRAINING_TESTING.
*************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <malloc.h>
#include <math.h>
#include "tools.h"
#include "myhmm.h"
#include "arg.h"
static char usage[] =
"myHMM - hidden Markov model\n"
"command: myHMM -hmm <hmm> [option]\n"
"Required parameters:\n"
" -hmm filename the file for hmm definition\n"
"Optional parameters:\n"
" -train filename the file for training\n"
" -test filename the file for testing\n"
" -trainedHmm filename the file for trained hmm\n"
" -tested filename the file for tested result\n"
" -mode int the mode for training, testing or both\n"
" 1: training, 2: testing, 3: both\n";
/************************************************************************
NAME
main - main function in the program
DESCRIPTION
This function extracts parameters from the argument list and perform
the specified actions by the parameters.
Input:
arguments in the command line
Output:
specified output(s): trained HMM file and/or tesed result file
Global variable list:
iTrain, iTest, T, trainData, testData, observationDefined,
extraSpace.
*************************************************************************/
int main(int argc, char** argv)
{
char* hmmFile;
char* trainFile;
char* testFile;
char* trainedHmmFile;
char* testedFile;
int mode;
/*---------- get the parameters ----------*/
if (argc <= 1 || extract_parameter(argc, argv, "-help", OPTION2) || extract_parameter(argc, argv, "-h", OPTION2) )
{
printf("%s", usage);
exit(0);
}
hmmFile = extract_parameter(argc, argv, HMM_FILE, ALWAYS);
trainFile = extract_parameter(argc, argv, TRAINING_FILE, OPTION);
testFile = extract_parameter(argc, argv, TESTING_FILE, OPTION);
trainedHmmFile = extract_parameter(argc, argv, TRAINED_HMM_FILE, OPTION);
testedFile = extract_parameter(argc, argv, TESTED_FILE, OPTION);
mode = oatoi(extract_parameter(argc, argv, MODE, OPTION), TRAINING);
/* check whether the mode is consistant with the provided files */
check_mode(mode, trainFile, testFile);
/* default values for trainedHmmFile and testedFile */
if(trainedHmmFile == NULL)
trainedHmmFile = defTrainedHmmFile;
if(testedFile == NULL)
testedFile = defTestedFile;
/*---------- load HMM ----------*/
loadHMM(hmmFile);
/*---------- train the HMM model ----------*/
if(mode == TRAINING || mode == TRAINING_TESTING)
{
/*---------- load training data ----------*/
iTrain = cal_lines(trainFile);
T = getLengthOfLongestLine(trainFile) + extraSpace;
loadSeq(&trainData, trainFile);
if(observationsDefined == FALSE)
{
readObservations(trainData);
}
/*---------- initialization ----------*/
init();
/*---------- train HMM ----------*/
baumWelch();
/*---------- save trained HMM ----------*/
saveHmm(trainedHmmFile);
}
/*---------- test with the HMM model ----------*/
if(mode == TESTING || mode == TRAINING_TESTING)
{
/*---------- 5. load testing data ----------*/
if(mode == TESTING)
{
T = getLengthOfLongestLine(testFile) + extraSpace;
init();
}
iTest = cal_lines(testFile);
loadSeq(&testData, testFile);
/*---------- test ----------*/
test(testedFile);
}
/* post-processing */
post_processing();
return 0;
}
/************************************************************************
NAME
post_processing - post processing after the main process
DESCRIPTION
This function releases the allocated memery for the global variables.
Input:
pointers to global variables.
Output:
None.
Global variable list:
transitions, emissions, pi, observations, alpha, beta, trainData,
testData.
*************************************************************************/
void post_processing()
{
int i;
/* free space for transition matrix */
if(transitions != NULL)
{
for(i=0; i < N; i++)
{
if(transitions[i] != NULL) free(transitions[i]);
}
free(transitions);
}
/* free space for emission matrix */
if(emissions != NULL)
{
for(i=0; i < N; i++)
{
if(emissions[i] != NULL) free(emissions[i]);
}
free(emissions);
}
/* free space for initial distribution vector */
if(pi != NULL) free(pi);
/* free space for observation list */
if(observations != NULL) free(observations);
/* free space for matrix alpha, used in forward algorithm */
if(alpha != NULL)
{
for(i=0; i < T; i++)
{
if(alpha[i] != NULL) free(alpha[i]);
}
free(alpha);
}
/* free space for matrix beta, used in backward algorithm */
if(beta != NULL)
{
for(i=0; i < T; i++)
{
if(beta[i] != NULL) free(beta[i]);
}
free(beta);
}
/* free space for training data */
if(trainData != NULL)
{
for(i=0; i < iTrain; i++)
{
if(trainData[i] != NULL) free(trainData[i]);
}
free(trainData);
}
/* free space for testing data */
if(testData != NULL)
{
for(i=0; i < iTest; i++)
{
if(testData[i] != NULL) free(testData[i]);
}
free(testData);
}
}
/************************************************************************
NAME
saveHmm - save the trained hidden Markov model in pre-defined format
DESCRIPTION
This function saves the trained hidden Markov model in pre-defined
format.
Input:
trained HMM & output file name
Output:
output file
Global variables:
N, M, T, transitions, emissions, pi.
*************************************************************************/
void saveHmm(char *out)
{
FILE *fp;
int i, j;
/* Open for write (will fail if inputfile does not exist) */
if( (fp = fopen( out, "w" )) == NULL )
{
printf( "The file '%s' was not opened\n", out);
exit(1);
}
fprintf(fp, "%d %d %d\n", N, M, T);
fprintf(fp, "// Observation list\n");
fprintf(fp, "O: ");
for(i=0; i < M; i++)
{
fprintf(fp, "%c ", observations[i]);
}
fprintf(fp, "\n");
/* transition matrix */
fprintf(fp, "// Transition matrix\n");
for(i=0; i < N; i++)
{
for(j=0; j < N; j++)
{
fprintf(fp, "%6.4f ", transitions[i][j]);
}
fputc('\n', fp);
}
/* emission matrix */
fprintf(fp, "// Emission matrix\n");
// emission matrix
for(i=0; i < N; i++)
{
for(j=0; j < M; j++)
{
fprintf(fp, "%6.4f ", emissions[i][j]);
}
fputc('\n', fp);
}
/* initial state distribution */
fprintf(fp, "// Initial state distribution\n");
for(i=0; i < N; i++)
{
fprintf(fp, "%6.4f ", pi[i]);
}
fputc('\n', fp);
fclose( fp );
}
/************************************************************************
NAME
test - test the test file with given hidden Markov model
DESCRIPTION
This function tests the test file with given hidden Markov model.
Input:
trained HMM & tested file name
Output:
tested file
Global variables list:
T, iTest, testData, extraObservations.
*************************************************************************/
void test(char* testedFile)
{
FILE *out_fp;
int i;
double prob;
/* the most probable state path of the current observation sequence */
char *buffer;
/* the most probable state paths of all the observation sequences */
double score;
buffer = (char *) malloc(T * sizeof(char));
/* Open for write (will fail if inputfile does not exist) */
if( (out_fp = fopen( testedFile, "w" )) == NULL )
{
printf( "The file '%s' was not opened\n", testedFile);
exit(1);
}
for(i=0; i < iTest; i++)
{
prob = forward( testData[i] );
/* score is equal to the natural logarithm of the probability
divided by the length */
score = log(prob) / (double)(strlen(testData[i]) - extraObservations);
viterbi(testData[i], buffer);
/* output probability and the most probable path */
fprintf(out_fp, "%f\n%s\n%s\n", score, testData[i], buffer);
}
free( buffer );
fclose( out_fp );
}
/************************************************************************
NAME
getSymbol - get the symbol corresponding to each state
DESCRIPTION
This function .. There are only 52 distinct symbols in this function.
Input:
state number
Output:
corresponding symbol.
Global variables list:
None.
*************************************************************************/
char getSymbol(int k)
{
char result;
if(k >= 0 && k < 26)
result = 'a' + k;
else if(k >= 26 && k < 52)
result = 'A' + k - 26;
else
result = '-';
return result;
}
/************************************************************************
NAME
getObservation - get the order of the specified observation
DESCRIPTION
This function gets the order of the specified observation in the
observation list with binary search
Input:
specified observation
Output:
the order of the specified observation
Global variables list:
M.
*************************************************************************/
int getObservation(char ch)
{
int start, end, middle;
/* initial value */
start = 0;
end = M-1;
middle= -1;
//exception: the input observation is not in the observation list
if(ch < observations[start] || ch > observations[end])
{
printf("the input observation is not in the observation list!\n");
exit(1);
}
if(observations[start] == ch)
return start;
if(observations[end] == ch)
return end;
middle = (start + end)/2;
/* why middle != start:
if middle == start, it means end = start + 1, and 'ch' is a
character which lies between observation 'start' and
observation 'end' */
while(observations[middle]!=ch && start < end && middle != start)
{
if(observations[middle] < ch)
start = middle;
else
end = middle;
middle = (start + end) / 2;
}
if(observations[middle] != ch)
{
printf("The char is not in observation list!\n");
exit(1);
}
return middle;
}
/************************************************************************
NAME
forward - forward algorithm
DESCRIPTION
This function calculates the probability of the observation sequence
with forward algorithm given the HMM model.
Input:
HMM model & the observation sequence
Output:
Probability
Global variables list:
N, alpha, pi, transitions, emissions.
*************************************************************************/
double forward(char *line)
{
int i, j, t;
int length = strlen( line );
/* the order of the real observation in the observation set */
int iObservation;
/* transition probability to state i given the observation sequence */
double sumTransProb;
double prob;
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -