?? icsiboost.c
字號:
/* Copyright (C) (2007) (Benoit Favre) <favre@icsi.berkeley.edu>This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.This program is distributed in the hope that it will be useful,but WITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See theGNU General Public License for more details.You should have received a copy of the GNU General Public Licensealong with this program; if not, write to the Free SoftwareFoundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */#if HAVE_CONFIG_H# include <config.h>#endif#define USE_THREADS//#define USE_FLOATS#include "utils/common.h"#include "utils/debug.h"#include "utils/vector.h"#include "utils/string.h"#include "utils/hashtable.h"#include "utils/mapped.h"#include "utils/array.h"#ifdef USE_THREADS#include "utils/threads.h"#include <pthread.h>#include <unistd.h>#include <sys/types.h>#include <sys/stat.h>#include <fcntl.h>#endif#include <stdio.h>#include <stdlib.h>#include <string.h>#include <math.h>#include <time.h>#ifdef USE_FLOATS#define double float#endif// declare vectors that are interesting for usvector_implement_functions_for_type(float, 0.0);vector_implement_functions_for_type(int32_t, 0);/*int num_EXP=0;double EXP(double number){ num_EXP++; if(isnan(number))die("exp(): had a NAN"); return exp(number);}int num_SQRT=0;double SQRT(double number){ num_SQRT++; if(isnan(number))die("sqrt(): had a NAN"); return sqrt(number);}int num_LOG=0;double LOG(double number){ num_LOG++; if(isnan(number))die("log(): had a NAN"); return log(number);}*/#define EXP(a) exp(a)#define LOG(a) log(a)inline double SQRT(double number){ if(number<0)return 0; return sqrt(number);}/*WARNING:- garbage collection is not working yet => do not activate itTODO:- free memory and remove leaks (and go through that again and again)- frequency cutoff- sampleing and ranking- more COMMENTS !!!NOTES ON THE ORIGINAL:- text features are treated as bag-of-ngrams- shyp file: boostexter saves non-text discrete features as their id- shyp file: when combining the same classifiers, boostexter adds the alphas and averages the C_js*/#define FEATURE_TYPE_IGNORE 0#define FEATURE_TYPE_CONTINUOUS 1#define FEATURE_TYPE_TEXT 2#define FEATURE_TYPE_SCORED_TEXT 3#define FEATURE_TYPE_SET 4typedef struct template { // a column definition int column; string_t* name; int type; hashtable_t* dictionary; // dictionary for token based template vector_t* tokens; // the actual tokens (if the feature is text; contains tokeninfo) vector_t* values; // continuous values (if the feature is continuous) int32_t* ordered; // ordered example ids according to value for continuous columns vector_t* classifiers; // classifiers related to that template (in classification mode only)} template_t;typedef struct example { // an instance //vector_t* features; // vector of (int or float according to templates) int class; double* weight; // example weight by class double* score; // example score by class, updated iteratively} example_t;#define CLASSIFIER_TYPE_THRESHOLD 1#define CLASSIFIER_TYPE_TEXT 2typedef struct weakclassifier { // a weak learner template_t* template; // the corresponding template int type; // type in CLASSIFIER_TYPE_TEXT or CLASSIFIER_TYPE_THRESHOLD int32_t token; // token for TEXT classifier int column; // redundant with template double threshold; // threshold for THRESHOLD classifier double alpha; // alpha from the paper double objective; // Z() value from minimization double *c0; // weight by class, unknown double *c1; // weight by class, token absent or below threshold double *c2; // weight by class, token present or above threshold} weakclassifier_t;typedef struct tokeninfo { // store info about a word (or token) int32_t id; char* key; size_t count; vector_t* examples;} tokeninfo_t;double smoothing=0.5; // -E <number> in boostexterint verbose=0;int output_weights=0;int output_scores=0;//int *random_sequence;#define y_l(x,y) (x->class==y?1.0:-1.0) // y_l() from the paper#define b(x,y) (x->class==y?1:0) // b() from the paper (binary class match)weakclassifier_t* train_text_stump(double min_objective, template_t* template, vector_t* examples, double** sum_of_weights, int num_classes){ int i,l; int column=template->column; size_t num_tokens=template->tokens->length; int32_t t; double* weight[2][num_classes]; // weight[b][label][token] (=> D() in the paper), only for token presence (absence is infered from sum_of_weights) for(l=0;l<num_classes;l++) { weight[0][l]=MALLOC(sizeof(double)*num_tokens); weight[1][l]=MALLOC(sizeof(double)*num_tokens); } for(t=1;t<num_tokens;t++) // initialize { for(l=0;l<num_classes;l++) { weight[0][l][t]=0.0; weight[1][l][t]=0.0; } tokeninfo_t* tokeninfo=(tokeninfo_t*)vector_get(template->tokens, t); //fprintf(stdout,"%s [%s] %d\n",template->name->data,tokeninfo->key,tokeninfo->examples->length); for(i=0;i<tokeninfo->examples->length;i++) // compute the presence weights { example_t* example=(example_t*)vector_get(examples,vector_get_int32_t(tokeninfo->examples,i)); for(l=0;l<num_classes;l++) { weight[b(example,l)][l][t]+=example->weight[l]; } } } weakclassifier_t* classifier=NULL; // init an empty classifier classifier=MALLOC(sizeof(weakclassifier_t)); classifier->template=template; classifier->threshold=NAN; classifier->alpha=1.0; classifier->type=CLASSIFIER_TYPE_TEXT; classifier->token=0; classifier->column=column; classifier->objective=1.0; classifier->c0=MALLOC(sizeof(double)*num_classes); classifier->c1=MALLOC(sizeof(double)*num_classes); classifier->c2=MALLOC(sizeof(double)*num_classes); double epsilon=smoothing/(num_classes*examples->length); //min_objective=1; for(t=1;t<num_tokens;t++) { double objective=0; for(l=0;l<num_classes;l++) // compute the objective function Z()=sum_j(sum_l(SQRT(W+*W-)) { /*if(weight[0][l][t]<0)weight[0][l][t]=0.0; if(weight[1][l][t]<0)weight[1][l][t]=0.0;*/ objective+=SQRT((sum_of_weights[1][l]-weight[1][l][t])*(sum_of_weights[0][l]-weight[0][l][t])); objective+=SQRT(weight[1][l][t]*weight[0][l][t]); } objective*=2; //fprintf(stdout,"DEBUG: column=%d token=%d obj=%f\n",column,t,objective); if(objective-min_objective<-1e-11) // select the argmin() { min_objective=objective; classifier->token=t; classifier->objective=objective; for(l=0;l<num_classes;l++) // update c0, c1 and c2 => c0 and c1 are the same for text stumps { classifier->c0[l]=0.5*LOG((sum_of_weights[1][l]-weight[1][l][t]+epsilon)/(sum_of_weights[0][l]-weight[0][l][t]+epsilon)); classifier->c1[l]=classifier->c0[l]; //classifier->c0[l]=0; //classifier->c1[l]=0.5*LOG((sum_of_weights[1][l]-weight[1][l][t]+epsilon)/(sum_of_weights[0][l]-weight[0][l][t]+epsilon)); classifier->c2[l]=0.5*LOG((weight[1][l][t]+epsilon)/(weight[0][l][t]+epsilon)); } } } for(l=0;l<num_classes;l++) // free memory { FREE(weight[0][l]); FREE(weight[1][l]); } //tokeninfo_t* info=vector_get(template->tokens,classifier->token); //fprintf(stdout,"DEBUG: column=%d token=%s obj=%f %s\n",column,info->key,classifier->objective,template->name->data); if(classifier->token==0) // no better classifier has been found { FREE(classifier->c0); FREE(classifier->c1); FREE(classifier->c2); FREE(classifier); return NULL; } return classifier;}weakclassifier_t* train_continuous_stump(double min_objective, template_t* template, vector_t* examples_vector, int num_classes){ size_t i,j,l; int column=template->column; float* values=(float*)template->values->data; int32_t* ordered=template->ordered; example_t** examples=(example_t**)examples_vector->data; if(ordered==NULL) // only order examples once, then keep the result in the template { ordered=MALLOC(sizeof(int32_t)*examples_vector->length); int32_t index=0; for(index=0;index<examples_vector->length;index++)ordered[index]=index; //for(i=0;i<examples->length && i<4;i++)fprintf(stdout,"%d %f\n",i,vector_get_float(((example_t*)vector_get(examples,i))->features,column)); int local_comparator(const void* a, const void* b) { int32_t aa=*(int32_t*)a; int32_t bb=*(int32_t*)b; float aa_value=values[aa]; float bb_value=values[bb]; //float aa_value=vector_get_float(template->values,(size_t)aa); //float bb_value=vector_get_float(template->values,(size_t)bb); //float aa_value=vector_get_float(((example_t*)vector_get(examples,(size_t)aa))->features,column); //float bb_value=vector_get_float(((example_t*)vector_get(examples,(size_t)bb))->features,column); //fprintf(stdout,"%d(%f) <=> %d(%f)\n",aa,aa_value,bb,bb_value); //if(aa_value<bb_value)return -1; //if(aa_value>bb_value)return 1; if(isnan(aa_value) || aa_value>bb_value)return 1; // put the NAN (unknown values) at the end of the list if(isnan(bb_value) || aa_value<bb_value)return -1; return 0; } qsort(ordered,examples_vector->length,sizeof(int32_t),local_comparator); //vector_sort(ordered,local_comparator); template->ordered=ordered; } double weight[3][2][num_classes]; // D(j,b,l) for(j=0;j<3;j++) for(l=0;l<num_classes;l++) { weight[j][0][l]=0.0; weight[j][1][l]=0.0; } for(i=0;i<examples_vector->length;i++) // compute the "unknown" weights and the weight of examples after threshold { int32_t example_id=ordered[i]; example_t* example=examples[example_id]; //fprintf(stdout,"%d %f\n",column,vector_get_float(example->features,column)); for(l=0;l<num_classes;l++) { if(isnan(values[example_id])) weight[0][b(example,l)][l]+=example->weight[l]; else weight[2][b(example,l)][l]+=example->weight[l]; } } weakclassifier_t* classifier=NULL; // new classifier classifier=MALLOC(sizeof(weakclassifier_t)); classifier->template=template; classifier->threshold=NAN; classifier->alpha=1.0; classifier->type=CLASSIFIER_TYPE_THRESHOLD; classifier->token=0; classifier->column=column; classifier->objective=1.0; classifier->c0=MALLOC(sizeof(double)*num_classes); classifier->c1=MALLOC(sizeof(double)*num_classes); classifier->c2=MALLOC(sizeof(double)*num_classes); double epsilon=smoothing/(num_classes*examples_vector->length); for(i=0;i<examples_vector->length-1;i++) // compute the objective function at every possible threshold (in between examples) { int32_t example_id=ordered[i]; example_t* example=examples[example_id]; //fprintf(stdout,"%zd %zd %f\n",i,vector_get_int32_t(ordered,i),vector_get_float(template->values,example_id)); if(isnan(values[example_id]))break; // skip unknown values //example_t* next_example=(example_t*)vector_get(examples,(size_t)next_example_id); for(l=0;l<num_classes;l++) // update the objective function by putting the current example the other side of the threshold { weight[1][b(example,l)][l]+=example->weight[l]; weight[2][b(example,l)][l]-=example->weight[l]; } int next_example_id=ordered[i+1]; if(values[example_id]==values[next_example_id])continue; // same value double objective=0; for(l=0;l<num_classes;l++) // compute objective Z() { /*if(weight[0][1][l]<0)weight[0][1][l]=0.0; if(weight[0][0][l]<0)weight[0][0][l]=0.0; if(weight[1][1][l]<0)weight[1][1][l]=0.0; if(weight[1][0][l]<0)weight[1][0][l]=0.0; if(weight[2][1][l]<0)weight[2][1][l]=0.0; if(weight[2][0][l]<0)weight[2][0][l]=0.0;*/ objective+=SQRT(weight[0][1][l]*weight[0][0][l]); objective+=SQRT(weight[1][1][l]*weight[1][0][l]); objective+=SQRT(weight[2][1][l]*weight[2][0][l]); } objective*=2; //fprintf(stdout,"DEBUG: column=%d threshold=%f obj=%f\n",column,(vector_get_float(next_example->features,column)+vector_get_float(example->features,column))/2,objective); if(objective-min_objective<-1e-11) // get argmin { classifier->objective=objective;
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -