26 RegisterNode< ClusterTreeNode > ClusterTreeNode::registerModule(
"ClusterTreeNode");
29 RegisterClustererModule< ClusterTree > ClusterTree::registerModule(
"ClusterTree");
31 ClusterTree::ClusterTree(
const UINT numSplittingSteps,
const UINT minNumSamplesPerNode,
const UINT maxDepth,
const bool removeFeaturesAtEachSpilt,
const UINT trainingMode,
const bool useScaling,
const double minRMSErrorPerNode){
34 this->numSplittingSteps = numSplittingSteps;
35 this->minNumSamplesPerNode = minNumSamplesPerNode;
36 this->maxDepth = maxDepth;
37 this->removeFeaturesAtEachSpilt = removeFeaturesAtEachSpilt;
38 this->trainingMode = trainingMode;
39 this->minRMSErrorPerNode = minRMSErrorPerNode;
40 Clusterer::classType =
"ClusterTree";
41 clustererType = Clusterer::classType;
42 Clusterer::debugLog.setProceedingText(
"[DEBUG ClusterTree]");
43 Clusterer::errorLog.setProceedingText(
"[ERROR ClusterTree]");
44 Clusterer::trainingLog.setProceedingText(
"[TRAINING ClusterTree]");
45 Clusterer::warningLog.setProceedingText(
"[WARNING ClusterTree]");
51 Clusterer::classType =
"ClusterTree";
52 clustererType = Clusterer::classType;
53 Clusterer::debugLog.setProceedingText(
"[DEBUG ClusterTree]");
54 Clusterer::errorLog.setProceedingText(
"[ERROR ClusterTree]");
55 Clusterer::trainingLog.setProceedingText(
"[TRAINING ClusterTree]");
56 Clusterer::warningLog.setProceedingText(
"[WARNING ClusterTree]");
75 this->numSplittingSteps = rhs.numSplittingSteps;
76 this->minNumSamplesPerNode = rhs.minNumSamplesPerNode;
77 this->maxDepth = rhs.maxDepth;
78 this->removeFeaturesAtEachSpilt = rhs.removeFeaturesAtEachSpilt;
79 this->trainingMode = rhs.trainingMode;
80 this->minRMSErrorPerNode = rhs.minRMSErrorPerNode;
91 if( clusterer == NULL )
return false;
105 this->numSplittingSteps = ptr->numSplittingSteps;
106 this->minNumSamplesPerNode = ptr->minNumSamplesPerNode;
107 this->maxDepth = ptr->maxDepth;
108 this->removeFeaturesAtEachSpilt = ptr->removeFeaturesAtEachSpilt;
109 this->trainingMode = ptr->trainingMode;
110 this->minRMSErrorPerNode = ptr->minRMSErrorPerNode;
124 const unsigned int M = trainingData.
getNumRows();
125 const unsigned int N = trainingData.
getNumCols();
128 Clusterer::errorLog <<
"train_(MatrixDouble &trainingData) - Training data has zero samples!" << endl;
132 numInputDimensions = N;
133 numOutputDimensions = 1;
139 trainingData.
scale(0, 1);
143 vector< UINT > features(N);
144 for(UINT i=0; i<N; i++){
149 UINT clusterLabel = 0;
151 tree = buildTree( trainingData, NULL, features, clusterLabel, nodeID );
156 Clusterer::errorLog <<
"train_(MatrixDouble &trainingData) - Failed to build tree!" << endl;
166 clusterLabels[i] = i+1;
168 clusterLikelihoods.resize(numClusters,0);
169 clusterDistances.resize(numClusters,0);
177 Clusterer::errorLog <<
"predict_(VectorDouble &inputVector) - Model Not Trained!" << endl;
182 Clusterer::errorLog <<
"predict_(VectorDouble &inputVector) - DecisionTree pointer is null!" << endl;
186 if( inputVector.size() != numInputDimensions ){
187 Clusterer::errorLog <<
"predict_(VectorDouble &inputVector) - The size of the input vector (" << inputVector.size() <<
") does not match the num features in the model (" << numInputDimensions << endl;
192 for(UINT n=0; n<numInputDimensions; n++){
193 inputVector[n] =
scale(inputVector[n], ranges[n].minValue, ranges[n].maxValue, 0, 1);
197 VectorDouble clusterLabel(1);
198 if( !tree->
predict( inputVector, clusterLabel ) ){
199 Clusterer::errorLog <<
"predict_(VectorDouble &inputVector) - Failed to predict!" << endl;
223 return tree->
print();
229 if( !file.is_open() )
231 Clusterer::errorLog <<
"saveModelToFile(fstream &file) - The file is not open!" << endl;
236 file <<
"GRT_CLUSTER_TREE_MODEL_FILE_V1.0" << endl;
240 Clusterer::errorLog <<
"saveModelToFile(fstream &file) - Failed to save clusterer settings to file!" << endl;
244 file <<
"NumSplittingSteps: " << numSplittingSteps << endl;
245 file <<
"MinNumSamplesPerNode: " << minNumSamplesPerNode << endl;
246 file <<
"MaxDepth: " << maxDepth << endl;
247 file <<
"RemoveFeaturesAtEachSpilt: " << removeFeaturesAtEachSpilt << endl;
248 file <<
"TrainingMode: " << trainingMode << endl;
249 file <<
"MinRMSErrorPerNode: " << minRMSErrorPerNode << endl;
250 file <<
"TreeBuilt: " << (tree != NULL ? 1 : 0) << endl;
255 Clusterer::errorLog <<
"saveModelToFile(fstream &file) - Failed to save tree to file!" << endl;
269 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not open file to load model" << endl;
277 if(word !=
"GRT_CLUSTER_TREE_MODEL_FILE_V1.0"){
278 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find Model File Header" << endl;
284 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Failed to load base settings from file!" << endl;
289 if(word !=
"NumSplittingSteps:"){
290 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find the NumSplittingSteps!" << endl;
293 file >> numSplittingSteps;
296 if(word !=
"MinNumSamplesPerNode:"){
297 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find the MinNumSamplesPerNode!" << endl;
300 file >> minNumSamplesPerNode;
303 if(word !=
"MaxDepth:"){
304 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find the MaxDepth!" << endl;
310 if(word !=
"RemoveFeaturesAtEachSpilt:"){
311 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find the RemoveFeaturesAtEachSpilt!" << endl;
314 file >> removeFeaturesAtEachSpilt;
317 if(word !=
"TrainingMode:"){
318 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find the TrainingMode!" << endl;
321 file >> trainingMode;
324 if(word !=
"MinRMSErrorPerNode:"){
325 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find the MinRMSErrorPerNode!" << endl;
328 file >> minRMSErrorPerNode;
331 if(word !=
"TreeBuilt:"){
332 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find the TreeBuilt!" << endl;
340 Clusterer::errorLog <<
"loadModelFromFile(string filename) - Could not find the Tree!" << endl;
349 Clusterer::errorLog <<
"loadModelFromFile(fstream &file) - Failed to create new RegressionTreeNode!" << endl;
353 tree->setParent( NULL );
356 Clusterer::errorLog <<
"loadModelFromFile(fstream &file) - Failed to load tree from file!" << endl;
363 clusterLabels[i] = i+1;
365 clusterLikelihoods.resize(numClusters,0);
366 clusterDistances.resize(numClusters,0);
390 return minRMSErrorPerNode;
394 this->minRMSErrorPerNode = minRMSErrorPerNode;
421 node->initNode( parent, depth, nodeID );
424 if( features.size() == 0 || M < minNumSamplesPerNode || depth >= maxDepth ){
430 node->setIsLeafNode(
true );
433 node->
set( M, 0, 0, clusterLabel );
435 Clusterer::trainingLog <<
"Reached leaf node. Depth: " << depth <<
" NumSamples: " << M << endl;
441 UINT featureIndex = 0;
442 double threshold = 0;
444 if( !computeBestSpilt( trainingData, features, featureIndex, threshold, minError ) ){
449 Clusterer::trainingLog <<
"Depth: " << depth <<
" FeatureIndex: " << featureIndex <<
" Threshold: " << threshold <<
" MinError: " << minError << endl;
452 if( minError <= minRMSErrorPerNode ){
457 node->setIsLeafNode(
true );
460 node->
set( M, featureIndex, threshold, clusterLabel );
462 Clusterer::trainingLog <<
"Reached leaf node. Depth: " << depth <<
" NumSamples: " << M << endl;
468 node->
set( M, featureIndex, threshold, 0 );
471 if( removeFeaturesAtEachSpilt ){
472 for(
size_t i=0; i<features.size(); i++){
473 if( features[i] == featureIndex ){
474 features.erase( features.begin()+i );
484 for(UINT i=0; i<M; i++){
491 node->setLeftChild( buildTree( lhs, node, features, clusterLabel, nodeID ) );
492 node->setRightChild( buildTree( rhs, node, features, clusterLabel, nodeID ) );
497 bool ClusterTree::computeBestSpilt(
const MatrixDouble &trainingData,
const vector< UINT > &features, UINT &featureIndex,
double &threshold,
double &minError ){
499 switch( trainingMode ){
500 case BEST_ITERATIVE_SPILT:
501 return computeBestSpiltBestIterativeSpilt( trainingData, features, featureIndex, threshold, minError );
503 case BEST_RANDOM_SPLIT:
504 return computeBestSpiltBestRandomSpilt( trainingData, features, featureIndex, threshold, minError );
507 Clusterer::errorLog <<
"Uknown trainingMode!" << endl;
515 bool ClusterTree::computeBestSpiltBestIterativeSpilt(
const MatrixDouble &trainingData,
const vector< UINT > &features, UINT &featureIndex,
double &threshold,
double &minError ){
517 const UINT M = trainingData.getNumRows();
518 const UINT N = (UINT)features.size();
522 if( N == 0 )
return false;
524 minError = numeric_limits<double>::max();
525 UINT bestFeatureIndex = 0;
527 double bestThreshold = 0;
532 vector< UINT > groupIndex(M);
533 vector< double > groupCounter(2);
534 vector< double > groupMean(2);
535 vector< double > groupMSE(2);
536 vector< MinMax > ranges = trainingData.getRanges();
539 for(UINT n=0; n<N; n++){
540 minRange = ranges[n].minValue;
541 maxRange = ranges[n].maxValue;
542 step = (maxRange-minRange)/
double(numSplittingSteps);
543 threshold = minRange;
544 featureIndex = features[n];
546 while( threshold <= maxRange ){
549 groupCounter[0] = groupCounter[1] = 0;
550 groupMean[0] = groupMean[1] = 0;
551 groupMSE[0] = groupMSE[1] = 0;
554 for(UINT i=0; i<M; i++){
555 groupID = trainingData[i][featureIndex] >= threshold ? 1 : 0;
556 groupIndex[i] = groupID;
559 groupMean[ groupID ] += trainingData[i][featureIndex];
560 groupCounter[ groupID ]++;
564 groupMean[0] /= (groupCounter[0] > 0 ? groupCounter[0] : 1);
565 groupMean[1] /= (groupCounter[1] > 0 ? groupCounter[1] : 1);
568 for(UINT i=0; i<M; i++){
569 groupMSE[ groupIndex[i] ] += MLBase::SQR( groupMean[ groupIndex[i] ] - trainingData[i][featureIndex] );
571 groupMSE[0] /= (groupCounter[0] > 0 ? groupCounter[0] : 1);
572 groupMSE[1] /= (groupCounter[1] > 0 ? groupCounter[1] : 1);
574 error = sqrt( groupMSE[0] + groupMSE[1] );
577 if( error < minError ){
579 bestThreshold = threshold;
580 bestFeatureIndex = featureIndex;
589 featureIndex = bestFeatureIndex;
590 threshold = bestThreshold;
595 bool ClusterTree::computeBestSpiltBestRandomSpilt(
const MatrixDouble &trainingData,
const vector< UINT > &features, UINT &featureIndex,
double &threshold,
double &minError ){
597 const UINT M = trainingData.getNumRows();
598 const UINT N = (UINT)features.size();
600 Clusterer::debugLog <<
"computeBestSpiltBestRandomSpilt() M: " << M << endl;
602 if( N == 0 )
return false;
604 minError = numeric_limits<double>::max();
605 UINT bestFeatureIndex = 0;
607 double bestThreshold = 0;
609 vector< UINT > groupIndex(M);
610 vector< double > groupCounter(2);
611 vector< double > groupMean(2);
612 vector< double > groupMSE(2);
613 vector< MinMax > ranges = trainingData.getRanges();
616 for(UINT n=0; n<N; n++){
617 featureIndex = features[n];
619 for(UINT m=0; m<numSplittingSteps; m++){
624 groupCounter[0] = groupCounter[1] = 0;
625 groupMean[0] = groupMean[1] = 0;
626 groupMSE[0] = groupMSE[1] = 0;
629 for(UINT i=0; i<M; i++){
630 groupID = trainingData[i][featureIndex] >= threshold ? 1 : 0;
631 groupIndex[i] = groupID;
634 groupMean[ groupID ] += trainingData[i][featureIndex];
635 groupCounter[ groupID ]++;
639 groupMean[0] /= (groupCounter[0] > 0 ? groupCounter[0] : 1);
640 groupMean[1] /= (groupCounter[1] > 0 ? groupCounter[1] : 1);
643 for(UINT i=0; i<M; i++){
644 groupMSE[ groupIndex[i] ] += MLBase::SQR( groupMean[ groupIndex[i] ] - trainingData[i][featureIndex] );
646 groupMSE[0] /= (groupCounter[0] > 0 ? groupCounter[0] : 1);
647 groupMSE[1] /= (groupCounter[1] > 0 ? groupCounter[1] : 1);
649 error = sqrt( groupMSE[0] + groupMSE[1] );
652 if( error < minError ){
654 bestThreshold = threshold;
655 bestFeatureIndex = featureIndex;
661 featureIndex = bestFeatureIndex;
662 threshold = bestThreshold;
virtual bool train_(MatrixDouble &trainingData)
virtual Node * deepCopyNode() const
virtual bool loadModelFromFile(fstream &file)
bool loadClustererSettingsFromFile(fstream &file)
virtual bool saveModelToFile(fstream &file) const
This class implements a Cluster Tree. This can be used to automatically build a cluster model (where ...
virtual ~ClusterTree(void)
ClusterTreeNode * deepCopyTree() const
virtual bool loadFromFile(fstream &file)
UINT numClusters
Number of clusters in the model.
unsigned int getNumCols() const
ClusterTree(const UINT numSplittingSteps=100, const UINT minNumSamplesPerNode=5, const UINT maxDepth=10, const bool removeFeaturesAtEachSpilt=false, const UINT trainingMode=BEST_ITERATIVE_SPILT, const bool useScaling=false, const double minRMSErrorPerNode=0.01)
virtual bool saveToFile(fstream &file) const
bool scale(const double minTarget, const double maxTarget)
bool saveClustererSettingsToFile(fstream &file) const
virtual bool predict(const VectorDouble &x)
bool copyBaseVariables(const Clusterer *Clusterer)
virtual bool deepCopyFrom(const Clusterer *cluster)
ClusterTree & operator=(const ClusterTree &rhs)
double scale(const double &x, const double &minSource, const double &maxSource, const double &minTarget, const double &maxTarget, const bool constrain=false)
UINT getPredictedClusterLabel() const
string getClustererType() const
bool set(const UINT nodeSize, const UINT featureIndex, const double threshold, const UINT clusterLabel)
double getRandomNumberUniform(double minRange=0.0, double maxRange=1.0)
virtual bool predict(const VectorDouble &x)
bool setMinRMSErrorPerNode(const double minRMSErrorPerNode)
virtual bool print() const
const ClusterTreeNode * getTree() const
unsigned int getNumRows() const
double getMinRMSErrorPerNode() const
std::vector< MinMax > getRanges() const
std::vector< T > getRowVector(const unsigned int r) const
virtual bool predict_(VectorDouble &inputVector)
virtual bool print() const
UINT predictedClusterLabel
Stores the predicted cluster label from the most recent predict( )