26 RegisterClustererModule< KMeans > KMeans::registerModule(
"KMeans");
29 KMeans::KMeans(
const UINT numClusters,
const UINT minNumEpochs,
const UINT maxNumEpochs,
const double minChange,
const bool computeTheta){
32 this->minNumEpochs = minNumEpochs;
33 this->maxNumEpochs = maxNumEpochs;
34 this->minChange = minChange;
35 this->computeTheta = computeTheta;
40 numTrainingIterationsToConverge = 0;
44 clustererType = classType;
45 debugLog.setProceedingText(
"[DEBUG KMeans]");
46 errorLog.setProceedingText(
"[ERROR KMeans]");
47 trainingLog.setProceedingText(
"[TRAINING KMeans]");
48 warningLog.setProceedingText(
"[WARNING KMeans]");
54 clustererType = classType;
55 debugLog.setProceedingText(
"[DEBUG KMeans]");
56 errorLog.setProceedingText(
"[ERROR KMeans]");
57 trainingLog.setProceedingText(
"[TRAINING KMeans]");
58 warningLog.setProceedingText(
"[WARNING KMeans]");
64 this->computeTheta = rhs.computeTheta;
65 this->finalTheta = rhs.finalTheta;
66 this->clusters = rhs.clusters;
67 this->assign = rhs.assign;
68 this->count = rhs.count;
69 this->thetaTracker = rhs.thetaTracker;
86 this->computeTheta = rhs.computeTheta;
87 this->finalTheta = rhs.finalTheta;
88 this->clusters = rhs.clusters;
89 this->assign = rhs.assign;
90 this->count = rhs.count;
91 this->thetaTracker = rhs.thetaTracker;
102 if( clusterer == NULL )
return false;
110 this->computeTheta = ptr->computeTheta;
111 this->finalTheta = ptr->finalTheta;
112 this->clusters = ptr->clusters;
113 this->assign = ptr->assign;
114 this->count = ptr->count;
115 this->thetaTracker = ptr->thetaTracker;
126 errorLog <<
"train_(ClassificationData &trainingData) - The training data is empty!" << endl;
137 for(UINT i=0; i<M; i++){
138 for(UINT j=0; j<N; j++){
139 data[i][j] = trainingData[i][j];
153 for(UINT i=0; i<M; i++){
154 for(UINT j=0; j<N; j++){
155 data[i][j] = trainingData[i][j];
167 errorLog <<
"train_(MatrixDouble &data) - Failed to train model. NumClusters is zero!" << endl;
172 errorLog <<
"train_(MatrixDouble &data) - The number of rows or columns in the data is zero!" << endl;
187 std::random_shuffle(randIndexs.begin(), randIndexs.end());
191 for(UINT j=0; j<numInputDimensions; j++){
192 clusters[k][j] = data[ randIndexs[k] ][j];
205 if( inputVector.size() != numInputDimensions ){
210 for(UINT n=0; n<numInputDimensions; n++){
211 inputVector[n] =
scale(inputVector[n], ranges[n].minValue, ranges[n].maxValue, 0, 1);
215 const double sigma = 1.0;
216 const double gamma = 1.0 / (2*SQR(sigma));
220 bestDistance = numeric_limits<double>::max();
232 for(UINT j=0; j<numInputDimensions; j++){
233 dist += SQR( inputVector[j]-clusters[i][j] );
236 clusterDistances[i] = dist;
237 clusterLikelihoods[i] = exp( - SQR(gamma * dist) );
239 sum += clusterLikelihoods[i];
241 if( dist < bestDistance ){
249 clusterLikelihoods[i] /= sum;
253 maxLikelihood = clusterLikelihoods[ minIndex ];
261 errorLog <<
"trainModel(MatrixDouble &data) - Failed to train model. NumClusters is zero!" << endl;
266 errorLog <<
"trainModel(MatrixDouble &data) - Failed to train model. The number of rows in the cluster matrix does not match the number of clusters! You should need to initalize the clusters matrix first before calling this function!" << endl;
270 if( clusters.
getNumCols() != numInputDimensions ){
271 errorLog <<
"trainModel(MatrixDouble &data) - Failed to train model. The number of columns in the cluster matrix does not match the number of input dimensions! You should need to initalize the clusters matrix first before calling this function!" << endl;
276 UINT currentIter = 0;
278 bool keepTraining =
true;
280 double lastTheta = 0;
282 double startTime = 0;
283 thetaTracker.clear();
285 numTrainingIterationsToConverge = 0;
302 while( keepTraining ){
306 numChanged = estep( data );
316 theta = calculateTheta(data);
317 delta = lastTheta - theta;
319 }
else theta = delta = 0;
322 if( numChanged == 0 && currentIter > minNumEpochs ){ converged =
true; keepTraining =
false; }
323 if( currentIter >= maxNumEpochs ){ keepTraining =
false; }
324 if( fabs( delta ) < minChange && computeTheta && currentIter > minNumEpochs ){ converged =
true; keepTraining =
false; }
325 if( computeTheta ) thetaTracker.push_back( theta );
327 trainingLog <<
"Epoch: " << currentIter <<
"/" << maxNumEpochs;
328 trainingLog <<
" Epoch time: " << (timer.
getMilliSeconds()-startTime)/1000.0 <<
" seconds";
329 trainingLog <<
" Theta: " << theta <<
" Delta: " << delta << endl;
331 trainingLog <<
"Model Trained at epoch: " << currentIter <<
" with a theta value of: " << theta << endl;
334 numTrainingIterationsToConverge = currentIter;
338 clusterLabels.resize(numClusters);
340 clusterLabels[i] = i+1;
342 clusterLikelihoods.resize(numClusters,0);
343 clusterDistances.resize(numClusters,0);
362 for (n=0; n < numInputDimensions; n++)
363 d += SQR( data[m][n]-clusters[k][n] );
364 if (d <= dmin){ dmin = d; kmin = k; }
366 if ( kmin != assign[m] ){
375 void KMeans::mstep(
const MatrixDouble &data) {
380 for (n=0;n<numInputDimensions;n++)
385 for(n=0; n < numInputDimensions; n++)
386 clusters[ assign[m] ][n] += data[m][n];
390 for (n=0; n < numInputDimensions; n++){
391 clusters[k][n] /= double(count[k]);
397 double KMeans::calculateTheta(
const MatrixDouble &data){
405 for(n=0; n < numInputDimensions; n++){
406 sum += SQR(clusters[k][n] - data[m][n]);
418 if( !file.is_open() ){
419 errorLog <<
"saveModelToFile(fstream &file) - Failed to save model, file is not open!" << endl;
423 file <<
"GRT_KMEANS_MODEL_FILE_V1.0\n";
426 errorLog <<
"saveModelToFile(fstream &file) - Failed to save clusterer settings to file!" << endl;
431 file <<
"Clusters:\n";
434 for(UINT n=0; n<numInputDimensions; n++){
435 file << clusters[k][n] <<
"\t";
450 errorLog <<
"loadModelFromFile(string filename) - Failed to open file!" << endl;
456 if( word !=
"GRT_KMEANS_MODEL_FILE_V1.0" ){
461 errorLog <<
"loadModelFromFile(string filename) - Failed to open file!" << endl;
467 if( word !=
"Clusters:" ){
472 clusters.
resize(numClusters,numInputDimensions);
476 for(UINT n=0; n<numInputDimensions; n++){
477 file >> clusters[k][n];
488 numTrainingSamples = 0;
491 thetaTracker.clear();
501 numTrainingSamples = 0;
504 thetaTracker.clear();
512 bool KMeans::setComputeTheta(
const bool computeTheta){
513 this->computeTheta = computeTheta;
521 this->clusters = clusters;
UINT getNumSamples() const
bool loadClustererSettingsFromFile(fstream &file)
UINT numTrainingSamples
Number of training examples.
KMeans & operator=(const KMeans &rhs)
UINT getNumDimensions() const
UINT numClusters
Number of clusters in the model.
unsigned int getNumCols() const
UINT getNumSamples() const
bool scale(const double minTarget, const double maxTarget)
bool saveClustererSettingsToFile(fstream &file) const
KMeans(const UINT numClusters=10, const UINT minNumEpochs=5, const UINT maxNumEpochs=1000, const double minChange=1.0e-5, const bool computeTheta=true)
UINT getNumClasses() const
virtual bool deepCopyFrom(const Clusterer *clusterer)
bool copyBaseVariables(const Clusterer *Clusterer)
This class implements the KMeans clustering algorithm.
double scale(const double &x, const double &minSource, const double &maxSource, const double &minTarget, const double &maxTarget, const bool constrain=false)
signed long getMilliSeconds() const
string getClustererType() const
bool trainModel(MatrixDouble &data)
virtual bool saveModelToFile(fstream &file) const
virtual bool train_(MatrixDouble &data)
unsigned int getNumRows() const
UINT getNumDimensions() const
bool setClusters(const MatrixDouble &clusters)
virtual bool predict_(VectorDouble &inputVector)
virtual bool loadModelFromFile(fstream &file)
std::vector< MinMax > getRanges() const
UINT nchg
Number of values changes.
virtual bool resize(const unsigned int r, const unsigned int c)
UINT predictedClusterLabel
Stores the predicted cluster label from the most recent predict( )