26 RegisterClustererModule< HierarchicalClustering > HierarchicalClustering::registerModule(
"HierarchicalClustering");
30 classType =
"HierarchicalClustering";
31 clustererType = classType;
32 debugLog.setProceedingText(
"[DEBUG HierarchicalClustering]");
33 errorLog.setProceedingText(
"[ERROR HierarchicalClustering]");
34 trainingLog.setProceedingText(
"[TRAINING HierarchicalClustering]");
35 warningLog.setProceedingText(
"[WARNING HierarchicalClustering]");
39 classType =
"HierarchicalClustering";
40 clustererType = classType;
41 debugLog.setProceedingText(
"[DEBUG HierarchicalClustering]");
42 errorLog.setProceedingText(
"[ERROR HierarchicalClustering]");
43 trainingLog.setProceedingText(
"[TRAINING HierarchicalClustering]");
44 warningLog.setProceedingText(
"[WARNING HierarchicalClustering]");
58 this->clusters = rhs.clusters;
59 this->distanceMatrix = rhs.distanceMatrix;
70 if( clusterer == NULL )
return false;
78 this->clusters = ptr->clusters;
79 this->distanceMatrix = ptr->distanceMatrix;
101 distanceMatrix.
clear();
117 for(UINT i=0; i<M; i++){
118 for(UINT j=0; j<N; j++){
119 data[i][j] = trainingData[i][j];
137 for(UINT i=0; i<M; i++){
138 for(UINT j=0; j<N; j++){
139 data[i][j] = trainingData[i][j];
143 return train( data );
150 distanceMatrix.
clear();
161 distanceMatrix.
resize(M,M);
164 for(UINT i=0; i<M; i++){
165 for(UINT j=0; j<M; j++){
166 if( i== j ) distanceMatrix[i][j] = numeric_limits<double>::max();
168 distanceMatrix[i][j] = squaredEuclideanDistance(data[i], data[j]);
174 UINT uniqueClusterID = 0;
175 vector< ClusterInfo > clusterData(M);
176 for(UINT i=0; i<M; i++){
177 clusterData[i].uniqueClusterID = uniqueClusterID++;
178 clusterData[i].addSampleToCluster(i);
181 trainingLog <<
"Starting clustering..." << endl;
186 newLevel.level = level;
187 for(UINT i=0; i<M; i++){
188 newLevel.clusters.push_back( clusterData[i] );
190 clusters.push_back( newLevel );
194 bool keepClustering =
true;
196 while( keepClustering ){
199 double minDist = numeric_limits<double>::max();
200 vector< vector< UINT > > clusterPairs;
201 UINT K = (UINT)clusterData.size();
202 for(UINT i=0; i<K; i++){
203 for(UINT j=0; j<K; j++){
205 double dist = computeClusterDistance( clusterData[i], clusterData[j] );
207 if( dist < minDist ){
209 vector< UINT > clusterPair(2);
212 clusterPairs.clear();
213 clusterPairs.push_back( clusterPair );
220 if( minDist == numeric_limits<double>::max() ){
221 keepClustering =
false;
222 warningLog <<
"train_(MatrixDouble &data) - Failed to find any cluster at level: " << level << endl;
228 newLevel.level = level;
232 newCluster.uniqueClusterID = uniqueClusterID++;
234 const UINT numClusterPairs = (UINT)clusterPairs.size();
236 for(UINT k=0; k<numClusterPairs; k++){
238 UINT numSamplesInClusterA = clusterData[ clusterPairs[k][0] ].getNumSamplesInCluster();
239 for(UINT i=0; i<numSamplesInClusterA; i++){
240 UINT index = clusterData[ clusterPairs[k][0] ][ i ];
241 newCluster.addSampleToCluster( index );
245 UINT numSamplesInClusterB = clusterData[ clusterPairs[k][1] ].getNumSamplesInCluster();
246 for(UINT i=0; i<numSamplesInClusterB; i++){
247 UINT index = clusterData[ clusterPairs[k][1] ][ i ];
248 newCluster.addSampleToCluster( index );
252 newCluster.clusterVariance = computeClusterVariance( newCluster, data );
255 UINT idA = clusterData[ clusterPairs[k][0] ].getUniqueClusterID();
256 UINT idB = clusterData[ clusterPairs[k][1] ].getUniqueClusterID();
258 vector< ClusterInfo >::iterator iter = clusterData.begin();
259 while( iter != clusterData.end() ){
260 if( iter->getUniqueClusterID() == idA || iter->getUniqueClusterID() == idB ){
261 iter = clusterData.erase( iter );
262 if( ++numRemoved >= 2 )
break;
268 clusterData.push_back( newCluster );
271 newLevel.clusters.push_back( newCluster );
273 clusters.push_back( newLevel );
281 keepClustering =
false;
284 if( clusterData.size() == 0 ){
285 keepClustering =
false;
288 trainingLog <<
"Cluster level: " << level <<
" Number of clusters: " << clusters.back().getNumClusters() << endl;
297 clusterLabels[i] = i+1;
299 clusterLikelihoods.resize(numClusters,0);
300 clusterDistances.resize(numClusters,0);
305 bool HierarchicalClustering::printModel(){
307 UINT K = (UINT)clusters.size();
309 cout <<
"Hierarchical Clustering Model\n\n";
310 for(UINT k=0; k<K; k++){
314 numSamples += clusters[k][i].getNumSamplesInCluster();
318 cout <<
"Level: " << clusters[k].level <<
"\tNumClusters: " << numClusters <<
"\tNumSamples: " << numSamples << endl;
320 cout <<
"ClusterVariance: " << clusters[k][i].clusterVariance << endl;
322 UINT numSamplesInCluster = clusters[k][i].getNumSamplesInCluster();
323 for(UINT j=0; j<numSamplesInCluster; j++){
324 cout << clusters[k][i][j] <<
"\t";
333 double HierarchicalClustering::squaredEuclideanDistance(
const double *a,
const double *b){
335 for(UINT i=0; i<N; i++){
336 dist += SQR( a[i] - b[i] );
341 double HierarchicalClustering::computeClusterDistance(
const ClusterInfo &clusterA,
const ClusterInfo &clusterB ){
343 double minDist = numeric_limits<double>::max();
344 const UINT numSamplesA = clusterA.getNumSamplesInCluster();
345 const UINT numSamplesB = clusterB.getNumSamplesInCluster();
348 for(UINT i=0; i<numSamplesA; i++){
349 for(UINT j=0; j<numSamplesB; j++){
350 if( distanceMatrix[ clusterA[i] ][ clusterB[j] ] < minDist ){
351 minDist = distanceMatrix[ clusterA[i] ][ clusterB[j] ];
359 double HierarchicalClustering::computeClusterVariance(
const ClusterInfo &cluster,
const MatrixDouble &data ){
361 VectorDouble mean(N,0);
362 VectorDouble
std(N,0);
365 UINT numSamples = cluster.getNumSamplesInCluster();
366 for(UINT j=0; j<N; j++){
367 for(UINT i=0; i<numSamples; i++){
368 UINT index = cluster[i];
369 mean[j] += data[ index ][j];
371 mean[j] /= double( numSamples );
375 for(UINT j=0; j<N; j++){
376 for(UINT i=0; i<numSamples; i++){
377 std[j] += SQR( data[ cluster[i] ][j] - mean[j] );
379 std[j] = sqrt(
std[j] /
double( numSamples-1 ) );
383 for(UINT j=0; j<N; j++){
391 if( !file.is_open() ){
392 errorLog <<
"saveModelToFile(string filename) - Failed to open file!" << endl;
396 file <<
"GRT_HIERARCHICAL_CLUSTERING_FILE_V1.0\n";
399 errorLog <<
"saveModelToFile(fstream &file) - Failed to save cluster settings to file!" << endl;
404 file <<
"M: " << M << endl;
405 file <<
"N: " << N << endl;
406 file <<
"NumLevels: " << clusters.size() << endl;
408 for(UINT i=0; i<clusters.size(); i++){
409 file <<
"Level: " << clusters[i].getLevel() << endl;
410 file <<
"NumClusters: " << clusters[i].getNumClusters() << endl;
426 if( word !=
"GRT_HIERARCHICAL_CLUSTERING_FILE_V1.0" ){
431 errorLog <<
"loadModelFromFile(fstream &file) - Failed to load cluster settings from file!" << endl;
UINT getNumSamples() const
HierarchicalClustering & operator=(const HierarchicalClustering &rhs)
bool loadClustererSettingsFromFile(fstream &file)
virtual bool train_(MatrixDouble &data)
UINT getNumDimensions() const
UINT numClusters
Number of clusters in the model.
unsigned int getNumCols() const
UINT getNumSamples() const
virtual bool train(ClassificationData trainingData)
bool saveClustererSettingsToFile(fstream &file) const
virtual bool loadModelFromFile(fstream &file)
bool copyBaseVariables(const Clusterer *Clusterer)
virtual bool deepCopyFrom(const Clusterer *clusterer)
virtual bool saveModelToFile(fstream &file) const
string getClustererType() const
unsigned int getNumRows() const
UINT getNumDimensions() const
virtual ~HierarchicalClustering()
virtual bool resize(const unsigned int r, const unsigned int c)
This class implements a basic Hierarchial Clustering algorithm.