25 RegressionData::RegressionData(
const UINT numInputDimensions,
const UINT numTargetDimensions,
const string datasetName,
const string infoText):totalNumSamples(0){
26 this->numInputDimensions = numInputDimensions;
27 this->numTargetDimensions = numTargetDimensions;
28 this->datasetName = datasetName;
29 this->infoText = infoText;
31 crossValidationSetup =
false;
32 useExternalRanges =
false;
33 debugLog.setProceedingText(
"[DEBUG LRD]");
34 errorLog.setProceedingText(
"[ERROR LRD]");
35 warningLog.setProceedingText(
"[WARNING LRD]");
46 this->datasetName = rhs.datasetName;
47 this->infoText = rhs.infoText;
48 this->numInputDimensions = rhs.numInputDimensions;
49 this->numTargetDimensions = rhs.numTargetDimensions;
50 this->totalNumSamples = rhs.totalNumSamples;
51 this->kFoldValue = rhs.kFoldValue;
52 this->crossValidationSetup = rhs.crossValidationSetup;
53 this->useExternalRanges = rhs.useExternalRanges;
54 this->externalInputRanges = rhs.externalInputRanges;
55 this->externalTargetRanges = rhs.externalTargetRanges;
56 this->data = rhs.data;
57 this->crossValidationIndexs = rhs.crossValidationIndexs;
58 this->debugLog = rhs.debugLog;
59 this->errorLog = rhs.errorLog;
60 this->warningLog = rhs.warningLog;
68 crossValidationSetup =
false;
70 crossValidationIndexs.clear();
75 if( numInputDimensions > 0 && numTargetDimensions > 0 ){
76 this->numInputDimensions = numInputDimensions;
77 this->numTargetDimensions = numTargetDimensions;
80 useExternalRanges =
false;
81 externalInputRanges.clear();
82 externalTargetRanges.clear();
85 errorLog <<
"setInputAndTargetDimensions(UINT numInputDimensions,UINT numTargetDimensions) - The number of input and target dimensions should be greater than zero!" << endl;
92 if( datasetName.find(
" ") == string::npos ){
93 this->datasetName = datasetName;
97 errorLog <<
"setDatasetName(const string &datasetName) - The dataset name cannot contain any spaces!" << endl;
102 this->infoText = infoText;
107 if( inputVector.size() == numInputDimensions && targetVector.size() == numTargetDimensions ){
112 crossValidationSetup =
false;
113 crossValidationIndexs.clear();
116 errorLog <<
"addSample(const VectorDouble &inputVector,const VectorDouble &targetVector) - The inputVector size or targetVector size does not match the size of the numInputDimensions or numTargetDimensions" << endl;
121 if( totalNumSamples > 0 ){
123 data.erase(data.end()-1);
124 totalNumSamples = (UINT)data.size();
127 crossValidationSetup =
false;
128 crossValidationIndexs.clear();
131 warningLog <<
"removeLastSample() - There are no samples to remove!" << endl;
139 if( data.capacity() >= N )
return true;
146 if( externalInputRanges.size() != numInputDimensions )
return false;
147 if( externalTargetRanges.size() != numTargetDimensions )
return false;
149 this->externalInputRanges = externalInputRanges;
150 this->externalTargetRanges = externalTargetRanges;
151 this->useExternalRanges = useExternalRanges;
157 if( externalInputRanges.size() != numInputDimensions && externalTargetRanges.size() != numTargetDimensions ){
158 this->useExternalRanges = useExternalRanges;
167 return scale(inputRanges,targetRanges,minTarget,maxTarget);
170 bool RegressionData::scale(
const vector< MinMax > &inputVectorRanges,
const vector< MinMax > &targetVectorRanges,
const double minTarget,
const double maxTarget){
171 if( inputVectorRanges.size() == numInputDimensions && targetVectorRanges.size() == numTargetDimensions ){
173 VectorDouble scaledInputVector(numInputDimensions,0);
174 VectorDouble scaledTargetVector(numTargetDimensions,0);
175 for(UINT i=0; i<totalNumSamples; i++){
178 for(UINT j=0; j<numInputDimensions; j++){
179 scaledInputVector[j] =
Util::scale(data[i].getInputVectorValue(j),inputVectorRanges[j].minValue,inputVectorRanges[j].maxValue,minTarget,maxTarget);
182 for(UINT j=0; j<numTargetDimensions; j++){
183 scaledTargetVector[j] =
Util::scale(data[i].getTargetVectorValue(j),targetVectorRanges[j].minValue,targetVectorRanges[j].maxValue,minTarget,maxTarget);
186 data[i].set(scaledInputVector,scaledTargetVector);
196 if( useExternalRanges )
return externalInputRanges;
198 vector< MinMax > ranges(numInputDimensions);
200 if( totalNumSamples > 0 ){
201 for(UINT j=0; j<numInputDimensions; j++){
202 ranges[j].minValue = data[0].getInputVectorValue(j);
203 ranges[j].maxValue = data[0].getInputVectorValue(j);
204 for(UINT i=0; i<totalNumSamples; i++){
205 if( data[i].getInputVectorValue(j) < ranges[j].minValue ){ ranges[j].minValue = data[i].getInputVectorValue(j); }
206 else if( data[i].getInputVectorValue(j) > ranges[j].maxValue ){ ranges[j].maxValue = data[i].getInputVectorValue(j); }
215 if( useExternalRanges )
return externalTargetRanges;
217 vector< MinMax > ranges(numTargetDimensions);
219 if( totalNumSamples > 0 ){
220 for(UINT j=0; j<numTargetDimensions; j++){
221 ranges[j].minValue = data[0].getTargetVectorValue(j);
222 ranges[j].maxValue = data[0].getTargetVectorValue(j);
223 for(UINT i=0; i<totalNumSamples; i++){
224 if( data[i].getTargetVectorValue(j) < ranges[j].minValue ){ ranges[j].minValue = data[i].getTargetVectorValue(j); }
225 else if( data[i].getTargetVectorValue(j) > ranges[j].maxValue ){ ranges[j].maxValue = data[i].getTargetVectorValue(j); }
232 string RegressionData::getStatsAsString()
const{
235 statsText +=
"DatasetName:\t" + datasetName +
"\n";
236 statsText +=
"DatasetInfo:\t" + infoText +
"\n";
237 statsText +=
"Number of Input Dimensions:\t" +
Util::toString( numInputDimensions ) +
"\n";
238 statsText +=
"Number of Target Dimensions:\t" +
Util::toString( numTargetDimensions ) +
"\n";
239 statsText +=
"Number of Samples:\t" +
Util::toString( totalNumSamples ) +
"\n";
243 statsText +=
"Dataset Input Dimension Ranges:\n";
244 for(UINT j=0; j<inputRanges.size(); j++){
250 statsText +=
"Dataset Target Dimension Ranges:\n";
251 for(UINT j=0; j<targetRanges.size(); j++){
257 bool RegressionData::printStats()
const{
258 cout << getStatsAsString();
269 const UINT numTrainingExamples = (UINT) floor(
double(totalNumSamples) / 100.0 * double(trainingSizePercentage) );
271 RegressionData trainingSet(numInputDimensions,numTargetDimensions);
273 vector< UINT > indexs( totalNumSamples );
277 UINT randomIndex = 0;
278 for(UINT i=0; i<totalNumSamples; i++) indexs[i] = i;
279 for(UINT x=0; x<totalNumSamples; x++){
281 SWAP( indexs[ x ] , indexs[ randomIndex ] );
285 for(UINT i=0; i<numTrainingExamples; i++){
286 trainingSet.
addSample( data[ indexs[i] ].getInputVector(), data[ indexs[i] ].getTargetVector() );
288 for(UINT i=numTrainingExamples; i<totalNumSamples; i++){
289 testSet.
addSample( data[ indexs[i] ].getInputVector(), data[ indexs[i] ].getTargetVector() );
297 crossValidationSetup =
false;
298 crossValidationIndexs.clear();
306 errorLog <<
"merge(RegressionData ®ressionData) - The number of input dimensions in the regressionData (" << regressionData.
getNumInputDimensions() <<
") does not match the number of input dimensions of this dataset (" << numInputDimensions <<
")" << endl;
311 errorLog <<
"merge(RegressionData ®ressionData) - The number of target dimensions in the regressionData (" << regressionData.
getNumTargetDimensions() <<
") does not match the number of target dimensions of this dataset (" << numTargetDimensions <<
")" << endl;
317 addSample(regressionData[i].getInputVector(), regressionData[i].getTargetVector());
321 crossValidationSetup =
false;
322 crossValidationIndexs.clear();
329 crossValidationSetup =
false;
330 crossValidationIndexs.clear();
333 if( K > totalNumSamples ){
334 errorLog <<
"spiltDataIntoKFolds(UINT K) - K can not be zero!" << endl;
339 if( K > totalNumSamples ){
340 errorLog <<
"spiltDataIntoKFolds(UINT K) - K can not be larger than the total number of samples in the dataset!" << endl;
346 vector< UINT > indexs( totalNumSamples );
349 UINT numSamplesPerFold = (UINT) floor( totalNumSamples/
double(K) );
352 crossValidationIndexs.resize(K);
356 UINT randomIndex = 0;
359 for(UINT i=0; i<totalNumSamples; i++) indexs[i] = i;
360 for(UINT x=0; x<totalNumSamples; x++){
365 SWAP( indexs[ x ] , indexs[ randomIndex ] );
370 for(UINT i=0; i<totalNumSamples; i++){
372 crossValidationIndexs[ foldIndex ].push_back( indexs[i] );
375 if( ++counter == numSamplesPerFold && foldIndex < K-1 ){
381 crossValidationSetup =
true;
389 if( !crossValidationSetup ){
390 errorLog <<
"getTrainingFoldData(UINT foldIndex) - Cross Validation has not been setup! You need to call the spiltDataIntoKFolds(UINT K,bool useStratifiedSampling) function first before calling this function!" << endl;
394 if( foldIndex >= kFoldValue )
return trainingData;
400 for(UINT k=0; k<kFoldValue; k++){
401 if( k != foldIndex ){
402 for(UINT i=0; i<crossValidationIndexs[k].size(); i++){
404 index = crossValidationIndexs[k][i];
405 trainingData.
addSample( data[ index ].getInputVector(), data[ index ].getTargetVector() );
416 if( !crossValidationSetup )
return testData;
418 if( foldIndex >= kFoldValue )
return testData;
424 for(UINT i=0; i<crossValidationIndexs[ foldIndex ].size(); i++){
426 index = crossValidationIndexs[ foldIndex ][i];
427 testData.
addSample( data[ index ].getInputVector(), data[ index ].getTargetVector() );
433 UINT RegressionData::removeDuplicateSamples(){
435 UINT numSamplesRemoved = 0;
438 sort(data.begin(),data.end(),RegressionSample::sortByInputVectorAscending );
441 double minDist = 1.0e-5;
443 double totalDimensions = numInputDimensions + numTargetDimensions;
444 bool keepSearching =
true;
445 vector< RegressionSample >::iterator currentSample = data.begin();
446 vector< RegressionSample >::iterator nextSample = data.begin()+1;
448 if( currentSample == data.end() ) keepSearching =
false;
449 if( nextSample == data.end() ) keepSearching =
false;
451 while( keepSearching ){
453 for(UINT i=0; i<numInputDimensions; i++){
454 dist += SQR( currentSample->getInputVectorValue(i) - nextSample->getInputVectorValue(i) );
456 for(UINT i=0; i<numTargetDimensions; i++){
457 dist += SQR( currentSample->getTargetVectorValue(i) - nextSample->getTargetVectorValue(i) );
459 dist /= totalDimensions;
460 if( dist <= minDist ){
462 currentSample = data.erase( nextSample );
463 nextSample = currentSample + 1;
465 debugLog <<
"Removing sample with dist: " << dist << endl;
471 if( currentSample == data.end() ) keepSearching =
false;
472 if( nextSample == data.end() ) keepSearching =
false;
475 return numSamplesRemoved;
503 file.open(filename.c_str(), std::ios::out);
505 if( !file.is_open() ){
506 errorLog <<
"saveDatasetToFile(const string &filename) - Failed to open file!" << endl;
510 file <<
"GRT_LABELLED_REGRESSION_DATA_FILE_V1.0\n";
511 file <<
"DatasetName: " << datasetName << endl;
512 file <<
"InfoText: " << infoText << endl;
513 file <<
"NumInputDimensions: "<<numInputDimensions<<endl;
514 file <<
"NumTargetDimensions: "<<numTargetDimensions<<endl;
515 file <<
"TotalNumTrainingExamples: "<<totalNumSamples<<endl;
516 file <<
"UseExternalRanges: " << useExternalRanges << endl;
518 if( useExternalRanges ){
519 for(UINT i=0; i<externalInputRanges.size(); i++){
520 file << externalInputRanges[i].minValue <<
"\t" << externalInputRanges[i].maxValue << endl;
522 for(UINT i=0; i<externalTargetRanges.size(); i++){
523 file << externalTargetRanges[i].minValue <<
"\t" << externalTargetRanges[i].maxValue << endl;
527 file <<
"RegressionData:\n";
529 for(UINT i=0; i<totalNumSamples; i++){
530 for(UINT j=0; j<numInputDimensions; j++){
531 file << data[i].getInputVectorValue(j) <<
"\t";
533 for(UINT j=0; j<numTargetDimensions; j++){
534 file << data[i].getTargetVectorValue(j);
535 if( j!= numTargetDimensions-1 ) file <<
"\t";
547 file.open(filename.c_str(), std::ios::in);
550 if( !file.is_open() ){
551 errorLog <<
"loadDatasetFromFile(const string &filename) - Failed to open file!" << endl;
559 if(word !=
"GRT_LABELLED_REGRESSION_DATA_FILE_V1.0"){
560 errorLog <<
"loadDatasetFromFile(const string &filename) - Unknown file header!" << endl;
567 if(word !=
"DatasetName:"){
568 errorLog <<
"loadDatasetFromFile(const string &filename) - failed to find DatasetName!" << endl;
575 if(word !=
"InfoText:"){
576 errorLog <<
"loadDatasetFromFile(const string &filename) - failed to find InfoText!" << endl;
584 while( word !=
"NumInputDimensions:" ){
585 infoText += word +
" ";
590 if(word !=
"NumInputDimensions:"){
591 errorLog <<
"loadDatasetFromFile(const string &filename) - Failed to find NumInputDimensions!" << endl;
595 file >> numInputDimensions;
599 if(word !=
"NumTargetDimensions:"){
600 errorLog <<
"loadDatasetFromFile(const string &filename) - Failed to find NumTargetDimensions!" << endl;
604 file >> numTargetDimensions;
608 if(word !=
"TotalNumTrainingExamples:"){
609 errorLog <<
"loadDatasetFromFile(const string &filename) - Failed to find TotalNumTrainingExamples!" << endl;
613 file >> totalNumSamples;
617 if(word !=
"UseExternalRanges:"){
618 errorLog <<
"loadDatasetFromFile(const string &filename) - failed to find DatasetName!" << endl;
622 file >> useExternalRanges;
625 if( useExternalRanges ){
626 externalInputRanges.resize(numInputDimensions);
627 externalTargetRanges.resize(numTargetDimensions);
628 for(UINT i=0; i<externalInputRanges.size(); i++){
629 file >> externalInputRanges[i].minValue;
630 file >> externalInputRanges[i].maxValue;
632 for(UINT i=0; i<externalTargetRanges.size(); i++){
633 file >> externalTargetRanges[i].minValue;
634 file >> externalTargetRanges[i].maxValue;
640 if( word !=
"RegressionData:" && word !=
"LabelledRegressionData:" ){
641 errorLog <<
"loadDatasetFromFile(const string &filename) - Failed to find RegressionData!" << endl;
646 VectorDouble inputVector(numInputDimensions);
647 VectorDouble targetVector(numTargetDimensions);
650 for(UINT i=0; i<totalNumSamples; i++){
652 for(UINT j=0; j<numInputDimensions; j++){
653 file >> inputVector[j];
655 for(UINT j=0; j<numTargetDimensions; j++){
656 file >> targetVector[j];
658 data[i].set(inputVector, targetVector);
668 file.open(filename.c_str(), std::ios::out );
670 if( !file.is_open() ){
671 errorLog <<
"saveDatasetToCSVFile(const string &filename) - Failed to open file!" << endl;
676 for(UINT i=0; i<totalNumSamples; i++){
677 for(UINT j=0; j<numInputDimensions; j++){
678 file << data[i].getInputVector()[j] <<
",";
680 for(UINT j=0; j<numTargetDimensions; j++){
681 file << data[i].getTargetVector()[j];
682 if( j != numTargetDimensions-1 ) file <<
",";
697 datasetName =
"NOT_SET";
706 if( !parser.parseCSVFile(filename,
true) ){
707 errorLog <<
"loadDatasetFromCSVFile(...) - Failed to parse CSV file!" << endl;
711 if( !parser.getConsistentColumnSize() ){
712 errorLog <<
"loadDatasetFromCSVFile(...) - The CSV file does not have a consistent number of columns!" << endl;
716 if( parser.getColumnSize() != numInputDimensions+numTargetDimensions ){
717 errorLog <<
"loadDatasetFromCSVFile(...) - The number of columns in the CSV file (" << parser.getColumnSize() <<
")";
718 errorLog <<
" does not match the number of input dimensions plus the number of target dimensions (" << numInputDimensions+numTargetDimensions <<
")" << endl;
726 VectorDouble inputVector(numInputDimensions);
727 VectorDouble targetVector(numTargetDimensions);
728 for(UINT i=0; i<parser.getRowSize(); i++){
734 for(UINT j=0; j<numInputDimensions; j++){
739 for(UINT j=0; j<numTargetDimensions; j++){
744 if( !
addSample(inputVector, targetVector) ){
745 warningLog <<
"loadDatasetFromCSVFile(string filename) - Could not add sample " << i <<
" to the dataset!" << endl;
static std::string toString(const int &i)
bool saveDatasetToCSVFile(const string &filename) const
bool loadDatasetFromCSVFile(const string &filename, const UINT numInputDimensions, const UINT numTargetDimensions)
bool enableExternalRangeScaling(const bool useExternalRanges)
bool setInfoText(const string &infoText)
bool loadDatasetFromFile(const string &filename)
bool setInputAndTargetDimensions(const UINT numInputDimensions, const UINT numTargetDimensions)
static double scale(const double &x, const double &minSource, const double &maxSource, const double &minTarget, const double &maxTarget, const bool constrain=false)
bool load(const string &filename)
vector< MinMax > getInputRanges() const
static double stringToDouble(const std::string &s)
bool reserve(const UINT N)
UINT getNumSamples() const
bool setExternalRanges(const vector< MinMax > &externalInputRanges, const vector< MinMax > &externalTargetRanges, const bool useExternalRanges)
bool saveDatasetToFile(const string &filename) const
RegressionData getTrainingFoldData(const UINT foldIndex) const
int getRandomNumberInt(int minRange, int maxRange)
vector< MinMax > getTargetRanges() const
RegressionData & operator=(const RegressionData &rhs)
The RegressionData is the main data structure for recording, labeling, managing, saving, and loading datasets that can be used to train and test the GRT supervised regression algorithms.
bool merge(const RegressionData ®ressionData)
vector< RegressionSample > getData() const
bool spiltDataIntoKFolds(const UINT K)
bool save(const string &filename) const
RegressionData(const UINT numInputDimensions=0, const UINT numTargetDimensions=0, const string datasetName="NOT_SET", const string infoText="")
RegressionData partition(const UINT trainingSizePercentage)
UINT getNumTargetDimensions() const
bool addSample(const VectorDouble &inputVector, const VectorDouble &targetVector)
static bool stringEndsWith(const std::string &str, const std::string &ending)
bool setDatasetName(const string &datasetName)
UINT getNumInputDimensions() const
RegressionData getTestFoldData(const UINT foldIndex) const
bool scale(const double minTarget, const double maxTarget)