Biorithm  1.1
 All Classes Functions Variables Typedefs Friends
params.h
00001 /**************************************************************************
00002  * Copyright (c) 2002-2011 T. M. Murali                                   *
00003  *                                                                        *
00004  * This file is part of Biorithm.                                         *
00005  *                                                                        *
00006  * Biorithm is free software: you can redistribute it and/or modify       *
00007  * it under the terms of the GNU General Public License as published by   *
00008  * the Free Software Foundation, either version 3 of the License, or      *
00009  * (at your option) any later version.                                    *
00010  *                                                                        *
00011  * Biorithm is distributed in the hope that it will be useful,            *
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of         *
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          *
00014  * GNU General Public License for more details.                           *
00015  *                                                                        *
00016  * You should have received a copy of the GNU General Public License      *
00017  * along with Biorithm.  If not, see <http://www.gnu.org/licenses/>.      *
00018  *                                                                        *
00019  **************************************************************************/
00020 
00021 // Purpose: functions to deal with parameters that govern the clustering algorithm.
00022 
00023 #ifndef _PARAMS_H
00024 #define _PARAMS_H
00025 
00026 #include <math.h>
00027 #include <fstream>
00028 #include <map>
00029 #include <set>
00030 
00031 #include "format.h"
00032 #include "global.h"
00033 
00034 //#include "cluster_gp.h"
00035 
00036 // file created by gengetopt. need to include it for MyClusterParams::set(gengetopt_args_info& cl) 
00037 //#include "cmdline.h"
00038 
00039 // forward declaration to avoid including cmdline.h; when i compile kmeans, there is no corresponding cmdline.h for it.
00040 class gengetopt_args_info;
00041 
00042 enum MyDataType {aff, cog, mic};
00043 enum MyVerbosityLevel {verbosityZero, verbosityOne, verbosityTwo, verbosityThree, verbosityFour};
00044 const MyVerbosityLevel MY_VERBOSITY_LEVEL_TABLE[] =
00045 {verbosityZero, verbosityOne, verbosityTwo, verbosityThree, verbosityFour};
00046 
00047 const int MAX_VERBOSITY_LEVEL = 4;
00048 
00049 
00050 const int MAX_LINE_LENGTH = 1024;
00051 const int DEFAULT_NUM_SEEDS = -1;
00052 const int DEFAULT_SIZE_DISC = -1;
00053 const int DEFAULT_NUM_DISCS = -1;
00054 const int DEFAULT_MAX_COUNT = -1;
00055 
00056 // forward declarations.
00057 class MyCluster;
00058 class MyPointSet;
00059 //struct MyAffyFileFormat;
00060 
00061 // various paramaters needed when computing a cluster. it is easier to
00062 // have a struct like this than to pass around a bunch of single
00063 // values.
00064 struct MyClusterParams
00065 {
00066   friend class MyCluster;
00067   
00068   
00069 public:
00070   MyClusterParams();
00071   
00072   // -a, --minimumSupport
00073   MyNT getMinimumSupport() const
00074     {
00075       return(minimumSupport);
00076     }
00077   // -A, --anneal
00078   bool getAnneal() const
00079     {
00080       return(anneal);
00081     }
00082   
00083   // --apriori
00084   bool getApriori() const
00085   {
00086     return(runAprioriAlgo);
00087   }
00088 
00089   // -b, --beta
00090   MyNT getBeta() const
00091     {
00092       return(beta);
00093     }
00094   // -c, --class
00095   string getClassFileName() const
00096     {
00097       return(classFile);
00098     }
00099   
00100   string getCommandLine() const
00101     {
00102       return(commandLine);
00103     }
00104   
00105   void setCommandLine(int argc, char **argv)
00106     {
00107       for (int i = 0; i < argc; i++)
00108         {
00109           commandLine += argv[i];
00110           commandLine += " ";
00111         }
00112     }
00113 
00114   // -C, --config
00115   string getConfigFileName() const
00116     {
00117       return(configFile);
00118     }
00119   void readConfigFile();
00120 
00121   // --correlation
00122   bool getComputeCorrelations() const
00123     {
00124       return(computeCorrelations);
00125     }
00126   MyNT getCorrelationThreshold() const
00127     {
00128       return(correlationThreshold);
00129     }
00130   
00131   
00132   // -d, --datatype
00133   MyDataType getDataType() const
00134     {
00135       return(dataType);
00136     }
00137   char getDelimiter() const
00138     {
00139       return(delimiter);
00140     }
00141 
00142   // have to define this function in params.C since MyAffyFileFormat
00143   // is incomplete here.
00144   MyAffyFileFormat getFileFormat(unsigned int which) const
00145     {
00146       return(fileFormats[which]);
00147     }
00148 
00149 
00150 
00151   
00152 
00153   // -f, --flip
00154   bool getFlip() const
00155     {
00156       return(flipPointsAndCoords);
00157     }
00158 
00159   // --gaussian
00160   bool getUseGaussian() const
00161     {
00162       return(useGaussian);
00163     }
00164   
00165 
00166   // -g, --greedy
00167   bool getGreedy() const
00168     {
00169       return(findClustersGreedily);
00170     }
00171   
00172   // -G, --generate
00173   bool getGenerateData() const
00174     {
00175       return(generateData);
00176     }
00177   
00178   // --genes
00179   string getGenesFileName() const
00180     {
00181       return(genesFile);
00182     }
00183 
00184   // --gene-id-column
00185   string getGeneIdColumn(unsigned int whichPointSet) const
00186     {
00187       return(geneIdColumns[whichPointSet]);
00188     }
00189 
00190   
00191   // --gene-name-column
00192   string getGeneNameColumn(unsigned int whichPointSet) const
00193     {
00194       return(geneNameColumns[whichPointSet]);
00195     }
00196 
00197   
00198   // -i, --input
00199   string getInputFileName(int index = 0) const
00200     {
00201       return(inputFiles[index]);
00202          //      return(inputFile);
00203     }
00204 
00205   // -I, --itemsets
00206   bool getComputeItemsets(unsigned int which = 0) const
00207   {
00208     //    return(computeItemsets[which]);
00209     return((0 != computeItemsets[which]));
00210   
00211   }
00212   
00213   int getNumLimitSets() const
00214     {
00215          return(limitFiles.size());
00216     }
00217   
00218   // -l, --limit
00219   string getLimitFileName(unsigned int which = 0) const
00220   {
00221     return(limitFiles[which]);
00222   }
00223 
00224   
00225   // --lb
00226   MyNT getLowerBoundForFiltering() const
00227     {
00228       return(lowerBound);
00229     }
00230   bool getLowerBoundGiven() const
00231     {
00232       return(lowerBoundGiven);
00233     }
00234   
00235   
00236   // --log
00237   bool getUseLogarithm() const
00238     {
00239       return(useLogarithm);
00240     }
00241   
00242 
00243   // -m, --memory
00244   bool getSaveMemory() const
00245     {
00246       return(saveMemory);
00247     }
00248 
00249   // --mc
00250   int getMaxCount() const
00251     {
00252       return(maxCount);
00253       
00254     }
00255   void setMaxCount(int m)
00256     {
00257       maxCount = m;
00258     }
00259 
00260   // --max-down-regulated
00261   MyNT getMaximumDownRegulated() const
00262   {
00263     return(maxDownRegulatedValue);
00264   }
00265   bool maximumDownRegulatedGiven() const
00266     {
00267       return(maxDownRegulatedGiven);
00268     }
00269   
00270   // --min-up-regulated
00271   MyNT getMinimumUpRegulated() const
00272   {
00273     return(minUpRegulatedValue);
00274   }
00275   bool minimumUpRegulatedGiven() const
00276     {
00277       return(minUpRegulatedGiven);
00278     }
00279   
00280   
00281   // --min
00282   bool getMinimiseClusterSize() const
00283     {
00284       return(minimiseClusterSize);
00285     }
00286   
00287   // --mq
00288   MyNT getMinimumHomogeneity() const
00289     {
00290       return(minimumClusterHomogeneity);
00291     }
00292   void setMinimumHomogeneity(MyNT q)
00293     {
00294       minimumClusterHomogeneity = q;
00295     }
00296   
00297   
00298   int getNumNames() const
00299     {
00300       return(numNames);
00301     }
00302 
00303   // -n, --number
00304   int getNumPointsToRead() const
00305     {
00306       return(numPointsToRead);
00307     }
00308   
00309   // --nb
00310   int getNumBestClusters() const
00311     {
00312       return(numBestClusters);
00313     }
00314 
00315   // --nc
00316   int getNumClustersToCompute() const
00317     {
00318       return(numClusters);
00319     }
00320 
00321   // --nd
00322   int getNumDiscs(unsigned int index  = 0) const
00323     {
00324       return(numDiscs[index]);
00325     }
00326   
00327   unsigned int getNumPointSets() const
00328     {
00329          unsigned int retval;
00330          // some weird bug make numPointSets zero.
00331          retval = numPointSets;
00332          retval = inputFiles.size();
00333          cout << "\tMyClusterParams::getNumPointSets(): Returning " 
00334                  << retval << endl;
00335          // some weird bug make this number zero.
00336          //      return(numPointSets);
00337          return(retval);
00338     }
00339   
00340   // --ns
00341   int getNumSeeds(unsigned int index = 0) const
00342     {
00343       return(numSeeds[index]);
00344     }
00345 
00346   // -o, --output
00347   string getOutputFileName() const
00348     {
00349       return(outputFile);
00350     }
00351 
00352   // --oi 
00353   string getOutputInternalValuesFileName() const
00354     {
00355       return(outputInternalValuesFile);
00356     }
00357   
00358   
00359   // -p, --partition
00360   // badly named function.
00361   bool getPartition() const
00362     {
00363       return(partitionPoints);
00364     }
00365 
00366   // --pc
00367   int getMaxPointCount() const
00368     {
00369       return(maxPointCount);
00370     }
00371   
00372   // --pv
00373   MyNT getMaxPValue() const
00374     {
00375       return(maxPValue);
00376     }
00377   
00378   // -r, --repeat
00379   string getRepeatFileName() const
00380     {
00381       return(repeatFile);
00382     }
00383 
00384   // --rf
00385   string getWidthRangeFileName() const
00386     {
00387       return(widthRangeFile);
00388     }
00389 
00390   // --row-name-column
00391   string getRowNameColumn() const
00392     {
00393       return(rowNameColumn);
00394     }
00395   
00396   
00397   // -s, --siginificant
00398   bool getUseSignificantIntervals(unsigned int which = 0) const
00399     {
00400       return(useSignificantIntervals[which]);
00401     }
00402   void setUseSignificantIntervals(unsigned int which = 0, bool val = true)
00403   {
00404       useSignificantIntervals[which] = val;
00405     }
00406   
00407 
00408   // -S, --statistics.
00409   bool getComputeStatistics() const
00410     {
00411       return(computeStatistics);
00412     }
00413   
00414 
00415 
00416   // not really related to --sd option but still putting these
00417   // functions according to the minimumSupportbetical order of command-line
00418   // options.
00419   int getSizeDisc(unsigned int index = 0) const
00420     {
00421       return(sizeDiscs[index]);
00422          //      return(sizeDisc);
00423     }
00424 
00425   void setSizeDisc(int sd, unsigned int index = 0)
00426     {
00427          sizeDiscs[index] = sd;
00428          //      sizeDisc = sd;
00429     }
00430   
00431   // --st
00432   MyNT getSimilarityThreshold() const
00433     {
00434       return(clusterSimilarityThreshold);
00435     }
00436   
00437   // --shamir
00438   bool getShamir() const
00439     {
00440       return(runShamirAlgo);
00441     }
00442   
00443   void set(gengetopt_args_info &cl, unsigned int whichPointSet = 0,
00444                  // if this arg is true, then i have already called set()
00445                  // before so i need not expect all the variables i expect
00446                  // the first time.
00447                  bool update = false);
00448 
00449 
00450   // --uniform
00451   bool getUseUniform() const
00452     {
00453       return(useUniform);
00454     }
00455   
00456   // --ub
00457   MyNT getUpperBoundForFiltering() const
00458     {
00459       return(upperBound);
00460     }
00461   bool getUpperBoundGiven() const
00462     {
00463       return(upperBoundGiven);
00464     }
00465 
00466   // -v, --verbose
00467   MyVerbosityLevel getVerbosity() const
00468     {
00469       return(verbose);
00470     }
00471 
00472   // -w, --width
00473   MyNT getWidth() const
00474     {
00475       return(width);
00476     }
00477 
00478   // --wf
00479   string getWidthFileName() const
00480     {
00481       return(widthFile);
00482     }
00483   
00484 
00485   void print(ostream& ostr) const;
00486 
00487   // set those params that depend on the number of points.
00488   void set(int numPoints, int numDimensions, unsigned int index = 0)
00489     {
00490       setNumSeeds(numPoints, numDimensions, index);
00491       // set sizeDisc before setting numDiscs.
00492       setSizeDisc(numPoints, numDimensions, index);
00493       setNumDiscs(numPoints, numDimensions, index);
00494       // set maxCount. this way, i don't have to remember to set the
00495       // max count in the command line when i want it to be the number
00496       // of points.  
00497       if (DEFAULT_MAX_COUNT == maxCount)
00498         maxCount = numPoints;
00499       
00500     }
00501 
00502 
00503 private:
00504   void setDataType(const string& type)
00505     {
00506       if (("a" == type) || ("aff" == type) || ("affy" == type) || ("affymetrix" == type))
00507         {
00508           dataType = aff;
00509           delimiter = '\t';
00510           numNames = 2;
00511         }
00512       else if (("c" == type) || ("cog" == type))
00513         {
00514           dataType = cog;
00515           delimiter = ' ';
00516         }
00517       else if (("m" == type) || ("mic" == type) || ("microarray" == type))
00518         {
00519           dataType = mic;
00520           delimiter = ' ';
00521           numNames = 1;
00522         }
00523       else
00524         cerr << "ERROR! Unknown data type " << type << ". Exiting.\n";
00525     }
00526   
00527   void setNumSeeds(unsigned int numPoints, 
00528                             unsigned int numDimensions, unsigned int index = 0)
00529     {
00530       // i can generate at most numPoints seeds.
00531       if (numSeeds[index] > numPoints)
00532         numSeeds[index] = numPoints;
00533     }
00534   
00535   void setSizeDisc(unsigned int numPoints, unsigned int numDimensions, 
00536                             unsigned int index = 0)
00537     {
00538       if (DEFAULT_SIZE_DISC == sizeDiscs[index])
00539         // sizeDisc has not been changed on the command line.
00540         //
00541         // $\log (d/(1 probDisc)) /\log (1/(2 \beta))$
00542         sizeDiscs[index] = static_cast<int>(10*log(numDimensions/(1 - probDisc))/log(1/(2*beta)));
00543       if (sizeDiscs[index] > numPoints)
00544         // the number of points in the discriminant should not be
00545         // greater than the number of points! but setting it equal to
00546         // #points seems wrong since i will never find a cluster. i am
00547         // hacking the problem away by setting sizeDisc to half the
00548         // number of points.
00549         sizeDiscs[index] = numPoints/2;
00550     }
00551   
00552   void setNumDiscs(unsigned int numPoints, unsigned int numDimensions, 
00553                             unsigned int index)
00554     {
00555       if (DEFAULT_NUM_DISCS == numDiscs[index])
00556         // numDiscs has not been changed on the command line.
00557         //
00558         // $\log (1 - probCluster) / \log (1 - \minimumSupport^{sizeDisc})$. the
00559         // formula is the same as that for numSeeds, except that \minimumSupport
00560         // is replaced by \minimumSupport^{sizeDisc}.
00561         numDiscs[index] = static_cast<int>(log(1 - probCluster)/log(1 - pow((double)minimumSupport, sizeDiscs[index]/10.0)));
00562     }
00563 
00564   void processRepeatFile(ifstream& rstr, int& localArgc, char **&localArgv);
00565   
00566 private:
00567 
00568   // data members
00569 
00570   // -a, --minimumSupport
00571   MyNT minimumSupport;
00572   // -A, --anneal
00573   bool anneal;
00574 
00575   // --apriori
00576   bool runAprioriAlgo;
00577 
00578   // -b, --beta
00579   MyNT beta;
00580 
00581   // -c, --class
00582   string classFile;
00583   
00584   string commandLine;
00585 
00586   // -C, --config
00587   string configFile;
00588   map< string, string > configInfo;
00589   
00590   // --correlations
00591   bool computeCorrelations;
00592   MyNT correlationThreshold;
00593   
00594 
00595   // -d, --datatype
00596   MyDataType dataType;
00597 
00598   // delimiter that separates fields.
00599   char delimiter;
00600 
00601   // file formats for each file to read.
00602   vector< MyAffyFileFormat > fileFormats;
00603   
00604   // -f, --flip
00605   bool flipPointsAndCoords;
00606 
00607   // --gaussian
00608   bool useGaussian;
00609   
00610   // -g, --greedy
00611   bool findClustersGreedily;
00612   
00613   // -G, --generate
00614   bool generateData;
00615   
00616   // --genes
00617   string genesFile;
00618 
00619   // --gene-id-column
00620   //  string geneIdColumn;
00621   vector< string > geneIdColumns;
00622 
00623   // --gene-name-column
00624   //  string geneNameColumn;
00625   vector< string > geneNameColumns;
00626   
00627   // -i, --input
00628   // file to read input from.
00629   string inputFile;
00630   vector< string > inputFiles;
00631 
00632   // -I, --itemset
00633   //  bool computeItemsets;
00634   //   vector< bool > computeItemsets;
00635    vector< unsigned int > computeItemsets;
00636 
00637   // --ignore-column-name
00638   // names of columns to ignore.
00639   //
00640   // set does not compile for some reason!!! "set is used as a type
00641   // but not defined as a type." what is going on? 
00642   // set< string, ltstr > ignoredColumns;
00643   map< string, bool > ignoredColumns;
00644 
00645   // --ignore-row-name
00646   // names of rows to ignore.
00647   //  set< string, ltstr > ignoredRows;
00648   map< string, bool > ignoredRows;
00649 
00650 
00651   // -l, --limit
00652   vector< string > limitFiles;
00653   
00654   // --lb
00655   MyNT lowerBound;
00656   bool lowerBoundGiven;
00657   
00658   // --log
00659   bool useLogarithm;
00660   
00661   // -m, --memory
00662   bool saveMemory;
00663 
00664   // --mc
00665   int maxCount;
00666 
00667   // --min
00668   bool minimiseClusterSize;
00669   
00670   // --max-down-regulated
00671   MyNT maxDownRegulatedValue;
00672   bool maxDownRegulatedGiven;
00673   
00674   // --min-up-regulated
00675   MyNT minUpRegulatedValue;
00676   bool minUpRegulatedGiven;
00677   
00678   // --mq
00679   MyNT minimumClusterHomogeneity;
00680   
00681   // number of names in the input file for each gene.
00682   int numNames;
00683 
00684   // -n, --number
00685   int numPointsToRead;
00686 
00687   // --nb. return the numBestClusters computed clusters. the idea is
00688   // not just to return one cluster per round but to store the
00689   // numBestClusters over all rounds.
00690   int numBestClusters;
00691   
00692   // --nc
00693   int numClusters;
00694   // --nd
00695   //  int numDiscs;
00696   vector< unsigned int > numDiscs;
00697 
00698   // this parameter is not set by any command-line parameter. i will
00699   // use the number of -i and --sd options to set it.
00700   unsigned int numPointSets;
00701 
00702   // --ns
00703   //  int numSeeds;
00704   vector< unsigned int > numSeeds;
00705 
00706   // -o, --output
00707   string outputFile;
00708 
00709   // --oi
00710   // if true, output internal values of the experiment, usually for a subsequent run.
00711   bool outputInternalValues;
00712   string outputInternalValuesFile;
00713   
00714   // -p, --partition
00715   bool partitionPoints;
00716 
00717   // --pc, the maximum number of clusters that a point can appear in.
00718   // usually used with the -p option so that i don't partition points
00719   // between clusters. when i don't use the -p option, i set this
00720   // variable to be 1. the default value is -1, which means a point
00721   // can appear in as many clusters as it wants.
00722   int maxPointCount;
00723   
00724   // -pc
00725   // probability that a random sample is in the cluster.
00726   MyNT probCluster;
00727   // --pd
00728   // probability that a random sample of size sizeDisc is a discriminant.
00729   MyNT probDisc;
00730   // --ps
00731   // probability that a random point is a good seed (in the cluster).
00732   MyNT probSeed;
00733   
00734   // --pv
00735   MyNT maxPValue;
00736   
00737   // -r, --repeat
00738   // if true, repeat a previous experiment from parameters stored in given file.
00739   bool repeatExperiment;
00740   string repeatFile;
00741 
00742   // --rf
00743   // the name of the file containing widths corresponding to significant intervals. 
00744   string widthRangeFile;
00745 
00746   // --row-name-column
00747   string rowNameColumn;
00748   
00749 
00750   // -s, --siginificant. if
00751   // useSignificantIntervals is 0, then i don't have to compare
00752   // cluster with siginificant intervals at all. if
00753   // useSignificantIntervals is 1, then i require that the cluster's
00754   // bounded intervals be completely inside significant intervals.
00755   //  bool useSignificantIntervals;
00756   vector< unsigned int > useSignificantIntervals;
00757   
00758   // -S, --statistics
00759   bool computeStatistics;
00760   
00761   
00762   // --sd
00763   //  int sizeDisc;
00764   vector< unsigned int > sizeDiscs;
00765 
00766   // --shamir.
00767   bool runShamirAlgo;
00768   
00769   // --st. threshold beyond which two clusters are similar. used with
00770   // the --nb option.
00771   MyNT clusterSimilarityThreshold;
00772 
00773   // --ub
00774   MyNT upperBound;
00775   bool upperBoundGiven;
00776   
00777   
00778   // --uniform
00779   bool useUniform;
00780   
00781   // -v, --verbose
00782   MyVerbosityLevel verbose;
00783   
00784   // -w, --width
00785   MyNT width;
00786   // --wf
00787   string widthFile;
00788   
00789 };
00790 
00791 
00792 #endif // _PARAMS_H 
 All Classes Functions Variables Typedefs Friends