Biorithm  1.1
 All Classes Functions Variables Typedefs Friends
point.h
00001 /**************************************************************************
00002  * Copyright (c) 2002-2011 T. M. Murali                                   *
00003  * Copyright (c) 2010-2011 Christopher L. Poirel                          *
00004  *                                                                        *
00005  * This file is part of Biorithm.                                         *
00006  *                                                                        *
00007  * Biorithm is free software: you can redistribute it and/or modify       *
00008  * it under the terms of the GNU General Public License as published by   *
00009  * the Free Software Foundation, either version 3 of the License, or      *
00010  * (at your option) any later version.                                    *
00011  *                                                                        *
00012  * Biorithm is distributed in the hope that it will be useful,            *
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of         *
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          *
00015  * GNU General Public License for more details.                           *
00016  *                                                                        *
00017  * You should have received a copy of the GNU General Public License      *
00018  * along with Biorithm.  If not, see <http://www.gnu.org/licenses/>.      *
00019  *                                                                        *
00020  **************************************************************************/
00021 
00022 // Purpose: defines point class.
00023 
00024 #ifndef POINT_H
00025 #define POINT_H
00026 
00027 #include <map>
00028 #include <set>
00029 
00030 #include "constants.h"
00031 // Test for GCC >= 4.3. from http://gcc.gnu.org/onlinedocs/cpp/Common-Predefined-Macros.html. See definition of GCC_VERSION macro in libutil/constants.h
00032 #if (GCC_VERSION >= 40300)
00033 // see http://gcc.gnu.org/gcc-4.3/changes.html
00034 #include <tr1/unordered_set>
00035 using namespace std::tr1;
00036 #else
00037 #include <ext/hash_set>
00038 using namespace __gnu_cxx;
00039 #endif
00040 
00041 #include "myiterator.h"
00042 
00043 #include "format.h"
00044 #include "global.h"
00045 #include "histogram.h"
00046 #include "info.h"
00047 #include "interval.h"
00048 #include "params.h"
00049 #include "random.h"
00050 
00051 // forward declarations to avoid including params.h
00052 // class MyClusterParams;
00053 // enum MyVerbosityLevel;
00054 
00055 
00056 // do not flip rows and columns in data. lots of code has to change as
00057 // a result so i am guarding the new code with #define.
00058 //#define DO_NOT_FLIP_DATA 1
00059 
00060 // a base class that stores an n-dimensional point.
00061 template <class NT>
00062 class Point_d 
00063 {
00064 public:
00065   Point_d(int d=0)
00066     : dimension(0), missingDims(), coords()
00067     {}
00068   Point_d (int d, const vector< NT> &c, 
00069                  const set< unsigned int > *missing = NULL)
00070     : dimension(d), missingDims(), coords(c)
00071     {
00072          if (missing)
00073            missingDims = *missing;
00074     }
00075   Point_d(const Point_d& other)
00076       : dimension(other.dimension), missingDims(other.missingDims), coords(other.coords)
00077     {}
00078   
00079   
00080   virtual ~Point_d()
00081     {}
00082   
00083   unsigned int getNumDimensions() const
00084     {
00085       return(dimension);
00086     }
00087   
00088   NT operator[](int i) const
00089     {
00090       // TODO: BUG. check if i is valid.
00091       return(coords[i]);
00092     }
00093 
00094   // is the coord for dim i missing in this point?
00095   bool isDimMissing(unsigned int i ) const
00096   {
00097     // HACKHACKHACK. i get a segfault here, probably only when
00098     // missingDims has no entries. i am adding a check for the size,
00099     // which is quite an ugly HACKHACKHACK.
00100     return((0 != missingDims.size()) && (missingDims.end() != missingDims.find(i)));
00101 //    return(missingDims.end() != missingDims.find(i));
00102   }
00103 
00106   unsigned int getNumMissingDimensions() const
00107     {
00108       return(missingDims.size());
00109     }
00110   
00115   void standardise()
00116     {
00117       MyNT mean = computeMean(coords, &missingDims);
00118       MyNT variance = computeVariance(coords, mean, &missingDims);
00119       standardise(mean, sqrt(variance));
00120     }
00121   
00122 
00127   void standardise(MyNT m, MyNT s)
00128     {
00129       for (unsigned int i = 0; i < coords.size(); i++)
00130         if (!isDimMissing(i))
00131           coords[i] = (coords[i] - m)/s;
00132     }
00133   
00134   
00135 
00136   void print(ostream& ostr) const
00137     {
00138       for (typename vector< NT>::const_iterator itr = coords.begin(); itr != coords.end(); itr++)
00139         ostr << "\t" << *itr;
00140       ostr << endl;
00141     }
00142 
00143   friend ostream& operator<<(ostream& ostr,  const Point_d &pt)
00144     {
00145       pt.print(ostr);
00146       return(ostr);
00147     }
00148   
00149   friend istream& operator>>(istream& istr, Point_d &pt)
00150     {
00151       cerr << "ERROR! istream& operator>>(istream& istr, Point_d &pt) not implemented. Exiting.\n";
00152       exit(-1);
00153       return(istr);
00154     }
00155   
00161   virtual void randomise()
00162     {
00163       random_shuffle(coords.begin(), coords.end());
00164     }
00165   
00166 //protected:
00167     // TODO: DESIGN. should i allow user to modify any coords?
00168   NT& operator[](int i)
00169     {
00170       return(coords[i]);
00171     }
00172   
00173 protected:
00174 
00175   void addMissingDim(unsigned int i)
00176   {
00177     missingDims.insert(i);
00178   }
00179 
00180   void deleteMissingDim(unsigned int i)
00181   {
00182     missingDims.erase(i);
00183   }
00184 
00185 
00186 protected:
00187   unsigned int dimension;
00188   // list of coords with missing values.
00189   set< unsigned int > missingDims;
00190   vector< NT > coords;
00191 };
00192 
00193 typedef Point_d< MyNT > _MyPoint;
00194 
00195 class MyPointInfo : public MyInfo
00196 {
00197 public:
00198   MyPointInfo(const MyString &id = "", const MyString &sid = "", const string &cname = "") 
00199       : MyInfo(id, sid), className(cname)
00200     {}
00201   MyPointInfo(const MyPointInfo& info)
00202       : MyInfo(info), className(info.className)
00203     {}
00204   MyPointInfo(const MyInfo& info)
00205       : MyInfo(info), className()
00206     {}
00207   virtual ~MyPointInfo()
00208     {}
00209 
00210   // return a clone of this. this function is a way of providing a
00211   // virtual constructor for the MyInfo class hierarchy. 
00212   virtual MyPointInfo* clone() const // = 0;
00213     {
00214       MyPointInfo* clone = new MyPointInfo(*this);
00215       clone->setClassName(className);
00216       return(clone);
00217     }
00218 
00219   virtual void setClassName(string name)
00220     {
00221       className = name;
00222     }
00223   virtual string getClassName() const
00224     {
00225       return(className);
00226     }
00227   
00228   virtual void print(ostream& ostr) const
00229     {
00230       MyInfo::print(ostr);
00231       ostr << "Class name = " << className << endl;
00232     }
00233   
00234 private:
00235   // the name of the class the point belongs to.
00236   string className;
00237 };
00238 
00239 // define MyDimInfo to be exactly the same as MyPointInfo. this class
00240 // is useful when i don't flip a point set.
00241 typedef MyPointInfo MyDimInfo;
00242 // MyDimInfo is a plain old string.
00243 // typedef string MyDimInfo;
00244 
00245 // the real point class with bells and whistles (e.g., info).
00246 class MyPoint : public _MyPoint
00247 {
00248   friend class MyPointSet;
00249   
00250 public:
00251   MyPoint()
00252       : _MyPoint(), info(NULL), chosen(false)
00253     {}
00254   // need constructors that use both MyInfo (for readCOGData in read.C) and MyPointInfo
00255   MyPoint(int dim, const vector< MyNT >& coords, MyInfo *inf)
00256       : _MyPoint(dim, coords), info(NULL), chosen(false), hidden(false)
00257     {
00258       if (inf)
00259         setInfo(inf);
00260     }
00261 
00262   MyPoint(int dim, const vector< MyNT >& coords, 
00263                 //MyPointInfo *inf, p
00264                 MyInfo *inf, 
00265                 set< unsigned int > *missing = NULL)
00266     : _MyPoint(dim, coords, missing), info(NULL), chosen(false), hidden(false)
00267     {
00268       if (inf)
00269         setInfo(inf);
00270     }
00271 
00272   MyPoint(const _MyPoint &pt, MyPointInfo *inf)
00273       : _MyPoint(pt), info(NULL), chosen(false), hidden(false)
00274     {
00275       if (inf)
00276         setInfo(inf);
00277     }
00278   MyPoint(const MyPoint& copy)
00279       : _MyPoint(copy), info(NULL), chosen(copy.chosen), hidden(copy.hidden)
00280     {
00281       if (copy.info)
00282         setInfo(copy.info);
00283     }
00284   const MyPoint& operator=(const MyPoint& rhs)
00285     {
00286       if (this != &rhs)
00287         {
00288           this->_MyPoint::operator=(rhs);
00289           if (rhs.info)
00290             setInfo(rhs.info);
00291           chosen = rhs.chosen;
00292           hidden = rhs.hidden;
00293         }
00294       return(*this);
00295     }
00296   
00297   virtual ~MyPoint()
00298     {
00299       delete info;
00300     }
00301   
00302   const MyInfo* getInfo() const
00303     {
00304       return(info);
00305     }
00306 
00307   MyString getName() const
00308     {
00309       if (info)
00310         {
00311           // return short name if it exists.
00312           if ("" != info->getShortName())
00313             return(info->getShortName());
00314           else
00315             return(info->getName());
00316         }
00317       else
00318         return(string("NONAME"));
00319     }
00320 
00321   // TODO: DESIGN. i have to set info's members when i read
00322   // MyAffyGene. is there a way to still keep info a const.
00323   //
00324   void setInfo(MyPointInfo* inf)
00325     {
00326 #if 0      
00327       info = inf;
00328 #endif
00329       delete info;
00330       
00331 //       if (!info)
00332         info = inf->clone();
00333       // TODO: LEAK. is there a leak here? what happens to the old
00334       // values of name and shortName?
00335 //       info->setName(inf->getName());
00336 //       info->setShortName(inf->getShortName());
00337     }
00338   void setInfo(MyInfo* inf)
00339     {
00340       delete info;
00341       // TODO: DESIGN. why do i need to clone when a new works?
00342       info = new MyPointInfo(*inf);
00343     }
00344 
00345 #if 0      
00346   void setInfo(const MyString &name, const MyString &shortName,
00347                const string className = "")
00348     {
00349       if (!info)
00350         info = new MyPointInfo;
00351       // TODO: LEAK. is there a leak here? what happens to the old
00352       // values of name and shortName?
00353       info->setName(name);
00354       info->setShortName(shortName);
00355       info->setClassName(className);
00356     }
00357 #endif
00358 
00359   string getClassName() const
00360     {
00361       if (info)
00362         return(info->getClassName());
00363       else
00364         return("NONAME");
00365 //        cerr << "Point " << getName() << " has no associated info to store class name with.\n";
00366     }
00367   
00368   void setClassName(string name)
00369     {
00370       if (info)
00371         info->setClassName(name);
00372       else
00373         cerr << "Point " << getName() << " has no associated info to store class name with.\n";
00374     }
00375   bool getChosen() const
00376     {
00377       return(chosen);
00378     }
00379 
00380   bool getHidden() const
00381     {
00382       return(hidden);
00383     }
00384 
00389   MyNT getMaxCoord() const
00390     {
00391       if (getNumMissingDimensions() == getNumDimensions())
00392         return(0);
00393       unsigned int i = 0;
00394       MyNT max;
00395       while (isDimMissing(i))
00396         i++;
00397       // coords[i] is not missing.
00398       max = coords[i];
00399       while (i < getNumDimensions())
00400         {
00401           if (!isDimMissing(i) && (coords[i] > max))
00402             max = coords[i];
00403           i++;
00404         }
00405       return(max);
00406     }
00407   
00412   MyNT getMinCoord() const
00413     {
00414       if (getNumMissingDimensions() == getNumDimensions())
00415         return(0);
00416       unsigned int i = 0;
00417       MyNT min;
00418       while (isDimMissing(i))
00419         i++;
00420       // coords[i] is not missing.
00421       min = coords[i];
00422       while (i < getNumDimensions())
00423         {
00424           if (!isDimMissing(i) && (coords[i] < min))
00425             min = coords[i];
00426           i++;
00427         }
00428       return(min);
00429     }
00430 
00435   MyNT computeNorm(unsigned int p) const;
00436   
00444   MyNT computeSparsityHoyer() const;
00445   
00446 
00447   
00459   int computeDiscreteCorrelation(const MyPoint &other,
00460                                  bool computeSign = false) const;
00461 
00491   MyNT computePearsonsCorrelation(const MyPoint &other,
00492                                   const MyNT *mean1 = NULL, const MyNT *stdDev1 = NULL,
00493                                   const MyNT *mean2 = NULL, const MyNT *stdDev2 = NULL)
00494     const;
00495 
00496 
00502   void computeRanks(MyPoint &rankedPoint) const;
00503   
00504 private:
00505 
00506   // only friends and derived classes can set chosen and hidden bits.
00507   //
00508   // called by MyPointSet::choosePoints().
00509   void setChosen(bool c)
00510     {
00511       chosen = c;
00512     }
00513   
00514   // called by MyPointSet::hidePoints().
00515   void setHidden(bool h)
00516     {
00517       hidden = h;
00518     }
00519 
00520 
00521   // distance related functions.
00522   MyNT distance(const MyPoint &other)
00523     {
00524       MyNT distance = 0;
00525       for (unsigned int i = 0; i < getNumDimensions(); i++)
00526         // no need to ignore missing dimensions since they contribute 0.
00527         distance += coords[i]*other.coords[i];
00528       return(sqrt(distance));
00529     }
00530 
00531   MyNT mean() const
00532     {
00533       MyNT _mean = 0;
00534       for (unsigned int i = 0; i < getNumDimensions(); i++)
00535         {
00536           // ignore missing dimensions.
00537           if (isDimMissing(i))
00538             continue;
00539           _mean += coords[i];
00540         }
00541       _mean /= (getNumDimensions() - getNumMissingDimensions());
00542       return(_mean);
00543     }
00544   
00545   MyNT variance(MyNT mean) const
00546     {
00547       MyNT _variance = 0;
00548       for (unsigned int i = 0; i < getNumDimensions(); i++)
00549         {
00550           // ignore missing dimensions.
00551           if (isDimMissing(i))
00552             continue;
00553           _variance += (coords[i] - mean)*(coords[i] - mean);
00554         }
00555       _variance /= (getNumDimensions() - getNumMissingDimensions());
00556       return(_variance);
00557     }
00558   
00559   MyNT standardDeviation(MyNT mean) const
00560     {
00561       return(sqrt(variance(mean)));
00562     }
00563   
00564   
00565 private:
00566   MyPointInfo* info;
00567   // to keep track of whether a point belongs to any cluster or not. i
00568   // need two variables for the following reasons: i use chosen just
00569   // as a statistic (to keep a count of how many points have been
00570   // picked). i use hidden to "hide" points and thus to control the
00571   // behaviour of the gropram based on the value of
00572   // params.getPartition(). if i use only hidden or chosen, then i have
00573   // to pass params.getPartition() to the MyCluster constructor, which
00574   // seems totally wrong. by using both chosen and hidden, i can set
00575   // hidden inside MyCluster::hideContainedPoints() and chosen inside
00576   // MyCluster::findContainedPoints().
00577   bool chosen;
00578   bool hidden;
00579 };
00580 
00581 
00582 typedef pair< int, MyNT > MyEntropy;
00583 
00584 struct compareEntropies : public binary_function< MyEntropy, MyEntropy, bool> 
00585 {
00586   bool operator()(const MyEntropy& __x, const MyEntropy& __y) const
00587     {
00588       return __x.second < __y.second; }
00589 };
00590 
00591 // inline ostream& operator<<(ostream& ostr, const MyEntropy& ent)
00592 // {
00593 //   ostr << "(" << ent.first << ", " << ent.second << ")";
00594 //   return(ostr);
00595 // }
00596 
00597 
00598 
00599 
00600 typedef map< string, string, ltstr >  MyClassesType;
00601 typedef set< string, ltstr > MyClassNamesType;
00602 
00603 const unsigned int NUM_CORRELATION_HISTOGRAM_BINS = 100;
00604 const MyNT MIN_CORRELATION = -1;
00605 const MyNT MAX_CORRELATION = 1;
00606 const MyNT CORRELATION_RANGE = MAX_CORRELATION - MIN_CORRELATION;
00607 
00608 
00614 class MyPointSet
00615 {
00626   friend class MyCluster;
00627   
00628   typedef map< MyNT, unsigned int > MyDimCountsType;
00629   typedef MyDimCountsType::const_iterator MyDimCountsConstPtr;
00630   typedef MyDimCountsType::iterator MyDimCountsPtr;
00631   
00632 public:
00633 
00634   MyPointSet()
00635       : numPoints(0), numDimensions(0), numNotChosen(0), numNotHidden(0),
00636         points(),
00637         hiddenPoints(), hiddenPointsByClass(), 
00638            flipped(false),
00639         namesToIndices(), classNamesExist(false), 
00640            classNames(),
00641            classesToIndices(), classCounts(),
00642         dimInfo(), dimLows(0), dimHighs(0), dimMeans(0), dimSigmas(0), dimCounts(0),
00643         dimEntropies(), sortedDims(), _pointMeans(), _pointStandardDeviations()
00644     {}
00645   MyPointSet(const MyPointSet& copy)
00646       : numPoints(copy.numPoints), numDimensions(copy.numDimensions),
00647         numNotChosen(copy.numNotChosen),
00648         numNotHidden(copy.numNotHidden),
00649         points(copy.points),
00650         hiddenPoints(copy.hiddenPoints), 
00651         hiddenPointsByClass(copy.hiddenPointsByClass), 
00652            flipped(copy.flipped),
00653         namesToIndices(copy.namesToIndices),
00654         classNamesExist(copy.classNamesExist),
00655            classNames(copy.classNames),
00656         classesToIndices(copy.classesToIndices),
00657         classCounts(copy.classCounts),
00658         dimInfo(copy.dimInfo), dimLows(copy.dimLows), dimHighs(copy.dimHighs),
00659         dimMeans(copy.dimMeans), dimSigmas(copy.dimSigmas),
00660         dimCounts(copy.dimCounts),
00661         dimEntropies(copy.dimEntropies), sortedDims(copy.sortedDims),
00662         _pointMeans(copy._pointMeans),
00663         _pointStandardDeviations(copy._pointStandardDeviations)
00664     {}
00665   const MyPointSet& operator=(const MyPointSet& rhs)
00666     {
00667       if (this != &rhs)
00668         {
00669           numPoints = rhs.numPoints;
00670           numDimensions = rhs.numDimensions;
00671           numNotChosen = rhs.numNotChosen;
00672           numNotHidden = rhs.numNotHidden;
00673           points.clear();
00674           points = rhs.points;
00675           hiddenPoints.clear();
00676           hiddenPoints = rhs.hiddenPoints;
00677                 hiddenPointsByClass.clear();
00678           hiddenPointsByClass = rhs.hiddenPointsByClass;
00679           flipped = rhs.flipped,
00680           namesToIndices.clear();
00681           namesToIndices = rhs.namesToIndices;
00682           classNamesExist = rhs.classNamesExist;
00683                 classNames.clear();
00684                 classNames = rhs.classNames;
00685                 classesToIndices.clear();
00686           classesToIndices = rhs.classesToIndices;
00687                 classCounts.clear();
00688           classCounts = rhs.classCounts;
00689 
00690           dimInfo.clear();
00691           dimInfo = rhs.dimInfo;
00692           dimLows.clear();
00693           dimLows = rhs.dimLows;
00694           dimHighs.clear();
00695           dimHighs = rhs.dimHighs;
00696           dimMeans.clear();
00697           dimMeans = rhs.dimMeans;
00698           dimSigmas.clear();
00699           dimSigmas = rhs.dimSigmas;
00700           
00701           dimCounts.clear();
00702           dimCounts = rhs.dimCounts;
00703           
00704           dimEntropies.clear();
00705           dimEntropies = rhs.dimEntropies;
00706           sortedDims.clear();
00707           sortedDims = rhs.sortedDims;
00708           _pointMeans = rhs._pointMeans;
00709           _pointStandardDeviations = rhs._pointStandardDeviations;
00710   }
00711       return(*this);
00712     }
00713   
00714   virtual ~MyPointSet()
00715     {}
00716 
00717 
00731   void compareTTest(string name, const MyPointSet &other, MyNT &tstat,
00732                     MyNT &pvalue) const;
00733   
00738   MyNT computeAverageExpressionValue(const set< string > &subset);
00739 
00741   void computeCoordinateDistribution(MyHistogram &histogram) const;
00742 
00744   MyNT computeMean(string point) const
00745     {
00746       MyPoint mean;
00747       computeMean(point, mean);
00748       return(::computeMean(mean.coords, &mean.missingDims));
00749     }
00750 
00752   bool computeMean(string point, MyPoint &mean) const;
00753 
00768   bool computeMeanAndVariance(string point, MyNT &mu, MyNT &var,
00769                               unsigned int &numCoords) const;
00770   
00771 
00772   
00775   void computeMeanAndStandardDeviation(MyNT &mean, MyNT &stddev);
00776   
00777   void computeSignificantValues(MyNT pvalueThreshold,
00778                                 MyHistogram &histogram, MyNT &value);
00779   
00781   void convertToRanks();
00782   
00783 
00785   void print(string outfile);
00786   
00788   void print(ostream &ostr);
00789   
00790   // when params can have multiple input files, i need some variable
00791   // to say which one to read!
00792   virtual void read(MyClusterParams& params, unsigned int whichPointSet = 0);
00793   // read just with an infile. call readAffyData() directly.
00794   virtual void read(string infile, const MyAffyFileFormat &format = MyAffyFileFormat());
00795 
00796   virtual void insert(const MyPoint& point)
00797     {
00798       points.push_back(point);
00799       numPoints++;
00800     }
00801 
00802   virtual unsigned int size() const
00803     {
00804       return(points.size());
00805     }
00806 
00807   virtual unsigned int getNumPoints() const
00808     {
00809       return(numPoints);
00810     }
00811 
00812   virtual unsigned int getNumDimensions() const
00813     {
00814       return(numDimensions);
00815     }
00816 
00817   virtual unsigned int getNumSamples() const
00818     {
00819       if (_isFlipped())
00820         return(numPoints);
00821       return(numDimensions);
00822     }
00823 
00824   virtual unsigned int getNumGenes() const
00825     {
00826       if (_isFlipped())
00827         return(numDimensions);
00828       return(numPoints);
00829     }
00835   virtual bool isPoint(string name) const
00836     {
00837       return(namesToIndices.end() != namesToIndices.find(name));
00838     }
00839   
00840   virtual vector< unsigned int > getPointIndex(string name) const
00841     {
00842       map< string, vector< unsigned int >, ltstr >::const_iterator itr =
00843         namesToIndices.find(name);
00844       if (namesToIndices.end() == itr)
00845         {
00846 #ifdef DEBUG
00847 //          cerr << "Cannot find point named \"" << name << "\"" << endl;
00848 #endif          
00849 //          return(getNumPoints());
00850           // return a vector of size 1.
00851           return(vector<unsigned int>(1, getNumPoints()));
00852         }
00853       return(itr->second);
00854     }
00855   
00856   
00857   virtual const MyPoint& getSample(unsigned int i ) const;
00858   
00859 
00860   // is there class info stored with the points.
00861   bool doClassNamesExist() const
00862     {
00863       return(classNamesExist);
00864     }
00865 
00866   // get the class name of sample i. it is better to have a separate
00867   // function since the result depends on whether the point set is
00868   // flipped or not. if i didnt have this function and the point set
00869   // was flipped, i could call points[i].getClassName(). but i would
00870   // have to know in the calling function whether the point set was
00871   // flipped or not, which is messy.
00872   string getClassName(int i) const;
00873   // get the sample name of sample i.
00874   string getSampleName(int i) const;
00875   // get the gene name of gene i.
00876   string getGeneName(int i) const;
00877   
00878   // return the number of points in a particular class.
00879   unsigned int numInClass(string className) const
00880     {
00881       map< string, int, ltstr >::const_iterator itr;
00882       itr = classCounts.find(className);
00883       return((*itr).second);
00884 //      return(classCounts[className]);
00885     }
00886   
00887   // set the class names.
00888   void setClasses(string className);
00889   void setClasses(const MyClassesType& classes);
00890   // read classes from a file. return the number of classes read.
00891   unsigned int readClasses(string fileName);
00892 
00893   void getClasses(MyClassNamesType &names)
00894     {
00895       names = classNames;
00896     }
00897   
00898   
00899   // return the info stored about all dimensions.
00900   virtual const vector< MyDimInfo >& getDimensionInfo() const
00901     {
00902       return(dimInfo);
00903     }
00904 
00905   // return info stored about dimension i.
00906   virtual MyDimInfo getDimensionInfo(int i) const
00907     {
00908       return(dimInfo[i]);
00909     }
00910 
00912   virtual void getDimensionNames(set< string > &names) const
00913     {
00914       for (unsigned int i = 0; i < getNumDimensions(); i++)
00915         names.insert(dimInfo[i].getName());
00916     }
00917 
00918   virtual void getDimCoords(int dim, vector< MyNT >& coords) const
00919     {
00920       coords = dimensions[dim];
00921 //       for (unsigned int i = 0; i < numPoints; i++)
00922 //         coords.push_back(points[i][dim]);
00923     }
00924   
00925   virtual MyNT getDimensionLow(int i) const
00926     {
00927       return(dimLows[i]);
00928     }
00929   virtual MyNT getDimensionHigh(int i) const
00930     {
00931       return(dimHighs[i]);
00932     }
00933 
00934   virtual MyNT getDimensionMean(int i) const
00935     {
00936       return(dimMeans[i]);
00937     }
00938 
00939   virtual MyNT getDimensionSigma(int i) const
00940     {
00941       return(dimSigmas[i]);
00942     }
00943 
00944   // get a vector of the distinct values for each dimension
00945   virtual void getDimensionValues(unsigned int i, vector< MyNT > &values)
00946     const;
00947   // return the number of coords in dimension i with coordinate = value.
00948   virtual unsigned int getDimensionCount(unsigned int i, MyNT value) const
00949     {
00950       return(_getDimensionCount(dimCounts[i].find(value)));
00951     }
00952   
00953   
00954   
00955   virtual const MyPoint& operator[](int i) const
00956     {
00957       // TODO: BUG. not checking if i is in the right range.
00958       return(points[i]);
00959     }
00960   
00961   virtual MyPoint& operator[](int i)
00962     {
00963       // TODO: BUG. not checking if i is in the right range.
00964       return(points[i]);
00965     }
00966 
00967   // find the min and max of the points indexed by discriminant along
00968   // dimension dim.
00969   virtual void getInterval(const vector< int > &discIndices, 
00970                                           unsigned int dim, MyNT &low, MyNT &high) const;
00971 
00972 
00973   // suggest a class (to use in MyPointSet::generate()). the current
00974   // algo simply iterates over all classes.
00975   virtual string suggestClass() const
00976   {
00977     static set< string, ltstr>::const_iterator classNameItr = 
00978          classNames.begin();
00979     ++classNameItr;
00980     // reset, if necessary.
00981     if (classNames.end() == classNameItr)
00982          classNameItr = classNames.begin();
00983     return(*classNameItr);
00984   }
00985   // generate a random index.
00986   virtual bool generate(int& index, bool forbidChosen = true) const;
00987   // generate a random set of indices.
00988   virtual bool generate(int num, vector< int >& indices, 
00989                                     // if true, don't generate indices for hidden points.
00990                                     bool forbidHidden = true,
00991                                     // only generate indices of points in this class.
00992                                     string classToUse = "") const;
00993   
00994   virtual int getNumNotChosen() const
00995     {
00996       return(numNotChosen);
00997     }
00998   
00999   virtual int getNumNotHidden() const
01000     {
01001       return(numNotHidden);
01002     }
01003   
01005   virtual void getPointNames(set< string > &names) const
01006     {
01007       for (unsigned int i = 0; i < getNumPoints(); i++)
01008         names.insert(points[i].getName());
01009     }
01010 
01011   // flipping coordinates and points.
01012   virtual void flip();
01013 
01021   void splitByClass(vector< MyPointSet > &splitPointSets,
01022                     map< string, unsigned int > &classToIndex);
01023   
01024   
01027   void randomise()
01028     {
01029       for (unsigned int i = 0; i < points.size(); i++)
01030         points[i].randomise();
01031     }
01032   
01033       
01034 
01035   
01036   // functions to set chosen and hidden bits. see comments before
01037   // definition of MyPoint::chosen to see the difference between
01038   // hiding and choosing a point.
01039   //
01040   // called by MyCluster::chooseContainedPoints().
01041   virtual void choosePoints(const vector< int >& containedPoints);
01042   
01043   // called by MyCluster::hideContainedPoints().
01044   virtual void hidePoints(const vector< int >& containedPoints);
01045   // called by MyCluster::hide().
01046   virtual void hide(const vector< int >& containedPoints,
01047                                 const vector< MyDimensionInterval > &intervals);
01048 
01049   virtual MyNT computeEntropy(int dim, MyVerbosityLevel verbosity) const;
01050   virtual void computeEntropies(MyVerbosityLevel verbosity);
01051   // do some simple filtering on points.
01052   virtual unsigned int filter(const MyClusterParams &params);
01053   // a version for code that does not want to use MyClusterParams.
01054   unsigned int filter(MyNT maxDownRegulated, MyNT minUpRegulated);
01055 
01056 
01087   virtual bool computeCorrelations(string point1, string point2,
01088                                    vector< MyNT > &correlations,
01089                                    string method = "Pearsons") const;
01090   
01117   virtual void computeCorrelations(MyNT threshold, string id,
01118                                    bool absoluteValues = true,
01119                                    map< string, map< string, MyNT > >*pairs = NULL);
01157   virtual void printCorrelations(MyNT threshold, string outputFile,
01158                                  bool absoluteValues = true,
01159                                  map< string, map< string, MyNT > >*pairs = NULL);
01160 
01170   MyHistogram printCorrelationsHistogram(string outputFile) const;
01171 
01183   MyHistogram computeCorrelationsHistogram(map< string, map< string, MyNT > >*pairs = NULL)
01184     const;
01185 
01192   virtual MyHistogram
01193   computeRandomisedCorrelationsHistogram(unsigned int numRandomisations, string outputFile);
01194 
01195 
01196   /* Standardise each gene in the data set.
01197 
01198   Linearly transform each genes's coordinates to have mean 0 and standard deviation 1.
01199 
01200   */
01201   void standardise()
01202     {
01203       for (unsigned int i = 0; i < points.size(); i++)
01204         points[i].standardise();      
01205     }
01206   
01207   /* Standardise each gene in the data set to have mean m and standard deviation s.
01208 
01209   Linearly transform each genes's coordinates to have mean m and standard deviation s.
01210 
01211   */
01212   void standardise(MyNT m, MyNT s)
01213     {
01214       for (unsigned int i = 0; i < points.size(); i++)
01215         points[i].standardise(m, s);      
01216     }
01217 
01218   /* Store translations of the row names into a new name space.
01219 
01220      \param[in] rowNameAliases a two dimensional map, each of whose
01221      keys is a row name, and each of whose values is a set of aliases
01222      for this row name in the new name space.
01223 
01224      \note The method internally stores these translations, so that a
01225      subsequent call to MyPointSet::isPoint(name),
01226      MyPointSet::getPointIndex(name), or MyPointSet::computeMean(name)
01227      will work correctly when name belongs to the new name space.
01228 
01229      \warning Ensure that the keys in rowNameAliases are from the same
01230      namespace as the row names stored in the invocant.
01231    */
01232   void storeRowNameTranslations(const map< string, set< string > > & rowNameAliases);
01233   
01234 
01235   
01236 
01237 
01238   bool _isFlipped() const
01239     {
01240       return(flipped);
01241     }
01242   
01243 private:
01244 
01245   // count the number of points that have not yet been chosen in a
01246   // cluster. see comments before definition of MyPoint::chosen to see
01247   // the difference between hiding and choosing a point.
01248   virtual void _countNumNotChosen();
01249 
01250   // count the number of points that have not yet been hidden in a
01251   // cluster. see comments before definition of MyPoint::chosen to see
01252   // the difference between hiding and choosing a point.
01253   virtual void _countNumNotHidden();
01254 
01255   // functions to compute and store stats on dimensions.
01256   void _fillIn();
01257 
01258   // create a map going to a point's name to its index.
01259   void _createNameToIndicesMap();
01260 
01261   // functions to return a value or its count for a dimension.
01262   MyNT _getCoord(MyDimCountsConstPtr itr) const
01263     {
01264       return((*itr).first);
01265     }
01266   
01267   void _computeDimCounts();
01268   
01269   unsigned int _getDimensionCount(MyDimCountsConstPtr itr) const
01270     {
01271       return((*itr).second);
01272     }
01273 
01274   void _computeClassCounts();
01275 
01276 private:  
01277 
01278   unsigned  int numPoints;
01279   unsigned int numDimensions;
01280   unsigned int numNotChosen;
01281   unsigned int numNotHidden;
01282   vector< MyPoint > points;
01283   // it is useful to store the coords by dimension too. i am blowing
01284   // up space by a factor of two but i am saving on time since the
01285   // calls to MyPointSet::getDimCoords() are much faster.
01286   vector< vector< MyNT > > dimensions;
01287   // a hash set of indices corresponding to those points that have been hidden.
01288 #if GCC_VERSION >= 40300
01289   unordered_set< int > hiddenPoints;
01290   // for each class, indices that are hidden.
01291   map< string, unordered_set< int > > hiddenPointsByClass;
01292 #else
01293   hash_set< int > hiddenPoints;
01294   // for each class, indices that are hidden.
01295   map< string, hash_set< int > > hiddenPointsByClass;
01296 #endif
01297   // have i flipped the point and dimensions?
01298   bool flipped;
01299   
01300   // map from a point's name to its index.
01301 //  map< string, int, ltstr > namesToIndices;
01302 
01303   // for some data sets, e.g., human Affy chips, there are multiple
01304   // indices for each point, e.g., an entrez-gene ID, so the second
01305   // type in the map is a vector.
01306   map< string, vector< unsigned int >, ltstr > namesToIndices;
01307 
01308   bool classNamesExist;
01309 
01310   // just a collection of class names.
01311   set< string, ltstr> classNames;
01312   // map from a class to the indices of the points in that class.
01313   map< string, vector< int >, ltstr > classesToIndices;
01314   
01315   // map storing counts for each class.
01316   map< string, int, ltstr > classCounts;
01317 
01318   
01319 
01320   // something to store info about dimensions.
01321   // names and ids of dimensions.
01322   vector< MyDimInfo > dimInfo;
01323   // smallest coordinate for each dim.
01324   vector< MyNT > dimLows;
01325   // greatest coordinate for each dim.
01326   vector< MyNT > dimHighs;
01327   // the mean for each dim (useful in computing Gaussian p-values).
01328   vector< MyNT > dimMeans;
01329   // the sigma for each dim (useful in computing Gaussian p-values).
01330   vector< MyNT > dimSigmas;
01331   
01332   // for each dim, a count of how many times each value appears in a
01333   // point along that dim.
01334   vector< MyDimCountsType > dimCounts;
01335   
01336   // entropy of info for each dimension.
01337   vector< MyNT > dimEntropies;
01338   // dimension sorted in order of increasing entropy.
01339   vector< int > sortedDims;
01340 
01341   // point means
01342   vector< MyNT > _pointMeans;
01343   // point standard deviations.
01344   vector< MyNT > _pointStandardDeviations;
01345   
01346 };
01347 
01348 
01358 class MySetOfPointSets
01359 {
01360 private:
01361   // all the point sets.
01362   vector< MyPointSet > _pointSets;
01363   // the class/condition associated with each point set.
01364   vector< string > _classes;
01365   // the inverse of the previous map.
01366   map< string, unsigned int > _classToIndex;
01367   
01368 public:
01369   // typedef vector< MyPointSet >::iterator MySetOfPointSets::iterator;
01370   // typedef vector< MyPointSet >::const_iterator MySetOfPointSets::const_iterator;
01371   
01372   MySetOfPointSets()
01373       : _pointSets(), _classToIndex()
01374     {}
01375   
01376   virtual ~MySetOfPointSets()
01377     {}
01378 
01380   void insert(MyPointSet &p)
01381     {
01382       _pointSets.push_back(p);
01383       // make sure _classes is the same length as _pointSets.
01384       _classes.push_back("");
01385     }
01386 
01388   void insert(MyPointSet &p, string condition)
01389     {
01390       _pointSets.push_back(p);
01391       _classes.push_back(condition);
01392       _classToIndex[condition] = _pointSets.size() - 1;
01393     }
01394 
01401   void assignClass(unsigned int index, string condition)
01402     {
01403       if (index >= _pointSets.size())
01404         {
01405           cerr << "Assinging class " << condition
01406                << "to a non-existant point set index " << index << "." << endl;
01407           return;
01408         }
01409       // forget the old info.
01410       _classToIndex.erase(_classes[index]);
01411       // record the new info.
01412       _classes[index] = condition;
01413       _classToIndex[condition] = index;
01414     }
01415   
01423   bool getPointSet(string condition, MyPointSet *pointSet)
01424     {
01425       map< string, unsigned int >::iterator citr = _classToIndex.find(condition);
01426       if (_classToIndex.end() == citr)
01427         {
01428           pointSet = NULL;
01429           return(false);
01430         }
01431       pointSet = &(_pointSets[citr->second]);
01432       return(true);
01433     }
01434 
01440   // /// \param[out] pointSet, 
01443   const MyPointSet * getPointSet(string condition) const
01444     {
01445       map< string, unsigned int >::const_iterator citr = _classToIndex.find(condition);
01446       if (_classToIndex.end() == citr)
01447         {
01448           return(NULL);
01449           // pointSet = NULL;
01450           // return(false);
01451         }
01452       return(&(_pointSets[citr->second]));
01453       // pointSet = &(_pointSets[citr->second]);
01454       // return(true);
01455     }
01456 };
01457 
01458 // next() will return a MyPointSet&.
01459 class MySetOfPointSetsIterator: public  MyIterator< MyPointSet >
01460   {
01461   private:
01462     typedef vector< MyPointSet >::iterator _MySetOfPointSetsIterator;
01463 
01464     _MySetOfPointSetsIterator start;
01465     _MySetOfPointSetsIterator end;
01466     _MySetOfPointSetsIterator current;
01467     vector< MyPointSet > &pointSets;
01468     
01469   public:
01470     // ban this contstructor.
01471     MySetOfPointSetsIterator(_MySetOfPointSetsIterator _start,
01472                              _MySetOfPointSetsIterator _end);
01473     
01474 
01475     MySetOfPointSetsIterator(_MySetOfPointSetsIterator _start,
01476                              _MySetOfPointSetsIterator _end,
01477                              vector< MyPointSet > &_pointSets)
01478         : start(_start), end(_end), current(_start), pointSets(_pointSets)
01479       {}
01480     
01481     virtual bool hasNext()
01482       {
01483         return(current != end);
01484       }
01485     
01486     MyPointSet &next()
01487       {
01488         MyPointSet &_next = *current;
01489         current++;
01490         return(_next);
01491       }
01492   };
01493 
01494 
01495 
01496 #endif //POINT_H
 All Classes Functions Variables Typedefs Friends