Biorithm  1.1
 All Classes Functions Variables Typedefs Friends
projcluster.h
00001 /**************************************************************************
00002  * Copyright (c) 2002-2011 T. M. Murali                                   *
00003  *                                                                        *
00004  * This file is part of Biorithm.                                         *
00005  *                                                                        *
00006  * Biorithm is free software: you can redistribute it and/or modify       *
00007  * it under the terms of the GNU General Public License as published by   *
00008  * the Free Software Foundation, either version 3 of the License, or      *
00009  * (at your option) any later version.                                    *
00010  *                                                                        *
00011  * Biorithm is distributed in the hope that it will be useful,            *
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of         *
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          *
00014  * GNU General Public License for more details.                           *
00015  *                                                                        *
00016  * You should have received a copy of the GNU General Public License      *
00017  * along with Biorithm.  If not, see <http://www.gnu.org/licenses/>.      *
00018  *                                                                        *
00019  **************************************************************************/
00020 
00021 #ifndef _PROJCLUSTER_H
00022 #define _PROJCLUSTER_H
00023 
00024 #include <fstream>
00025 #include <iostream>
00026 // for ostream_iterator
00027 #include <iterator>
00028 #include <set>
00029 
00030 using namespace std;
00031 
00032 #include "global.h"
00033 #include "boundeddim.h"
00034 #include "interval.h"
00035 #include "params.h"
00036 #include "point.h"
00037 #include "read.h"
00038 
00039 // since i moved findProjectiveCluster() to MyCluster::find(),
00040 // findProjectiveCluster() need not be a friend of
00041 // MyClusterParams. therefore, i can move this declaration to its
00042 // rightful place (projcluster.h) from params.h.
00043 enum MyProjectiveClusterReturnType { ZERO_POINTS, ZERO_SEEDS,
00044                                      NO_RANDOM_SEEDS, NO_RANDOM_INDICES,
00045                                      NO_QUALITY_CLUSTERS, NO_SIGNIFICANT_CLUSTERS,
00046                                      CLUSTER_FOUND };
00047 
00048 
00049 // struct storing the key params i use to control
00050 // MyCluster::find(). these params may also be stored in
00051 // MyClusterParams but i want to store the key subset in this
00052 // structure and to store a vector of this subset sorted by round to
00053 // keep a history of the parameter values.
00054 struct MyKeyClusterParams
00055 {
00056   int  numSeeds;
00057   int  sizeDisc;
00058   int  numDiscs;
00059   int  whichRound;
00060   MyNT minimumHomogeneity;
00061   MyProjectiveClusterReturnType found;
00062 };
00063 
00064 
00065 
00066 class MyPointPtr;
00067 
00068 
00069 // what i assume is the default point set when there are multiple.
00070 const unsigned int MY_DEFAULT_POINT_SET = 0;
00071 
00072 
00073 
00074 
00075 
00076 class MyCluster;
00077 
00078 // class for box with some bounded dimensions and some unbounded dimensions.
00079 class MyBox
00080 {
00081   friend class MyCluster;
00082 public:
00083   MyBox()
00084       : numBounded(0), intervals(), realIntervals()
00085     {}
00086   
00087 
00088   // construct from list of bounded dimensions and bounds.
00089   MyBox(const vector< MyNT > &lows, const vector< MyNT > &highs,
00090         const vector< bool > &bounded);
00091   MyBox(const MyPoint &centre, const vector< MyNT > &width, const vector< bool > &bounded);
00092 
00093   MyBox(const MyBox& copy)
00094       : numBounded(copy.numBounded),
00095         intervals(copy.intervals), realIntervals(copy.realIntervals)
00096     {}
00097   const MyBox& operator=(const MyBox& rhs)
00098     {
00099       if (this != &rhs)
00100         {
00101           numBounded = rhs.numBounded;
00102           intervals.clear();
00103           intervals = rhs.intervals;
00104           realIntervals.clear();
00105           realIntervals = rhs.realIntervals;
00106         }
00107       return(*this);
00108     }
00109   // see p 62, item 14 of effective c++ for why i shouldn't declare a
00110   // virtual destructor in a non-base class.
00111 //  
00112 //  virtual
00113   ~MyBox()
00114     {}
00115   // free up memory used by MyBox.
00116 //  virtual
00117   void free()
00118     {
00119       intervals.clear();
00120       realIntervals.clear();
00121     }
00122   
00123   bool operator==(const MyBox& rhs) const;
00124   
00125   unsigned int getNumBounded() const
00126     {
00127       return(numBounded);
00128     }
00129   
00130   
00131   // check if point lies inside the box.
00132   bool contains(const MyPoint &point) const
00133     {
00134       for (unsigned int i = 0; i < numBounded; i++)
00135 //        if (!::contains(lows[i], highs[i], point[boundedDims[i]]))
00136 
00137         // should not check against realLows and realHighs since they
00138         // may not be defined yet! i might be calling this function
00139         // from MyCluster::findContainedPoints(). 
00140         if (!intervals[i].contains(point[intervals[i].getDimension()]))
00141           return(false);
00142       return(true);
00143     }
00144 
00145   // check whether dim is bounded in the cluster.
00146   bool checkBounded(int dim) const
00147     {
00148       return(realIntervals.end() != _findDimension(dim));
00149     }
00150   
00151 
00152 
00153 
00154   // checks if box is significant. if there is any bounded dimension
00155   // for which the box's side is not equal to any of the intervals for
00156   // that bounded dimension, returns false. otherwise, returns true. 
00157   // also see the comments before the implementatiom of this function
00158   // in projcluster.C
00159   bool isSignificant(vector< vector< MyInterval > >& intervals);
00160 
00161   // checks what fraction of the volume (should it be perimeter?) of box is contained within significant intervals. 
00162   MyNT isSoftContained(vector< vector< MyInterval > >& intervals);
00163 
00164   // return true iff every bounded interval has significance greater than maxPValue.  
00165   bool isSignificant(const MyPointSet& points, MyNT maxPValue);
00166   
00167   void getPValues(const MyPointSet& points, vector< MyNT >& pvalues)
00168     {
00169       for (unsigned int i = 0; i < numBounded; i++)
00170         pvalues.push_back(computePValue(points, i));
00171     }
00172       
00173   
00174   // set realLows and realHighs to be the bounding box of points.
00175   void update(const MyPointSet& points, const vector< int >& indices);
00176   // update realLows and realHighs to reflect addition of points[index].
00177   void update(const MyPointSet& points, int index);
00178 
00179 #ifdef DEBUG  
00180   // don't initialise by default to NULL since the two versions of
00181   // MyBox::check() become ambiguous.
00182   bool check(const vector< MyNT >* lows,
00183              const vector< MyNT >* highs) const;
00184   bool check(const vector< MyNT >* width) const;
00185 #endif
00186   
00187   void print(ostream& ostr, const vector< MyDimInfo >& dimensionInfo,
00188              MyVerbosityLevel verbosity) const;
00189 
00190 private:
00191   void construct(const vector< MyNT > &lows,
00192                  const vector< MyNT > &highs, const vector< bool > &bounded);
00193   void construct(const MyPoint &centre, const vector< MyNT > &width,
00194                  const vector< bool > &bounded);
00195 
00196   // compute the p-value of the ith bounded interval.
00197   MyNT computePValue(const MyPointSet& points, int i);
00198 
00199 
00200   
00201   // find the entry in realIntervals that whose dimension == dim.
00202   vector< MyDimensionInterval >::const_iterator _findDimension(unsigned int dim) const
00203     {
00204       vector< MyDimensionInterval >::const_iterator itr;
00205       // find a MyDimensionInterval whose dimension == dim.
00206       for (itr = realIntervals.begin(); itr != realIntervals.end(); itr++)
00207         if (dim == (*itr).getDimension())
00208           break;
00209       return(itr);
00210     }
00211   
00212   // find the entry in realIntervals that whose dimension == dim.
00213   vector< MyDimensionInterval >::iterator _findDimension(unsigned int dim)
00214     {
00215       vector< MyDimensionInterval >::iterator itr;
00216       // find a MyDimensionInterval whose dimension == dim.
00217       for (itr = realIntervals.begin(); itr != realIntervals.end(); itr++)
00218         if (dim == (*itr).getDimension())
00219           break;
00220       return(itr);
00221     }
00222   
00223 private:
00224   unsigned int numBounded;
00225   // intervals are the seed -/+ per-dim interval that the point is
00226   // associated with.
00227   vector< MyDimensionInterval > intervals;
00228   // realIntervals are the actual bounding box of the
00229   // contained points.
00230   vector< MyDimensionInterval > realIntervals;
00231 };
00232 
00233 #ifdef USE_CLUSTER_DETAILS
00234 // detail of how the cluster is created.
00235 struct MyClusterDetails
00236 {
00237 public:
00238   
00239   // how should i store a pointer to the seed? 
00240   //
00241   // (i) it is not possible to store a const MyPoint& because when i
00242   // construct a MyCluster, the seed i have could be allocated
00243   // temporarily. the seed might get overwritten or go out of scope
00244   // making the reference inside MyClusterDetails invalid. another
00245   // issue is that the seed as a const MyPoint& means that i have to
00246   // store a *pointer* to MyClusterDetails in MyCluster. i hate this
00247   // fact and i dont know if i can do anything better. the reason is
00248   // as follows: i want to create arrays of MyClusters (e.g., the
00249   // allClusters variable in cluster.C, i have to be able to call the
00250   // no-parameter MyCluster constructor. then when i assign a real
00251   // MyCluster to an "empty" MyCluster, i have to call the
00252   // MyClusterDetails assignment operator, which does not allow me to
00253   // assign to the const reference to MyPoint in MyClusterDetails. by
00254   // sotring a pointer to MyClusterDetails in MyCluster, i can use new
00255   // to assign to MyClusterDetails when i call the MyCluster
00256   // assignment operator.
00257   //
00258   // (ii) i could store an iterator that points to the seed inside
00259   // MyPointSet. however the iterator can become invalid if the vector
00260   // containing the points inside MyPointSet is reallocated.
00261   //
00262   // (iii) i store an index to the point. this index is invalid only
00263   // if i delete or insert a point with smaller index, which i believe
00264   // will not happen. of course, storing the index means that i have
00265   // to change the MyCluster constructor to accept an index.
00266 
00267   typedef int MyPointPtr;
00268 
00269   MyClusterDetails()
00270       : seed(), discIndices()
00271     {}
00272   MyClusterDetails(MyPointPtr s, const vector< int >& ind)
00273       : seed(s), discIndices(ind)
00274     {}
00275   MyClusterDetails(const MyClusterDetails& copy)
00276       : seed(copy.seed), discIndices(copy.discIndices)
00277     {}
00278   ~MyClusterDetails()
00279     {}
00280   // not providing an implementation.
00281   const MyClusterDetails& operator=(const MyClusterDetails& rhs);
00282   
00283 //     const MyClusterDetails& operator=(const MyClusterDetails& rhs)
00284 //     {
00285 //       if (this != &rhs)
00286 //         {
00287 //           seed = rhs.seed;
00288 //           discIndices.clear();
00289 //           discIndices = rhs.discIndices;
00290 //         }
00291 //       return(*this);
00292 //     }
00293   
00294   const MyPoint& getSeed(const MyPointSet& points) const
00295     {
00296       return(points[seed]);
00297     }
00298   
00299 //   const MyPointPtr& getSeed(const MyPointSet& points) const
00300 //     {
00301 //       return(seed);
00302 //     }
00303   
00304 public:
00305   MyPointPtr seed;
00306 //  int index;
00307   vector< int > discIndices;
00308 };
00309 #endif // USE_CLUSTER_DETAILS
00310 
00311 
00312 // forward declarations.
00313 // struct MyClusterLess;
00314 // struct MyClusterMore;
00315 class MyClusterSet;
00316 
00317 
00318 // stores box and points (or pointers to them) that lie inside the box. 
00319 class MyCluster
00320 {
00321 public:
00322 #ifdef USE_CLUSTER_DETAILS
00323   typedef MyClusterDetails::MyPointPtr MyPointPtr;
00324 #endif  
00325   
00326   MyCluster()
00327       : box(), numContained(0), containedPoints(), realContainedPoints(),
00328 #ifdef USE_CLUSTER_DETAILS
00329         details(NULL), 
00330 #endif     
00331            filledIn(false), pointsFlipped(false)
00332     {}
00333 
00334 #ifdef USE_BOUNDED_DIM_TYPE
00335   // construct from seed and MyBoundedDim.
00336   MyCluster(
00337 #ifdef USE_CLUSTER_DETAILS                
00338                   const MyPointPtr seedPtr, 
00339 #endif // USE_CLUSTER_DETAILS
00340                   const MyBoundedDim &boundedDimInfo,
00341             const MyPointSet& points, const vector< int >& indices,
00342             const MyClusterParams &params);
00343 #else
00344 
00345   // construct from list of bounded dimensions and bounds.
00346   MyCluster(
00347 #ifdef USE_CLUSTER_DETAILS
00348                   const MyPointPtr seedPtr, 
00349 #endif // USE_CLUSTER_DETAILS             
00350                   const vector< MyNT > &lows,
00351             const vector< MyNT > &highs, const MyPointSet& points,
00352             const vector< int > &indices,
00353             const MyClusterParams &params);
00354 //            bool saveMemory = false);
00355   MyCluster(
00356 #ifdef USE_CLUSTER_DETAILS                
00357                   const MyPointPtr seedPtr, 
00358 #endif // USE_CLUSTER_DETAILS             
00359                   const vector< MyNT > &width,
00360             const MyPointSet& points,
00361             const vector< int > &indices,
00362             const MyClusterParams &params);
00363 //            bool saveMemory = false);
00364 
00365   // SEEDLESS version.
00366   MyCluster(const vector< MyNT > &lows, const vector< MyNT > &highs,
00367             const MyPointSet& points, bool saveMemory = false);
00368 
00369   // SEEDLESS, LOWLESS, HIGHLESS version
00370   MyCluster(const MyPointSet& points,
00371             const vector< int > &indices,
00372             const MyClusterParams &params);
00373 #endif
00374   
00375   MyCluster(const MyCluster& copy)
00376       : box(copy.box), numContained(copy.numContained),
00377         containedPoints(copy.containedPoints),
00378         realContainedPoints(copy.realContainedPoints),
00379 #ifdef USE_CLUSTER_DETAILS
00380         details(copy.details), 
00381 #endif
00382            filledIn(copy.filledIn),
00383 
00384         pointsFlipped(copy.pointsFlipped)
00385     {}
00386   
00387   const MyCluster& operator=(const MyCluster& rhs)
00388     {
00389       if (this != &rhs)
00390         {
00391           box = rhs.box;
00392           numContained = rhs.numContained;
00393           containedPoints.clear();
00394           containedPoints = rhs.containedPoints;
00395           realContainedPoints.clear();
00396           realContainedPoints = rhs.realContainedPoints;
00397           // don't delete details because i dont allocate details
00398           // using new when use operator=. if i delete, then i can
00399           // delete a pointer i actually need (e.g., when i assign
00400           // inside MY_forall).
00401 //          delete details;
00402 #ifdef USE_CLUSTER_DETAILS
00403           details = rhs.details;
00404 #endif
00405           filledIn = rhs.filledIn;
00406           pointsFlipped = rhs.pointsFlipped;
00407         }
00408       return(*this);
00409     }
00410 //  virtual
00411   ~MyCluster()
00412     {
00413 //      delete details;
00414     }
00415 
00416   bool operator==(const MyCluster& rhs) const
00417     {
00418       if (this == &rhs)
00419         return(true);
00420       // enough to compare the boxes since the boxes store the real bounding box.
00421       return(box == rhs.box);
00422     }
00423 
00424     bool operator!=(const MyCluster& rhs) const
00425     {
00426       return(!(*this == rhs));
00427     }
00428 
00429   // is *this < other.
00430   bool MyCluster::operator<(const MyCluster &other) const
00431     {
00432       // i could use MyCluster::measure() but since it can depend on
00433       // shiftValue (which is yucky in its own way), i am not.
00434       return(getNumBounded()*getNumContained() <
00435              other.getNumBounded()*other.getNumContained());
00436     }
00437   
00438   // is *this > other.
00439   bool MyCluster::operator>(const MyCluster &other) const
00440     {
00441       return(!(*this < other));
00442     }
00443 
00444 #ifdef USE_BOUNDED_DIM_TYPE
00445   void construct(const MyBoundedDim &boundedDimInfo,
00446                  const MyPointSet& points, const vector< int >& indices,
00447                  const MyClusterParams &params, bool brandNew = false);
00448 #else  
00449   void construct(const vector< MyNT > &width, const MyPointSet& points,
00450                  bool brandNew = false);
00451 
00452   void construct(const MyPointSet& points,
00453                  const vector< int > &discIndices,
00454                  const MyClusterParams &params,
00455                  bool brandNew = false);
00456 #endif // USE_BOUNDED_DIM_TYPE
00457   
00458   // i need this for find_greedy().
00459   void construct(const vector< MyNT > &lows, const vector< MyNT > &highs,
00460                  const MyPointSet& points, bool brandNew = false);
00461 
00462   // used to be findProjectiveCluster() in cluster.C. find a
00463   // projective cluster and set the found cluster to be *this.
00464   MyProjectiveClusterReturnType
00465   find(const MyPointSet& points, MyClusterParams& params, ofstream &ostr,
00466        const MyBoundedDim &boundedDimInfo,
00467 //        const vector< MyNT >& width,
00468 //        vector< vector< MyInterval > >* intervals = NULL,
00469 //        vector< vector< vector< int > > >* intervalsForPoints = NULL,
00470        // passing MyClusterMore since i want clusters to be sorted
00471        // in decreasing order of measure.
00472        MyClusterSet *bestClusters = NULL
00473        )
00474   {
00475 
00476     cerr << "This version of MyCluster::find() is not implemented!" << endl;
00477     exit(-1);
00478     // call the other find() with vectors.
00479 //     return(find(vector< MyPointSet >(1, points), params, ostr, 
00480 //                       vector< MyBoundedDim >(1, boundedDimInfo), bestClusters));
00481   }
00482 
00483   MyProjectiveClusterReturnType
00484   find(vector< MyPointSet > &pointSets, 
00485           MyClusterParams& params, ofstream &ostr,
00486           //       vector< MyBoundedDim > &boundedDimInfo,
00487        vector< const MyBoundedDim* > &boundedDimInfo,
00488        MyClusterSet *bestClusters = NULL
00489        );
00490 
00491   // find a cluster using a greedy algorithm.
00492   MyProjectiveClusterReturnType
00493   find_greedy(const MyPointSet& points, MyClusterParams& params,
00494               ofstream &ostr,
00495               const vector< MyNT >& widths,
00496               vector< vector< MyInterval > >* intervals,
00497               vector< vector< vector< int > > >* intervalsForPoints);
00498   
00499 
00500   // return true if *this and other are similar, whatever that might
00501   // mean.
00502   bool MyCluster::similar(const MyCluster &other) const;
00503   // this functions uses an argument for the threshold for similarity.
00504   bool MyCluster::similar(const MyCluster &other, MyNT threshold) const;
00505   
00506   // compare this and rhs. currently, i just compute the size of the
00507   // symmetric difference between the two sets of contained points (in
00508   // numDiff) and the number of points that are in rhs but not in this
00509   // (in numNew).
00510   void compare(const MyCluster& rhs, int& numDiff, int& numNew) const;
00511   bool compare(vector< MyCluster >& clusters, MyVerbosityLevel verbose);
00512   void merge(const MyCluster& cluster);
00513 
00514   // compute a Shamir-like p-value based on the #points from each
00515   // class in the cluster.
00516   MyNT pValueShamir(const MyPointSet& points) const;
00517   // compute the p-value of the cluster based on the approximation to
00518   // the gaussian PDF using the central limit theorem.
00519   MyNT pValueGaussian(const MyPointSet &points) const;
00520   
00521   bool quality(MyNT alpha, int numPoints, int numDims) const;
00522   bool support(MyNT alpha, int numPoints, int numDims) const;
00523   MyNT homogeneity(const MyPointSet& points) const;
00524   // compute the type of this cluster. there is no point calling this
00525   // function if the class names are not provided in the command-line.
00526   string type(const MyPointSet &points) const;
00527   
00528   void getContainedPoints(vector< int > &cont) const
00529   {
00530     cont = containedPoints;
00531   }
00532   unsigned int getNumContained() const
00533     {
00534       return(numContained);
00535     }
00536 
00537   unsigned int getNumBounded() const
00538     {
00539       return(box.getNumBounded());
00540     }
00541 
00542   // count the number of points in cluster that are contained in this cluster.
00543   int countNumContained(const MyPointSet& points, const MyCluster& cluster)
00544     const;
00545   
00546   // count the number of points (referred to by indices) that are
00547   // contained in this cluster.
00548   int countNumContained(const MyPointSet& points, const vector< int >& indices)
00549     const;
00550 
00551   // compute the set of points contained in cluster that are contained
00552   // in this cluster. store the resulting set of indices in
00553   // pointsInside and return the number of such points.
00554   int findContainedPoints(const MyPointSet& points, const MyCluster& cluster,
00555                           vector< int >& pointsInside);
00556   
00557   // compute the set of points referred to by indices that are
00558   // contained in this cluster. store the resulting set of indices in
00559   // pointsInside and return the number of such points.
00560   int findContainedPoints(const MyPointSet& points,
00561                           const vector< int >& indices,
00562                           vector< int >& pointsInside) const;
00563 
00564   // compute the points in *this not contained in any cluster in clusters.
00565   int findNotContainedPoints(const MyPointSet& points,
00566                              const vector< MyCluster> &clusters,
00567                              vector< int >& pointsOutside) const;
00568   
00569   // compute the set of points contained in *this that are not
00570   // contained in cluster. store the resulting set of indices in
00571   // pointsOutside and return the number of such points.
00572   int findNotContainedPoints(const MyPointSet& points, 
00573                              const MyCluster& cluster,
00574                              vector< int >& pointsOutside) const;
00575   
00576   
00577   // compute the set of points referred to by indices that are not
00578   // contained in this cluster. store the resulting set of indices in
00579   // pointsOutside and return the number of such points.
00580   int findNotContainedPoints(const MyPointSet& points,
00581                              const vector< int >& indices,
00582                              vector< int >& pointsOutside) const;
00583   
00584 
00585   // compute the measure of this cluster.
00586   //
00587   // GCC3: COMMENTING OUT because gcc3 complains about ambiguous calls
00588   // to log10 and pow. i don't need this function anyway.
00589   //
00590 //   MyNT measure(int shiftValue) const
00591 //     {
00592 //       // return a number equal to the concatenation of
00593 //       // getNumContained() and getNumBounded().
00594 //       int base = static_cast< int > (1 + log10(shiftValue));
00595 //       return(getNumBounded()*pow(10, base) + getNumContained());
00596 //     }
00597 
00598   // compute the measure of this cluster.
00599   MyNT measure() const
00600     {
00601       // return a number equal to the product of
00602       // getNumContained() and getNumBounded().
00603       return(getNumBounded()*getNumContained());
00604     }
00605 
00606   // see comments before definition of MyPoint::chosen to see
00607   // the difference between hiding and choosing a point.
00608   //
00609   // called in main() in cluster.C after a cluster is selected (when
00610   // CLUSTER_FOUND == found) but only when params.getPartition() is
00611   // true, i.e., when points are being partitioned between the
00612   // clusters. 
00613   void hideContainedPoints(MyPointSet& points) const
00614     {
00615       points.hidePoints(containedPoints);
00616     }
00617 
00618   // called in main() in cluster.C after a cluster is selected (when
00619   // CLUSTER_FOUND == found).
00620   void chooseContainedPoints(MyPointSet& points) const
00621     {
00622       points.choosePoints(containedPoints);
00623     }
00624 
00625   // hide just the cluster. i do so by replacing the coords
00626   // corresponding to the points and dims in the cluster by random
00627   // numbers.
00628   void hide(MyPointSet& points) const
00629     {
00630       points.hide(containedPoints, box.intervals);
00631     }
00632 
00633 
00634 
00635 
00636   // add 1 to pointCounts[i] if point i is contained in *this
00637   void updatePointCounts(vector< unsigned int > &pointCounts) const
00638     {
00639       const vector< int > &contPoints = realContainedPoints;
00640 //    const vector< int > &contPoints = containedPoints;
00641       
00642       for (unsigned int i = 0; i < contPoints.size(); i++)
00643         pointCounts[contPoints[i]]++;
00644     }
00645   
00646         
00647   void print(ostream& ostr, MyVerbosityLevel verbosity) const;
00648   // see comments for definition of print() in cluster.C about whether
00649   // print() can be const or not.
00650   void print(ostream& ostr, const MyPointSet& points,
00651              MyVerbosityLevel verbosity, bool printItemSetFormat = false) const;
00652   // print info on p-values, type, quality, etc. some of them depend
00653   // on the existence of class names.
00654   void printExtra(ostream& ostr, const MyPointSet& points) const;
00655 
00656 
00657   // functions for adding/deleting poitns and dimensions.
00658   bool addPoint(const MyPointSet& points, int index);
00659   bool deletePoint(const MyPointSet& points, int index);
00660   bool addDimension(int dim, MyNT start, MyNT end, const MyPointSet& points,
00661                     bool check = false);
00662   bool addDimension(int dim, MyNT start, MyNT end, const vector< int >& contPoints);
00663   bool addDimension(const MyDimensionInterval& interval,
00664                     const vector< int >& contPoints);
00665 
00666   
00667   // TODO: DESIGN. do i need this function?  
00668   bool addDimension(int dim, MyNT width, const MyPointSet& points);
00669   bool deleteDimension(int dim, const MyPointSet& points);
00670   
00671   void anneal(const MyPointSet& points, const vector< MyNT >& widths,
00672                        MyClusterParams& params);
00673   
00674   bool isSignificant(vector< vector< MyInterval > >& intervals)
00675     {
00676       return(box.isSignificant(intervals));
00677     }
00678   
00679   MyNT isSoftContained(vector< vector< MyInterval > >& intervals)
00680     {
00681       return(box.isSoftContained(intervals));
00682     }
00683 
00684   bool isSignificant(const MyPointSet& points, MyNT maxPValue)
00685     {
00686       return(box.isSignificant(points, maxPValue));
00687     }
00688   
00689   void getPValues(const MyPointSet& points, vector< MyNT >& pvalues)
00690     {
00691       return(box.getPValues(points, pvalues));
00692     }
00693       
00694   // returns the number of containedPoints contained in [start, end]
00695   // along dimension dim.
00696   int countNumContainedForDim(int dim, MyNT start, MyNT end, const MyPointSet &points) const
00697     {
00698       int numIn = 0;
00699       for (unsigned int i = 0; i < numContained; i++)
00700         numIn += ::contains(start, end, points[containedPoints[i]][dim]);
00701       return(numIn);
00702     }
00703 
00704 private:
00705 
00706   // functions that return the real index of a given sample or a given
00707   // gene. these functions are the only ones that actually refer to
00708   // gene expression data in the function name. in all of them, the
00709   // flipped argument indicates whether the point set has been flipped
00710   // or not. if not, the samples correspond to the bounded dimensions,
00711   // otherwise to the genes.
00712   unsigned int getNumSamples(bool flipped) const
00713     {
00714       if (flipped)
00715         return(getNumContained());
00716       else
00717         return(getNumBounded());
00718     }
00719   
00720   unsigned int getNumGenes(bool flipped) const
00721     {
00722       if (flipped)
00723         return(getNumBounded());
00724       else
00725         return(getNumContained());
00726     }
00727   
00728   unsigned int getSampleIndex(int i, bool flipped) const
00729     {
00730       if (flipped)
00731         return(containedPoints[i]);
00732       else
00733         return(box.intervals[i].getDimension());
00734     }
00735   unsigned  int getGeneIndex(int i, bool flipped) const
00736     {
00737       if (flipped)
00738         return(box.intervals[i].getDimension());
00739       else
00740         return(containedPoints[i]);
00741     }
00742   
00743 
00744   
00745   // free up memory used by MyCluster
00746 //  virtual
00747   void free()
00748     {
00749       containedPoints.clear();
00750       box.free();
00751       filledIn = false;
00752     }
00753 
00754 #ifdef DEBUG
00755   bool check(const MyPointSet& points) const;
00756 #endif
00757   
00758   // find the largest class of points in the cluster.
00759   void _findLargestClass(const MyPointSet& points,
00760                          unsigned int& maxNum,
00761                          string& maxClass, bool real = true) const;
00762 
00763   // take list of points and store those that lie inside. if
00764   // partitionPoints is true, then i only include points in this
00765   // cluster that have not already been "chosen" (put in some other
00766   // cluster).
00767   void findContainedPoints(const MyPointSet &points)
00768     {
00769       for (unsigned int i = 0; i < points.size(); i++)
00770         // only consider points that have not already been hidden.
00771         // whether the hidden bit is set or not depends on the -p
00772         // command line parameter (params.getPartition()). a point's
00773         // hidden bit is set in MyCluster::hideContainedPoints().
00774         if (box.contains(points[i]))
00775           {
00776             realContainedPoints.push_back(i);
00777             if (!points[i].getHidden())
00778               containedPoints.push_back(i);
00779           }
00780       
00781     }
00782   
00783   void findContainedPoints(const MyPointSet &points, vector< int >& contPoints) const
00784     {
00785       for (unsigned int i = 0; i < points.size(); i++)
00786         // only consider points that have not already been
00787         // chosen. whether the chosen bit is set or not depend on the
00788         // command line parameter. a point's chosen bit is set in the
00789         // main function that calls findProjectiveCluster().
00790         if (!points[i].getHidden() && (box.contains(points[i])))
00791           contPoints.push_back(i);
00792     }
00793 
00794   // returns a vector of points in containedPoints that lie inside
00795   // [start, end] along dimension dim.
00796   inline int findContainedPointsForDim(int dim, MyNT start, MyNT end,
00797                                        const MyPointSet &points, vector< int >& newContained)
00798     {
00799       for (unsigned int i = 0; i < numContained; i++)
00800         if (::contains(start, end, points[containedPoints[i]][dim]))
00801           newContained.push_back(containedPoints[i]);
00802       return(newContained.size());
00803     }
00804 
00805   int findMaxNumContainedForDim(int dim, const vector< MyInterval >* intervals,
00806                                 MyNT width, const MyPointSet& points,
00807                                 MyNT& maxStart, MyNT& maxEnd);
00808   
00809 
00810 
00811 
00812   void mergeContainedPoints(const vector< int >& pts);
00813 
00814   void computeBoundedDimensions(const MyPointSet& points,
00815                                 const MyPoint& seed, const vector< int >&discIndices,
00816                                 const vector< MyNT > &lows, const vector< MyNT > &highs,
00817                                 vector< bool >& bounded) const;
00818   void computeBoundedDimensions(const MyPointSet& points,
00819                                 const MyPoint& seed, const vector< int >&discIndices,
00820                                 const vector< MyNT > &width,
00821                                 vector< bool >& bounded) const;
00822 
00823   void improveDimensions(const MyPointSet& points, MyClusterParams& params,
00824                          vector< vector< MyInterval > >* intervals);
00825   
00826 
00827 private:
00828   MyBox box;
00829   unsigned int numContained;
00830   // the difference between these two vectors is that containedPoints
00831   // has points contained in box but not hidden. realContainedPoints
00832   // does not care when the points are hidden or not.
00833   vector< int > containedPoints;
00834   vector< int > realContainedPoints;
00835 
00836 #ifdef USE_CLUSTER_DETAILS
00837   MyClusterDetails* details;
00838 #endif
00839   // are all fields filled in?
00840   bool filledIn;
00841 
00842   // were the points flipped before constructing the cluster. storing
00843   // this bit with each cluster is a waste of space but the advantage
00844   // is that i can set the bit when constructing the cluster and not
00845   // have to worry about passing the bit to all the functions that
00846   // need it.
00847   bool pointsFlipped;
00848 };
00849 
00850 
00851 // define after defining MyCluster.
00852 struct MyClusterLess :
00853   public binary_function< const MyCluster&, const MyCluster&, bool >
00854 {
00855   bool operator()(const MyCluster &lhs, const MyCluster &rhs) const
00856   {
00857     return(lhs < rhs);
00858   }
00859 };
00860 
00861 struct MyClusterMore :
00862   public binary_function< const MyCluster&, const MyCluster&, bool >
00863 {
00864   bool operator()(const MyCluster &lhs, const MyCluster &rhs) const
00865   {
00866     return(lhs > rhs);
00867   }
00868 };
00869 
00870 
00871 
00872 // a set of sorted clusters.
00873 struct MyClusterSet
00874 {
00875 public:
00876   MyClusterSet(int max = 0)
00877     // i hope it is ok to initialise minMeasure as 0.
00878       :  clusters(), maxNum(max), minMeasure(0), minPtr()
00879     {}
00880 
00881   ~MyClusterSet()
00882     {}
00883   
00884   int size() const
00885     {
00886       return(clusters.size());
00887     }
00888   int maxSize() const
00889     {
00890       return(maxNum);
00891     }
00892   bool empty() const
00893     {
00894       return(clusters.empty());
00895     }
00896   // is the set full? if so, i cannot add any more clusters.
00897   bool full() const
00898     {
00899       return(maxNum == clusters.size());
00900     }
00901   bool insert(const MyCluster &cluster, const MyPointSet &points,
00902               const MyClusterParams &params);
00903   void print(ostream &ostr) const;
00904   void printAll(ostream &ostr, const MyPointSet &points,
00905                  MyClusterParams &params) const;
00906   void printAllTypes(ostream &ostr, const MyPointSet &points) const;
00907   
00908   
00909     
00910 private:
00911   
00912 private:
00913   // using MyClusterMore to store clusters in decreasing order of
00914   // measure.
00915   typedef set< MyCluster, MyClusterMore > _MyClusterSet;
00916 
00917   set< MyCluster, MyClusterMore > clusters;
00918   // the maximum number i want to store. if -1, i can store as many as
00919   // i want.
00920   int maxNum;
00921   // the smallest measure stored in clusters.
00922   MyNT minMeasure;
00923   // a pointer to the cluster with measure minMeasure.
00924   _MyClusterSet::iterator minPtr;
00925   
00926 };
00927 
00928 
00929 
00930 #endif // _PROJCLUSTER_H 
 All Classes Functions Variables Typedefs Friends