00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #ifndef _PROJCLUSTER_H
00022 #define _PROJCLUSTER_H
00023
00024 #include <fstream>
00025 #include <iostream>
00026
00027 #include <iterator>
00028 #include <set>
00029
00030 using namespace std;
00031
00032 #include "global.h"
00033 #include "boundeddim.h"
00034 #include "interval.h"
00035 #include "params.h"
00036 #include "point.h"
00037 #include "read.h"
00038
00039
00040
00041
00042
00043 enum MyProjectiveClusterReturnType { ZERO_POINTS, ZERO_SEEDS,
00044 NO_RANDOM_SEEDS, NO_RANDOM_INDICES,
00045 NO_QUALITY_CLUSTERS, NO_SIGNIFICANT_CLUSTERS,
00046 CLUSTER_FOUND };
00047
00048
00049
00050
00051
00052
00053
00054 struct MyKeyClusterParams
00055 {
00056 int numSeeds;
00057 int sizeDisc;
00058 int numDiscs;
00059 int whichRound;
00060 MyNT minimumHomogeneity;
00061 MyProjectiveClusterReturnType found;
00062 };
00063
00064
00065
00066 class MyPointPtr;
00067
00068
00069
00070 const unsigned int MY_DEFAULT_POINT_SET = 0;
00071
00072
00073
00074
00075
00076 class MyCluster;
00077
00078
00079 class MyBox
00080 {
00081 friend class MyCluster;
00082 public:
00083 MyBox()
00084 : numBounded(0), intervals(), realIntervals()
00085 {}
00086
00087
00088
00089 MyBox(const vector< MyNT > &lows, const vector< MyNT > &highs,
00090 const vector< bool > &bounded);
00091 MyBox(const MyPoint ¢re, const vector< MyNT > &width, const vector< bool > &bounded);
00092
00093 MyBox(const MyBox& copy)
00094 : numBounded(copy.numBounded),
00095 intervals(copy.intervals), realIntervals(copy.realIntervals)
00096 {}
00097 const MyBox& operator=(const MyBox& rhs)
00098 {
00099 if (this != &rhs)
00100 {
00101 numBounded = rhs.numBounded;
00102 intervals.clear();
00103 intervals = rhs.intervals;
00104 realIntervals.clear();
00105 realIntervals = rhs.realIntervals;
00106 }
00107 return(*this);
00108 }
00109
00110
00111
00112
00113 ~MyBox()
00114 {}
00115
00116
00117 void free()
00118 {
00119 intervals.clear();
00120 realIntervals.clear();
00121 }
00122
00123 bool operator==(const MyBox& rhs) const;
00124
00125 unsigned int getNumBounded() const
00126 {
00127 return(numBounded);
00128 }
00129
00130
00131
00132 bool contains(const MyPoint &point) const
00133 {
00134 for (unsigned int i = 0; i < numBounded; i++)
00135
00136
00137
00138
00139
00140 if (!intervals[i].contains(point[intervals[i].getDimension()]))
00141 return(false);
00142 return(true);
00143 }
00144
00145
00146 bool checkBounded(int dim) const
00147 {
00148 return(realIntervals.end() != _findDimension(dim));
00149 }
00150
00151
00152
00153
00154
00155
00156
00157
00158
00159 bool isSignificant(vector< vector< MyInterval > >& intervals);
00160
00161
00162 MyNT isSoftContained(vector< vector< MyInterval > >& intervals);
00163
00164
00165 bool isSignificant(const MyPointSet& points, MyNT maxPValue);
00166
00167 void getPValues(const MyPointSet& points, vector< MyNT >& pvalues)
00168 {
00169 for (unsigned int i = 0; i < numBounded; i++)
00170 pvalues.push_back(computePValue(points, i));
00171 }
00172
00173
00174
00175 void update(const MyPointSet& points, const vector< int >& indices);
00176
00177 void update(const MyPointSet& points, int index);
00178
00179 #ifdef DEBUG
00180
00181
00182 bool check(const vector< MyNT >* lows,
00183 const vector< MyNT >* highs) const;
00184 bool check(const vector< MyNT >* width) const;
00185 #endif
00186
00187 void print(ostream& ostr, const vector< MyDimInfo >& dimensionInfo,
00188 MyVerbosityLevel verbosity) const;
00189
00190 private:
00191 void construct(const vector< MyNT > &lows,
00192 const vector< MyNT > &highs, const vector< bool > &bounded);
00193 void construct(const MyPoint ¢re, const vector< MyNT > &width,
00194 const vector< bool > &bounded);
00195
00196
00197 MyNT computePValue(const MyPointSet& points, int i);
00198
00199
00200
00201
00202 vector< MyDimensionInterval >::const_iterator _findDimension(unsigned int dim) const
00203 {
00204 vector< MyDimensionInterval >::const_iterator itr;
00205
00206 for (itr = realIntervals.begin(); itr != realIntervals.end(); itr++)
00207 if (dim == (*itr).getDimension())
00208 break;
00209 return(itr);
00210 }
00211
00212
00213 vector< MyDimensionInterval >::iterator _findDimension(unsigned int dim)
00214 {
00215 vector< MyDimensionInterval >::iterator itr;
00216
00217 for (itr = realIntervals.begin(); itr != realIntervals.end(); itr++)
00218 if (dim == (*itr).getDimension())
00219 break;
00220 return(itr);
00221 }
00222
00223 private:
00224 unsigned int numBounded;
00225
00226
00227 vector< MyDimensionInterval > intervals;
00228
00229
00230 vector< MyDimensionInterval > realIntervals;
00231 };
00232
00233 #ifdef USE_CLUSTER_DETAILS
00234
00235 struct MyClusterDetails
00236 {
00237 public:
00238
00239
00240
00241
00242
00243
00244
00245
00246
00247
00248
00249
00250
00251
00252
00253
00254
00255
00256
00257
00258
00259
00260
00261
00262
00263
00264
00265
00266
00267 typedef int MyPointPtr;
00268
00269 MyClusterDetails()
00270 : seed(), discIndices()
00271 {}
00272 MyClusterDetails(MyPointPtr s, const vector< int >& ind)
00273 : seed(s), discIndices(ind)
00274 {}
00275 MyClusterDetails(const MyClusterDetails& copy)
00276 : seed(copy.seed), discIndices(copy.discIndices)
00277 {}
00278 ~MyClusterDetails()
00279 {}
00280
00281 const MyClusterDetails& operator=(const MyClusterDetails& rhs);
00282
00283
00284
00285
00286
00287
00288
00289
00290
00291
00292
00293
00294 const MyPoint& getSeed(const MyPointSet& points) const
00295 {
00296 return(points[seed]);
00297 }
00298
00299
00300
00301
00302
00303
00304 public:
00305 MyPointPtr seed;
00306
00307 vector< int > discIndices;
00308 };
00309 #endif // USE_CLUSTER_DETAILS
00310
00311
00312
00313
00314
00315 class MyClusterSet;
00316
00317
00318
00319 class MyCluster
00320 {
00321 public:
00322 #ifdef USE_CLUSTER_DETAILS
00323 typedef MyClusterDetails::MyPointPtr MyPointPtr;
00324 #endif
00325
00326 MyCluster()
00327 : box(), numContained(0), containedPoints(), realContainedPoints(),
00328 #ifdef USE_CLUSTER_DETAILS
00329 details(NULL),
00330 #endif
00331 filledIn(false), pointsFlipped(false)
00332 {}
00333
00334 #ifdef USE_BOUNDED_DIM_TYPE
00335
00336 MyCluster(
00337 #ifdef USE_CLUSTER_DETAILS
00338 const MyPointPtr seedPtr,
00339 #endif
00340 const MyBoundedDim &boundedDimInfo,
00341 const MyPointSet& points, const vector< int >& indices,
00342 const MyClusterParams ¶ms);
00343 #else
00344
00345
00346 MyCluster(
00347 #ifdef USE_CLUSTER_DETAILS
00348 const MyPointPtr seedPtr,
00349 #endif
00350 const vector< MyNT > &lows,
00351 const vector< MyNT > &highs, const MyPointSet& points,
00352 const vector< int > &indices,
00353 const MyClusterParams ¶ms);
00354
00355 MyCluster(
00356 #ifdef USE_CLUSTER_DETAILS
00357 const MyPointPtr seedPtr,
00358 #endif
00359 const vector< MyNT > &width,
00360 const MyPointSet& points,
00361 const vector< int > &indices,
00362 const MyClusterParams ¶ms);
00363
00364
00365
00366 MyCluster(const vector< MyNT > &lows, const vector< MyNT > &highs,
00367 const MyPointSet& points, bool saveMemory = false);
00368
00369
00370 MyCluster(const MyPointSet& points,
00371 const vector< int > &indices,
00372 const MyClusterParams ¶ms);
00373 #endif
00374
00375 MyCluster(const MyCluster& copy)
00376 : box(copy.box), numContained(copy.numContained),
00377 containedPoints(copy.containedPoints),
00378 realContainedPoints(copy.realContainedPoints),
00379 #ifdef USE_CLUSTER_DETAILS
00380 details(copy.details),
00381 #endif
00382 filledIn(copy.filledIn),
00383
00384 pointsFlipped(copy.pointsFlipped)
00385 {}
00386
00387 const MyCluster& operator=(const MyCluster& rhs)
00388 {
00389 if (this != &rhs)
00390 {
00391 box = rhs.box;
00392 numContained = rhs.numContained;
00393 containedPoints.clear();
00394 containedPoints = rhs.containedPoints;
00395 realContainedPoints.clear();
00396 realContainedPoints = rhs.realContainedPoints;
00397
00398
00399
00400
00401
00402 #ifdef USE_CLUSTER_DETAILS
00403 details = rhs.details;
00404 #endif
00405 filledIn = rhs.filledIn;
00406 pointsFlipped = rhs.pointsFlipped;
00407 }
00408 return(*this);
00409 }
00410
00411 ~MyCluster()
00412 {
00413
00414 }
00415
00416 bool operator==(const MyCluster& rhs) const
00417 {
00418 if (this == &rhs)
00419 return(true);
00420
00421 return(box == rhs.box);
00422 }
00423
00424 bool operator!=(const MyCluster& rhs) const
00425 {
00426 return(!(*this == rhs));
00427 }
00428
00429
00430 bool MyCluster::operator<(const MyCluster &other) const
00431 {
00432
00433
00434 return(getNumBounded()*getNumContained() <
00435 other.getNumBounded()*other.getNumContained());
00436 }
00437
00438
00439 bool MyCluster::operator>(const MyCluster &other) const
00440 {
00441 return(!(*this < other));
00442 }
00443
00444 #ifdef USE_BOUNDED_DIM_TYPE
00445 void construct(const MyBoundedDim &boundedDimInfo,
00446 const MyPointSet& points, const vector< int >& indices,
00447 const MyClusterParams ¶ms, bool brandNew = false);
00448 #else
00449 void construct(const vector< MyNT > &width, const MyPointSet& points,
00450 bool brandNew = false);
00451
00452 void construct(const MyPointSet& points,
00453 const vector< int > &discIndices,
00454 const MyClusterParams ¶ms,
00455 bool brandNew = false);
00456 #endif // USE_BOUNDED_DIM_TYPE
00457
00458
00459 void construct(const vector< MyNT > &lows, const vector< MyNT > &highs,
00460 const MyPointSet& points, bool brandNew = false);
00461
00462
00463
00464 MyProjectiveClusterReturnType
00465 find(const MyPointSet& points, MyClusterParams& params, ofstream &ostr,
00466 const MyBoundedDim &boundedDimInfo,
00467
00468
00469
00470
00471
00472 MyClusterSet *bestClusters = NULL
00473 )
00474 {
00475
00476 cerr << "This version of MyCluster::find() is not implemented!" << endl;
00477 exit(-1);
00478
00479
00480
00481 }
00482
00483 MyProjectiveClusterReturnType
00484 find(vector< MyPointSet > &pointSets,
00485 MyClusterParams& params, ofstream &ostr,
00486
00487 vector< const MyBoundedDim* > &boundedDimInfo,
00488 MyClusterSet *bestClusters = NULL
00489 );
00490
00491
00492 MyProjectiveClusterReturnType
00493 find_greedy(const MyPointSet& points, MyClusterParams& params,
00494 ofstream &ostr,
00495 const vector< MyNT >& widths,
00496 vector< vector< MyInterval > >* intervals,
00497 vector< vector< vector< int > > >* intervalsForPoints);
00498
00499
00500
00501
00502 bool MyCluster::similar(const MyCluster &other) const;
00503
00504 bool MyCluster::similar(const MyCluster &other, MyNT threshold) const;
00505
00506
00507
00508
00509
00510 void compare(const MyCluster& rhs, int& numDiff, int& numNew) const;
00511 bool compare(vector< MyCluster >& clusters, MyVerbosityLevel verbose);
00512 void merge(const MyCluster& cluster);
00513
00514
00515
00516 MyNT pValueShamir(const MyPointSet& points) const;
00517
00518
00519 MyNT pValueGaussian(const MyPointSet &points) const;
00520
00521 bool quality(MyNT alpha, int numPoints, int numDims) const;
00522 bool support(MyNT alpha, int numPoints, int numDims) const;
00523 MyNT homogeneity(const MyPointSet& points) const;
00524
00525
00526 string type(const MyPointSet &points) const;
00527
00528 void getContainedPoints(vector< int > &cont) const
00529 {
00530 cont = containedPoints;
00531 }
00532 unsigned int getNumContained() const
00533 {
00534 return(numContained);
00535 }
00536
00537 unsigned int getNumBounded() const
00538 {
00539 return(box.getNumBounded());
00540 }
00541
00542
00543 int countNumContained(const MyPointSet& points, const MyCluster& cluster)
00544 const;
00545
00546
00547
00548 int countNumContained(const MyPointSet& points, const vector< int >& indices)
00549 const;
00550
00551
00552
00553
00554 int findContainedPoints(const MyPointSet& points, const MyCluster& cluster,
00555 vector< int >& pointsInside);
00556
00557
00558
00559
00560 int findContainedPoints(const MyPointSet& points,
00561 const vector< int >& indices,
00562 vector< int >& pointsInside) const;
00563
00564
00565 int findNotContainedPoints(const MyPointSet& points,
00566 const vector< MyCluster> &clusters,
00567 vector< int >& pointsOutside) const;
00568
00569
00570
00571
00572 int findNotContainedPoints(const MyPointSet& points,
00573 const MyCluster& cluster,
00574 vector< int >& pointsOutside) const;
00575
00576
00577
00578
00579
00580 int findNotContainedPoints(const MyPointSet& points,
00581 const vector< int >& indices,
00582 vector< int >& pointsOutside) const;
00583
00584
00585
00586
00587
00588
00589
00590
00591
00592
00593
00594
00595
00596
00597
00598
00599 MyNT measure() const
00600 {
00601
00602
00603 return(getNumBounded()*getNumContained());
00604 }
00605
00606
00607
00608
00609
00610
00611
00612
00613 void hideContainedPoints(MyPointSet& points) const
00614 {
00615 points.hidePoints(containedPoints);
00616 }
00617
00618
00619
00620 void chooseContainedPoints(MyPointSet& points) const
00621 {
00622 points.choosePoints(containedPoints);
00623 }
00624
00625
00626
00627
00628 void hide(MyPointSet& points) const
00629 {
00630 points.hide(containedPoints, box.intervals);
00631 }
00632
00633
00634
00635
00636
00637 void updatePointCounts(vector< unsigned int > &pointCounts) const
00638 {
00639 const vector< int > &contPoints = realContainedPoints;
00640
00641
00642 for (unsigned int i = 0; i < contPoints.size(); i++)
00643 pointCounts[contPoints[i]]++;
00644 }
00645
00646
00647 void print(ostream& ostr, MyVerbosityLevel verbosity) const;
00648
00649
00650 void print(ostream& ostr, const MyPointSet& points,
00651 MyVerbosityLevel verbosity, bool printItemSetFormat = false) const;
00652
00653
00654 void printExtra(ostream& ostr, const MyPointSet& points) const;
00655
00656
00657
00658 bool addPoint(const MyPointSet& points, int index);
00659 bool deletePoint(const MyPointSet& points, int index);
00660 bool addDimension(int dim, MyNT start, MyNT end, const MyPointSet& points,
00661 bool check = false);
00662 bool addDimension(int dim, MyNT start, MyNT end, const vector< int >& contPoints);
00663 bool addDimension(const MyDimensionInterval& interval,
00664 const vector< int >& contPoints);
00665
00666
00667
00668 bool addDimension(int dim, MyNT width, const MyPointSet& points);
00669 bool deleteDimension(int dim, const MyPointSet& points);
00670
00671 void anneal(const MyPointSet& points, const vector< MyNT >& widths,
00672 MyClusterParams& params);
00673
00674 bool isSignificant(vector< vector< MyInterval > >& intervals)
00675 {
00676 return(box.isSignificant(intervals));
00677 }
00678
00679 MyNT isSoftContained(vector< vector< MyInterval > >& intervals)
00680 {
00681 return(box.isSoftContained(intervals));
00682 }
00683
00684 bool isSignificant(const MyPointSet& points, MyNT maxPValue)
00685 {
00686 return(box.isSignificant(points, maxPValue));
00687 }
00688
00689 void getPValues(const MyPointSet& points, vector< MyNT >& pvalues)
00690 {
00691 return(box.getPValues(points, pvalues));
00692 }
00693
00694
00695
00696 int countNumContainedForDim(int dim, MyNT start, MyNT end, const MyPointSet &points) const
00697 {
00698 int numIn = 0;
00699 for (unsigned int i = 0; i < numContained; i++)
00700 numIn += ::contains(start, end, points[containedPoints[i]][dim]);
00701 return(numIn);
00702 }
00703
00704 private:
00705
00706
00707
00708
00709
00710
00711
00712 unsigned int getNumSamples(bool flipped) const
00713 {
00714 if (flipped)
00715 return(getNumContained());
00716 else
00717 return(getNumBounded());
00718 }
00719
00720 unsigned int getNumGenes(bool flipped) const
00721 {
00722 if (flipped)
00723 return(getNumBounded());
00724 else
00725 return(getNumContained());
00726 }
00727
00728 unsigned int getSampleIndex(int i, bool flipped) const
00729 {
00730 if (flipped)
00731 return(containedPoints[i]);
00732 else
00733 return(box.intervals[i].getDimension());
00734 }
00735 unsigned int getGeneIndex(int i, bool flipped) const
00736 {
00737 if (flipped)
00738 return(box.intervals[i].getDimension());
00739 else
00740 return(containedPoints[i]);
00741 }
00742
00743
00744
00745
00746
00747 void free()
00748 {
00749 containedPoints.clear();
00750 box.free();
00751 filledIn = false;
00752 }
00753
00754 #ifdef DEBUG
00755 bool check(const MyPointSet& points) const;
00756 #endif
00757
00758
00759 void _findLargestClass(const MyPointSet& points,
00760 unsigned int& maxNum,
00761 string& maxClass, bool real = true) const;
00762
00763
00764
00765
00766
00767 void findContainedPoints(const MyPointSet &points)
00768 {
00769 for (unsigned int i = 0; i < points.size(); i++)
00770
00771
00772
00773
00774 if (box.contains(points[i]))
00775 {
00776 realContainedPoints.push_back(i);
00777 if (!points[i].getHidden())
00778 containedPoints.push_back(i);
00779 }
00780
00781 }
00782
00783 void findContainedPoints(const MyPointSet &points, vector< int >& contPoints) const
00784 {
00785 for (unsigned int i = 0; i < points.size(); i++)
00786
00787
00788
00789
00790 if (!points[i].getHidden() && (box.contains(points[i])))
00791 contPoints.push_back(i);
00792 }
00793
00794
00795
00796 inline int findContainedPointsForDim(int dim, MyNT start, MyNT end,
00797 const MyPointSet &points, vector< int >& newContained)
00798 {
00799 for (unsigned int i = 0; i < numContained; i++)
00800 if (::contains(start, end, points[containedPoints[i]][dim]))
00801 newContained.push_back(containedPoints[i]);
00802 return(newContained.size());
00803 }
00804
00805 int findMaxNumContainedForDim(int dim, const vector< MyInterval >* intervals,
00806 MyNT width, const MyPointSet& points,
00807 MyNT& maxStart, MyNT& maxEnd);
00808
00809
00810
00811
00812 void mergeContainedPoints(const vector< int >& pts);
00813
00814 void computeBoundedDimensions(const MyPointSet& points,
00815 const MyPoint& seed, const vector< int >&discIndices,
00816 const vector< MyNT > &lows, const vector< MyNT > &highs,
00817 vector< bool >& bounded) const;
00818 void computeBoundedDimensions(const MyPointSet& points,
00819 const MyPoint& seed, const vector< int >&discIndices,
00820 const vector< MyNT > &width,
00821 vector< bool >& bounded) const;
00822
00823 void improveDimensions(const MyPointSet& points, MyClusterParams& params,
00824 vector< vector< MyInterval > >* intervals);
00825
00826
00827 private:
00828 MyBox box;
00829 unsigned int numContained;
00830
00831
00832
00833 vector< int > containedPoints;
00834 vector< int > realContainedPoints;
00835
00836 #ifdef USE_CLUSTER_DETAILS
00837 MyClusterDetails* details;
00838 #endif
00839
00840 bool filledIn;
00841
00842
00843
00844
00845
00846
00847 bool pointsFlipped;
00848 };
00849
00850
00851
00852 struct MyClusterLess :
00853 public binary_function< const MyCluster&, const MyCluster&, bool >
00854 {
00855 bool operator()(const MyCluster &lhs, const MyCluster &rhs) const
00856 {
00857 return(lhs < rhs);
00858 }
00859 };
00860
00861 struct MyClusterMore :
00862 public binary_function< const MyCluster&, const MyCluster&, bool >
00863 {
00864 bool operator()(const MyCluster &lhs, const MyCluster &rhs) const
00865 {
00866 return(lhs > rhs);
00867 }
00868 };
00869
00870
00871
00872
00873 struct MyClusterSet
00874 {
00875 public:
00876 MyClusterSet(int max = 0)
00877
00878 : clusters(), maxNum(max), minMeasure(0), minPtr()
00879 {}
00880
00881 ~MyClusterSet()
00882 {}
00883
00884 int size() const
00885 {
00886 return(clusters.size());
00887 }
00888 int maxSize() const
00889 {
00890 return(maxNum);
00891 }
00892 bool empty() const
00893 {
00894 return(clusters.empty());
00895 }
00896
00897 bool full() const
00898 {
00899 return(maxNum == clusters.size());
00900 }
00901 bool insert(const MyCluster &cluster, const MyPointSet &points,
00902 const MyClusterParams ¶ms);
00903 void print(ostream &ostr) const;
00904 void printAll(ostream &ostr, const MyPointSet &points,
00905 MyClusterParams ¶ms) const;
00906 void printAllTypes(ostream &ostr, const MyPointSet &points) const;
00907
00908
00909
00910 private:
00911
00912 private:
00913
00914
00915 typedef set< MyCluster, MyClusterMore > _MyClusterSet;
00916
00917 set< MyCluster, MyClusterMore > clusters;
00918
00919
00920 int maxNum;
00921
00922 MyNT minMeasure;
00923
00924 _MyClusterSet::iterator minPtr;
00925
00926 };
00927
00928
00929
00930 #endif // _PROJCLUSTER_H