00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00035
00036
00037
00038 #ifndef _ENRICHMENT_ALGORITHM_H
00039 #define _ENRICHMENT_ALGORITHM_H
00040
00041 #include "graph.h"
00042 #include "old-annotations.h"
00043
00044 #include "../libexpression/point.h"
00045 #include "constants.h"
00046
00047
00048 #if GCC_VERSION >= 40300
00049
00050 #include <tr1/unordered_set>
00051 #include <tr1/unordered_map>
00052 using namespace std::tr1;
00053 #else
00054 #include <ext/hash_map>
00055 using namespace __gnu_cxx;
00056 #define unordered_set hash_set;
00057 #define unordered_map hash_map;
00058 #endif
00059
00060 using namespace __gnu_cxx;
00061
00069 class LogHelper
00070 {
00071 private:
00072 mutable vector<double> _logFactCache;
00073 public:
00074 LogHelper();
00075 double factorial(unsigned int) const;
00076 double factorialLog(unsigned int) const;
00077 double factorialEstimate(unsigned int) const;
00078 double factorialEstimateLog(unsigned int) const;
00079 };
00080
00081
00092 class FunctionalEnrichmentAlgorithm
00093 {
00094 protected:
00096 set<MyNodeId> _clusterGenes;
00097
00099 set<MyNodeId> _universeGenes;
00100
00102 MyAnnotations& _annotations;
00103
00105 const GeneOntology *_go;
00106
00108 string _category;
00109
00110
00112
00113
00114
00115 static LogHelper _logHelper;
00116
00117
00118 public:
00128 FunctionalEnrichmentAlgorithm(const set< MyNodeId >& clusterGenes, const set< MyNodeId >& universeGenes,
00129 MyAnnotations& annotations, const GeneOntology& go, string category);
00130
00138 FunctionalEnrichmentAlgorithm(MyAnnotations& annotations, const GeneOntology& go, string category)
00139 : _annotations(annotations), _go(&go), _category(category) {}
00140
00142 virtual ~FunctionalEnrichmentAlgorithm()
00143 {}
00144
00150 virtual void computeEnrichedFunctions(const set< string >& functionsToProcess);
00151
00156 virtual void computeEnrichmentForFunction(const string funcType, const string funcId) = 0;
00157
00162 virtual void printEnrichmentResults(ostream &ostr) = 0;
00163
00164 };
00165
00166
00172 class RandomFunctionNetworkBasedEnrichmentAlgorithm: public FunctionalEnrichmentAlgorithm
00173 {
00174 private:
00175
00177 MyGraph _clusterGraph;
00179 MyGraph _universeGraph;
00180
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195
00196 struct randomFunctionEnrichmentResult{
00197 string functionId;
00198 string functionType;
00199 unsigned int v_f;
00200 unsigned int c_f;
00201 double lexComponentsByNodesPValue;
00202 double lexComponentsByEdgesPValue;
00203 double analyticalHypPValue;
00204 double empiricalHypPValue;
00205 double densityPValue;
00206 };
00207
00211 vector< randomFunctionEnrichmentResult > _enrichmentResults;
00212
00216 map< string, vector< int > > _componentSizesByNodes;
00217
00221 map< string, vector< int > > _componentSizesByEdges;
00222
00226 map< string, double > _densities;
00227
00232 map< string, pair< unsigned int, unsigned int > > _functionSizes;
00233
00241 map< string, pair< int, int > > _lexComponentsByNodesRecords;
00242
00250 map< string, pair< int, int > > _lexComponentsByEdgesRecords;
00251
00259 map< string, pair< int, int > > _empiricalHypRecords;
00260
00261
00269 map< string, pair< int, int > > _densityRecords;
00270
00273 unsigned int _numRandomGraphs;
00274
00275
00276 void initialize(const set< string >& functionsToProcess);
00277
00278 public:
00292 RandomFunctionNetworkBasedEnrichmentAlgorithm(const MyGraph& clusterGraph, const MyGraph& universeGraph,
00293 MyAnnotations& annotations, const GeneOntology& go, string category,
00294 unsigned int numRandomGraphs);
00295
00297 virtual ~RandomFunctionNetworkBasedEnrichmentAlgorithm()
00298 {}
00299
00301 virtual void computeEnrichedFunctions(const set< string >& functionsToProcess);
00302
00303
00304 virtual void computeEnrichmentForFunction(const string funcType, const string funcId);
00305
00310 void printEnrichmentResult(ostream &ostr, randomFunctionEnrichmentResult result);
00311
00316 virtual void printEnrichmentResults(ostream &ostr);
00317
00318 };
00319
00320
00325 class RandomUniverseNetworkBasedEnrichmentAlgorithm: public FunctionalEnrichmentAlgorithm
00326 {
00327 private:
00328
00330 MyGraph _clusterGraph;
00332 MyGraph _universeGraph;
00334 MyGraph _randomGraph;
00335
00341 struct randomUniverseEnrichmentResult{
00342 string functionId;
00343 string functionType;
00344 unsigned int v_f;
00345 unsigned int c_f;
00346 double lexComponentsByNodesPValue;
00347 double lexComponentsByEdgesPValue;
00348 double analyticalHypPValue;
00349 double empiricalHypPValue;
00350 double densityPValue;
00351 };
00352
00356 vector< randomUniverseEnrichmentResult > _enrichmentResults;
00357
00361 map< string, vector< int > > _componentSizesByNodes;
00362
00366 map< string, vector< int > > _componentSizesByEdges;
00367
00371 map< string, double > _densities;
00372
00377 map< string, pair< unsigned int, unsigned int > > _functionSizes;
00378
00386 map< string, pair< int, int > > _lexComponentsByNodesRecords;
00387
00395 map< string, pair< int, int > > _lexComponentsByEdgesRecords;
00396
00404 map< string, pair< int, int > > _empiricalHypRecords;
00405
00413 map< string, pair< int, int > > _densityRecords;
00414
00417 unsigned int _numRandomGraphs;
00418
00421 unsigned int _numRandomizationIterations;
00422
00423 void updateEnrichmentForFunction(string funcType, string funcId);
00424
00425
00426 void initialize(const set< string >& functionsToProcess);
00427
00428 public:
00442 RandomUniverseNetworkBasedEnrichmentAlgorithm(const MyGraph& clusterGraph, const MyGraph& universeGraph,
00443 MyAnnotations& annotations, const GeneOntology& go, string category,
00444 unsigned int numRandomGraphs, unsigned int numRandomizationIterations);
00445
00447 virtual ~RandomUniverseNetworkBasedEnrichmentAlgorithm()
00448 {}
00449
00451 virtual void computeEnrichedFunctions(const set< string >& functionsToProcess);
00452
00453 void computeEnrichmentForFunction(const string funcType, const string funcId);
00454
00459 void printEnrichmentResult(ostream &ostr, randomUniverseEnrichmentResult result);
00460
00465 virtual void printEnrichmentResults(ostream &ostr);
00466
00467 };
00468
00469
00473 class HypergeometricAlgorithm: public FunctionalEnrichmentAlgorithm
00474 {
00475 public:
00481 struct hypergeometricEnrichmentResult{
00482 string functionId;
00483 string functionType;
00484 unsigned int v_f;
00485 unsigned int c_f;
00486 double hypergeometricPvalue;
00487 };
00488
00489 private:
00490
00493 vector< hypergeometricEnrichmentResult > _hypergeometricEnrichmentResults;
00494
00495 public:
00496
00505 HypergeometricAlgorithm(const set<string> &clusterGraph, const set<string> &universeSet,
00506 MyAnnotations &annotations, const GeneOntology &go,
00507 string category);
00508
00509
00518 HypergeometricAlgorithm(const MyGraph& clusterGraph, const MyGraph& universeGraph,
00519 MyAnnotations& annotations, const GeneOntology& go,
00520 string category);
00521
00523 virtual ~HypergeometricAlgorithm()
00524 {}
00525
00531 virtual void computeEnrichmentForFunction(const string funcType, const string funcId);
00532
00537 virtual void printEnrichmentResults(ostream &ostr);
00538
00547 static long double getHypergeometricPvalue(int v, int c, int v_f, int c_f);
00548 };
00549
00550
00551
00556 class PAGEAlgorithm : public FunctionalEnrichmentAlgorithm
00557 {
00558
00559 private:
00560
00561 MyPointSet _universePoints;
00562
00563 map< string, unsigned int > _indexMap;
00564
00565 double _mean;
00566 double _stddev;
00567
00568 double getValue(string gene)
00569 {
00570 return _universePoints[0][_indexMap[gene]];
00571 }
00572
00573 struct enrichmentResult{
00574 string funcId;
00575 string funcType;
00576 unsigned int v_f;
00577 double zscore;
00578 double pval;
00579 };
00580
00581 vector< enrichmentResult > _enrichmentResults;
00582
00583
00584 public:
00585
00592 PAGEAlgorithm(const MyPointSet &universePoints,
00593 MyAnnotations &annotations, const GeneOntology &go, string category);
00594
00596 virtual ~PAGEAlgorithm()
00597 {}
00598
00603 virtual void computeEnrichmentForFunction(const string funcType, const string funcId);
00604
00609 virtual void printEnrichmentResults(ostream &ostr);
00610 };
00611
00612
00613
00618 class GenGOAlgorithm: public FunctionalEnrichmentAlgorithm
00619 {
00620 private:
00621
00623 double _p;
00625 double _q;
00627 double _penalty;
00628
00632 struct FunctionInfo
00633 {
00634 set<MyNodeId> clusterGenes;
00635 int numNotClusterGenes;
00636 };
00637 map<string, FunctionInfo> _allFunctionInfo;
00638 vector<string> _activeFunctions;
00639 set<string> _notActiveFunctions;
00640
00642 map<string, int> _clusterGenesToNumActiveFunctions;
00643
00645 int _numClusterGenesWithSomeActiveFunction;
00646
00648 int _numClusterGenesWithNoActiveFunction;
00649
00651 int _numEdgesNotClusterGeneWithActiveFunction;
00652
00654 int _numEdgesNotClusterGeneWithNotActiveFunction;
00655
00659 void _activateFunction(string func);
00660
00664 void _deactivateFunction(string func);
00665
00667 double _calcLogLikelihood();
00668
00669 public:
00682 GenGOAlgorithm(const MyGraph& clusterGraph, const MyGraph& universeGraph,
00683 MyAnnotations& annotations, const GeneOntology& go, string category,
00684 double p, double q, double penalty);
00686 virtual ~GenGOAlgorithm()
00687 {}
00688
00692 virtual void computeEnrichedFunctions(const set< string >& functionsToProcess);
00693
00694
00695 virtual void computeEnrichmentForFunction(const string funcType, const string funcId)
00696 {}
00697
00702 virtual void printEnrichmentResults(ostream &ostr);
00703
00704 };
00705
00706
00719 class GroupHypAlgorithm: public FunctionalEnrichmentAlgorithm
00720 {
00721 private:
00722 MyGraph _universeGraph;
00723 MyGraph _clusterGraph;
00724
00726 map< string, set<MyNodeId> > _functionToAnnotatedClusterGenes;
00728 map< string, set<MyNodeId> > _functionToAnnotatedNotClusterGenes;
00729
00731 vector<string> _activeFunctions;
00733 set<string> _notActiveFunctions;
00734
00736 map<string, int> _clusterGenesToNumActiveFunctions;
00738 map<string, int> _notClusterGenesToNumActiveFunctions;
00739
00740 int _numClusterGenesWithSomeActiveFunction;
00741 int _numNotClusterGenesWithSomeActiveFunction;
00742
00746 void _activateFunction(string func);
00747
00751 void _deactivateFunction(string func);
00752
00755 double _calcGroupHypPvalue() const;
00756
00757 public:
00767 GroupHypAlgorithm(const MyGraph& clusterGraph, const MyGraph& universeGraph,
00768 MyAnnotations& annotations, const GeneOntology& go, string category);
00769
00771 virtual ~GroupHypAlgorithm()
00772 {}
00773
00774 virtual void computeEnrichedFunctions(const set< string >& functionsToProcess);
00775
00776
00777 virtual void computeEnrichmentForFunction(const string funcType, const string funcId)
00778 {}
00779
00784 virtual void printEnrichmentResults(ostream &ostr);
00785
00786 };
00787
00788
00789
00790
00795 class EdgeGroupHyp: public FunctionalEnrichmentAlgorithm
00796 {
00797 private:
00798 MyGraph _universeGraph;
00799 MyGraph _clusterGraph;
00800
00801 set<string> _clusterEdges;
00802 set<string> _notClusterEdges;
00803
00804
00805 map< string, set<string> > _functionToAnnotatedClusterGenes;
00806
00808 map< string, set<string> > _functionToAnnotatedClusterEdges;
00810 map< string, set<string> > _functionToAnnotatedNotClusterEdges;
00811
00813 vector<string> _activeFunctions;
00815 set<string> _notActiveFunctions;
00816
00818 map<string, int> _clusterEdgesToNumActiveFunctions;
00820 map<string, int> _notClusterEdgesToNumActiveFunctions;
00821
00822 int _numClusterEdgesWithSomeActiveFunction;
00823 int _numNotClusterEdgesWithSomeActiveFunction;
00824
00825 void _initialize();
00826
00830 void _activateFunction(string func);
00831
00835 void _deactivateFunction(string func);
00836
00839 long double _calcGroupHypPvalue() const;
00840
00841 public:
00851 EdgeGroupHyp(const MyGraph& clusterGraph, const MyGraph& universeGraph,
00852 MyAnnotations& annotations, const GeneOntology& go, string category);
00853
00855 virtual ~EdgeGroupHyp()
00856 {}
00857
00858 virtual void computeEnrichedFunctions(const set< string >& functionsToProcess);
00859
00860
00861 virtual void computeEnrichmentForFunction(const string funcType, const string funcId)
00862 {}
00863
00868 virtual void printEnrichmentResults(ostream &ostr);
00869
00870 };
00871
00872
00873
00874
00875
00880 class FuncAssociate: public FunctionalEnrichmentAlgorithm
00881 {
00882 private:
00883
00885 MyPointSet _clusterRanks;
00886
00887 vector< pair< MyNodeId, MyNT > > _currClusterRank;
00888 string _currClusterId;
00889
00890 ofstream _outputStream;
00891
00893 struct FunctionInfo{
00894 BioFunction function;
00895 set< MyNodeId > funcGenesInUniverse;
00896 unsigned int numFuncGenesInCluster;
00897 double hypPval;
00898 double bestHypPval;
00899 set< MyNodeId > genesInBestPrefix;
00900 set< MyNodeId > funcGenesInBestPrefix;
00901 pair< unsigned int, unsigned int > adjustedPval;
00902 };
00903
00909 void computeBestPval(const BioFunction* function);
00910
00913 void computeAdjustedPvals();
00914
00915
00916 void updateCurrClusterInfo(unsigned int i);
00917
00918 void printHeader(string sample);
00919
00920 void printEnrichmentForFuncInfo(FunctionInfo &fi);
00921
00922
00923
00924
00925
00926 public:
00940 FuncAssociate(const MyPointSet &clusterPoints, const set<string> &universeSet,
00941 MyAnnotations& annotations, const GeneOntology& go, string category,
00942 string outfile);
00943
00945 virtual ~FuncAssociate()
00946 {}
00947
00953 virtual void computeEnrichedFunctions(const set< string >& functionsToProcess);
00954
00955
00956 virtual void computeEnrichmentForFunction(const string funcType, const string funcId)
00957 {}
00958
00963 virtual void printEnrichmentResults(ostream &ostr);
00964
00965 };
00966
00967
00968
00969
00974 class SetCoverEnrichment: public FunctionalEnrichmentAlgorithm
00975 {
00976 private:
00977
00978 MyGraph _universeGraph;
00979 MyGraph _clusterGraph;
00980
00981 set<string> _currClusterEdges;
00982
00983 struct FuncStats{
00984 set<string> inEdges;
00985 set<string> outEdges;
00986 MyNT score;
00987 };
00988
00989 map<string, FuncStats> _stats;
00990
00991 vector<string> _selectedFunctions;
00992 set<string> _selectedInEdges;
00993 set<string> _selectedOutEdges;
00994
00995
00996
00997 void _initialize();
00998
00999 MyNT _score(unsigned int in, unsigned int out);
01000
01001 string _getHighestScoringFunc();
01002
01003 void _removeFunc(string func);
01004
01005
01006 public:
01016 SetCoverEnrichment(const MyGraph& clusterGraph, const MyGraph& universeGraph,
01017 MyAnnotations& annotations, const GeneOntology& go, string category);
01018
01020 virtual ~SetCoverEnrichment()
01021 {}
01022
01023
01029 virtual void computeEnrichedFunctions(const set< string >& functionsToProcess);
01030
01032 virtual void computeEnrichmentForFunction(const string funcType, const string funcId)
01033 { }
01034
01039 virtual void printEnrichmentResults(ostream &ostr);
01040
01041 };
01042
01043
01044
01045
01046
01047
01048
01051 class FunctionalEnrichmentAlgorithmDispatcher
01052 {
01053 public:
01054
01055 enum FunctionalEnrichmentAlgorithmType
01056 {
01057 NO_ALGORITHM,
01058 FUNCASSOCIATE,
01059 GENGO,
01060 GROUP_HYP,
01061 EDGE_GROUP_HYP,
01062 HYPERGEOMETRIC,
01063 PAGE,
01064 RANDOM_FUNCTION,
01065 RANDOM_UNIVERSE,
01066 SET_COVER,
01067 MAX_ALGORITHM
01068 };
01069
01070 static FunctionalEnrichmentAlgorithm* getFunctionalEnrichmentAlgorithm(
01071 unsigned int algorithmType,
01072 const set<MyNodeId> &clusterSet, const MyGraph &clusterNet, const MyPointSet &clusterPoints,
01073 const set<MyNodeId> &universeSet, const MyGraph &universeNet, const MyPointSet &universePoints,
01074 MyAnnotations &annotations, const GeneOntology &go, string category,
01075 unsigned int numRandomNets, unsigned int numRandomizationIterations,
01076 double genGOp, double genGOq, double genGOpenalty, string outfile);
01077
01078 };
01079
01080
01083 class EnrichmentPostprocess
01084 {
01085 private:
01086
01087 struct enrichmentResult{
01088 GOFunction* function;
01089 double lexComponentsByNodesPValue;
01090 double lexComponentsByEdgesPValue;
01091 double densityPValue;
01092 };
01093
01095 GeneOntology * _go;
01096
01098 vector< GOFunction* > _topoSortedGOFunctions;
01099
01101 double _pvalCutoff;
01102
01105 map< string, enrichmentResult > _results;
01106
01113 map< string, set< string > > _functionsBelowCutoff;
01114
01115
01119 map< string, set< string > > _leavesBelowCutoff;
01120
01121 public:
01123 EnrichmentPostprocess(GeneOntology& go, string resultsFile, double pvalCutoff);
01124
01126 ~EnrichmentPostprocess()
01127 {}
01128
01133 void readEnrichment(string inFile);
01134
01135 void print(ostream &ostr);
01136
01137 };
01138
01139
01140 #endif // _ENRICHMENT_ALGORITHM_H