Biorithm  1.1
 All Classes Functions Variables Typedefs Friends
enrichment-algorithm.h
00001 /**************************************************************************
00002  * Copyright (c) 2004-2011 T. M. Murali                                   *
00003  * Copyright (c) 2007-2011 Naveed Massjouni                               *
00004  * Copyright (c) 2009-2011 Christopher L. Poirel                          *
00005  * Copyright (c) 2011 Phillip Whisenhunt                                  *
00006  * Copyright (c) 2011 David Badger                                        *
00007  * Copyright (c) 2010 Clifford Conley Owens III                           *
00008  *                                                                        *
00009  * This file is part of Biorithm.                                         *
00010  *                                                                        *
00011  * Biorithm is free software: you can redistribute it and/or modify       *
00012  * it under the terms of the GNU General Public License as published by   *
00013  * the Free Software Foundation, either version 3 of the License, or      *
00014  * (at your option) any later version.                                    *
00015  *                                                                        *
00016  * Biorithm is distributed in the hope that it will be useful,            *
00017  * but WITHOUT ANY WARRANTY; without even the implied warranty of         *
00018  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          *
00019  * GNU General Public License for more details.                           *
00020  *                                                                        *
00021  * You should have received a copy of the GNU General Public License      *
00022  * along with Biorithm.  If not, see <http://www.gnu.org/licenses/>.      *
00023  *                                                                        *
00024  **************************************************************************/
00025 
00035 // Purpose: define classes that implement various algorithms for
00036 // functional enrichment.
00037 
00038 #ifndef _ENRICHMENT_ALGORITHM_H
00039 #define _ENRICHMENT_ALGORITHM_H
00040 
00041 #include "graph.h"
00042 #include "old-annotations.h"
00043 
00044 #include "../libexpression/point.h"
00045 #include "constants.h"
00046 // Test for GCC >= 4.3. from http://gcc.gnu.org/onlinedocs/cpp/Common-Predefined-Macros.html.
00047 // See definition of GCC_VERSION macro in libutil/constants.h
00048 #if GCC_VERSION >= 40300
00049 // see http://gcc.gnu.org/gcc-4.3/changes.html
00050 #include <tr1/unordered_set>
00051 #include <tr1/unordered_map>
00052 using namespace std::tr1;
00053 #else
00054 #include <ext/hash_map>
00055 using namespace __gnu_cxx;
00056 #define unordered_set hash_set;
00057 #define unordered_map hash_map;
00058 #endif
00059 
00060 using namespace __gnu_cxx;
00061 
00069 class LogHelper
00070 {
00071 private:
00072   mutable vector<double> _logFactCache;
00073 public:
00074   LogHelper();
00075   double factorial(unsigned int) const;
00076   double factorialLog(unsigned int) const;
00077   double factorialEstimate(unsigned int) const;
00078   double factorialEstimateLog(unsigned int) const;
00079 };
00080 
00081 
00092 class FunctionalEnrichmentAlgorithm 
00093 {
00094 protected:
00096   set<MyNodeId> _clusterGenes;
00097 
00099   set<MyNodeId> _universeGenes;
00100 
00102   MyAnnotations& _annotations;
00103 
00105   const GeneOntology *_go;
00106 
00108   string _category;
00109 
00110 
00112   // This is declared as static because the LogHelper caches factorial logs
00113   // as they are computed. Now when any instance of FunctionalEnrichmentAlgorithm
00114   // is instantiated, it can share this cache (and the associated LogHelper).
00115   static LogHelper _logHelper;
00116 
00117   
00118 public:
00128   FunctionalEnrichmentAlgorithm(const set< MyNodeId >& clusterGenes, const set< MyNodeId >& universeGenes,
00129                                 MyAnnotations& annotations, const GeneOntology& go, string category);
00130 
00138   FunctionalEnrichmentAlgorithm(MyAnnotations& annotations, const GeneOntology& go, string category)
00139     : _annotations(annotations), _go(&go), _category(category) {}
00140 
00142   virtual ~FunctionalEnrichmentAlgorithm()
00143     {}
00144 
00150   virtual void computeEnrichedFunctions(const set< string >& functionsToProcess);
00151 
00156   virtual void computeEnrichmentForFunction(const string funcType, const string funcId) = 0;
00157 
00162   virtual void printEnrichmentResults(ostream &ostr) = 0;
00163 
00164 };
00165 
00166 
00172 class RandomFunctionNetworkBasedEnrichmentAlgorithm: public FunctionalEnrichmentAlgorithm
00173 {
00174 private:
00175 
00177   MyGraph _clusterGraph;
00179   MyGraph _universeGraph;
00180 
00186 //  struct randomFunctionEnrichmentResult{
00187 //    string functionId;
00188 //    string functionType;
00189 //    unsigned int v_f;
00190 //    unsigned int c_f;
00191 //    double lexComponentsByNodesPValue;
00192 //    double lexComponentsByEdgesPValue;
00193 //    double densityPValue;
00194 //  };
00195 
00196   struct randomFunctionEnrichmentResult{
00197     string functionId;
00198     string functionType;
00199     unsigned int v_f;
00200     unsigned int c_f;
00201     double lexComponentsByNodesPValue;
00202     double lexComponentsByEdgesPValue;
00203     double analyticalHypPValue;
00204     double empiricalHypPValue;
00205     double densityPValue;
00206   };
00207 
00211   vector< randomFunctionEnrichmentResult > _enrichmentResults;
00212 
00216   map< string, vector< int > > _componentSizesByNodes;
00217 
00221   map< string, vector< int > > _componentSizesByEdges;
00222 
00226   map< string, double > _densities;
00227 
00232   map< string, pair< unsigned int, unsigned int > > _functionSizes;
00233 
00241   map< string, pair< int, int > > _lexComponentsByNodesRecords;
00242 
00250   map< string, pair< int, int > > _lexComponentsByEdgesRecords;
00251 
00259   map< string, pair< int, int > > _empiricalHypRecords;
00260 
00261 
00269   map< string, pair< int, int > > _densityRecords;
00270 
00273   unsigned int _numRandomGraphs;
00274 
00275   // a helper function for initializing _componentSizesByNodes, _densities
00276   void initialize(const set< string >& functionsToProcess);
00277 
00278 public:
00292   RandomFunctionNetworkBasedEnrichmentAlgorithm(const MyGraph& clusterGraph, const MyGraph& universeGraph,
00293                  MyAnnotations& annotations, const GeneOntology& go, string category,
00294                  unsigned int numRandomGraphs);
00295 
00297   virtual ~RandomFunctionNetworkBasedEnrichmentAlgorithm()
00298     {}
00299 
00301   virtual void computeEnrichedFunctions(const set< string >& functionsToProcess);
00302 
00303 
00304   virtual void computeEnrichmentForFunction(const string funcType, const string funcId);
00305 
00310   void printEnrichmentResult(ostream &ostr, randomFunctionEnrichmentResult result);
00311 
00316   virtual void printEnrichmentResults(ostream &ostr);
00317 
00318 };
00319 
00320 
00325 class RandomUniverseNetworkBasedEnrichmentAlgorithm: public FunctionalEnrichmentAlgorithm
00326 {
00327 private:
00328 
00330   MyGraph _clusterGraph;
00332   MyGraph _universeGraph;
00334   MyGraph _randomGraph;
00335 
00341   struct randomUniverseEnrichmentResult{
00342     string functionId;
00343     string functionType;
00344     unsigned int v_f;
00345     unsigned int c_f;
00346     double lexComponentsByNodesPValue;
00347     double lexComponentsByEdgesPValue;
00348     double analyticalHypPValue;
00349     double empiricalHypPValue;
00350     double densityPValue;
00351   };
00352 
00356   vector< randomUniverseEnrichmentResult > _enrichmentResults;
00357 
00361   map< string, vector< int > > _componentSizesByNodes;
00362 
00366   map< string, vector< int > > _componentSizesByEdges;
00367 
00371   map< string, double > _densities;
00372 
00377   map< string, pair< unsigned int, unsigned int > > _functionSizes;
00378 
00386   map< string, pair< int, int > > _lexComponentsByNodesRecords;
00387 
00395   map< string, pair< int, int > > _lexComponentsByEdgesRecords;
00396 
00404   map< string, pair< int, int > > _empiricalHypRecords;
00405 
00413   map< string, pair< int, int > > _densityRecords;
00414 
00417   unsigned int _numRandomGraphs;
00418 
00421   unsigned int _numRandomizationIterations;
00422 
00423   void updateEnrichmentForFunction(string funcType, string funcId);
00424 
00425   // a helper function for initializing _componentSizesByNodes, _densities
00426   void initialize(const set< string >& functionsToProcess);
00427 
00428 public:
00442   RandomUniverseNetworkBasedEnrichmentAlgorithm(const MyGraph& clusterGraph, const MyGraph& universeGraph,
00443                  MyAnnotations& annotations, const GeneOntology& go, string category,
00444                  unsigned int numRandomGraphs, unsigned int numRandomizationIterations);
00445 
00447   virtual ~RandomUniverseNetworkBasedEnrichmentAlgorithm()
00448     {}
00449 
00451   virtual void computeEnrichedFunctions(const set< string >& functionsToProcess);
00452 
00453   void computeEnrichmentForFunction(const string funcType, const string funcId);
00454 
00459   void printEnrichmentResult(ostream &ostr, randomUniverseEnrichmentResult result);
00460 
00465   virtual void printEnrichmentResults(ostream &ostr);
00466 
00467 };
00468 
00469 
00473 class HypergeometricAlgorithm: public FunctionalEnrichmentAlgorithm
00474 {
00475 public:
00481   struct hypergeometricEnrichmentResult{
00482     string functionId;
00483     string functionType;
00484     unsigned int v_f;
00485     unsigned int c_f;
00486     double hypergeometricPvalue;
00487   };
00488 
00489 private:
00490 
00493   vector< hypergeometricEnrichmentResult > _hypergeometricEnrichmentResults;
00494 
00495 public:
00496 
00505   HypergeometricAlgorithm(const set<string> &clusterGraph, const set<string> &universeSet,
00506                           MyAnnotations &annotations, const GeneOntology &go,
00507                           string category);
00508 
00509 
00518   HypergeometricAlgorithm(const MyGraph& clusterGraph, const MyGraph& universeGraph,
00519                           MyAnnotations& annotations, const GeneOntology& go,
00520                           string category);
00521 
00523   virtual ~HypergeometricAlgorithm()
00524     {}
00525 
00531   virtual void computeEnrichmentForFunction(const string funcType, const string funcId);
00532 
00537   virtual void printEnrichmentResults(ostream &ostr);
00538 
00547   static long double getHypergeometricPvalue(int v, int c, int v_f, int c_f);
00548 };
00549 
00550 
00551 
00556 class PAGEAlgorithm : public FunctionalEnrichmentAlgorithm
00557 {
00558 
00559 private:
00560 
00561   MyPointSet _universePoints;
00562 
00563   map< string, unsigned int > _indexMap;
00564 
00565   double _mean;
00566   double _stddev;
00567 
00568   double getValue(string gene)
00569     {
00570       return _universePoints[0][_indexMap[gene]];
00571     }
00572 
00573   struct enrichmentResult{
00574     string funcId;
00575     string funcType;
00576     unsigned int v_f;
00577     double zscore;
00578     double pval;
00579   };
00580 
00581   vector< enrichmentResult > _enrichmentResults;
00582 
00583 
00584 public:
00585 
00592   PAGEAlgorithm(const MyPointSet &universePoints,
00593                 MyAnnotations &annotations, const GeneOntology &go, string category);
00594 
00596   virtual ~PAGEAlgorithm()
00597     {}
00598 
00603   virtual void computeEnrichmentForFunction(const string funcType, const string funcId);
00604 
00609   virtual void printEnrichmentResults(ostream &ostr);
00610 };
00611 
00612 
00613 
00618 class GenGOAlgorithm: public FunctionalEnrichmentAlgorithm
00619 {
00620 private:
00621 
00623   double _p;
00625   double _q;
00627   double _penalty;
00628 
00632   struct FunctionInfo
00633   {
00634     set<MyNodeId> clusterGenes;
00635     int numNotClusterGenes;
00636   };
00637   map<string, FunctionInfo> _allFunctionInfo;
00638   vector<string> _activeFunctions;
00639   set<string> _notActiveFunctions;
00640 
00642   map<string, int> _clusterGenesToNumActiveFunctions;
00643 
00645   int _numClusterGenesWithSomeActiveFunction;
00646 
00648   int _numClusterGenesWithNoActiveFunction;
00649 
00651   int _numEdgesNotClusterGeneWithActiveFunction;
00652 
00654   int _numEdgesNotClusterGeneWithNotActiveFunction;
00655 
00659   void _activateFunction(string func);
00660 
00664   void _deactivateFunction(string func);
00665 
00667   double _calcLogLikelihood();
00668 
00669 public:
00682   GenGOAlgorithm(const MyGraph& clusterGraph, const MyGraph& universeGraph,
00683                  MyAnnotations& annotations, const GeneOntology& go, string category,
00684                  double p, double q, double penalty);
00686   virtual ~GenGOAlgorithm()
00687     {}
00688 
00692   virtual void computeEnrichedFunctions(const set< string >& functionsToProcess);
00693 
00694   // not used
00695   virtual void computeEnrichmentForFunction(const string funcType, const string funcId)
00696     {}
00697 
00702   virtual void printEnrichmentResults(ostream &ostr);
00703 
00704 };
00705 
00706 
00719 class GroupHypAlgorithm: public FunctionalEnrichmentAlgorithm
00720 {
00721 private:
00722   MyGraph _universeGraph;
00723   MyGraph _clusterGraph;
00724 
00726   map< string, set<MyNodeId> > _functionToAnnotatedClusterGenes;
00728   map< string, set<MyNodeId> > _functionToAnnotatedNotClusterGenes;
00729 
00731   vector<string> _activeFunctions;
00733   set<string> _notActiveFunctions;
00734 
00736   map<string, int> _clusterGenesToNumActiveFunctions;
00738   map<string, int> _notClusterGenesToNumActiveFunctions;
00739 
00740   int _numClusterGenesWithSomeActiveFunction;
00741   int _numNotClusterGenesWithSomeActiveFunction;
00742 
00746   void _activateFunction(string func);
00747 
00751   void _deactivateFunction(string func);
00752 
00755   double _calcGroupHypPvalue() const;
00756 
00757 public:
00767   GroupHypAlgorithm(const MyGraph& clusterGraph, const MyGraph& universeGraph,
00768                     MyAnnotations& annotations, const GeneOntology& go, string category);
00769 
00771   virtual ~GroupHypAlgorithm()
00772     {}
00773 
00774   virtual void computeEnrichedFunctions(const set< string >& functionsToProcess);
00775 
00776   // not used
00777   virtual void computeEnrichmentForFunction(const string funcType, const string funcId)
00778     {}
00779 
00784   virtual void printEnrichmentResults(ostream &ostr);
00785 
00786 };
00787 
00788 
00789 
00790 
00795 class EdgeGroupHyp: public FunctionalEnrichmentAlgorithm
00796 {
00797 private:
00798   MyGraph _universeGraph;
00799   MyGraph _clusterGraph;
00800 
00801   set<string> _clusterEdges;
00802   set<string> _notClusterEdges;
00803 
00804 
00805   map< string, set<string> > _functionToAnnotatedClusterGenes;
00806 
00808   map< string, set<string> > _functionToAnnotatedClusterEdges;
00810   map< string, set<string> > _functionToAnnotatedNotClusterEdges;
00811 
00813   vector<string> _activeFunctions;
00815   set<string> _notActiveFunctions;
00816 
00818   map<string, int> _clusterEdgesToNumActiveFunctions;
00820   map<string, int> _notClusterEdgesToNumActiveFunctions;
00821 
00822   int _numClusterEdgesWithSomeActiveFunction;
00823   int _numNotClusterEdgesWithSomeActiveFunction;
00824 
00825   void _initialize();
00826 
00830   void _activateFunction(string func);
00831 
00835   void _deactivateFunction(string func);
00836 
00839   long double _calcGroupHypPvalue() const;
00840 
00841 public:
00851   EdgeGroupHyp(const MyGraph& clusterGraph, const MyGraph& universeGraph,
00852                MyAnnotations& annotations, const GeneOntology& go, string category);
00853 
00855   virtual ~EdgeGroupHyp()
00856     {}
00857 
00858   virtual void computeEnrichedFunctions(const set< string >& functionsToProcess);
00859 
00860   // not used
00861   virtual void computeEnrichmentForFunction(const string funcType, const string funcId)
00862     {}
00863 
00868   virtual void printEnrichmentResults(ostream &ostr);
00869 
00870 };
00871 
00872 
00873 
00874 
00875 
00880 class FuncAssociate: public FunctionalEnrichmentAlgorithm
00881 {
00882 private:
00883 
00885   MyPointSet _clusterRanks;
00886 
00887   vector< pair< MyNodeId, MyNT > > _currClusterRank;
00888   string _currClusterId;
00889 
00890   ofstream _outputStream;
00891 
00893   struct FunctionInfo{
00894     BioFunction function;
00895     set< MyNodeId > funcGenesInUniverse;
00896     unsigned int numFuncGenesInCluster;
00897     double hypPval;
00898     double bestHypPval;
00899     set< MyNodeId > genesInBestPrefix;
00900     set< MyNodeId > funcGenesInBestPrefix;
00901     pair< unsigned int, unsigned int > adjustedPval;
00902   };
00903 
00909   void computeBestPval(const BioFunction* function);
00910 
00913   void computeAdjustedPvals();
00914 
00915   // Updates _currClusterRank with the ranking given by sample i in _clusterRanks
00916   void updateCurrClusterInfo(unsigned int i);
00917 
00918   void printHeader(string sample);
00919 
00920   void printEnrichmentForFuncInfo(FunctionInfo &fi);
00921 
00922   /*
00923   void printEnrichmentForSample(ostream &ostr, string sample);
00924   */
00925 
00926 public:
00940   FuncAssociate(const MyPointSet &clusterPoints, const set<string> &universeSet,
00941                 MyAnnotations& annotations, const GeneOntology& go, string category,
00942                 string outfile);
00943 
00945   virtual ~FuncAssociate()
00946     {}
00947 
00953   virtual void computeEnrichedFunctions(const set< string >& functionsToProcess);
00954 
00955   // not used
00956   virtual void computeEnrichmentForFunction(const string funcType, const string funcId)
00957    {}
00958 
00963   virtual void printEnrichmentResults(ostream &ostr);
00964 
00965 };
00966 
00967 
00968 
00969 
00974 class SetCoverEnrichment: public FunctionalEnrichmentAlgorithm
00975 {
00976 private:
00977 
00978   MyGraph _universeGraph;
00979   MyGraph _clusterGraph;
00980 
00981   set<string> _currClusterEdges;
00982 
00983   struct FuncStats{
00984     set<string> inEdges;
00985     set<string> outEdges;
00986     MyNT score;
00987   };
00988 
00989   map<string, FuncStats> _stats;
00990 
00991   vector<string> _selectedFunctions;
00992   set<string> _selectedInEdges;
00993   set<string> _selectedOutEdges;
00994 
00995 
00996 
00997   void _initialize();
00998 
00999   MyNT _score(unsigned int in, unsigned int out);
01000 
01001   string _getHighestScoringFunc();
01002 
01003   void _removeFunc(string func);
01004 
01005 
01006 public:
01016   SetCoverEnrichment(const MyGraph& clusterGraph, const MyGraph& universeGraph,
01017                  MyAnnotations& annotations, const GeneOntology& go, string category);
01018 
01020   virtual ~SetCoverEnrichment()
01021     {}
01022 
01023 
01029   virtual void computeEnrichedFunctions(const set< string >& functionsToProcess);
01030 
01032   virtual void computeEnrichmentForFunction(const string funcType, const string funcId)
01033     { }
01034 
01039   virtual void printEnrichmentResults(ostream &ostr);
01040 
01041 };
01042 
01043 
01044 
01045 
01046 
01047 
01048 
01051 class FunctionalEnrichmentAlgorithmDispatcher
01052 {
01053 public:
01054 
01055   enum FunctionalEnrichmentAlgorithmType
01056     {
01057       NO_ALGORITHM,
01058       FUNCASSOCIATE,
01059       GENGO,
01060       GROUP_HYP,
01061       EDGE_GROUP_HYP,
01062       HYPERGEOMETRIC,
01063       PAGE,
01064       RANDOM_FUNCTION,
01065       RANDOM_UNIVERSE,
01066       SET_COVER,
01067       MAX_ALGORITHM
01068     };
01069 
01070   static FunctionalEnrichmentAlgorithm* getFunctionalEnrichmentAlgorithm(
01071         unsigned int algorithmType,
01072         const set<MyNodeId> &clusterSet, const MyGraph &clusterNet, const MyPointSet &clusterPoints,
01073         const set<MyNodeId> &universeSet, const MyGraph &universeNet, const MyPointSet &universePoints,
01074         MyAnnotations &annotations, const GeneOntology &go, string category,
01075         unsigned int numRandomNets, unsigned int numRandomizationIterations,
01076         double genGOp, double genGOq, double genGOpenalty, string outfile);
01077 
01078 };
01079 
01080 
01083 class EnrichmentPostprocess
01084 {
01085 private:
01086 
01087   struct enrichmentResult{
01088     GOFunction* function;
01089     double lexComponentsByNodesPValue;
01090     double lexComponentsByEdgesPValue;
01091     double densityPValue;
01092   };
01093 
01095   GeneOntology * _go;
01096 
01098   vector< GOFunction* > _topoSortedGOFunctions;
01099 
01101   double _pvalCutoff;
01102 
01105   map< string, enrichmentResult > _results;
01106 
01113   map< string, set< string > > _functionsBelowCutoff;
01114 
01115 
01119   map< string, set< string > > _leavesBelowCutoff;
01120 
01121 public:
01123   EnrichmentPostprocess(GeneOntology& go, string resultsFile, double pvalCutoff);
01124 
01126   ~EnrichmentPostprocess()
01127     {}
01128 
01133   void readEnrichment(string inFile);
01134 
01135   void print(ostream &ostr);
01136 
01137 };
01138 
01139 
01140 #endif // _ENRICHMENT_ALGORITHM_H
 All Classes Functions Variables Typedefs Friends