Biorithm  1.1
 All Classes Functions Variables Typedefs Friends
enrichment.h
00001 /**************************************************************************
00002  * Copyright (c) 2004-2011 T. M. Murali                                   *
00003  *                                                                        *
00004  * This file is part of Biorithm.                                         *
00005  *                                                                        *
00006  * Biorithm is free software: you can redistribute it and/or modify       *
00007  * it under the terms of the GNU General Public License as published by   *
00008  * the Free Software Foundation, either version 3 of the License, or      *
00009  * (at your option) any later version.                                    *
00010  *                                                                        *
00011  * Biorithm is distributed in the hope that it will be useful,            *
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of         *
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          *
00014  * GNU General Public License for more details.                           *
00015  *                                                                        *
00016  * You should have received a copy of the GNU General Public License      *
00017  * along with Biorithm.  If not, see <http://www.gnu.org/licenses/>.      *
00018  *                                                                        *
00019  **************************************************************************/
00020 
00027 #ifndef _ENRICHMENT_H_
00028 #define _ENRICHMENT_H_
00029 
00030 // for tolower.
00031 #include <cctype>
00032 #include <iostream>
00033 #include <string>
00034 #include <sstream>
00035 #include <map>
00036 #include <set>
00037 #include <vector>
00038 #include <cmath>
00039 #include <algorithm>
00040 #include <utility>
00041 #include <cstdlib>
00042 
00043 using namespace std;
00044 
00045 // copying definition in murali-xmotif/global.h since histogram.* are
00046 // now in libutil and histogram.h includes enrichment.h.
00047 typedef double MyNT;
00048 
00049 
00064 enum LIBENRICHMENT_TEST_TYPE {LIBENRICHMENT_NONE, LIBENRICHMENT_BONFERRONI, LIBENRICHMENT_HOLMS, LIBENRICHMENT_FALSE_DISCOVERY_RATE};
00065 
00066 //map< LIBENRICHMENT_TEST_TYPE, string > _testTypeStrings;
00067 
00069 inline string convertTestTypeToString(LIBENRICHMENT_TEST_TYPE test)
00070 {
00071   if (LIBENRICHMENT_NONE == test)
00072     return("no");
00073   else if (LIBENRICHMENT_BONFERRONI == test)
00074     return("Bonferroni");
00075   else if (LIBENRICHMENT_HOLMS == test)
00076     return("Holms");
00077   else if (LIBENRICHMENT_FALSE_DISCOVERY_RATE == test)
00078     return("False discovery rate");
00079   else
00080     return("");
00081 //  return(_testTypeStrings[test]);
00082 }
00083 
00085 inline string convertTestTypeToShortString(LIBENRICHMENT_TEST_TYPE test)
00086 {
00087   if (LIBENRICHMENT_NONE == test)
00088     return("no");
00089   else if (LIBENRICHMENT_BONFERRONI == test)
00090     return("Bonferroni");
00091   else if (LIBENRICHMENT_HOLMS == test)
00092     return("Holms");
00093   else if (LIBENRICHMENT_FALSE_DISCOVERY_RATE == test)
00094     return("FDR");
00095   else
00096     return("");
00097 //  return(_testTypeStrings[test]);
00098 }
00099 
00114 inline LIBENRICHMENT_TEST_TYPE  convertStringToTestType(string test)
00115 {
00116 //  test = tolower(test);
00117   if ("" == test)
00118     return(LIBENRICHMENT_NONE);
00119   else if ("Bonferroni" == test)
00120     return(LIBENRICHMENT_BONFERRONI);
00121   else if ("Holms" == test)
00122     return(LIBENRICHMENT_HOLMS);
00123   else if ("FDR" == test)
00124     return(LIBENRICHMENT_FALSE_DISCOVERY_RATE);
00125   else
00126     {
00127       cerr << "Unknown enrichment test \"" << test << "\". Will not do any correction for multiple hypothesis testing." << endl;
00128       return(LIBENRICHMENT_NONE);
00129     }
00130   
00131 //  return(_testTypeStrings[test]);
00132 }
00133 
00138 template <class S,class T> struct EnrichmentRecord
00139 {
00141   double score;
00143   T relation;
00145   int gTotal;
00147   int gTrue;
00149   int lTotal;
00151   int lTrue;
00153   set<S> annotated;
00154 
00155   string itoa(int in)
00156     {
00157       stringstream ss;
00158       ss<<in;
00159       string out;
00160       ss>>out;
00161       return out;
00162         }
00163   void print(ostream &fstr)
00164     {
00165       // local is the cluster. global is the universe.
00166       fstr << lTrue << "\t" << lTotal << "\t" << gTrue << "\t" << gTotal;
00167     }
00168 
00170   string reason()
00171         {
00172           string empty;
00173           return empty+itoa(lTrue)+"/"+itoa(lTotal)+" vs. "+itoa(gTrue)+"/"+itoa(gTotal);
00174         }
00177   const bool operator<(const EnrichmentRecord<S,T> &comp) const
00178     {
00179       if(comp.score!=score)
00180         return score<comp.score;
00181       return relation<comp.relation;
00182     }
00183 
00186   const bool operator==(const EnrichmentRecord<S,T> &comp) const
00187     {
00188       return (comp.score==score && relation==comp.relation);
00189     }
00190 };
00191 
00192 // comparator for EnrichmentRecords.
00193 // struct compareEnrichmentRecords :
00194 //   public binary_function< EnrichmentRecord<class S,class T>,
00195 //                           EnrichmentRecord<class S,class T>, bool> 
00196 // {
00197 //   bool operator()(const EnrichmentRecord<class S,class T> &__x,
00198 //                   const EnrichmentRecord<class S,class T> & __y) const
00199 //     {
00200 //       return __x.score < __y.score; }
00201 // };
00202 
00203 
00204 
00206 template <class S,class T> class Enrichment {
00207 public:
00209   Enrichment();
00210 
00219   void clear();
00220   
00236   double computeHyperGeometricProbability(int globTotal, int globTrue, int locTotal, int locTrue);
00237 
00242   void addUnannotated(int numS,int numT);
00243 
00264   bool check(double value, LIBENRICHMENT_TEST_TYPE test, double alpha, unsigned int numTests, unsigned int rank);
00265 
00286   double correct(double value, LIBENRICHMENT_TEST_TYPE test, double alpha, unsigned int numTests, unsigned int rank);
00287 
00308   void check(vector<EnrichmentRecord<S,T> > &in, vector<EnrichmentRecord<S,T> > &out,
00309              LIBENRICHMENT_TEST_TYPE test, double alpha);
00310                 
00311 
00319   void getEnrichments(set<S> &in, vector<EnrichmentRecord<S,T> > &out);
00320 
00321   void getEnrichmentsGSEA(set<S> &in, vector<EnrichmentRecord<S,T> > &out);
00322 
00329   bool isPair(const S &s, const T &t) const;
00330 
00337   void loadPairs(const vector<pair<S,T> > &in);
00338 
00345   void loadPairs(const map< S, set<T> > &in);
00346 
00347 private:
00348   double HyperGeoHelper(int globTotal, int globTrue, int locTotal, int locTrue);
00349   double factorial(int);
00350   double factorialLog(int);
00351   double factorialEstimate(int);
00352   double factorialEstimateLog(int);
00353   map<S,set<T> > sXt;
00354   map<T,set<S> > tXs;
00355   int unS,unT;
00356   string itoa(int in)
00357     {
00358       stringstream ss;
00359       ss<<in;
00360       string out;
00361       ss>>out;
00362       return out;
00363     }
00364   vector<double> logFactCache;
00365 };
00366 
00367 #ifndef PI_DEFINED
00368 #define PI_DEFINED 1
00369 const double PI = 4*atan2(1.0, 1.0);
00370 //#define PI 3.14159265
00371 #endif //
00372 
00373 template <class S,class T>
00374 Enrichment<S,T>::Enrichment()
00375     : sXt(), tXs(), unS(0), unT(0)
00376 {
00377         
00378 }
00379 
00380 template <class S,class T>
00381 void Enrichment<S,T>::clear()
00382 {
00383   sXt.clear();
00384   tXs.clear();
00385   unS=unT=0;
00386 }
00387 
00388 template <class S,class T>
00389 bool Enrichment<S,T>::isPair(const S &s, const T &t) const
00390 {
00391   typename map<S,set<T> >::const_iterator sitr;
00392   typename set< T >::const_iterator titr;
00393   sitr = sXt.find(s);
00394   if (sXt.end() == sitr)
00395     return(false);
00396   titr = sitr->second.find(t);
00397   if (sitr->second.end() == titr)
00398     return(false);
00399   return(true);
00400 }
00401 
00402 
00403 template <class S,class T>
00404 void Enrichment<S,T>::loadPairs(const vector<pair<S,T> > &in)
00405 {
00406         for(int i=0;i<in.size();i++)
00407         {
00408                 sXt[in[i].first].insert(in[i].second);
00409                 tXs[in[i].second].insert(in[i].first);
00410         }
00411 }
00412 
00413 template <class S,class T>
00414 void Enrichment<S,T>::loadPairs(const map < S, set<T> > &in)
00415 {
00416   sXt = in;
00417   for (typename map< S, set<T> >::const_iterator itr = in.begin(); itr != in.end(); itr++)
00418     {
00419       for (typename set<T>::const_iterator titr = itr->second.begin(); titr != itr->second.end(); titr++)
00420         tXs[*titr].insert(itr->first);
00421     }
00422 }
00423 
00424 template <class S,class T>
00425 void Enrichment<S,T>::addUnannotated(int a,int b)
00426 {
00427         unS=a;
00428         unT=b;
00429 }
00430 
00431 template <class S,class T>
00432 void Enrichment<S,T>::getEnrichments(set<S> &in, vector<EnrichmentRecord<S,T> > &out)
00433 {
00434   map<T,int> counts;
00435   map<T,set<S> > annot;
00436 //  vector<EnrichmentRecord<S,T> > out;
00437   int locTotal=0;
00438   for(typename set<S>::iterator i=in.begin();i!=in.end();i++)
00439     // loop over every element (gene) in in.
00440     {
00441       if(sXt.find(*i)!=sXt.end())
00442         // *i has an annotation of type T.
00443         {
00444 //           typename set<T>::iterator begin=sXt[*i].begin();
00445 //           typename set<T>::iterator end=sXt[*i].end();
00446           for(typename set<T>::iterator j = sXt[*i].begin(); j != sXt[*i].end(); j++)
00447             {
00448               // increment count for type T (function) element *j.
00449               counts[*j]++;
00450               // keep in mind that *j annotates *i.
00451               annot[*j].insert(*i);
00452             }
00453           // increment #elements (genes) in in that have an annotation of type T.
00454           locTotal++;
00455         }
00456     }
00457   for(typename map<T,int>::iterator i=counts.begin();i!=counts.end();i++)
00458     {
00459       int locTrue=i->second;
00460       int globTrue=tXs[i->first].size();
00461       int globTotal=sXt.size()+unS;
00462       double value= computeHyperGeometricProbability(globTotal, globTrue, locTotal, locTrue);
00463       if(value<=1)
00464         {
00465           EnrichmentRecord<S,T> e;
00466           e.score=value;
00467           e.gTotal=globTotal;
00468           e.gTrue=globTrue;
00469           e.lTotal=locTotal;
00470           e.lTrue=locTrue;
00471           e.relation=i->first;
00472           e.annotated=annot[i->first];
00473           out.push_back(e);
00474         }
00475     }
00476   sort(out.begin(),out.end());//, compareEnrichmentRecords);
00477 //  return out;         
00478 }
00479 
00480 
00481 template <class S,class T>
00482 void Enrichment<S,T>::getEnrichmentsGSEA(set<S> &in,
00483                                          // sorted order 
00484 
00485                                          vector<EnrichmentRecord<S,T> > &out)
00486 {
00487   map<T,int> counts;
00488   map<T,set<S> > annot;
00489 //  vector<EnrichmentRecord<S,T> > out;
00490   int locTotal=0;
00491   for(typename set<S>::iterator i=in.begin();i!=in.end();i++)
00492     // loop over every element (gene) in in.
00493     {
00494       if(sXt.find(*i)!=sXt.end())
00495         // *i has an annotation of type T.
00496         {
00497 //           typename set<T>::iterator begin=sXt[*i].begin();
00498 //           typename set<T>::iterator end=sXt[*i].end();
00499           for(typename set<T>::iterator j = sXt[*i].begin(); j != sXt[*i].end(); j++)
00500             {
00501               // increment count for type T (function) element *j.
00502               counts[*j]++;
00503               // keep in mind that *j annotates *i.
00504               annot[*j].insert(*i);
00505             }
00506           // increment #elements (genes) in in that have an annotation of type T.
00507           locTotal++;
00508         }
00509     }
00510   for(typename map<T,int>::iterator i=counts.begin();i!=counts.end();i++)
00511     {
00512       int locTrue=i->second;
00513       int globTrue=tXs[i->first].size();
00514       int globTotal=sXt.size()+unS;
00515       double value= computeHyperGeometricProbability(globTotal, globTrue, locTotal, locTrue);
00516       if(value<=1)
00517         {
00518           EnrichmentRecord<S,T> e;
00519           e.score=value;
00520           e.gTotal=globTotal;
00521           e.gTrue=globTrue;
00522           e.lTotal=locTotal;
00523           e.lTrue=locTrue;
00524           e.relation=i->first;
00525           e.annotated=annot[i->first];
00526           out.push_back(e);
00527         }
00528     }
00529   sort(out.begin(),out.end());//, compareEnrichmentRecords);
00530 //  return out;         
00531 }
00532 
00533 
00534 template <class S,class T>
00535 bool
00536 Enrichment<S,T>::check(double value, LIBENRICHMENT_TEST_TYPE test, double alpha, unsigned int numTests,
00537                          unsigned int rank)
00538 {
00539   double correctedValue = correct(value, test, alpha, numTests, rank);
00540   return(correctedValue <= alpha);
00541 }
00542 
00543 template <class S,class T>
00544 double
00545 Enrichment<S,T>::correct(double value, LIBENRICHMENT_TEST_TYPE test, double alpha, unsigned int numTests,
00546                          unsigned int rank)
00547 {
00548   double correctedValue = value;
00549   switch(test)
00550     {
00551       // don't change alpha! keep alpha the same but change the
00552       // score/pvalue.
00553       case LIBENRICHMENT_NONE:
00554 //            myAlpha=alpha;
00555         break;
00556       case LIBENRICHMENT_BONFERRONI:
00557 //            myAlpha=alpha/n;
00558         correctedValue = numTests*correctedValue;
00559             break;
00560           case LIBENRICHMENT_HOLMS:
00561 //            myAlpha=alpha/(n-i2+1);
00562             correctedValue = (numTests - rank + 1)*correctedValue;
00563             break;
00564           case LIBENRICHMENT_FALSE_DISCOVERY_RATE:
00565 //            myAlpha=(alpha*i2)/n;
00566             correctedValue = (numTests*1.0/rank)*correctedValue;
00567             break;
00568         }
00569   return(correctedValue);
00570 }
00571 
00572 
00573 template <class S,class T>
00574 void Enrichment<S,T>::check(vector<EnrichmentRecord<S,T> > &in,
00575                             vector<EnrichmentRecord<S,T> > &out,
00576                             LIBENRICHMENT_TEST_TYPE test, double alpha)
00577 {
00578 //  vector<EnrichmentRecord<S,T> > out;
00579   unsigned int n=in.size();
00580   for(unsigned int i=0;i<n;i++)
00581     {
00582       double myAlpha=alpha;
00583       unsigned int i2= i + 1;
00584       in[i].score = correct(in[i].score, test, alpha, n, i2);
00585       if(in[i].score>myAlpha)
00586         break;
00587       out.push_back(in[i]);
00588     }
00589 //  return out;
00590 }       
00591 
00592 template <class S,class T>
00593 double Enrichment<S,T>::factorial( int n ){
00594   double result = 1;
00595   for(int i=n; i > 1; i-- )
00596     result*=i;
00597   return( result );
00598 }
00599 
00600 template <class S,class T>
00601 double Enrichment<S,T>::factorialLog(int n){
00602   logFactCache.resize(n + 1);
00603   // while loop is just resizing!
00604 //   while(logFactCache.size()<n+1)
00605 //     logFactCache.push_back(0);
00606   if(logFactCache[n]!=0)
00607     return logFactCache[n];
00608   
00609   double result=0;
00610   for(int i=1;i<=n;i++)
00611     result+=log(double(i));
00612   logFactCache[n]=result;
00613   return result;
00614 }
00615 
00616 template <class S,class T>
00617 double Enrichment<S,T>::factorialEstimate( int n ){
00618   //Note: This method computes an approximation of the factorial using
00619   //Stirling's formula.
00620   if( n <= 0 )
00621     return( 1 );
00622   // be exact if n is small, since the Stirling approximation is
00623   // not good for small n.
00624   if (n <= 20){ return (factorial(n)); }
00625   double result = sqrt(2*PI*n)*pow(n/exp((double)1),n);
00626   return( result );
00627 }
00628 
00629 template <class S,class T>
00630 double Enrichment<S,T>::factorialEstimateLog( int n ){
00631         if( n <= 0 ){ return 0; }
00632         // be exact if n is small, since the Stirling approximation is
00633         // not good for small n.
00634         if (n <= 20){ return (factorialLog(n)); }
00635         // exp(x) is e^x, so the argument to the second log is (n/e).
00636         double result = 0.5*log(2*PI*n) + n*log(n/exp((double)1));
00637         return( result );
00638 }
00639 template <class S,class T>
00640 double Enrichment<S,T>::HyperGeoHelper( int globTotal, int globTrue, int locTotal, int locTrue ){
00641         // double result = 
00642         //      factorialEstimateLog(globTrue) + \
00643         //      factorialEstimateLog(globTotal - globTrue) + \
00644         //      factorialEstimateLog(locTotal) + \
00645         //      factorialEstimateLog(globTotal-locTotal) - \
00646         //      factorialEstimateLog(locTrue) - \
00647         //      factorialEstimateLog(globTrue-locTrue) - \
00648         //      factorialEstimateLog(globTotal-globTrue+locTrue-locTotal) - \
00649         //      factorialEstimateLog(locTotal - locTrue) - \
00650         //      factorialEstimateLog(globTotal);
00651 
00652         // I can be sure that locTrue <= globTrue and locTrue <=
00653         // locTotal, if I called this method from Enrichment::computeHyperGeometricProbability().
00654         double result = 
00655           // start with choose(locTotal, locTrue)
00656           factorialEstimateLog(locTotal) 
00657           - factorialEstimateLog(locTrue) 
00658           - factorialEstimateLog(locTotal - locTrue) 
00659           // multiply by choose(globTotal-locTotal, globTrue-locTrue)
00660           + factorialEstimateLog(globTotal-locTotal) 
00661           - factorialEstimateLog(globTrue-locTrue) 
00662           - factorialEstimateLog(globTotal-globTrue+locTrue-locTotal) 
00663           // divide by choose(globTotal globTrue)
00664           + factorialEstimateLog(globTrue)
00665           + factorialEstimateLog(globTotal - globTrue) 
00666           - factorialEstimateLog(globTotal)
00667 ;
00668 
00669         return exp(result);
00670 }
00671 template <class S,class T> double Enrichment<S,T>::computeHyperGeometricProbability(int globTotal, int globTrue,
00672                                                                                     int locTotal, int locTrue){
00673         double sum=0;
00674         for(int i=locTrue; i<=locTotal && i<=globTrue; i++ )
00675                 sum+=HyperGeoHelper(globTotal,globTrue,locTotal,i);
00676         return sum;
00677 }
00678 
00679 #endif
 All Classes Functions Variables Typedefs Friends