00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00027 #ifndef _ENRICHMENT_H_
00028 #define _ENRICHMENT_H_
00029
00030
00031 #include <cctype>
00032 #include <iostream>
00033 #include <string>
00034 #include <sstream>
00035 #include <map>
00036 #include <set>
00037 #include <vector>
00038 #include <cmath>
00039 #include <algorithm>
00040 #include <utility>
00041 #include <cstdlib>
00042
00043 using namespace std;
00044
00045
00046
00047 typedef double MyNT;
00048
00049
00064 enum LIBENRICHMENT_TEST_TYPE {LIBENRICHMENT_NONE, LIBENRICHMENT_BONFERRONI, LIBENRICHMENT_HOLMS, LIBENRICHMENT_FALSE_DISCOVERY_RATE};
00065
00066
00067
00069 inline string convertTestTypeToString(LIBENRICHMENT_TEST_TYPE test)
00070 {
00071 if (LIBENRICHMENT_NONE == test)
00072 return("no");
00073 else if (LIBENRICHMENT_BONFERRONI == test)
00074 return("Bonferroni");
00075 else if (LIBENRICHMENT_HOLMS == test)
00076 return("Holms");
00077 else if (LIBENRICHMENT_FALSE_DISCOVERY_RATE == test)
00078 return("False discovery rate");
00079 else
00080 return("");
00081
00082 }
00083
00085 inline string convertTestTypeToShortString(LIBENRICHMENT_TEST_TYPE test)
00086 {
00087 if (LIBENRICHMENT_NONE == test)
00088 return("no");
00089 else if (LIBENRICHMENT_BONFERRONI == test)
00090 return("Bonferroni");
00091 else if (LIBENRICHMENT_HOLMS == test)
00092 return("Holms");
00093 else if (LIBENRICHMENT_FALSE_DISCOVERY_RATE == test)
00094 return("FDR");
00095 else
00096 return("");
00097
00098 }
00099
00114 inline LIBENRICHMENT_TEST_TYPE convertStringToTestType(string test)
00115 {
00116
00117 if ("" == test)
00118 return(LIBENRICHMENT_NONE);
00119 else if ("Bonferroni" == test)
00120 return(LIBENRICHMENT_BONFERRONI);
00121 else if ("Holms" == test)
00122 return(LIBENRICHMENT_HOLMS);
00123 else if ("FDR" == test)
00124 return(LIBENRICHMENT_FALSE_DISCOVERY_RATE);
00125 else
00126 {
00127 cerr << "Unknown enrichment test \"" << test << "\". Will not do any correction for multiple hypothesis testing." << endl;
00128 return(LIBENRICHMENT_NONE);
00129 }
00130
00131
00132 }
00133
00138 template <class S,class T> struct EnrichmentRecord
00139 {
00141 double score;
00143 T relation;
00145 int gTotal;
00147 int gTrue;
00149 int lTotal;
00151 int lTrue;
00153 set<S> annotated;
00154
00155 string itoa(int in)
00156 {
00157 stringstream ss;
00158 ss<<in;
00159 string out;
00160 ss>>out;
00161 return out;
00162 }
00163 void print(ostream &fstr)
00164 {
00165
00166 fstr << lTrue << "\t" << lTotal << "\t" << gTrue << "\t" << gTotal;
00167 }
00168
00170 string reason()
00171 {
00172 string empty;
00173 return empty+itoa(lTrue)+"/"+itoa(lTotal)+" vs. "+itoa(gTrue)+"/"+itoa(gTotal);
00174 }
00177 const bool operator<(const EnrichmentRecord<S,T> &comp) const
00178 {
00179 if(comp.score!=score)
00180 return score<comp.score;
00181 return relation<comp.relation;
00182 }
00183
00186 const bool operator==(const EnrichmentRecord<S,T> &comp) const
00187 {
00188 return (comp.score==score && relation==comp.relation);
00189 }
00190 };
00191
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201
00202
00203
00204
00206 template <class S,class T> class Enrichment {
00207 public:
00209 Enrichment();
00210
00219 void clear();
00220
00236 double computeHyperGeometricProbability(int globTotal, int globTrue, int locTotal, int locTrue);
00237
00242 void addUnannotated(int numS,int numT);
00243
00264 bool check(double value, LIBENRICHMENT_TEST_TYPE test, double alpha, unsigned int numTests, unsigned int rank);
00265
00286 double correct(double value, LIBENRICHMENT_TEST_TYPE test, double alpha, unsigned int numTests, unsigned int rank);
00287
00308 void check(vector<EnrichmentRecord<S,T> > &in, vector<EnrichmentRecord<S,T> > &out,
00309 LIBENRICHMENT_TEST_TYPE test, double alpha);
00310
00311
00319 void getEnrichments(set<S> &in, vector<EnrichmentRecord<S,T> > &out);
00320
00321 void getEnrichmentsGSEA(set<S> &in, vector<EnrichmentRecord<S,T> > &out);
00322
00329 bool isPair(const S &s, const T &t) const;
00330
00337 void loadPairs(const vector<pair<S,T> > &in);
00338
00345 void loadPairs(const map< S, set<T> > &in);
00346
00347 private:
00348 double HyperGeoHelper(int globTotal, int globTrue, int locTotal, int locTrue);
00349 double factorial(int);
00350 double factorialLog(int);
00351 double factorialEstimate(int);
00352 double factorialEstimateLog(int);
00353 map<S,set<T> > sXt;
00354 map<T,set<S> > tXs;
00355 int unS,unT;
00356 string itoa(int in)
00357 {
00358 stringstream ss;
00359 ss<<in;
00360 string out;
00361 ss>>out;
00362 return out;
00363 }
00364 vector<double> logFactCache;
00365 };
00366
00367 #ifndef PI_DEFINED
00368 #define PI_DEFINED 1
00369 const double PI = 4*atan2(1.0, 1.0);
00370
00371 #endif //
00372
00373 template <class S,class T>
00374 Enrichment<S,T>::Enrichment()
00375 : sXt(), tXs(), unS(0), unT(0)
00376 {
00377
00378 }
00379
00380 template <class S,class T>
00381 void Enrichment<S,T>::clear()
00382 {
00383 sXt.clear();
00384 tXs.clear();
00385 unS=unT=0;
00386 }
00387
00388 template <class S,class T>
00389 bool Enrichment<S,T>::isPair(const S &s, const T &t) const
00390 {
00391 typename map<S,set<T> >::const_iterator sitr;
00392 typename set< T >::const_iterator titr;
00393 sitr = sXt.find(s);
00394 if (sXt.end() == sitr)
00395 return(false);
00396 titr = sitr->second.find(t);
00397 if (sitr->second.end() == titr)
00398 return(false);
00399 return(true);
00400 }
00401
00402
00403 template <class S,class T>
00404 void Enrichment<S,T>::loadPairs(const vector<pair<S,T> > &in)
00405 {
00406 for(int i=0;i<in.size();i++)
00407 {
00408 sXt[in[i].first].insert(in[i].second);
00409 tXs[in[i].second].insert(in[i].first);
00410 }
00411 }
00412
00413 template <class S,class T>
00414 void Enrichment<S,T>::loadPairs(const map < S, set<T> > &in)
00415 {
00416 sXt = in;
00417 for (typename map< S, set<T> >::const_iterator itr = in.begin(); itr != in.end(); itr++)
00418 {
00419 for (typename set<T>::const_iterator titr = itr->second.begin(); titr != itr->second.end(); titr++)
00420 tXs[*titr].insert(itr->first);
00421 }
00422 }
00423
00424 template <class S,class T>
00425 void Enrichment<S,T>::addUnannotated(int a,int b)
00426 {
00427 unS=a;
00428 unT=b;
00429 }
00430
00431 template <class S,class T>
00432 void Enrichment<S,T>::getEnrichments(set<S> &in, vector<EnrichmentRecord<S,T> > &out)
00433 {
00434 map<T,int> counts;
00435 map<T,set<S> > annot;
00436
00437 int locTotal=0;
00438 for(typename set<S>::iterator i=in.begin();i!=in.end();i++)
00439
00440 {
00441 if(sXt.find(*i)!=sXt.end())
00442
00443 {
00444
00445
00446 for(typename set<T>::iterator j = sXt[*i].begin(); j != sXt[*i].end(); j++)
00447 {
00448
00449 counts[*j]++;
00450
00451 annot[*j].insert(*i);
00452 }
00453
00454 locTotal++;
00455 }
00456 }
00457 for(typename map<T,int>::iterator i=counts.begin();i!=counts.end();i++)
00458 {
00459 int locTrue=i->second;
00460 int globTrue=tXs[i->first].size();
00461 int globTotal=sXt.size()+unS;
00462 double value= computeHyperGeometricProbability(globTotal, globTrue, locTotal, locTrue);
00463 if(value<=1)
00464 {
00465 EnrichmentRecord<S,T> e;
00466 e.score=value;
00467 e.gTotal=globTotal;
00468 e.gTrue=globTrue;
00469 e.lTotal=locTotal;
00470 e.lTrue=locTrue;
00471 e.relation=i->first;
00472 e.annotated=annot[i->first];
00473 out.push_back(e);
00474 }
00475 }
00476 sort(out.begin(),out.end());
00477
00478 }
00479
00480
00481 template <class S,class T>
00482 void Enrichment<S,T>::getEnrichmentsGSEA(set<S> &in,
00483
00484
00485 vector<EnrichmentRecord<S,T> > &out)
00486 {
00487 map<T,int> counts;
00488 map<T,set<S> > annot;
00489
00490 int locTotal=0;
00491 for(typename set<S>::iterator i=in.begin();i!=in.end();i++)
00492
00493 {
00494 if(sXt.find(*i)!=sXt.end())
00495
00496 {
00497
00498
00499 for(typename set<T>::iterator j = sXt[*i].begin(); j != sXt[*i].end(); j++)
00500 {
00501
00502 counts[*j]++;
00503
00504 annot[*j].insert(*i);
00505 }
00506
00507 locTotal++;
00508 }
00509 }
00510 for(typename map<T,int>::iterator i=counts.begin();i!=counts.end();i++)
00511 {
00512 int locTrue=i->second;
00513 int globTrue=tXs[i->first].size();
00514 int globTotal=sXt.size()+unS;
00515 double value= computeHyperGeometricProbability(globTotal, globTrue, locTotal, locTrue);
00516 if(value<=1)
00517 {
00518 EnrichmentRecord<S,T> e;
00519 e.score=value;
00520 e.gTotal=globTotal;
00521 e.gTrue=globTrue;
00522 e.lTotal=locTotal;
00523 e.lTrue=locTrue;
00524 e.relation=i->first;
00525 e.annotated=annot[i->first];
00526 out.push_back(e);
00527 }
00528 }
00529 sort(out.begin(),out.end());
00530
00531 }
00532
00533
00534 template <class S,class T>
00535 bool
00536 Enrichment<S,T>::check(double value, LIBENRICHMENT_TEST_TYPE test, double alpha, unsigned int numTests,
00537 unsigned int rank)
00538 {
00539 double correctedValue = correct(value, test, alpha, numTests, rank);
00540 return(correctedValue <= alpha);
00541 }
00542
00543 template <class S,class T>
00544 double
00545 Enrichment<S,T>::correct(double value, LIBENRICHMENT_TEST_TYPE test, double alpha, unsigned int numTests,
00546 unsigned int rank)
00547 {
00548 double correctedValue = value;
00549 switch(test)
00550 {
00551
00552
00553 case LIBENRICHMENT_NONE:
00554
00555 break;
00556 case LIBENRICHMENT_BONFERRONI:
00557
00558 correctedValue = numTests*correctedValue;
00559 break;
00560 case LIBENRICHMENT_HOLMS:
00561
00562 correctedValue = (numTests - rank + 1)*correctedValue;
00563 break;
00564 case LIBENRICHMENT_FALSE_DISCOVERY_RATE:
00565
00566 correctedValue = (numTests*1.0/rank)*correctedValue;
00567 break;
00568 }
00569 return(correctedValue);
00570 }
00571
00572
00573 template <class S,class T>
00574 void Enrichment<S,T>::check(vector<EnrichmentRecord<S,T> > &in,
00575 vector<EnrichmentRecord<S,T> > &out,
00576 LIBENRICHMENT_TEST_TYPE test, double alpha)
00577 {
00578
00579 unsigned int n=in.size();
00580 for(unsigned int i=0;i<n;i++)
00581 {
00582 double myAlpha=alpha;
00583 unsigned int i2= i + 1;
00584 in[i].score = correct(in[i].score, test, alpha, n, i2);
00585 if(in[i].score>myAlpha)
00586 break;
00587 out.push_back(in[i]);
00588 }
00589
00590 }
00591
00592 template <class S,class T>
00593 double Enrichment<S,T>::factorial( int n ){
00594 double result = 1;
00595 for(int i=n; i > 1; i-- )
00596 result*=i;
00597 return( result );
00598 }
00599
00600 template <class S,class T>
00601 double Enrichment<S,T>::factorialLog(int n){
00602 logFactCache.resize(n + 1);
00603
00604
00605
00606 if(logFactCache[n]!=0)
00607 return logFactCache[n];
00608
00609 double result=0;
00610 for(int i=1;i<=n;i++)
00611 result+=log(double(i));
00612 logFactCache[n]=result;
00613 return result;
00614 }
00615
00616 template <class S,class T>
00617 double Enrichment<S,T>::factorialEstimate( int n ){
00618
00619
00620 if( n <= 0 )
00621 return( 1 );
00622
00623
00624 if (n <= 20){ return (factorial(n)); }
00625 double result = sqrt(2*PI*n)*pow(n/exp((double)1),n);
00626 return( result );
00627 }
00628
00629 template <class S,class T>
00630 double Enrichment<S,T>::factorialEstimateLog( int n ){
00631 if( n <= 0 ){ return 0; }
00632
00633
00634 if (n <= 20){ return (factorialLog(n)); }
00635
00636 double result = 0.5*log(2*PI*n) + n*log(n/exp((double)1));
00637 return( result );
00638 }
00639 template <class S,class T>
00640 double Enrichment<S,T>::HyperGeoHelper( int globTotal, int globTrue, int locTotal, int locTrue ){
00641
00642
00643
00644
00645
00646
00647
00648
00649
00650
00651
00652
00653
00654 double result =
00655
00656 factorialEstimateLog(locTotal)
00657 - factorialEstimateLog(locTrue)
00658 - factorialEstimateLog(locTotal - locTrue)
00659
00660 + factorialEstimateLog(globTotal-locTotal)
00661 - factorialEstimateLog(globTrue-locTrue)
00662 - factorialEstimateLog(globTotal-globTrue+locTrue-locTotal)
00663
00664 + factorialEstimateLog(globTrue)
00665 + factorialEstimateLog(globTotal - globTrue)
00666 - factorialEstimateLog(globTotal)
00667 ;
00668
00669 return exp(result);
00670 }
00671 template <class S,class T> double Enrichment<S,T>::computeHyperGeometricProbability(int globTotal, int globTrue,
00672 int locTotal, int locTrue){
00673 double sum=0;
00674 for(int i=locTrue; i<=locTotal && i<=globTrue; i++ )
00675 sum+=HyperGeoHelper(globTotal,globTrue,locTotal,i);
00676 return sum;
00677 }
00678
00679 #endif