00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00033 #ifndef _GO_H
00034 #define _GO_H
00035
00036
00037 #include<iostream>
00038 #include<map>
00039 #include<set>
00040 #include<vector>
00041 #include<string>
00042 #include<sstream>
00043
00044 #include "boost/tuple/tuple_io.hpp"
00045 #include "biofunction.h"
00046 #include "enrichment.h"
00047
00048 #include "histogram.h"
00049 #include "setTemplates.C"
00050
00051 using namespace std;
00052
00053
00054 class MyAnnotations;
00055
00056
00057 enum enumGoRelationships
00058 {
00059 is_a_or_part_of,
00060 is_a,
00061 part_of,
00062 regulates,
00063 negatively_regulates,
00064 positively_regulates
00065 };
00066
00069 class GOFunction : public BioFunction {
00070 friend class GeneOntology;
00071
00072 public:
00074 GOFunction()
00075 : BioFunction()
00076 {
00077 _init();
00078 }
00079
00080 GOFunction(string cat, string id, string name = "")
00081 : BioFunction(cat, id, name)
00082 {
00083 _init();
00084 }
00085
00087 virtual ~GOFunction()
00088 {}
00089
00090
00091
00092 void _init()
00093 {
00094 integerId=-1;
00095
00096 def=comment="";
00097 obsolete=false;
00098 _siblingsComputed[0] =_siblingsComputed[1] =
00099 _siblingsComputed[2] = false;
00100
00101 }
00102
00107 MyNT computeSemanticValue() const;
00108
00109
00115 unsigned int computeSiblings(unsigned int type = 0);
00116
00117
00118
00119
00122 int getIntegerId() {return integerId;}
00123
00126 vector<int> getAltIds() {return alt_id;}
00127
00130 vector<string> getSubsets() {return subset;}
00131
00135 vector<string> getSynonyms() {return synonym;}
00136
00139 vector<string> getXrefAnalogs() {return xref_analog;}
00140
00141
00142
00143
00144
00147 string getNamespace()
00148 {return getCategory();}
00149
00150
00154 string getDef() {return def;}
00155
00158 string getComment() {return comment;}
00159
00162 bool isObsolete() {return obsolete;}
00163
00164 virtual unsigned int getNumParents(int type = 0)
00165 {
00166 return(parents[type].size());
00167 }
00168 virtual unsigned int getNumChildren(int type = 0)
00169 {
00170 return(children[type].size());
00171 }
00172
00173 virtual set< GOFunction *> getAncestors(int type = 0)
00174 {
00175
00176
00177 if(ancestors[type].size()>0)
00178 return ancestors[type];
00179 ancestors[type]= parents[type];
00180 for(set< GOFunction *>::iterator i = parents[type].begin();
00181 i != parents[type].end(); i++)
00182 ancestors[type]= setunion((*i)->getAncestors(type), ancestors[type]);
00183
00184 return ancestors[type];
00185 }
00186
00187 virtual map< GOFunction *, unsigned int > getAncestorsWithDistance(int type = 0);
00188
00189 virtual set< GOFunction *> getChildren(int type = 0)
00190 {
00191 return(children[type]);
00192 }
00193
00194 virtual set< GOFunction *> getDescendants(int type = 0)
00195 {
00196
00197 if(descendants[type].size()>0)
00198 return descendants[type];
00199 descendants[type] = children[type];
00200 for(set< GOFunction *>::iterator i = children[type].begin();
00201 i != children[type].end(); i++)
00202 descendants[type]= setunion((*i)->getDescendants(type), descendants[type]);
00203
00204 return descendants[type];
00205 }
00206
00207 virtual map< GOFunction *, unsigned int > getDescendantsWithDistance(int type = 0);
00208
00217 virtual unsigned int getDistanceToAncestor(GOFunction *ancestor,
00218 unsigned int type = 0);
00219
00220
00229 virtual unsigned int getDistanceToDescendant(GOFunction *descendant,
00230 unsigned int type = 0);
00231
00232
00233 virtual set< GOFunction *> getParents(int type = 0)
00234 {
00235 return(parents[type]);
00236 }
00237
00245 set< GOFunction * > getSiblings(unsigned int type = 0)
00246 {
00247 if (!_siblingsComputed[type])
00248 computeSiblings(type);
00249 return(_siblings[type]);
00250 }
00251
00252
00255 bool isAncestor(GOFunction *candidate)
00256 {
00257 set< GOFunction * > ancestors = getAncestors();
00258 return(ancestors.end() != ancestors.find(candidate));
00259 }
00260
00262 bool isChild(GOFunction* candidate)
00263 {
00264 set< GOFunction* > children = getChildren();
00265 return(children.end() != children.find(candidate));
00266 }
00267
00270 bool isDescendant(GOFunction *candidate)
00271 {
00272 set< GOFunction * > descendants = getDescendants();
00273 return(descendants.end() != descendants.find(candidate));
00274 }
00275
00277 bool isParent(GOFunction* candidate)
00278 {
00279 set< GOFunction* > parents = getParents();
00280 return(parents.end() != parents.find(candidate));
00281 }
00282
00283
00284 void addDepth(unsigned int depth, int type = 0)
00285 {
00286 _depths[type].insert(depth);
00287 }
00288
00290 set< unsigned int > getDepths(int type = 0)
00291 {
00292 return(_depths[type]);
00293 }
00294
00296 unsigned int getMinimumDepth(int type = 0)
00297 {
00298 return(*(_depths[type].begin()));
00299 }
00300
00302 unsigned int getMaximumDepth(int type = 0)
00303 {
00304 set< unsigned int >::iterator ditr = _depths[type].end();
00305 ditr--;
00306 return(*ditr);
00307 }
00308
00309
00310 protected:
00311 unsigned int _computeSiblingsNaively(unsigned int type = 0);
00312
00313
00314
00315 private:
00316 int integerId;
00317 vector<int> alt_id;
00318 vector<string> subset,synonym,xref_analog;
00319 string def,comment;
00320 bool obsolete;
00321
00322
00323 vector<int> tempParents[6];
00324 vector<int> tempChildren[6];
00325 set<GOFunction*> parents[6],ancestors[6];
00326 set<GOFunction*> children[6], descendants[6];
00327 set<GOFunction*> _siblings[6];
00328 bool _siblingsComputed[6];
00329 map< GOFunction*, unsigned int > ancestorsWithDistance[6], descendantsWithDistance[6];
00330
00331
00332
00333 set< unsigned int > _depths[3];
00334 };
00335
00336 enum GOGroupByDepthType {
00337 GO_GROUP_BY_MINIMUM_DEPTH = 0, GO_GROUP_BY_AVERAGE_DEPTH,
00338 GO_GROUP_BY_MAXIMUM_DEPTH
00339 };
00340
00342 class GeneOntology {
00343
00344 public:
00347 GeneOntology()
00348 : default_namespace(""), functions(), idxId(), idxName(), _functionCategoryAbbrevs(),
00349 _legalFunctionCategories(), _knownFunctionIds(), _topologicalSortComputed(false),
00350 _topologicallySortedFunctions()
00351 {
00352 _init();
00353 }
00354
00357 GeneOntology(istream *in);
00358
00362 void read(string file);
00366 void read(istream *in);
00367
00376 void readTerms(string file, vector<GOFunction *> &functions);
00377
00387 void computeEnrichmentsOntologizer(const set< string > &geneSet,
00388
00389 MyAnnotations &annotations,
00390 vector<EnrichmentRecord< string, string> > &rejalts) const;
00391
00392
00403 void computeFunctionsWithNoChildren(set< string > &functions,
00404 set< string > &childless);
00405
00413 void computeFunctionsWithNoChildren(set< GOFunction* > &functions,
00414 set< GOFunction* > &childless);
00415
00416
00417
00435 typedef boost::tuple< GOFunction *, unsigned int, unsigned int > GOLCATuple;
00436 int computeLeastCommonAncestors(GOFunction *function1, GOFunction *function2,
00437 vector< GOLCATuple >&lcas);
00438
00439
00440
00447 void computeMostSpecificFunctions(set< GOFunction * > &functions,
00448 set< GOFunction * > &specificFunctions);
00449
00450
00451 template < typename InputIterator, typename OutputIterator, typename Extractor >
00452 void computeMostSpecificFunctionsTemplated(
00453 InputIterator functionsBegin, InputIterator functionsEnd,
00454 OutputIterator specificFunctionsBegin,
00455 Extractor _extractor
00456 );
00457
00465 void computeMostSpecificFunctions(set< string > &functions,
00466 set< string > &specificFunctions);
00467
00468
00469
00481 set<GOFunction*> getAncestors(GOFunction* obj, int type=0);
00482
00483
00484
00485 void setBoolRelationship(bool r){_boolRelationship = r;}
00486
00487
00488
00489 void setStrRelationshipFile(string file){_strRelationshipFile = file;}
00490
00504 map< GOFunction*, unsigned int >getAncestorsWithDistance(GOFunction* obj,
00505 int type = 0);
00506
00516 set<GOFunction*> getParents(GOFunction* obj, int type=0);
00517
00530 set<GOFunction*> getDescendants(GOFunction* obj, int type=0);
00531
00532
00546 map< GOFunction*, unsigned int > getDescendantsWithDistance(GOFunction* obj, int type = 0);
00547
00548
00558 set<GOFunction*> getChildren(GOFunction* obj, int type=0);
00559
00563 GOFunction* getFunctionById(int id) const;
00564
00571 GOFunction* getFunctionById(string id) const
00572 {
00573 unsigned int intID;
00574 stringstream ss;
00575 ss << id;
00576 ss >> intID;
00577 return(getFunctionById(intID));
00578 }
00579
00583 GOFunction* getFunctionByName(string name) const;
00584
00585
00586
00587
00588 bool isKnownFunctionId(string id) const
00589 {
00590 if (_knownFunctionIds.find(id)==_knownFunctionIds.end())
00591 return false;
00592 else
00593 return true;
00594 }
00595
00596
00607 void computeFunctionDepths(
00608 unsigned int type = 0);
00609
00612 void computeSiblings(unsigned int type = 0);
00613
00627 void computeTopologicalSort(vector< GOFunction* > &sortedFunctions, unsigned int type = 0);
00628
00629
00631 void printFunctionDepths(ostream &fstr, unsigned int type = 0);
00632
00644 void printFunctionDescendants(ostream &fstr, unsigned int type = 0);
00645
00648 void printRelationships();
00649
00663 void groupFunctionsByDepth(map< unsigned int, set< GOFunction * > > &functionsByDepth,
00664 string functionCategory = "biological_process",
00665 GOGroupByDepthType groupingType =
00666 GO_GROUP_BY_MINIMUM_DEPTH);
00667
00668
00669 void groupFunctionsByCutoff(map< string, map< unsigned int, set< GOFunction * > > > &functionsByCutoff, const vector< unsigned int > &cutoffs, const map< BioFunction, unsigned int > &geneCounts, bool allowDescendants = false);
00670
00671
00672 void groupFunctionsByDepth2(map< string, map< unsigned int, set< GOFunction * > > > &functionsByDepth, bool allowDescendants = false);
00673
00674
00675
00676
00677 void groupFunctionsByDepth2(map< string, map< unsigned int, set< BioFunction > > > &functionsByDepth, bool allowDescendants = false);
00683 ~GeneOntology();
00684
00685 private:
00686
00687 bool _boolRelationship;
00688 string _strRelationshipFile;
00689
00690 void _init()
00691 {
00692 _functionCategoryAbbrevs["p"] = "biological_process";
00693 _functionCategoryAbbrevs["c"] = "cellular_component";
00694 _functionCategoryAbbrevs["f"] = "molecular_function";
00695 _legalFunctionCategories.insert("biological_process");
00696 _legalFunctionCategories.insert("cellular_component");
00697 _legalFunctionCategories.insert("molecular_function");
00698 }
00699
00700 void addIdx(unsigned int id,GOFunction* obj);
00701 void buildIndices();
00702
00704 bool _isLegalFunctionCategory(string name, string &expandedName) const
00705 {
00706 expandedName = name;
00707 if (_legalFunctionCategories.end() !=
00708 _legalFunctionCategories.find(name))
00709
00710 return(true);
00711
00712 map< string, string >::const_iterator aitr =
00713 _functionCategoryAbbrevs.find(name);
00714 if (_functionCategoryAbbrevs.end()
00715 != (aitr = _functionCategoryAbbrevs.find(name)))
00716
00717 {
00718 expandedName = aitr->second;
00719 return(true);
00720 }
00721
00722
00723 return(false);
00724 }
00725
00726
00727 private:
00728
00729 string default_namespace;
00730 vector<GOFunction*> functions;
00731
00732
00733
00734
00735
00736
00737 vector<GOFunction*> idxId;
00738 map<string,GOFunction*> idxName;
00739
00740
00742 map< string, string > _functionCategoryAbbrevs;
00743 set< string > _legalFunctionCategories;
00744 set< string > _knownFunctionIds;
00745
00746 bool _topologicalSortComputed;
00747 vector< GOFunction* > _topologicallySortedFunctions;
00748
00749
00750
00751
00752 map< string, map< unsigned int, set< GOFunction * > > > _functionsByCutoff;
00753 map< string, map< unsigned int, set< GOFunction * > > > _functionsByDepth;
00754
00755
00756 };
00757
00758 template < typename InputIterator, typename OutputIterator, typename Extractor >
00759 void GeneOntology::computeMostSpecificFunctionsTemplated(
00760 InputIterator functionsBegin, InputIterator functionsEnd,
00761 OutputIterator specificFunctionsBegin,
00762 Extractor _extractor)
00763 {
00764 bool isAncestor;
00765 InputIterator fitr1, fitr2;
00766 GOFunction *func1, *func2;
00767
00768 for (fitr1 = functionsBegin; fitr1 != functionsEnd; fitr1++)
00769 {
00770
00771 isAncestor = false;
00772 func1 = getFunctionById(_extractor(*fitr1));
00773 fitr2 = fitr1;
00774 fitr2++;
00775 for (; fitr2 != functionsEnd; fitr2++)
00776 {
00777 func2 = getFunctionById(_extractor(*fitr2));
00778 if (func2->isAncestor(func1))
00779
00780
00781 {
00782 isAncestor = true;
00783 break;
00784 }
00785 }
00786 if (!isAncestor)
00787
00788
00789 {
00790 *specificFunctionsBegin = *fitr1;
00791 ++specificFunctionsBegin;
00792 }
00793 }
00794 }
00795
00796 #endif // _GO_H