Biorithm  1.1
 All Classes Functions Variables Typedefs Friends
GO.h
00001 /**************************************************************************
00002  * Copyright (c) 2004-2011 T. M. Murali                                   *
00003  * Copyright (c) 2007-2011 Naveed Massjouni                               *
00004  * Copyright (c) 2009-2011 Christopher L. Poirel                          *
00005  * Copyright (c) 2011 Phillip Whisenhunt                                  *
00006  * Copyright (c) 2011 David Badger                                        *
00007  * Copyright (c) 2010 Clifford Conley Owens III                           *
00008  *                                                                        *
00009  * This file is part of Biorithm.                                         *
00010  *                                                                        *
00011  * Biorithm is free software: you can redistribute it and/or modify       *
00012  * it under the terms of the GNU General Public License as published by   *
00013  * the Free Software Foundation, either version 3 of the License, or      *
00014  * (at your option) any later version.                                    *
00015  *                                                                        *
00016  * Biorithm is distributed in the hope that it will be useful,            *
00017  * but WITHOUT ANY WARRANTY; without even the implied warranty of         *
00018  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          *
00019  * GNU General Public License for more details.                           *
00020  *                                                                        *
00021  * You should have received a copy of the GNU General Public License      *
00022  * along with Biorithm.  If not, see <http://www.gnu.org/licenses/>.      *
00023  *                                                                        *
00024  **************************************************************************/
00025 
00026 
00033 #ifndef _GO_H
00034 #define _GO_H
00035 
00036 
00037 #include<iostream>
00038 #include<map>
00039 #include<set>
00040 #include<vector>
00041 #include<string>
00042 #include<sstream>
00043 
00044 #include "boost/tuple/tuple_io.hpp"
00045 #include "biofunction.h"
00046 #include "enrichment.h"
00047 // HACKHACKHACK. this #include is just for defining the MyNT type.
00048 #include "histogram.h"
00049 #include "setTemplates.C"
00050 
00051 using namespace std;
00052 
00053 // forward declaration
00054 class MyAnnotations;
00055 
00056 //enumated type for GO relationships; 0-is_a_or_part_of, 1-is_a, 2-part_of, 3-regulates, 4-negatively_regulates, 5-positively_regulates
00057 enum enumGoRelationships
00058 {
00059     is_a_or_part_of,
00060     is_a,
00061     part_of,
00062     regulates,
00063     negatively_regulates,
00064     positively_regulates
00065 };
00066 
00069 class GOFunction  : public BioFunction {
00070   friend class GeneOntology;
00071         
00072 public:
00074   GOFunction()
00075       : BioFunction()
00076     {
00077       _init();      
00078     }
00079 
00080   GOFunction(string cat, string id, string name = "")
00081       : BioFunction(cat, id, name)
00082     {
00083       _init();
00084     }
00085 
00087   virtual ~GOFunction()
00088     {}
00089   
00090 
00091   // set up all the internal variables.
00092   void _init()
00093     {
00094       integerId=-1;
00095 //      name=name_space=
00096       def=comment="";
00097       obsolete=false;
00098       _siblingsComputed[0] =_siblingsComputed[1] =
00099         _siblingsComputed[2] = false;
00100       
00101     }
00102 
00107   MyNT computeSemanticValue() const;
00108   
00109   
00115   unsigned int computeSiblings(unsigned int type = 0);
00116   
00117 
00118 
00119 
00122   int getIntegerId() {return integerId;}
00123 
00126   vector<int> getAltIds() {return alt_id;}
00127 
00130   vector<string> getSubsets() {return subset;}
00131 
00135   vector<string> getSynonyms() {return synonym;}
00136 
00139   vector<string> getXrefAnalogs() {return xref_analog;}
00140 
00141 //   /*! Accessor for name
00142 //    * \returns function name*/
00143 //   string getName() {return name;}
00144 
00147   string getNamespace()
00148     {return getCategory();}
00149 //    {return name_space;}
00150 
00154   string getDef() {return def;}
00155 
00158   string getComment() {return comment;}
00159 
00162   bool isObsolete() {return obsolete;}
00163   
00164   virtual unsigned int getNumParents(int type = 0)
00165     {
00166       return(parents[type].size());
00167     }
00168   virtual unsigned int getNumChildren(int type = 0)
00169     {
00170       return(children[type].size());
00171     }
00172   
00173   virtual set< GOFunction *> getAncestors(int type = 0)
00174     {
00175       
00176       //check if already computed/no parents
00177       if(ancestors[type].size()>0)
00178         return ancestors[type];
00179       ancestors[type]= parents[type];
00180       for(set< GOFunction *>::iterator i = parents[type].begin();
00181           i != parents[type].end(); i++)
00182         ancestors[type]= setunion((*i)->getAncestors(type), ancestors[type]);
00183   
00184       return ancestors[type];
00185     }
00186 
00187   virtual map< GOFunction *, unsigned int > getAncestorsWithDistance(int type = 0);
00188   
00189   virtual set< GOFunction *> getChildren(int type = 0)
00190     {
00191       return(children[type]);
00192     }
00193   
00194   virtual set< GOFunction *> getDescendants(int type = 0)
00195     {
00196       //check if already computed or no descendants.
00197       if(descendants[type].size()>0)
00198         return descendants[type];
00199       descendants[type] = children[type];
00200       for(set< GOFunction *>::iterator i = children[type].begin();
00201           i != children[type].end(); i++)
00202         descendants[type]= setunion((*i)->getDescendants(type), descendants[type]);
00203   
00204       return descendants[type];
00205     }
00206   
00207   virtual map< GOFunction *, unsigned int > getDescendantsWithDistance(int type = 0);
00208 
00217   virtual unsigned int getDistanceToAncestor(GOFunction *ancestor,
00218                                              unsigned int type = 0);
00219   
00220   
00229   virtual unsigned int getDistanceToDescendant(GOFunction *descendant,
00230                                                unsigned int type = 0);
00231   
00232   
00233   virtual set< GOFunction *> getParents(int type = 0)
00234     {
00235       return(parents[type]);
00236     }
00237   
00245   set< GOFunction * > getSiblings(unsigned int type = 0)
00246     {
00247       if (!_siblingsComputed[type])
00248         computeSiblings(type);
00249       return(_siblings[type]);
00250     }
00251   
00252 
00255   bool isAncestor(GOFunction *candidate) 
00256     {
00257       set< GOFunction * > ancestors = getAncestors();
00258       return(ancestors.end() != ancestors.find(candidate));
00259     }
00260 
00262   bool isChild(GOFunction* candidate)
00263     {
00264       set< GOFunction* > children = getChildren();
00265       return(children.end() != children.find(candidate));
00266     }
00267 
00270   bool isDescendant(GOFunction *candidate) 
00271     {
00272       set< GOFunction * > descendants = getDescendants();
00273       return(descendants.end() != descendants.find(candidate));
00274     }
00275 
00277   bool isParent(GOFunction* candidate)
00278     {
00279       set< GOFunction* > parents = getParents();
00280       return(parents.end() != parents.find(candidate));
00281     }
00282 
00283   
00284   void addDepth(unsigned int depth, int type = 0)
00285     {
00286       _depths[type].insert(depth);
00287     }
00288 
00290   set< unsigned int > getDepths(int type = 0)
00291     {
00292       return(_depths[type]);
00293     }
00294 
00296   unsigned int getMinimumDepth(int type = 0)
00297     {
00298       return(*(_depths[type].begin()));
00299     }
00300   
00302   unsigned int getMaximumDepth(int type = 0)
00303     {
00304       set< unsigned int >::iterator ditr = _depths[type].end();
00305       ditr--;
00306       return(*ditr);
00307     }
00308   
00309 
00310 protected:
00311   unsigned int _computeSiblingsNaively(unsigned int type = 0);
00312 
00313   
00314   
00315 private:
00316   int integerId;
00317   vector<int> alt_id;
00318   vector<string> subset,synonym,xref_analog;
00319   string def,comment;
00320   bool obsolete;
00321 
00322   //TODO: Make these enumerated instead of 0,1,2
00323   vector<int> tempParents[6];                   //0-all, 1-is_a, 2-part_of
00324   vector<int> tempChildren[6];                  //0-all, 1-is_a, 2-part_of
00325   set<GOFunction*> parents[6],ancestors[6];     //0-all, 1-is_a, 2-part_of
00326   set<GOFunction*> children[6], descendants[6]; //0-all, 1-is_a, 2-part_of
00327   set<GOFunction*> _siblings[6];
00328   bool _siblingsComputed[6];
00329   map< GOFunction*, unsigned int > ancestorsWithDistance[6], descendantsWithDistance[6];
00330 
00331   
00332   // other computed info.
00333   set< unsigned int > _depths[3];
00334 };
00335 
00336 enum GOGroupByDepthType {
00337   GO_GROUP_BY_MINIMUM_DEPTH = 0, GO_GROUP_BY_AVERAGE_DEPTH,
00338   GO_GROUP_BY_MAXIMUM_DEPTH 
00339 };
00340 
00342 class GeneOntology {
00343 
00344 public:
00347   GeneOntology()
00348     : default_namespace(""), functions(), idxId(), idxName(), _functionCategoryAbbrevs(),
00349       _legalFunctionCategories(), _knownFunctionIds(), _topologicalSortComputed(false),
00350       _topologicallySortedFunctions()
00351     {
00352       _init();
00353     }
00354 
00357   GeneOntology(istream *in);
00358   
00362   void read(string file);
00366   void read(istream *in);
00367 
00376   void readTerms(string file, vector<GOFunction *> &functions);
00377 
00387   void computeEnrichmentsOntologizer(const set< string > &geneSet,
00388                                      // not const becaus MyAnnotations::numAnnotatedGenes() is not a const function.
00389                                      MyAnnotations &annotations,
00390                                      vector<EnrichmentRecord< string, string> > &rejalts) const;
00391   
00392 
00403   void computeFunctionsWithNoChildren(set< string > &functions,
00404                                       set< string > &childless);
00405 
00413   void computeFunctionsWithNoChildren(set< GOFunction* > &functions,
00414                                       set< GOFunction* > &childless);
00415 
00416     
00417 
00435   typedef boost::tuple< GOFunction *, unsigned int, unsigned int > GOLCATuple;
00436   int computeLeastCommonAncestors(GOFunction *function1, GOFunction *function2,
00437                                   vector< GOLCATuple >&lcas);
00438 //                                  vector< GOFunction * > &lcas);
00439 
00440 
00447   void computeMostSpecificFunctions(set< GOFunction * > &functions,
00448                                     set< GOFunction * > &specificFunctions);
00449   
00450 
00451   template < typename InputIterator, typename OutputIterator, typename Extractor >
00452   void computeMostSpecificFunctionsTemplated(
00453     InputIterator functionsBegin, InputIterator functionsEnd,
00454     OutputIterator specificFunctionsBegin,
00455     Extractor _extractor
00456     );
00457 
00465   void computeMostSpecificFunctions(set< string > &functions,
00466                                     set< string > &specificFunctions);
00467   
00468 
00469 
00481   set<GOFunction*> getAncestors(GOFunction* obj, int type=0);
00482   
00483   /* Setter for boolRelationship
00484    * \param r used to set boolRelationship */
00485   void setBoolRelationship(bool r){_boolRelationship = r;}
00486   
00487   /* Setter for boolRelationship
00488    * \param r used to set boolRelationship */
00489   void setStrRelationshipFile(string file){_strRelationshipFile = file;}
00490   
00504   map< GOFunction*, unsigned int >getAncestorsWithDistance(GOFunction* obj,
00505                                                            int type = 0);
00506 
00516   set<GOFunction*> getParents(GOFunction* obj, int type=0);
00517 
00530   set<GOFunction*> getDescendants(GOFunction* obj, int type=0);
00531 
00532 
00546   map< GOFunction*, unsigned int > getDescendantsWithDistance(GOFunction* obj, int type = 0);
00547   
00548   
00558   set<GOFunction*> getChildren(GOFunction* obj, int type=0);
00559   
00563   GOFunction* getFunctionById(int id) const;
00564   
00571   GOFunction* getFunctionById(string id) const
00572     {
00573       unsigned int intID;
00574       stringstream ss;
00575       ss << id;
00576       ss >> intID;
00577       return(getFunctionById(intID));
00578     }
00579 
00583   GOFunction* getFunctionByName(string name) const;
00584 
00585   /* Return true if the given id is known by the instance of GO.
00586      Otherwise, return false.
00587    */
00588   bool isKnownFunctionId(string id) const
00589     {
00590       if (_knownFunctionIds.find(id)==_knownFunctionIds.end())
00591         return false;
00592       else
00593         return true;
00594     }
00595 
00596   
00607   void computeFunctionDepths(//vector< set< GOFunction* > > &functionsByDepth,
00608     unsigned int type = 0);
00609 
00612   void computeSiblings(unsigned int type = 0);
00613   
00627   void computeTopologicalSort(vector< GOFunction* > &sortedFunctions, unsigned int type = 0);
00628   
00629 
00631   void printFunctionDepths(ostream &fstr, unsigned int type = 0);
00632 
00644   void printFunctionDescendants(ostream &fstr, unsigned int type = 0);
00645 
00648   void printRelationships();
00649   
00663   void groupFunctionsByDepth(map< unsigned int, set< GOFunction * > > &functionsByDepth,
00664                              string functionCategory = "biological_process",
00665                              GOGroupByDepthType groupingType =
00666                              GO_GROUP_BY_MINIMUM_DEPTH);
00667 
00668   // for each cutoff, group the highest functions which annotate that many or fewer genes such that there are no parent-child pairs in the group
00669   void groupFunctionsByCutoff(map< string, map< unsigned int, set< GOFunction * > > > &functionsByCutoff, const vector< unsigned int > &cutoffs, const map< BioFunction, unsigned int > &geneCounts, bool allowDescendants = false);
00670   // group functions using their minimum depths such that there are no parent-child pairs in a group
00671   // this version is needed because the original doesn't prevent parent-child pairs and only considers one category at a time
00672   void groupFunctionsByDepth2(map< string, map< unsigned int, set< GOFunction * > > > &functionsByDepth, bool allowDescendants = false);
00673 
00674   // group functions using their minimum depths such that there are no
00675   // parent-child pairs in a group this version is needed because i
00676   // want to return BioFunctions rather than pointers to GOFunctions.
00677   void groupFunctionsByDepth2(map< string, map< unsigned int, set< BioFunction > > > &functionsByDepth, bool allowDescendants = false);
00683   ~GeneOntology();
00684 
00685 private:
00686 
00687   bool _boolRelationship;
00688   string _strRelationshipFile;
00689   
00690   void _init()
00691     {
00692       _functionCategoryAbbrevs["p"] = "biological_process";
00693       _functionCategoryAbbrevs["c"] = "cellular_component";
00694       _functionCategoryAbbrevs["f"] = "molecular_function";
00695       _legalFunctionCategories.insert("biological_process");
00696       _legalFunctionCategories.insert("cellular_component");
00697       _legalFunctionCategories.insert("molecular_function");
00698     }
00699   
00700   void addIdx(unsigned int id,GOFunction* obj);
00701   void buildIndices();
00702 
00704   bool _isLegalFunctionCategory(string name, string &expandedName) const
00705     {
00706       expandedName = name;
00707       if (_legalFunctionCategories.end() !=
00708           _legalFunctionCategories.find(name))
00709         // name is in _legalFunctionCategories
00710         return(true);
00711       // name may be an abbreviation for a legal category
00712       map< string, string >::const_iterator aitr =
00713         _functionCategoryAbbrevs.find(name);
00714       if (_functionCategoryAbbrevs.end()
00715           != (aitr = _functionCategoryAbbrevs.find(name)))
00716         // name is a legal abbreviation.
00717         {
00718           expandedName = aitr->second;
00719           return(true);
00720         }
00721       
00722       
00723       return(false);
00724     }
00725 
00726 
00727 private:
00728 
00729   string default_namespace;
00730   vector<GOFunction*> functions;
00731   
00732   //represent the integerId->function mapping as a vector since
00733   //it is bounded in size generally linear in count, and
00734   //vector is instant for lookups which are going to
00735   //happen alot with Ids. we can use a map but a vector
00736   //is faster at the cost of using extra space.
00737   vector<GOFunction*> idxId;
00738   map<string,GOFunction*> idxName;
00739 
00740   // various constants associated with GO.
00742   map< string, string > _functionCategoryAbbrevs;
00743   set< string > _legalFunctionCategories;
00744   set< string > _knownFunctionIds;
00745   
00746   bool _topologicalSortComputed;
00747   vector< GOFunction* > _topologicallySortedFunctions;
00748   
00749   // the first key (a string) is the category. the second
00750   // key (unsigned int) is either the number of annotations of a
00751   // function or its minimum depth.
00752   map< string, map< unsigned int, set< GOFunction * > > > _functionsByCutoff;
00753   map< string, map< unsigned int, set< GOFunction * > > > _functionsByDepth;
00754   
00755                 
00756 };
00757 
00758 template < typename InputIterator, typename OutputIterator, typename Extractor >
00759 void GeneOntology::computeMostSpecificFunctionsTemplated(
00760   InputIterator functionsBegin, InputIterator functionsEnd,
00761   OutputIterator specificFunctionsBegin,
00762   Extractor _extractor)
00763 {
00764   bool isAncestor;
00765   InputIterator fitr1, fitr2;
00766   GOFunction *func1, *func2;
00767   // compare all pairs of functions to see if one is an ancestor of the other.
00768   for (fitr1 = functionsBegin; fitr1 != functionsEnd; fitr1++)
00769     {
00770       // assume fitr1 is not an ancestor of any other function.
00771       isAncestor = false;
00772       func1 = getFunctionById(_extractor(*fitr1));
00773       fitr2 = fitr1;
00774       fitr2++;
00775       for (; fitr2 != functionsEnd; fitr2++)
00776         {
00777           func2 = getFunctionById(_extractor(*fitr2));
00778           if (func2->isAncestor(func1))
00779           // *fitr1 is an ancestor of *fitr2. i do not need to make
00780           // any more checks for *fitr1.
00781           {
00782             isAncestor = true;
00783             break;
00784           }
00785         }
00786       if (!isAncestor)
00787         // fitr1 is not an ancestor of any other functions. add it to
00788         // specificFunctionsBegin.
00789         {
00790           *specificFunctionsBegin = *fitr1;
00791           ++specificFunctionsBegin;
00792         }
00793     }
00794 }
00795 
00796 #endif // _GO_H 
 All Classes Functions Variables Typedefs Friends