Biorithm  1.1
 All Classes Functions Variables Typedefs Friends
old-annotations.h
00001 /**************************************************************************
00002  * Copyright (c) 2004-2011 T. M. Murali                                   *
00003  * Copyright (c) 2007-2011 Naveed Massjouni                               *
00004  * Copyright (c) 2009-2011 Christopher L. Poirel                          *
00005  * Copyright (c) 2011 Phillip Whisenhunt                                  *
00006  * Copyright (c) 2011 David Badger                                        *
00007  * Copyright (c) 2010 Clifford Conley Owens III                           *
00008  *                                                                        *
00009  * This file is part of Biorithm.                                         *
00010  *                                                                        *
00011  * Biorithm is free software: you can redistribute it and/or modify       *
00012  * it under the terms of the GNU General Public License as published by   *
00013  * the Free Software Foundation, either version 3 of the License, or      *
00014  * (at your option) any later version.                                    *
00015  *                                                                        *
00016  * Biorithm is distributed in the hope that it will be useful,            *
00017  * but WITHOUT ANY WARRANTY; without even the implied warranty of         *
00018  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          *
00019  * GNU General Public License for more details.                           *
00020  *                                                                        *
00021  * You should have received a copy of the GNU General Public License      *
00022  * along with Biorithm.  If not, see <http://www.gnu.org/licenses/>.      *
00023  *                                                                        *
00024  **************************************************************************/
00025 
00032 #ifndef _ANNOTATIONS_H
00033 #define _ANNOTATIONS_H
00034 
00035 #include <iostream>
00036 #include <fstream>
00037 #include <iterator>
00038 #include <map>
00039 #include <set>
00040 #include <string>
00041 
00042 using namespace std;
00043 
00044 #include "boost/multi_index_container.hpp"
00045 #include "boost/multi_index/member.hpp"
00046 #include "boost/multi_index/mem_fun.hpp"
00047 #include "boost/multi_index/ordered_index.hpp"
00048 #include <boost/multi_index/composite_key.hpp>
00049 
00050 using boost::multi_index_container;
00051 using namespace boost::multi_index;
00052 
00053 // for is_any_of.
00054 #include "boost/algorithm/string/classification.hpp"
00055 // for string split.
00056 #include "boost/algorithm/string.hpp"
00057 
00058 #include "enrichment.h"
00059 #include "myiterator.h"
00060 
00061 #include "biofunction.h"
00062 #include "GO.h"
00063 
00064 // also declared in graph.h
00065 typedef double MyNT;
00066 
00068 //
00069 // there is no good reason old-annotations.h should have this
00070 // declaration. but MyAnnotations::computeEnrichmentsGenGO() has an
00071 // argument of type GenGOParametersOptimisationType. i cannot #include
00072 // gengo.h in this file since gengo.h refers to MyAnnotations.
00073 //
00079 enum GenGOParametersOptimisationType { GENGO_PARAMETER_OPTIMISATION_NONE = 0,
00080                                        GENGO_PARAMETER_OPTIMISATION_LOOP,
00081                                        GENGO_PARAMETER_OPTIMISATION_LEARN,
00082                                        // loop over many values and learn for each value.
00083                                        GENGO_PARAMETER_OPTIMISATION_LOOP_AND_LEARN,
00084 };
00085 
00086 
00087 
00088 
00089 enum MyGainAnnotationType { NOT_ANNOTATED_STATE = -1, HYPOTHETICAL_STATE,
00090                         ANNOTATED_STATE};
00091 
00093 inline MyGainAnnotationType makeAnnotationType(string c)
00094 {
00095   if ("-1" == c)
00096     return(NOT_ANNOTATED_STATE);
00097   if ("0" == c)
00098     return(HYPOTHETICAL_STATE);
00099   if ("1" == c)
00100     return(ANNOTATED_STATE);
00101   cerr << "Unknown annotation state" << c << endl;
00102   exit(-1);
00103 }
00104 
00105 extern map< string, string > GO_UNKNOWN_FUNCTIONS;
00106 
00107 //     *   TAS / IDA
00108 //     *
00109 //           o IMP = / IGI / IPI
00110 //           o ISS / IEP
00111 //           o NAS
00112 //           o
00113 //                 + = IEA
00114 
00118 struct GOEvidenceCodes
00119 {
00120 private:
00121   map< string, MyNT >_weights;
00122 public:
00123 
00125   GOEvidenceCodes()
00126     {
00127         /*
00128       _weights["TAS"] = 0.9;
00129       _weights["IDA"] = 0.9;
00130       _weights["IMP"] = 0.7;
00131       _weights["IGI"] = 0.7;
00132       _weights["IPI"] = 0.7;
00133       _weights["IEP"] = 0.6;
00134       _weights["ISS"] = 0.6;
00135       _weights["IC"] = 0.5;
00136       _weights["NAS"] = 0.4;
00137       _weights["RCA"] = 0.4;
00138       _weights["IEA"] = 0.1;
00139       */
00140       
00141       // Brett Tyler's recommended weightings:
00142       //
00143       // "Note that TAS is weak because the nature of the evidence is not
00144       // determined. GO considers ISS and IEP to be special cases of RCA so
00145       // all should have similar weightings. IC is used very conservatively so
00146       // is usually reliable."
00147       _weights["IDA"] = 0.9;
00148       _weights["IMP"] = 0.9;
00149       _weights["IGI"] = 0.9;
00150       _weights["IC"] = 0.8;
00151       _weights["IPI"] = 0.6;
00152       _weights["IEP"] = 0.6;
00153       _weights["ISS"] = 0.6;
00154       _weights["RCA"] = 0.6;
00155       _weights["TAS"] = 0.6;
00156       _weights["NAS"] = 0.3;
00157       _weights["IEA"] = 0.2;
00158     }
00159   
00165   MyNT getWeight(string code) const
00166     {
00167       static set< string > _unknownCodes;
00168       map< string, MyNT >::const_iterator witr = _weights.find(code);
00169       if (_weights.end() == witr)
00170         {
00171           if (_unknownCodes.end() == _unknownCodes.find(code))
00172             {
00173               cerr << "GOEvidenceCodes::getWeight(): unknown evidence code "
00174                    << code << ", returning weight of 0" << endl;
00175               _unknownCodes.insert(code);
00176             }
00177           return(0);
00178         }
00179       
00180       return(witr->second);
00181     }
00182 
00186   bool hasWeight(string code) const
00187     {
00188       map< string, MyNT >::const_iterator witr = _weights.find(code);
00189       return(_weights.end() != witr);
00190     }
00191   
00192 
00198   bool isExperimental(string code) const
00199     {
00200       if ("TAS" == code)
00201         return(true);
00202       if ("IDA" == code)
00203         return(true);
00204       if ("IMP" == code)
00205         return(true);
00206       if ("IGI" == code)
00207         return(true);
00208       if ("IPI" == code)
00209         return(true);
00210       if ("IEP" == code)
00211         return(true);
00212       if ("ISS" == code)
00213         return(false);
00214       if ("IC" == code)
00215         return(true);
00216       if ("NAS" == code)
00217         return(true);
00218       if ("RCA" == code)
00219         return(false);
00220       if ("IEA" == code)
00221         return(false);
00222     }
00223 
00229   void checkEvidenceCodeWeights(const map< string, unsigned int > &ecUsageCounts, ostream &logStream);
00230 
00231 
00242   void readEvidenceCodeWeights(string ecwFile);
00243 };
00244 
00245 // read functions from file. each line contains one function.
00246 template < class container >
00247 void readFunctions(string file, container &functions)
00248 {
00249   cout << "Reading items from " << file << endl;
00250   ifstream ifstr(file.c_str());
00251   if (!ifstr)
00252     {
00253       cerr << "Cannot open file " << file << endl;
00254       exit(-1);
00255     }
00256   string func, line;
00257   vector< string > items;
00258   while (!ifstr.eof())
00259     {
00260       getline(ifstr, line);
00261       // ignore comments.
00262       if ('#' == line[0])
00263         continue;
00264       items.clear();
00265       boost::split(items, line, boost::is_any_of("\t"));
00266       if (items.size() == 0)
00267         continue;
00268       func = items[0];
00269       if (func=="")
00270         continue;
00271       functions.insert(func);
00272     }
00273   ifstr.close();
00274   cout << "Read " << functions.size() << " items." << endl;
00275 }
00276  
00277 // tell doxygen to skip
00281 struct annotation_gene_function_evidencecode_index {};
00282 struct annotation_gene_function_index {};
00283 struct annotation_gene_functiontype_index {};
00284 struct annotation_gene_index {};
00285 struct annotation_function_index {};
00286 //struct algorithm_index {};
00287 
00288 struct annotation_function_gene_index {};
00293 
00294 
00295 
00296 
00297 
00298 
00299 struct FunctionalAnnotation
00300 {
00301 public:
00302   // these need to be public for SetOfAnnotations.
00303   string _gene;
00304   BioFunction _function;
00305   string _evidenceCode;
00306   MyGainAnnotationType _annotationStatus;
00307   // must be false when _annotationStatus is not ANNOTATED_STATE.
00308   bool _mostSpecificAnnotation;
00309   
00310 public:
00311   FunctionalAnnotation(string g, string f, string c, string e,
00312                        MyGainAnnotationType status, bool msa = false)
00313       : _gene(g), _function(c, f), _evidenceCode(e),
00314         _annotationStatus(status),_mostSpecificAnnotation(msa)
00315     {}
00316   FunctionalAnnotation(string g, const BioFunction &f, string e,
00317                        MyGainAnnotationType status, bool msa = false)
00318       : _gene(g), _function(f), _evidenceCode(e), _annotationStatus(status),
00319         _mostSpecificAnnotation(msa)
00320     {}
00321 
00323   string getFunctionType() const
00324     {
00325       return(_function.getCategory());
00326     }
00327   
00328 };
00329 
00330 
00331 // // extracts a FunctionalAnnotation's function type. See "User-defined
00332 // // key extractors" in
00333 // // http://www.boost.org/doc/libs/1_36_0/libs/multi_index/doc/tutorial/key_extraction.html
00334 // struct FunctionalAnnotationFunctionType
00335 // {
00336 //   // result_type typedef required by Key Extractor concept
00337 //   typedef string result_type; 
00338   
00339 //   string operator()(const FunctionalAnnotationFunctionType& fa) const // operator() must be const
00340 //   {
00341 //     return fa._function.getCategory();
00342 //   }
00343 // };
00344 
00345 
00355 typedef multi_index_container<
00356   // the class whose instances we will store in the multi_index_container.
00357   FunctionalAnnotation,
00358   // specify all the indices now.
00359   indexed_by<
00360 
00361     // Define multi-column primary key.  Order first by
00362     // FunctionalAnnotation::_gene, then by
00363     // FunctionalAnnotation::_function, and finally by
00364     // FunctionalAnnotation::_evidenceCode. Use
00365     // annotation_gene_function_evidencecode_index as the tag for this
00366     // index. This index is unique since a specific (function, gene)
00367     // pair may occur exactly once. Hence, it serves as the primary key for the multi_index.
00368     ordered_unique<
00369       // name by which to refer to this index.
00370       tag< annotation_gene_function_evidencecode_index >,
00371       // specify the composite key now.
00372       composite_key<
00373         // the class whose members form the composite key. typically, it is the same class as the one we are storing in the multi_index.
00374         FunctionalAnnotation,
00375         // specify how to extract each element of the key. the first
00376         // item says that we want a member of FunctionalAnnotation,
00377         // its type is string, and it can be accessed through the
00378         // _gene member variable.
00379         member< FunctionalAnnotation, string, &FunctionalAnnotation::_gene>,
00380         member< FunctionalAnnotation, BioFunction, &FunctionalAnnotation::_function >,
00381         member< FunctionalAnnotation, string, &FunctionalAnnotation::_evidenceCode >
00382         >
00383 //       composite_key_compare<
00384 //       std::less<std::string>, // genes sorted by default
00385 //       std::less<std::string>, // functions sorted by default
00386 //       std::less<std::string>  // evidencecodes sorted by default
00387 //       >
00388     >,
00389     
00390     // specify other non-primary indices now.
00391     //
00392     // there can be many evidence codes for a gene-function pair.
00393     ordered_non_unique<
00394     tag< annotation_gene_function_index >,
00395     composite_key<
00396     FunctionalAnnotation,
00397     member< FunctionalAnnotation, string, &FunctionalAnnotation::_gene >,
00398     member< FunctionalAnnotation, BioFunction, &FunctionalAnnotation::_function >
00399   >,
00400     composite_key_compare<
00401       std::less<std::string>, // genes sorted by default
00402       std::less< BioFunction >  // functions sorted by default
00403   >
00404   >,
00405 
00406     // Order first by FunctionalAnnotation::_function and then by
00407     // FunctionalAnnotation::_gene. Use annotation_function_gene_index
00408     // as the tag for this index. This index is non-unique since a
00409     // specific (function, gene) pair may occur multiple times.
00410     ordered_non_unique<
00411     tag< annotation_function_gene_index >,
00412     composite_key<
00413     FunctionalAnnotation,
00414     member< FunctionalAnnotation, BioFunction, &FunctionalAnnotation::_function >,
00415     member< FunctionalAnnotation, string, &FunctionalAnnotation::_gene >
00416       >,
00417       composite_key_compare<
00418         std::less< BioFunction >,  // functions sorted by default
00419         std::less<std::string> // genes sorted by default
00420         >
00421       >,
00422 
00423     
00424     // Order first by FunctionalAnnotation::_gene and then the value
00425     // returned by the member function
00426     // FunctionalAnnotation::getFunctionType(). Use
00427     // annotation_gene_functiontype_index as the tag for this
00428     // index. This index is non-unique since a specific (gene,
00429     // functiontype) pair may occur multiple times.
00430     ordered_non_unique<
00431       tag< annotation_gene_functiontype_index >,
00432       composite_key<
00433         FunctionalAnnotation,
00434         member< FunctionalAnnotation, string, &FunctionalAnnotation::_gene >,
00435         // see http://www.boost.org/doc/libs/1_36_0/libs/multi_index/doc/tutorial/key_extraction.html#const_mem_fun
00436         const_mem_fun< FunctionalAnnotation, string, &FunctionalAnnotation::getFunctionType >
00437         >
00438       >,
00439 
00440     // sort by less<string> on _gene
00441     ordered_non_unique< tag< annotation_gene_index >,
00442                         member< FunctionalAnnotation, std::string,
00443                                 &FunctionalAnnotation::_gene > >,
00444     
00445     // sort on _function
00446     ordered_non_unique< tag< annotation_function_index >,
00447                         member< FunctionalAnnotation, BioFunction,
00448                                 &FunctionalAnnotation::_function > >//,
00449     
00450     > // matches indexed_by 
00451   > SetOfFunctionalAnnotations;
00452 
00500 class MyAnnotations
00501 {
00502 
00503 private:
00504 //  typedef map< string, map< string, map< string, MyGainAnnotationType > > >
00505   typedef set< string > BioFunctionSet;
00506   typedef map< string, BioFunctionSet > MyGenesToBioFunctionTypes;
00507   typedef map< string,  MyGenesToBioFunctionTypes > MyGenesToFunctions;
00508   // first "string" is the function type, second "string" is the
00509   // function, third "string" is the gene/protein.
00510 //  typedef map< string, map< string, MyGainAnnotationType > > BioFunctionToGenes;
00511   typedef set< string > MyGeneSet;
00512   typedef map< string, MyGeneSet > BioFunctionToGenes;
00513   typedef map< string, BioFunctionToGenes > BioFunctionTypesToGenes;
00514 
00515   typedef BioFunctionToGenes::iterator _BioFunctionsIterator;
00516   typedef BioFunctionToGenes::const_iterator _BioFunctionsConstIterator;
00517   typedef BioFunctionTypesToGenes::iterator _NewBioFunctionsIterator;
00518 
00519   // multi_index related typedefs to make accessing various indices easier.
00520   typedef SetOfFunctionalAnnotations::index< annotation_function_index >::type FunctionIndex;
00521   typedef SetOfFunctionalAnnotations::index< annotation_function_gene_index >::type FunctionGeneIndex;
00522   typedef SetOfFunctionalAnnotations::index< annotation_gene_index >::type GeneIndex;
00523 
00524   
00525   SetOfFunctionalAnnotations _allAnnotations;
00526   
00527   SetOfFunctionalAnnotations _allNonAnnotations;
00528   
00529   // default constructor will initialise weights, which can be overridden by MyAnnotations::readEvidenceCodeWeights().
00530   GOEvidenceCodes _evidenceCodes;
00531   // the number of times each evidence code is used in an annotation.
00532   map< string, unsigned int > _evidenceCodeCountsInAllAnnotations;
00533   
00534   // list of genes annotated by at least one function of a given type.
00535 //  map< string, map< string, MyGainAnnotationType > > functionTypeAnnotatedGenes;
00536   map< string, set< string > > functionTypeAnnotatedGenes;
00537 
00538   // list of genes not annotated by at any function of a given type.
00539 //  map< string, map< string, MyGainAnnotationType > > functionTypeNonAnnotatedGenes;
00540   map< string, set< string > > functionTypeNonAnnotatedGenes;
00541   
00542   // for each function, list of genes annotating it.
00543   BioFunctionTypesToGenes functionsAnnotatedGenes;
00544 
00545   // for each function, list of genes non-annotated/hypothetical wrt function.
00546   BioFunctionTypesToGenes functionsNonAnnotatedGenes;
00547 
00548   
00549 MyGenesToFunctions geneAnnotatedFunctions;
00550   MyGenesToFunctions geneNonAnnotatedFunctions;
00551 
00552   // for each function, the number of genes annotated with the function. 
00553   map< string, unsigned int > numFunctionAnnotatedGenes;
00554   // for each function, the number of genes
00555   // non-annotated/hypothetical with the function.
00556   map< string, unsigned int > numFunctionNonAnnotatedGenes;
00557   
00558   map< string, MyNT > functionFrequencies;
00559 
00560   // GO related variables, has true path rule been applied.
00561   GeneOntology *goDAG;
00562   bool _truePathRuleAppliedUpward;
00563   bool _truePathRuleAppliedDownward;
00564   bool _truePathRuleAppliedSideways;
00565 
00566   map< string, set< string > > _functionsWithOverlappingAnnoations;
00567   
00568   bool _overlappingFunctionsFlag;
00569   
00570 public:  
00571 
00572   class NewBioFunctionsIterator;
00573   friend class NewBioFunctionsIterator;
00574   
00581   class BioFunctionsIterator: public  MyIterator< string >
00582   {
00583   private:
00584     _BioFunctionsIterator start;
00585     _BioFunctionsIterator end;
00586     _BioFunctionsIterator current;
00587     
00588   public:
00589     BioFunctionsIterator(const _BioFunctionsIterator _start,
00590                          const _BioFunctionsIterator _end)
00591         : start(_start), end(_end), current(_start)
00592       {}
00593     
00594     virtual bool hasNext()
00595       {
00596     return(current != end);
00597       }
00598     
00599     string &next()
00600       {
00601         // BUBUGBUG UGLY HACK. I have defined MyIterator::next() to return
00602         // a T& so that when T is a class, i can return a ref. but when T
00603         // is a string, i get compilation errors without this const_cast.
00604         string &_next(const_cast< string & >((*current).first));
00605         current++;
00606         return(_next);
00607       }
00608   };
00609 
00616   class BioFunctionsConstIterator: public  MyIterator< string >
00617   {
00618   private:
00619     _BioFunctionsConstIterator start;
00620     _BioFunctionsConstIterator end;
00621     _BioFunctionsConstIterator current;
00622     
00623   public:
00624     BioFunctionsConstIterator(_BioFunctionsConstIterator _start,
00625                               _BioFunctionsConstIterator _end)
00626         : start(_start), end(_end), current(_start)
00627       {}
00628     
00629     virtual bool hasNext()
00630       {
00631         return(current != end);
00632       }
00633     
00634     string &next()
00635       {
00636         // BUBUGBUG UGLY HACK. I have defined MyIterator::next() to return
00637         // a T& so that when T is a class, i can return a ref. but when T
00638         // is a string, i get compilation errors without this const_cast.
00639         string &_next(const_cast< string & >((*current).first));
00640         // string & _next((*current).first);
00641         current++;
00642         return(_next);
00643       }
00644   };
00645   
00654 class BioFunctionsIteratorMultiIndex: public  MyIterator< BioFunction >
00655 {
00656 private:
00657   // the reference to the set of all annotations passed in the constructor.
00658   const SetOfFunctionalAnnotations & _allAnnotations;
00659   // the reference to the index over the functions in _allAnnotations.
00660   const FunctionIndex &_functionIndex;
00661   FunctionIndex::iterator _startItr;
00662   FunctionIndex::iterator _endItr;
00663   // the pointer to the current function.
00664   FunctionIndex::iterator _currentItr;
00665   // store the id of the current function.
00666   BioFunction _currentFunction;
00667 
00668   
00669 public:
00671   BioFunctionsIteratorMultiIndex(const SetOfFunctionalAnnotations &_anns)
00672       : _allAnnotations(_anns), _functionIndex(_allAnnotations.get< annotation_function_index >()),
00673         _startItr(_allAnnotations.get< annotation_function_index >().begin()),
00674         _endItr(_allAnnotations.get< annotation_function_index >().end()), _currentItr(_startItr), 
00675         _currentFunction()
00676     {
00677       // sometime _startItr might be equal to _endItr, so I am being careful about initialising _currentFunction.
00678       if (_startItr != _endItr)
00679         _currentFunction = _currentItr->_function;
00680     }
00682   virtual bool hasNext()
00683   {
00684     return(_currentItr != _endItr);
00685   }
00687   BioFunction &next()
00688   {
00689     _currentFunction = _currentItr->_function;
00690     _moveCurrentItr();
00691     return(_currentFunction);
00692   }
00693 private:      
00694   // move _currentItr to point to the start of the next function.
00695   void _moveCurrentItr()
00696     {
00697       _currentItr = _functionIndex.upper_bound(_currentFunction);
00698     }
00699   
00700 };
00701   
00702   
00703 public:
00704   MyAnnotations()
00705     : functionTypeAnnotatedGenes(), functionTypeNonAnnotatedGenes(),
00706          functionsAnnotatedGenes(), functionsNonAnnotatedGenes(),
00707          geneAnnotatedFunctions(),
00708          geneNonAnnotatedFunctions(),
00709          numFunctionAnnotatedGenes(),
00710       numFunctionNonAnnotatedGenes(),
00711       _evidenceCodes(), _evidenceCodeCountsInAllAnnotations(),
00712       functionFrequencies(),
00713       goDAG(NULL), _truePathRuleAppliedUpward(false), _truePathRuleAppliedDownward(false),
00714       _truePathRuleAppliedSideways(false), _overlappingFunctionsFlag(false)
00715   {
00716     // set up static variables.
00717     _initialise();
00718   }
00719 
00720   virtual ~MyAnnotations()
00721   {}
00722 
00723   void setOverlappingFunctionsFlag()
00724   {
00725      _overlappingFunctionsFlag = true;
00726   }
00727 
00729   void addAnnotation(string gene, const BioFunction &function,
00730                      string evidenceCode,
00731                      MyGainAnnotationType status = ANNOTATED_STATE,
00732                      bool msa = false);
00733 
00735   void addAnnotation(string gene, string function, string funcType,
00736                      string evidenceCode,
00737                      MyGainAnnotationType status = ANNOTATED_STATE,
00738                      bool msa = false);
00739   
00740 
00741   
00748 
00786   void applyTruePathRule(GeneOntology &go, bool applyUpward = true, bool applyDownward = true, bool applySideways = true);
00787 
00788 
00810   bool checkTruePathRule(GeneOntology &go, ostream *ostr = NULL,
00811                          bool *annotationsOK = NULL, bool *unknownOK = NULL);// const;
00812 
00813 
00836   void computeDifference(MyAnnotations &other, MyAnnotations &difference,
00837                          bool restrictToCommonGenes = false);
00838 
00844   void computeEnrichments(const set< string > &geneSet,
00845                           const GeneOntology *go,
00846                           vector<EnrichmentRecord< string, string> > &rejalts);
00847   
00863   void computeEnrichmentsGenGO(const set< string > &geneSet,
00864                                vector<EnrichmentRecord< string, string> > &rejalts,
00865                                const GeneOntology *go = NULL,
00866                                GenGOParametersOptimisationType optType = GENGO_PARAMETER_OPTIMISATION_NONE
00867                                ) const;
00868 
00878   void computeEnrichmentsGenGO(const set< string > &geneSet,
00879                                string functionType,
00880                                vector<EnrichmentRecord< string, string> > &rejalts,
00881                                const GeneOntology *go = NULL,
00882                                GenGOParametersOptimisationType optType = GENGO_PARAMETER_OPTIMISATION_NONE
00883                                ) const;
00884 
00888   MyNT computeFunctionFrequency(string funcType, string func);
00889 
00890   // compute various statistics.
00891   void computeStatistics();
00892   
00893   // count the number of genes each function annotates (NOT the number of annotations)
00894   void computeGeneCounts(map< BioFunction, unsigned int > &geneCounts);
00895   
00896   void copyAnnotationsForFunctions(const set< BioFunction > &functions, MyAnnotations &annotations);
00897   
00898 
00904   virtual MyAnnotations::BioFunctionsIterator functions(string type)
00905     {
00906       MyAnnotations::BioFunctionsIterator itr(functionsAnnotatedGenes[type].begin(),
00907                                               functionsAnnotatedGenes[type].end());
00908       return(itr);
00909     }
00910 
00913   virtual MyAnnotations::BioFunctionsIteratorMultiIndex functionsMultiIndex() const
00914     {
00915       // MyAnnotations::BioFunctionsIteratorMultiIndex itr(_allAnnotations.begin(), _allAnnotations.end(), _allAnnotations);
00916       MyAnnotations::BioFunctionsIteratorMultiIndex itr(_allAnnotations);
00917       return(itr);
00918     }
00919   
00926   set< string > functionTypes() const
00927     {
00928       set< string > types;
00929       BioFunctionTypesToGenes::const_iterator itr;
00930       for (itr = functionsAnnotatedGenes.begin(); itr != functionsAnnotatedGenes.end(); itr++)
00931         types.insert((*itr).first);
00932       return(types);
00933     }
00934 
00949   MyNT getAnnotatedFunctionWeight(string gene, string func, string type);
00950 
00962   MyNT getAnnotatedFunctionWeightProbabilisticOr(string gene, string func, string type);
00963   
00976   map< string, MyNT > getAnnotatedFunctionWeights(string gene, string func, string type);
00977 
00995   // note: this method is used in Reporter::printECWeightedCurveDataFromCV().
00996   map< string, MyNT > getAnnotatedWeights(string gene, string type);
00997 
01004   void getAnnotatedGenes(string functype, set< string >& annotatedGenes);
01005 
01018   void getAnnotations(string type, map< string, set< string > > &annotations) const;
01019   
01033   void getAnnotationsByGene(string type, map< string, set< string > > &annotations) const;
01034   
01047   virtual void getAnnotations(string type, string function,
01048                               set< string >  &annotations) const;
01049   
01058   void getAnnotations(const GOFunction * function, set< string > &annotations) const;
01059 
01069   void getAnnotations(const set< GOFunction *>  &functions,
01070                       set< string >  &annotations) const;
01071 
01089   virtual void getAnnotationsForGene(string type, string gene,
01090                                      set< string >  &annotations) const;
01091 
01092 
01099   void getEvidenceCodeCounts(map< string, unsigned int > &ecCounts) const
01100     {
01101       ecCounts = _evidenceCodeCountsInAllAnnotations;
01102     }
01103   
01117   void getEvidenceCodeCounts(string funcType, string funcId, map< string, unsigned int > &ecCounts) const;
01118   
01119   
01120   
01122   MyNT getFunctionFrequency(string function) const
01123     {
01124       map< string, MyNT >::const_iterator itr;
01125       itr = functionFrequencies.find(function);
01126       if (itr == functionFrequencies.end())
01127         return(0);
01128       return((*itr).second);
01129     }
01130 
01140   string getFunctionType(string functionName) const;
01141   
01142 
01146   bool hasExperimentallyAnnotatedFunction(string gene, string category) const
01147     {
01148       // TODO is there a better way to do this perhaps by using a gene-evidencecode index?
01149       const GeneIndex &gi = _allAnnotations.get< annotation_gene_index >();
01150       GeneIndex::const_iterator start = gi.lower_bound(gene), end = gi.upper_bound(gene);
01151       for (GeneIndex::const_iterator gitr = start; gitr != end; gitr++)
01152         {
01153           if (_evidenceCodes.isExperimental(gitr->_evidenceCode) && gitr->_function.getCategory() == category)
01154             return (true);
01155         }
01156       return (false);
01157     }
01158   
01159         
01161   bool hasAnnotatedFunction(string gene) const
01162     {
01163       const GeneIndex &gi = _allAnnotations.get< annotation_gene_index >();
01164       return (gi.end() != gi.find(gene));
01165       //return(geneAnnotatedFunctions.end() !=
01166       //       geneAnnotatedFunctions.find(gene));
01167     }
01168 
01170   bool hasAnnotatedFunction(string gene, string category) const
01171     {
01172       return(hasAnnotatedFunction(gene) &&
01173              // how i can check in this category.
01174              (geneAnnotatedFunctions.find(gene)->second.end() !=
01175               geneAnnotatedFunctions.find(gene)->second.find(category)));
01176     }
01177         
01198   // method's name used to be hasNoFunction().
01199    bool isNotKnownToBeAnnotatedWithFunction(string gene, string func, string type)
01200     {
01201       if (!_truePathRuleAppliedDownward)
01202         // use the explicit check.
01203         return(isNotKnownToBeAnnotatedWithFunctionSansTruePathRule(gene, func, type, *goDAG));
01204     
01205       if(_overlappingFunctionsFlag)
01206       {
01207           if (haveOverlappingFunction(gene, func, type))
01208              return true;      
01209       }
01210 
01211 
01212       // since _truePathRuleAppliedDownward is false, use
01213       // geneNonAnnotatedFunctions to answer the query.
01214       MyGenesToFunctions::const_iterator gfItr;
01215       MyGenesToBioFunctionTypes::const_iterator gbftItr;
01216       // gene may have no function stored at all!
01217       if (geneAnnotatedFunctions.end() ==
01218           (gfItr = geneAnnotatedFunctions.find(gene)))
01219         return(true);
01220       // gene may have no function stored for type.
01221       if (gfItr->second.end() == (gbftItr = gfItr->second.find(type)))
01222         return(true);
01223       // function stored for type is unknown function.
01224       if (gbftItr->second.end() !=
01225           gbftItr->second.find(GO_UNKNOWN_FUNCTIONS[type]))
01226         return(true);
01227       // gene is non-annotated with respect to function, i.e. an
01228       // ancestor of function is the most specific function
01229       // annotating gene.
01230       if ((geneNonAnnotatedFunctions[gene][type].end() !=
01231            geneNonAnnotatedFunctions[gene][type].find(func))
01232 //           &&
01233 //           // this check is redundant because i put a func in
01234 //           // geneNonAnnotatedFunctions only if the state is
01235 //           // HYPOTHETICAL_STATE.
01236 //           (HYPOTHETICAL_STATE ==
01237 //            geneNonAnnotatedFunctions[gene][type][func])
01238           )
01239         return(true);
01240 
01241       // gene may have no function stored at all!
01242       if (geneAnnotatedFunctions.end() ==
01243           geneAnnotatedFunctions.find(gene))
01244         return(true);
01245       // gene may have no function stored for type.
01246       if (geneAnnotatedFunctions[gene].end() ==
01247          geneAnnotatedFunctions[gene].find(type))
01248         return(true);
01249       // function stored for type is unknown function.
01250       if (geneAnnotatedFunctions[gene][type].end() !=
01251           geneAnnotatedFunctions[gene][type].find(GO_UNKNOWN_FUNCTIONS[type]))
01252         return(true);
01253       // gene is non-annotated with respect to function, i.e. an
01254       // ancestor of function is the most specific function
01255       // annotating gene.
01256       if ((geneNonAnnotatedFunctions[gene][type].end() !=
01257            geneNonAnnotatedFunctions[gene][type].find(func))
01258 //           &&
01259 //           // this check is redundant because i put a func in
01260 //           // geneNonAnnotatedFunctions only if the state is
01261 //           // HYPOTHETICAL_STATE.
01262 //           (HYPOTHETICAL_STATE ==
01263 //            geneNonAnnotatedFunctions[gene][type][func])
01264           )
01265         return(true);
01266 
01267 
01268       return(
01269         // gene may have no function stored at all!
01270         (geneAnnotatedFunctions.end() ==
01271          geneAnnotatedFunctions.find(gene)) || 
01272         // gene may have no function stored for type.
01273         (geneAnnotatedFunctions[gene].end() ==
01274          geneAnnotatedFunctions[gene].find(type)) ||
01275         // function stored for type is unknown function.
01276         (geneAnnotatedFunctions[gene][type].end() !=
01277          geneAnnotatedFunctions[gene][type].find(GO_UNKNOWN_FUNCTIONS[type])) ||
01278         // gene is non-annotated with respect to function, i.e. an
01279         // ancestor of function is the most specific function
01280         // annotating gene.
01281         ((geneNonAnnotatedFunctions[gene][type].end() !=
01282           geneNonAnnotatedFunctions[gene][type].find(func))
01283          //          &&
01284          //          // this check is redundant because i put a func in
01285          //          // geneNonAnnotatedFunctions only if the state is
01286          //          // HYPOTHETICAL_STATE.
01287          //          (HYPOTHETICAL_STATE ==
01288          //           geneNonAnnotatedFunctions[gene][type][func])
01289          )
01290         );
01291     }
01292 
01318   bool isNotKnownToBeAnnotatedWithFunctionSansTruePathRule(string gene, string func, string type,
01319                                                          GeneOntology &goDAG);
01320 
01321 
01324   bool hasAnnotatedGene(string func, string type) const
01325     {
01326       BioFunctionTypesToGenes::const_iterator itr = functionsAnnotatedGenes.find(type);
01327       if (functionsAnnotatedGenes.end() == itr)
01328         // this type has no function.
01329         return(false);
01330       BioFunctionToGenes::const_iterator gitr = itr->second.find(func);
01331       if (itr->second.end() == gitr)
01332         // the function does not annotate any gene.
01333         return(false);
01334       // check if there indeed are any genes stored at gitr.
01335       if (gitr->second.empty())
01336         return(false);
01337       return(true);
01338     }
01339   
01341   bool hasAnnotatedGene(const BioFunction &function) const
01342     {
01343       return(hasAnnotatedGene(function.getId(), function.getCategory()));
01344     }
01345   
01346   
01347   // return true if gene1 and gene2 have the given function.
01348   bool haveSameFunction(string gene1, string gene2, const BioFunction &function);
01349   // return true if gene1 and gene2 are both annotated with at least
01350   // one of the functions in the vector functionVector.
01351   bool haveSameFunction(string gene1, string gene2,
01352                                        const vector< BioFunction >& functionVector);
01353   template < class InputIterator >
01354   bool haveSameFunction(string gene1, string gene2,
01355                                        InputIterator first, InputIterator last);
01356   // return true if gene1 and gene2 have the same function of the given type.
01357   bool haveSameFunction(string gene1, string gene2, string type);
01358   // return true if gene1 and gene2 have the same function (of any type).
01359   bool haveSameFunction(string gene1, string gene2);
01360 
01361   // return true if the annotation of gene1 overlapps with function.
01362   bool haveOverlappingFunction(string gene1, string function, string type);
01363 
01364 
01365   
01381   //
01382   // WARNING: this method works correctly only with the multi_index
01383   // structure, since the multi_index stores whether an annotation is
01384   // the most specific annotation or not.
01385   bool isAnnotatedWithFunction(string gene, string func, string type, bool beforeTruePathRule = false);
01386   
01387  
01391   bool isAnnotatedWithFunctionExperimentally(string gene, string func, string type);
01392 
01399   bool isUnknownFunction(string func, string type) const
01400     {
01401       map< string, string >::const_iterator itr = GO_UNKNOWN_FUNCTIONS.find(type);
01402       return((GO_UNKNOWN_FUNCTIONS.end() != itr)
01403              && (func == itr->second));
01404     }
01405 
01410   //void keepAnnotationsForFunctions(set< string > &functionSet);
01411 
01415   void keepAnnotationsForGenes(set< string > &geneSet);
01416 
01417   
01419   unsigned int numAnnotations(string category = "");
01420   
01424   // use to be numNonAnnotations.
01425   unsigned int numUnknownAnnotations(string category = "");
01426   
01433   unsigned int numAnnotatedGenes(string funcType, string function)
01434     {
01435       return(functionsAnnotatedGenes[funcType][function].size());
01436 //    return(_allAnnotations.get< annotation_function_index >().count(function));
01437     }
01438 
01440   unsigned int numAnnotatedGenes(string funcType = "")
01441   {
01442     if ("" != funcType)
01443       return(functionTypeAnnotatedGenes[funcType].size());
01444     set< string > funcTypes = functionTypes();
01445     set< string >::iterator ftItr;
01446     unsigned int sum = 0;
01447     for (ftItr = funcTypes.begin(); ftItr != funcTypes.end(); ftItr++)
01448       sum += functionTypeAnnotatedGenes[*ftItr].size();
01449       return(sum);
01450   }
01451   
01453   unsigned int numAnnotatingFunctions(string gene, string funcType)
01454   {
01455     return(geneAnnotatedFunctions[gene][funcType].size());
01456   }
01457 
01465   unsigned int numAnnotatingFunctions(string funcType = "")
01466     {
01467       if ("" != funcType)
01468         return(functionsAnnotatedGenes[funcType].size());
01469       set< string > funcTypes = functionTypes();
01470       set< string >::iterator ftItr;
01471       unsigned int sum = 0;
01472       for (ftItr = funcTypes.begin(); ftItr != funcTypes.end(); ftItr++)
01473         sum += numAnnotatingFunctions(*ftItr);
01474       return(sum);
01475     }
01476   
01478   unsigned int numNonAnnotatedGenes(string funcType)
01479   {
01480 #ifdef DEBUG
01481     cout << "function type is " << funcType
01482          << ", unknown function is " << GO_UNKNOWN_FUNCTIONS[funcType]
01483          << ", #non-annotated genes is "
01484          << functionsNonAnnotatedGenes[funcType][GO_UNKNOWN_FUNCTIONS[funcType]].size()
01485          << " or " << functionTypeNonAnnotatedGenes[funcType].size()
01486          << endl;
01487     
01488 #endif    
01489     return(functionTypeNonAnnotatedGenes[funcType].size());
01490   }
01491 
01495   unsigned int numTotalNonAnnotatedGenes(string function, string funcType)
01496   {
01497     return(numNonAnnotatedGenes(funcType) + 
01498            numNonAnnotatedGenes(funcType, function));
01499   }
01500 
01501   // return only how many genes are non-annotated wrt function.
01502   unsigned int numNonAnnotatedGenes(string funcType, string function)
01503   {
01504     return(functionsNonAnnotatedGenes[funcType][function].size());
01505   }
01506   
01508   // was numNonAnnotatingFunctions.
01509   unsigned int numUnknownAnnotatingFunctions(string gene, string funcType)
01510   {
01511     return(geneNonAnnotatedFunctions[gene][funcType].size());
01512   }
01513 
01514 
01515   // a gene is negative wrt function if it is annotated in funcType
01516   // (this number will/should not include genes that are not
01517   // annotated by any function in funcType) and does not annotate
01518   // function and is also not non-annotated wrt function.
01519   unsigned int numNegativeGenes(string funcType, string function)
01520   {
01521     return(numAnnotatedGenes(funcType) - 
01522            numAnnotatedGenes(funcType, function) -
01523            numNonAnnotatedGenes(funcType, function)
01524            );
01525   }
01526 
01528   void print(string fileName);
01529 
01534   void printWithName(string fileName);
01535 
01538   void printNumAnnotationsPerFunction(ostream &numFunStream);
01539   
01561   void read(string fileName, string dataType = "unweighted",
01562             const map< string, set< string > > *nodeAliases = NULL,
01563             bool mostSpecificAnnotations = false,
01564             const set< string > *evidenceCodesToIgnore = NULL);
01565 
01566 
01581   void read(string fileName, unsigned int keySize);
01582   
01583 
01602   void readGO(string fileName, unsigned int altSymbolCol=0);
01603 
01610   void readEvidenceCodeWeights(string ecwFile)
01611     {
01612       _evidenceCodes.readEvidenceCodeWeights(ecwFile);
01613       _evidenceCodes.checkEvidenceCodeWeights(_evidenceCodeCountsInAllAnnotations, cerr);
01614     }
01615 
01630   void subtract(MyAnnotations &other, MyAnnotations &difference);
01631 
01632 
01635   void storeGODAG(GeneOntology &go)
01636     {
01637       goDAG = &go;
01638     }
01639 
01640   void readOverlappingFunctions(string fileName);
01641   
01642 private:
01643 
01644   // remove genes and functions with no annotations.
01645   void _cleanUp();
01646   
01653   unsigned int _copyAnnotatedGenes(const BioFunction &source, const BioFunction &target);
01654 
01675   unsigned int _copyAnnotatedGenesToNonAnnotated(const BioFunction &source, const BioFunction &target,
01676                                                  bool force = false,
01677                                                  MyAnnotations *original = NULL);
01678   
01679 
01680   
01687   unsigned int _copyNonAnnotatedGenes(const BioFunction &source, const BioFunction &target);
01688 
01689 // helper function for MyAnnotations::isAnnotatedWithFunction() that
01690 // uses the multi-level map.
01691 bool _isAnnotatedWithFunctionMap(string gene, string func, string type);
01692 
01693 // helper function for MyAnnotations::isAnnotatedWithFunction() that
01694 // uses the multi-index. Read the documentation for MyAnnotations::isAnnotatedWithFunction to see the role of beforeTruePathRule.
01695 bool _isAnnotatedWithFunctionMultiIndex(string gene, string func, string type, bool beforeTruePathRule = false);
01696 
01697   map< string, vector< string > > _functionsWithOverlappingAnnotations;
01698 
01699 private:
01700 
01701   static void _initialise();
01702   void _compareFunctionTypes(string funcType1, string funcType2);
01703   bool _check();
01704 
01705 };
01706 
01707   
01708   
01709   
01710 
01711 #endif // _ANNOTATIONS_H 
 All Classes Functions Variables Typedefs Friends