Biorithm  1.1
 All Classes Functions Variables Typedefs Friends
reporter.h
00001 /**************************************************************************
00002  * Copyright (c) 2001-2011 T. M. Murali                                   *
00003  * Copyright (c) 2011 Phillip Whisenhunt                                  *
00004  * Copyright (c) 2011 David Badger                                        *
00005  * Copyright (c) 2010 Jacqueline Addesa                                   *
00006  *                                                                        *
00007  * This file is part of Biorithm.                                         *
00008  *                                                                        *
00009  * Biorithm is free software: you can redistribute it and/or modify       *
00010  * it under the terms of the GNU General Public License as published by   *
00011  * the Free Software Foundation, either version 3 of the License, or      *
00012  * (at your option) any later version.                                    *
00013  *                                                                        *
00014  * Biorithm is distributed in the hope that it will be useful,            *
00015  * but WITHOUT ANY WARRANTY; without even the implied warranty of         *
00016  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          *
00017  * GNU General Public License for more details.                           *
00018  *                                                                        *
00019  * You should have received a copy of the GNU General Public License      *
00020  * along with Biorithm.  If not, see <http://www.gnu.org/licenses/>.      *
00021  *                                                                        *
00022  **************************************************************************/
00023 
00031 #ifndef _REPORTER_H
00032 #define _REPORTER_H
00033 
00034 #include <map>
00035 #include <string>
00036 #include <fstream>
00037 #include <iostream>
00038 #include <sys/stat.h>
00039 
00040 
00041 using namespace std;
00042 
00043 #include "boost/multi_index_container.hpp"
00044 #include "boost/multi_index/member.hpp"
00045 #include "boost/multi_index/ordered_index.hpp"
00046 #include <boost/multi_index/composite_key.hpp>
00047 
00048 using boost::multi_index_container;
00049 using namespace boost::multi_index;
00050   
00051 #include "biofunction.h"
00052 #include "graph-global.h" //for MyNT
00053 #include "old-annotations.h"
00054 
00055 #include "histogram.h"
00056 
00057 // i want to get rid of the CVResult typedef and everything that depends on it.
00058 #undef USE_CVRESULT
00059 
00060 enum MyGainCVResult { TRUE_POSITIVE = 1, FALSE_POSITIVE, TRUE_NEGATIVE, FALSE_NEGATIVE };
00061 
00067 struct DetailedCVResult
00068 {
00069 //private:
00070 public:
00071   string function;
00072   string gene;
00073   MyGainAnnotationType correctState;
00074   MyGainAnnotationType predictedState;
00075   // confidence in the correct state. this is used by sinksource with evidence code weights.
00076   MyNT correctStateConfidence;
00077   MyNT predictionConfidence;
00078   string algorithm;
00079 public:
00081   DetailedCVResult()
00082       : function(), gene(), correctState(), predictedState(), correctStateConfidence(1), predictionConfidence(0), algorithm()
00083     {}
00084   // Constructor that takes values for all fields.
00085   DetailedCVResult(string g, string f, MyGainAnnotationType cs, 
00086                    MyGainAnnotationType ps, MyNT csConf, MyNT conf, string algo)
00087       : function(f), gene(g), correctState(cs), predictedState(ps), correctStateConfidence(csConf), predictionConfidence(conf), algorithm(algo)
00088     {}
00089   
00090   string getGene() const
00091     {
00092       return(gene);
00093     }
00094   
00095   MyGainAnnotationType getCorrectState() const
00096     {
00097       return(correctState);
00098     }
00099   
00100   MyGainAnnotationType getPredictedState() const
00101     {
00102       return(predictedState);
00103     }
00104   MyNT getPredictionConfidence() const
00105     {
00106       return(predictionConfidence);
00107     }
00108   
00109   
00110   MyGainCVResult computeResult() const
00111     {
00112       if (ANNOTATED_STATE == correctState)
00113         {
00114           // +1 -> 0 -> +1 is a TP.
00115           if (ANNOTATED_STATE == predictedState)
00116             return(TRUE_POSITIVE);
00117           // +1 -> 0 -> 0 (-1) is a FN.
00118           if (HYPOTHETICAL_STATE == predictedState)
00119             return(FALSE_NEGATIVE);
00120           // +1 -> 0 -> -1 is a FN.
00121           if (NOT_ANNOTATED_STATE == predictedState)
00122             return(FALSE_NEGATIVE);
00123         }
00124       else if (NOT_ANNOTATED_STATE == correctState)
00125         {
00126           // -1 -> 0 -> 1 is a FP.
00127           if (ANNOTATED_STATE == predictedState)
00128             return(FALSE_POSITIVE);
00129           // -1 -> 0 -> 0 (-1) is a TN.
00130           if (HYPOTHETICAL_STATE == predictedState)
00131             return(TRUE_NEGATIVE);
00132           // -1 -> 0 -> -1 is a TN.
00133           if (NOT_ANNOTATED_STATE == predictedState)
00134             return(TRUE_NEGATIVE);
00135         }
00136       cerr << "DetailedCVResult::computeResult() ERROR: correctState appears to be HYPOTHETICAL_STATE, which is a big no-no" << endl;
00137     }
00138   
00139 
00140 
00141 };
00142 
00143 
00144 // tell doxygen to skip
00148 struct detailedcvresult_gene_function_correctstate_predictedstate_confidence_algorithm_index {};
00149 struct detailedcvresult_algorithm_confidence_index {};
00154 typedef multi_index_container
00155 <
00156   DetailedCVResult,
00157   indexed_by
00158   <
00159     // the primary key is made up of gene, function, correct state,
00160     // predicted state, confidence, and algorithm.
00164     ordered_unique
00165     <
00166       tag< detailedcvresult_gene_function_correctstate_predictedstate_confidence_algorithm_index >,
00167       composite_key
00168       <
00169         DetailedCVResult,
00170         member< DetailedCVResult, std::string, &DetailedCVResult::gene >,
00171         member< DetailedCVResult, std::string, &DetailedCVResult::function >,
00172         member< DetailedCVResult, MyGainAnnotationType, &DetailedCVResult::correctState >,
00173         member< DetailedCVResult, MyGainAnnotationType, &DetailedCVResult::predictedState >,
00174         member< DetailedCVResult, MyNT, &DetailedCVResult::predictionConfidence >,
00175         member< DetailedCVResult, std::string, &DetailedCVResult::algorithm >
00176         >,
00177       // specify how to order each field of the primary key.
00178       composite_key_compare
00179       <
00180         std::less< std::string >,
00181         std::less< std::string >,
00182         std::less< MyGainAnnotationType >,
00183         std::less< MyGainAnnotationType >,
00184         std::greater< MyNT >,
00185         std::less< std::string >
00186         >
00187       >,
00188     // index by algorithm and confidence.
00189     ordered_non_unique
00190     <
00191       tag< detailedcvresult_algorithm_confidence_index >,
00192       composite_key
00193       <
00194         DetailedCVResult,
00195         member< DetailedCVResult, std::string, &DetailedCVResult::algorithm >,
00196         member< DetailedCVResult, MyNT, &DetailedCVResult::predictionConfidence >
00197         >,
00198       composite_key_compare
00199       <
00200         std::less< std::string >,
00201         // sort in decreasing order of prediction confidence.
00202         std::greater< MyNT >
00203         >
00204       >
00205     >
00206   > SetOfDetailedCVResults;
00207 
00208 
00209 
00210 
00211 #ifdef USE_CVRESULT
00212 struct CVResult {
00213 //private:
00214 public:
00215   int TP, TN, FP, FN;
00216   MyNT fscore, precision, recall;
00217   // correct initialisation is 0, otherwise operator+ will not work.
00218 
00219 public:
00220   CVResult() { TP=TN=FP=FN=0;fscore=precision=recall=0.0; };
00221   CVResult(int tp, int tn, int fp, int fn)
00222     {
00223       TP=tp;TN=tn;FP=fp;FN=fn;
00224       calculate();
00225     };
00226   CVResult(const CVResult &other)
00227     {
00228       TP= other.TP;
00229       TN= other.TN;
00230       FP= other.FP;
00231       FN= other.FN;
00232       fscore = other.fscore;
00233       precision = other.precision;
00234       recall = other.recall;
00235     };
00236 
00237   MyNT getFscore() const
00238     {
00239       return(fscore);
00240     }
00241 
00242   MyNT getPrecision() const
00243     {
00244       return(precision);
00245     }
00246 
00247   MyNT getRecall() const
00248     {
00249       return(recall);
00250     }
00251   
00252   void calculate()
00253     {
00254       precision = (0!=TP+FP) ? (((1.0)*TP)/((1.0)*(TP+FP))) : 0.0;
00255       recall = (0!=TP+FN) ? (((1.0)*TP)/((1.0)*(TP+FN))) : 0.0;
00256       fscore = (0 !=precision+recall) ? ((2.0*precision*recall)/(precision+recall)) : 0.0;
00257     }
00258   // add two results. just do a field-wise addition.
00259   CVResult operator+(CVResult &rhs)
00260     {
00261       CVResult result(*this);
00262       result.TP += rhs.TP;
00263       result.TN += rhs.TN;
00264       result.FP += rhs.FP;
00265       result.FN += rhs.FN;      
00266       result.calculate();
00267       return(result);
00268     }
00269   
00270   // add rhs to *this. just do a field-wise addition.
00271   void operator+=(CVResult &rhs)
00272     {
00273       TP += rhs.TP;
00274       TN += rhs.TN;
00275       FP += rhs.FP;
00276       FN += rhs.FN;      
00277       calculate();
00278     }  
00279 };
00280 #endif // USE_CVRESULT
00281 
00282 struct PredictionDetails {
00283   MyNT energy, prob, input, threshold;
00284   PredictionDetails(MyNT p, MyNT i, MyNT t, MyNT e = 0)
00285     { prob=p;input=i,threshold=t; energy = e; };
00286         PredictionDetails()
00287     { prob=input=threshold=-2.0; energy = 0;};
00288 };
00289 
00290 
00291 struct Prediction
00292 {
00293 //  friend typename SetOfPredictions;
00294 public:
00295   // these need to be public for SetOfPredictions.
00296   string _gene;
00297   string _function;
00298   string _algorithm;
00299   MyNT _confidence;
00300   MyNT _cutBasedConfidence;
00301   // the rank of this prediction in the list of all predictions.
00302   unsigned int _rankOverall;
00303   // the rank of this prediction in the list of predictions for this function.
00304   unsigned int _rankFunction;
00305   
00306 public:
00307   Prediction(string g, string f, string a, MyNT c = 0)
00308       : _gene(g), _function(f), _algorithm(a), _confidence(c), _cutBasedConfidence(0),
00309         _rankOverall(0), _rankFunction(0)
00310     {}
00311 
00312   void setCutBasedConfidence(MyNT c)
00313     {
00314       _cutBasedConfidence = c;
00315     }
00316   MyNT getCutBasedConfidence() const
00317     {
00318       return(_cutBasedConfidence);
00319     }
00320   
00321 };
00322 
00323 // tell doxygen to skip
00327 struct prediction_gene_function_algorithm_index {};
00328 struct prediction_gene_algorithm_index {};
00329 struct prediction_gene_index {};
00330 struct prediction_function_index {};
00331 struct prediction_algorithm_index {};
00332 
00333 struct prediction_algorithm_confidence_index {};
00334 
00335 struct prediction_algorithm_function_confidence_index {};
00340 typedef multi_index_container<
00341   Prediction,
00342   indexed_by<
00343 
00344     // define multi-column primary key.
00345     ordered_unique<
00346       tag< prediction_gene_function_algorithm_index >,
00347       composite_key<
00348       Prediction,
00349       member< Prediction, std::string, &Prediction::_gene>,
00350       member< Prediction, std::string, &Prediction::_function >,
00351       member< Prediction, std::string, &Prediction::_algorithm >
00352       >,
00353       composite_key_compare<
00354         std::less<std::string>, // genes sorted by default
00355         std::less<std::string>, // functions sorted by default
00356         std::less<std::string>   // algorithms sorted by default
00357         >
00358       >,
00359     // there can be many functions for an algorithm-gene pair.
00360     ordered_non_unique<
00361       tag< prediction_gene_algorithm_index >,
00362       composite_key<
00363       Prediction,
00364       member< Prediction, std::string, &Prediction::_algorithm >,
00365       member< Prediction, std::string, &Prediction::_gene>
00366     >,
00367       composite_key_compare<
00368       std::less<std::string>,   // algorithms sorted by default
00369       std::less<std::string> // genes sorted by default
00370     >
00371     >,
00372     
00373     // sort on algorithm, function, and confidence
00374     ordered_non_unique<
00375       tag< prediction_algorithm_function_confidence_index >,
00376       composite_key<
00377       Prediction,
00378       member< Prediction, std::string, &Prediction::_algorithm >,
00379       member< Prediction, std::string, &Prediction::_function >,
00380       member< Prediction, MyNT, &Prediction::_confidence >
00381       >,
00382       composite_key_compare<
00383         std::less<std::string>, // algorithms sorted by default
00384         std::less<std::string>, // functions sorted by default
00385         std::greater<MyNT>   // confidence sorted by decreasing
00386         >
00387       >,
00388 
00389   // sort by less<string> on _gene
00390   ordered_non_unique< tag< prediction_gene_index >, member< Prediction ,std::string, &Prediction::_gene > >,
00391 
00392   // sort on _function
00393   ordered_non_unique< tag< prediction_function_index >, member< Prediction ,std::string, &Prediction::_function > >,
00394     
00395   // sort on _algorithm
00396     ordered_non_unique< tag< prediction_algorithm_index >, member< Prediction ,std::string, &Prediction::_algorithm > >,
00397   
00398   // sort on algorithm followed by confidence
00399   ordered_non_unique<
00400     tag< prediction_algorithm_confidence_index >,
00401     composite_key<
00402       Prediction,
00403       member< Prediction, std::string, &Prediction::_algorithm >,
00404       member< Prediction, MyNT, &Prediction::_confidence>
00405     >,
00406       composite_key_compare<
00407       std::less<std::string>,   // algorithms sorted by default
00408       std::greater< MyNT > // sort confidence in decreasing order.
00409     >
00410     >// ,
00411 
00412 //   // sort on algorithm followed by overall rank
00413 //   ordered_non_unique<
00414 //     tag< prediction_algorithm_rank_overall_index >,
00415 //     composite_key<
00416 //       Prediction,
00417 //       member< Prediction, std::string, &Prediction::_algorithm >,
00418 //       member< Prediction, unsigned int, &Prediction::_rankOverall>
00419 //     >
00420 //     >,
00421     
00422 //   // sort on algorithm followed by function followed by within-function rank
00423 //   ordered_non_unique<
00424 //     tag< prediction_algorithm_rank_function_index >,
00425 //     composite_key<
00426 //       Prediction,
00427 //       member< Prediction, std::string, &Prediction::_algorithm >,
00428 //       member< Prediction, std::string, &Prediction::_function >,
00429 //       member< Prediction, unsigned int, &Prediction::_rankFunction>
00430 //     >
00431 //     >
00432 
00433     >
00434   
00435   > SetOfPredictions;
00436 
00437 
00438 #ifdef USE_CVRESULT
00439 // the first level of these maps store the name of the algorithm. the
00440 // second level the name of the function. the third level the
00441 // threshold.
00442 typedef map< MyNT, CVResult > MyCVResultsPerFunction;
00443 typedef map< string, MyCVResultsPerFunction > MyCVResultsPerAlgo;
00444 typedef map< string, MyCVResultsPerAlgo > MyCVResults; 
00445 #endif //  USE_CVRESULT
00446 
00447 // each string is a gene.
00448 typedef map< string, DetailedCVResult > MyDetailedCVResultsPerThreshold;
00449 typedef map< MyNT, MyDetailedCVResultsPerThreshold > MyDetailedCVResultsPerFunction;
00450 //typedef map< string, DetailedCVResult > MyDetailedCVResultsPerFunction;
00451 typedef map< string, MyDetailedCVResultsPerFunction > MyDetailedCVResultsPerAlgo;
00452 typedef map< string, MyDetailedCVResultsPerAlgo > MyDetailedCVResults;
00453 //typedef map< string, map< string, map< MyNT, vector< DetailedCVResult > > > > MyDetailedCVResults;
00454 
00455 // PerFunction because each such map stores the results for a function.
00456 typedef map< string, PredictionDetails> MyPredictionResultsPerFunction;
00457 typedef map< string, MyPredictionResultsPerFunction > MyPredictionResultsPerAlgo;
00458 typedef map< string, MyPredictionResultsPerAlgo > MyPredictionResults;
00459 typedef map< string, map< string, map< MyNT, int> > > MyPredictionCountsPerFunctionAndThreshold;
00460 
00461 enum MyGainCompareFunctionType { COMPARE_FUNCTION_INVALID = -1,
00462                                  COMPARE_FUNCTION_IDENTICAL = 0,
00463                                  COMPARE_FUNCTION_DESCENDANT,
00464                                  COMPARE_FUNCTION_ANCESTOR,
00465                                  COMPARE_FUNCTION_RELATIVE };
00466 
00478 struct MyGainDAGDistanceType
00479 {
00480 private:
00481   int _up;
00482   int _down;
00483   MyGainCompareFunctionType _whichType;
00484 
00485 public:
00486   MyGainDAGDistanceType(int u = -1, int d = -1)
00487       : _up(u), _down(d), _whichType(computeDistanceType())
00488     {}
00489   virtual ~MyGainDAGDistanceType()
00490     {}
00491 
00493   MyGainCompareFunctionType computeDistanceType() const;
00494   
00495   unsigned int getDistance() const
00496     {
00497       return(max(0, _up) + max(0, _down));
00498     }
00499 
00500   MyGainCompareFunctionType getDistanceType() const
00501     {
00502       return(_whichType);
00503     }
00504 
00511   bool operator<(const MyGainDAGDistanceType &other) const;
00512 
00513 
00515   void print(ostream &ostr) const
00516     {
00517       ostr << _up << "\t" << _down;
00518     }
00519 };
00520   
00521 inline ostream &operator<<(ostream &ostr, const MyGainDAGDistanceType &distance)
00522 {
00523   distance.print(ostr);
00524   return(ostr);
00525 }
00526 
00527 
00533 struct PredictionEvaluation //: public Prediction // commented out because of issues with multi_index in versions before Boost 1.34
00534 {
00535 public:
00536   string _gene;
00537   string _function;
00538   string _algorithm;
00539   MyNT _confidence;
00540 
00541   // the rank of this prediction in the list of all predictions.
00542   unsigned int _rankOverall;
00543   // the rank of this prediction in the list of predictions for this function.
00544   unsigned int _rankFunction;
00545 
00546   string _closestFunction;
00547   string _closestExperimentallyAnnotatedFunction;
00548   // distance to closest function. if both _distanceUp and
00549   // _distanceDown are 0, the closest function is the predicted
00550   // function. if only _distanceUp is <= 0, _closestFunction is a
00551   // descendant of the predicted function, so the prediction is
00552   // verified. if only _distanceDown is <= 0, _closestFunction is an
00553   // ancestor of the predictd function. if both are positive, the
00554   // predicted function is a relative of _closestFunction. if both are
00555   // negative, something is wrong.
00556   MyGainDAGDistanceType _distance;
00557   MyGainDAGDistanceType _distanceToExperimentallyAnnotatedFunction;
00558   
00559   PredictionEvaluation(string g = "", string f = "", string a = "", MyNT c = 0)
00560   //      : Prediction(g, f, a, c), 
00561       : _gene(g), _function(f), _algorithm(a), _confidence(c),
00562         _rankOverall(0), _rankFunction(0),
00563         _closestFunction(), _closestExperimentallyAnnotatedFunction(), _distance(),
00564         _distanceToExperimentallyAnnotatedFunction()
00565     {}
00566   
00568   bool isClosestFunctionInitialised() const
00569     {
00570       return(COMPARE_FUNCTION_INVALID != _distance.getDistanceType());
00571     }
00572 
00574   bool isClosestExperimentallyAnnotatedFunctionInitialised() const
00575     {
00576       return(COMPARE_FUNCTION_INVALID !=
00577              _distanceToExperimentallyAnnotatedFunction.getDistanceType());
00578     }
00579 
00581   bool isVerified() const
00582     {
00583       return(isClosestFunctionInitialised()
00584              && ((COMPARE_FUNCTION_DESCENDANT == _distance.getDistanceType())
00585                  || (COMPARE_FUNCTION_IDENTICAL == _distance.getDistanceType())));
00586     }
00589   bool isExperimentallyVerified() const
00590     {
00591       return(isClosestExperimentallyAnnotatedFunctionInitialised()
00592              && ((COMPARE_FUNCTION_DESCENDANT ==
00593                   _distanceToExperimentallyAnnotatedFunction.getDistanceType())
00594                  || (COMPARE_FUNCTION_IDENTICAL ==
00595                      _distanceToExperimentallyAnnotatedFunction.getDistanceType())));
00596     }
00597   
00599 
00615   void updateClosestFunction(string f, int up, int down, bool experimentalEC = false);
00616   
00617 
00618 };
00619 
00620 // tell doxygen to skip
00624 struct evaluation_gene_function_algorithm_index {};
00625 struct evaluation_gene_algorithm_index {};
00626 struct evaluation_gene_index {};
00627 struct evaluation_function_index {};
00628 struct evaluation_algorithm_index {};
00629 
00630 struct evaluation_algorithm_confidence_index {};
00631 struct evaluation_algorithm_rank_overall_index {};
00632 struct evaluation_algorithm_rank_function_index {};
00637 typedef multi_index_container<
00638   PredictionEvaluation,
00639   indexed_by<
00640     // define multi-column primary key.
00641     ordered_unique<
00642       tag< evaluation_gene_function_algorithm_index >,
00643       composite_key<
00644       PredictionEvaluation,
00645       member< PredictionEvaluation, string, &PredictionEvaluation::_gene>,
00646       member< PredictionEvaluation, string, &PredictionEvaluation::_function >,
00647       member< PredictionEvaluation, string, &PredictionEvaluation::_algorithm >
00648       >,
00649       composite_key_compare<
00650       std::less<string>, // genes sorted by default
00651       std::less<string>, // functions sorted by default
00652       std::less<string>   // algorithms sorted by default
00653       >
00654     >,
00655     // there can be many functions for an algorithm-gene pair.
00656     ordered_non_unique<
00657       tag< evaluation_gene_algorithm_index >,
00658     composite_key<
00659       PredictionEvaluation,
00660         member< PredictionEvaluation, string, &PredictionEvaluation::_algorithm >,
00661         member< PredictionEvaluation, string, &PredictionEvaluation::_gene>
00662     >,
00663     composite_key_compare<
00664         std::less<string>,   // algorithms sorted by default
00665         std::less<string> // genes sorted by default
00666   >
00667 >,
00668     
00669 
00670     
00671   // sort by less<string> on _gene
00672   ordered_non_unique< tag< evaluation_gene_index >,
00673                       member< PredictionEvaluation, string, &PredictionEvaluation::_gene > >,
00674 
00675   // sort on _function
00676   ordered_non_unique< tag< evaluation_function_index >,
00677                       member< PredictionEvaluation, string, &PredictionEvaluation::_function > >,
00678     
00679   // sort on _algorithm
00680   ordered_non_unique< tag< evaluation_algorithm_index >,
00681                       member< PredictionEvaluation, string, &PredictionEvaluation::_algorithm > >,
00682 
00683   // sort on algorithm followed by confidence
00684   ordered_non_unique<
00685     tag< evaluation_algorithm_confidence_index >,
00686     composite_key<
00687       PredictionEvaluation,
00688       member< PredictionEvaluation, std::string, &PredictionEvaluation::_algorithm >,
00689       member< PredictionEvaluation, MyNT, &PredictionEvaluation::_confidence>
00690     >,
00691       composite_key_compare<
00692       std::less<std::string>,   // algorithms sorted by default
00693       std::greater< MyNT > // sort confidence in decreasing order.
00694     >
00695     >,
00696 
00697   // sort on algorithm followed by overall rank
00698   ordered_non_unique<
00699     tag< evaluation_algorithm_rank_overall_index >,
00700     composite_key<
00701       PredictionEvaluation,
00702       member< PredictionEvaluation, std::string, &PredictionEvaluation::_algorithm >,
00703       member< PredictionEvaluation, unsigned int, &PredictionEvaluation::_rankOverall>
00704     >
00705     >,
00706     
00707   // sort on algorithm followed by function followed by within-function rank
00708   ordered_non_unique<
00709     tag< evaluation_algorithm_rank_function_index >,
00710     composite_key<
00711       PredictionEvaluation,
00712       member< PredictionEvaluation, std::string, &PredictionEvaluation::_algorithm >,
00713       member< PredictionEvaluation, std::string, &PredictionEvaluation::_function >,
00714       member< PredictionEvaluation, unsigned int, &PredictionEvaluation::_rankFunction>
00715     >
00716     >
00717 
00718 
00719     
00720     > 
00721   > SetOfEvaluations;
00722 
00723 
00728 struct PredictionEvaluationSummary
00729 {
00730 public:
00731   string _algorithm;
00732   // the total number of predictions.
00733   unsigned int _numTotalPredictions;
00734 
00736   // statistics on what can be verified.
00738   
00739   // _numPredictionsForVerifiableGenes counts the number of predictions
00740   // for verifiable genes. some of the predictions may themselves not
00741   // be verifiable since the gene does not have a new annotation in
00742   // the same category as the predicted function. 
00743   // _numVerifiablePredictions counts the #of verifiable predictions.
00744   unsigned int _numPredictionsForVerifiableGenes;
00745   unsigned int _numVerifiablePredictions;
00746   // numTotalMSPs is just the total number of MSPs. 
00747   // numMSPsForVerifiableGenes and numVerifiableMSPs are the analogues
00748   // for MSPs for numPredictionsForVerifiableGenes and
00749   // numVerifiablePredictions.
00750   unsigned int _numTotalMSPs, _numMSPsForVerifiableGenes, _numVerifiableMSPs;
00751 
00752   // the next few quantities give an idea of the hierarchical
00753   // consistency of the predictions. they count the #predictions/#MSPs
00754   // for various categories of predictions.
00755   //
00756   // _numAverageMSPs = _numTotalPredictions/_numTotalMSPs.
00757   //
00758   // _numAverageMSPsForVerifiableGenes = _numPredictionsForVerifiableGenes/_numMSPsForVerifiableGenes.
00759   //
00760   // _numAverageVerifiableMSPs = _numVerifiablePredictions/_numVerifiableMSPs
00761 
00762   MyNT _numAverageMSPs, _numAverageMSPsForVerifiableGenes, _numAverageVerifiableMSPs;
00763 
00764   unsigned int _numVerifiableGenes;
00765 
00766 
00768   // statistics on what is verified.
00770   
00771   
00772   // key is the distance to the verifying new annotating function. 
00773   // value is the #MSPs with this distance.
00774   map< unsigned int, unsigned int > _distanceToNumVerifiedMSPs;
00775   map< unsigned int, unsigned int > _distanceToNumExperimentallyVerifiedMSPs;
00776   // total of the values in the _distanceToNumVerifiedMSPs map.
00777   unsigned int _numVerifiedMSPs, _numExperimentallyVerifiedMSPs;
00778   // the next map stores an entry at "index" d for each gene such that
00779   // the closest new annotation over all verified predicted functions
00780   // for that gene is d.
00781   map< unsigned int, unsigned int > _distanceToNumVerifiedGenes;
00782   map< unsigned int, unsigned int > _distanceToNumExperimentallyVerifiedGenes;
00783   set< string > _verifiedGenes, _experimentallyVerifiedGenes;
00784   // the number of genes in _verifiedGenes, also the total of the
00785   // values in the _distanceToNumVerifiedGenes map.
00786   unsigned int _numVerifiedGenes, _numExperimentallyVerifiedGenes;
00787 
00789   // histograms of confidence values.
00791   
00792   // the histogram of confidence values for verified MSPs.
00793   MyHistogram _verifiedMSPsConfidenceHistogram, _experimentallyVerifiedMSPsConfidenceHistogram;
00794   // the histogram of confidence values for unverified MSPs for genes
00795   // with at least one verified MSP.
00796   MyHistogram _verifiedGeneUnverifiedMSPsConfidenceHistogram,
00797     _experimentallyVerifiedGeneUnverifiedMSPsConfidenceHistogram;
00798   // the histogram of confidence values for MSPs for genes
00799   // without any verified MSP.
00800   MyHistogram _unverifiedGeneMSPsConfidenceHistogram;
00801   // the histogram of confidence values for MSPs for genes
00802   // without any experimentally verified MSP.
00803   MyHistogram _experimentallyUnverifiedGeneMSPsConfidenceHistogram;
00804   
00805 
00807   // recalls/precisions for ROC curves over all functions.
00809   MyHistogram _allConfidenceHistogram;
00810   MyHistogram _correctConfidenceHistogram;
00811   MyHistogram _precisionHistogram;
00812   MyHistogram _recallHistogram;
00813   MyHistogram _oneMinusRecallHistogram;
00814   
00815   // per function curves.
00816   map< string, MyHistogram > _perFunctionAllConfidenceHistograms;
00817   map< string, MyHistogram > _perFunctionCorrectConfidenceHistograms;
00818   map< string, unsigned int >_perFunctionNumNewAnnotations;
00819   map< string, MyHistogram > _perFunctionPrecisionHistograms;
00820   map< string, MyHistogram > _perFunctionRecallHistograms;
00821 //  map< BioFunction, MyHistogram > _perFunctionPrecisionHistogram;
00822 
00823   MyHistogram _verifiedMSPsRankHistogram, _experimentallyVerifiedMSPsRankHistogram;
00824   MyHistogram _verifiedGeneUnverifiedMSPsRankHistogram,
00825     _experimentallyVerifiedGeneUnverifiedMSPsRankHistogram;
00826   MyHistogram _unverifiedGeneMSPsRankHistogram;
00827   MyHistogram _experimentallyUnverifiedGeneMSPsRankHistogram;
00828   MyHistogram _allRankHistogram;
00829   MyHistogram _correctRankHistogram;
00830   map< string, MyHistogram > _perFunctionAllRankHistograms;
00831   map< string, MyHistogram > _perFunctionCorrectRankHistograms;
00832 
00833 public:
00834   
00835 };
00836   
00837 
00838 // struct evaluation_gene_function_algorithm_index {};
00839 // struct evaluation_gene_algorithm_index {};
00840 // struct evaluation_gene_index {};
00841 // struct evaluation_function_index {};
00842 struct evaluation_summaries_algorithm_index {};
00843 
00844 typedef multi_index_container<
00845   PredictionEvaluationSummary,
00846   indexed_by<
00847   // sort on _algorithm
00848   ordered_unique< tag< evaluation_summaries_algorithm_index >,
00849                       member< PredictionEvaluationSummary, string, &PredictionEvaluationSummary::_algorithm > >
00850     > 
00851   > SetOfEvaluationSummaries;
00852 
00853 class Reporter
00854 {
00855 private:
00856 
00857   SetOfPredictions _allPredictions;
00858   SetOfEvaluations _allEvaluations;
00859   SetOfEvaluationSummaries _allEvaluationSummaries;
00860   
00861   SetOfDetailedCVResults _allDetailedCVResults;
00862   
00863   MyPredictionResults predictionResults;
00864 #ifdef USE_CVRESULT
00865   MyCVResults cvResults;
00866 #endif // USE_CVRESULT
00867   MyDetailedCVResults detailedCvResults;
00868   MyPredictionCountsPerFunctionAndThreshold predictionCounts;
00869   string outputDirectoryName;
00870   string commandLine;
00871   string _experimentName;
00872   
00873   set< string > _geneUniverse;
00874   
00875 public:
00876   
00877   Reporter()
00878       : _allPredictions(), _allEvaluations(), _allEvaluationSummaries(),
00879         _allDetailedCVResults(), predictionResults(),
00880 #ifdef USE_CVRESULT
00881         cvResults(),
00882 #endif // USE_CVRESULT
00883         detailedCvResults(),  predictionCounts(),
00884         outputDirectoryName(),  commandLine(), _experimentName(),
00885         _geneUniverse()
00886     {}
00887 
00888   Reporter(string outDir, string cmdLine)
00889       : _allPredictions(), _allEvaluations(), _allEvaluationSummaries(),
00890         _allDetailedCVResults(), predictionResults(),
00891 #ifdef USE_CVRESULT
00892         cvResults(),
00893 #endif // USE_CVRESULT
00894         detailedCvResults(),  predictionCounts(),
00895         outputDirectoryName(),  commandLine(), _experimentName(),
00896         _geneUniverse()
00897     {
00898           this->outputDirectoryName = outDir;
00899           this->commandLine = cmdLine;
00900     }
00901 
00902   virtual ~Reporter()
00903     {}
00904 
00907   void addCV(string function, MyGainCVResult result, MyNT threshold,
00908              string algorithm = "Hopfield");
00909 
00912   // void addCV(string function, int TP, int TN, int FP, int FN, MyNT threshold,
00913   //            string algorithm = "Hopfield");
00914 
00942   void addCV(string gene, string function, MyGainAnnotationType correctState, 
00943              MyGainAnnotationType predictedState, MyNT correctStateConfidence, MyNT predictionConfidence,
00944              string algorithm = "Hopfield");
00945   
00946 
00947   void addPrediction(string gene, string function, MyNT prob, MyNT input, MyNT threshold,
00948                      string algorithm = "Hopfield");
00949 
00950 
00953   void addPredictionCutBasedConfidence(string gene, string function,
00954                                        MyNT confidence,
00955                                        MyNT threshold,
00956                                        string algorithm = "Hopfield");
00957 
00958 
00977   void checkTruePathRuleForPredictions(string algorithm,
00978                                        MyAnnotations &annotations,
00979                                        const GeneOntology &go,
00980                                        ostream &ostr,
00981                                        map< string, set < string > > &tprViolations);
00982 
00983 
00997   void comparePredictions(string algo1, string algo2, ostream &ostr);
00998   
00999   
01018   void computePredictionRanks(string algo, MyAnnotations &differentAnnotations,
01019                               GeneOntology &go);
01020 
01070   void evaluatePredictions(string algo, MyAnnotations &currentAnnotations,
01071                            MyAnnotations &newAnnotations, GeneOntology &go);
01072 
01090 
01091   void evaluatePredictionsForROCCurvesUsingRanks(string algo, MyAnnotations &differentAnnotations,
01092                                        GeneOntology &go);
01093   
01094   
01096   void getAlgorithms(set< string > &algorithms);
01097  
01100   void getGenesWithPredictions(string algo, set< string > &genes);
01101   
01102 /*  /// \param[in] algo1, the name of the first algorithm to compare.
01106   */
01107   
01119   /*  /// \note You only have to invoke the method once for each pair of
01122   */
01123   void printComparisonPredictionEvaluationROCCurves(const set< string > &algorithms, string outputDir,
01124                                                     MyAnnotations &annotations, const GeneOntology &go);
01125   void printComparisonPredictionEvaluationAUCScatterPlots(const set< string > &algorithms, string outputDir, MyAnnotations &annotations);
01126   
01127   void printDetailedCVResults(ostream &dcvfstr, bool flush = 1);
01128 
01130   void printCVResults(ostream &cvfstr, bool flush = 1, const BioFunction *functionToPrint = NULL);
01131   
01132   
01134   void printPredictions(ostream &predfstr, int numPredictionsToPrintPerFunction, bool flush = 1, const BioFunction *functionToPrint = NULL);
01135 
01137   void printPredictionEvaluationROCCurves(ostream &ostr);
01138   
01148   void printPredictionEvaluations(ostream &ostr, GeneOntology *go = NULL);
01149 
01151   void printPredictionEvaluationSummary(ostream &ostr);
01152 
01161   void readGeneUniverse(string guFile, set< string > &universe);
01162 
01163   
01176   void readPredictions(string predFile, set< string > *onlyFunctions = NULL,
01177                        string convertFunction = "");
01178   
01191   void readDetailedCVResults(string predFile,
01192                              const set< string > *onlyFunctions = NULL,
01193                              string convertFunction = "");
01194   
01201   void setExperimentName(string dataset)
01202     {
01203       _experimentName = dataset;
01204     }
01205     
01206 #ifdef USE_CVRESULT
01207   void printGroupedCVResults(ostream &cvfstr, const map< string, map< unsigned int, set< GOFunction * > > > &groupedFunctions);
01208 #endif // USE_CVRESULT
01209 
01236   void printCurveDataFromCV(ostream &out, GeneOntology &go, set< BioFunction > functions = set< BioFunction >(), string extra = "");
01237 
01250   void printECWeightedCurveDataFromCV(ostream &out, GeneOntology &go, MyAnnotations &annotations, set< BioFunction > functions = set< BioFunction >(), string extra = "");
01251 
01252   // TODO is this a better way to go than the "flush" option in print methods?
01253   void clear();
01254 
01255 
01256 private:
01257 
01258 
01259   // fill in a number of details for algoSummary. this process started
01260   // at the end of Reporter::evaluatePredictions().
01261   void _computeAlgorithmEvalationSummary(string algo,
01262                                          PredictionEvaluationSummary &algoSummary);
01263   
01264   // compute various confidence related histograms.
01265   void _computeConfidenceDistributions(string algo,
01266                                        PredictionEvaluationSummary &summary);
01267                                        
01268   void _computeRankDistributions(string algo,
01269                                        PredictionEvaluationSummary &summary);
01270 };
01271 
01272 
01273 
01274 #endif // _REPORTER_H
 All Classes Functions Variables Typedefs Friends