Biorithm  1.1
 All Classes Functions Variables Typedefs Friends
gain.h
00001 /**************************************************************************
00002  * Copyright (c) 2001-2012 T. M. Murali                                   *
00003  * Copyright (c) 2006-2007, 2010-2012 David Badger                        *
00004  * Copyright (c) 2011-2012 Christopher L. Poirel                          *
00005  * Copyright (c) 2011 Phillip Whisenhunt                                  *
00006  * Copyright (c) 2010 Jacqueline Addesa                                   *
00007  *                                                                        *
00008  * This file is part of Biorithm.                                         *
00009  *                                                                        *
00010  * Biorithm is free software: you can redistribute it and/or modify       *
00011  * it under the terms of the GNU General Public License as published by   *
00012  * the Free Software Foundation, either version 3 of the License, or      *
00013  * (at your option) any later version.                                    *
00014  *                                                                        *
00015  * Biorithm is distributed in the hope that it will be useful,            *
00016  * but WITHOUT ANY WARRANTY; without even the implied warranty of         *
00017  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          *
00018  * GNU General Public License for more details.                           *
00019  *                                                                        *
00020  * You should have received a copy of the GNU General Public License      *
00021  * along with Biorithm.  If not, see <http://www.gnu.org/licenses/>.      *
00022  *                                                                        *
00023  **************************************************************************/
00024 
00032 // Purpose: class declaration my MyGainGraph. this class exists
00033 // solely to implement a gain network algorithm on a graph.
00034 
00035 
00036 #ifndef _GAIN_H
00037 #define _GAIN_H
00038 
00039 #include <map>
00040 
00041 #include "constants.h"
00042 // Test for GCC >= 4.3. from http://gcc.gnu.org/onlinedocs/cpp/Common-Predefined-Macros.html. See definition of GCC_VERSION macro in libutil/constants.h
00043 #if GCC_VERSION > 40300
00044 // see http://gcc.gnu.org/gcc-4.3/changes.html
00045 #include <tr1/unordered_set>
00046 #else
00047 #include <ext/hash_map>
00048 #endif
00049 
00050 
00051 #include "boost/filesystem/path.hpp"
00052 #include "boost/filesystem/operations.hpp" // includes boost/filesystem/path.hpp
00053 #include "boost/filesystem/fstream.hpp"    // ditto
00054 
00055 using namespace std;
00056 
00057 #include "old-annotations.h"
00058 #include "GO.h"
00059 #include "gain-opts.h"
00060 #include "gain-state.h"
00061 #include "graph.h"
00062 #include "point.h"
00063 #include "reporter.h"
00064 
00065 typedef MyGainAnnotationType MyGainOrfState;
00066 
00068 enum MyGainUpdateType {GAIN_UPDATE_IMMEDIATE};
00069 
00070 // the innermost map maps from the strings "annotated" and
00071 // "negative" to the list of proteins that should be censored.
00072 //
00073 // the vector containing the innermost maps loops over the number of folds.
00074 //
00075 // the outermost map has keys that are the goids.
00076 
00077 //typedef map< string, vector< map< string, vector< string > > > > MyGainFoldInfo;
00078 
00079 typedef map< string, vector< vector< vector< string > > > > MyGainFoldInfo;
00080 
00081 // the innermost map maps from the MyGainAnnotationType
00082 // (ANNOTATED_STATE and NOT_ANNOTATED_STATE) to the set of proteins
00083 // that should be censored one by one
00084 //
00085 // the outermost map has keys that are the goids.
00086 typedef map< string, map< MyGainAnnotationType, set< string > > > MyGainLOOCVInfo;
00087 
00088 // various parameters controlling the Gain net.
00089 struct MyGainParams
00090 {
00091 private:
00092 
00093   // all the options.
00094   gengetopt_args_info _options;
00095 
00096   // i slowly want to make all variables private.
00097 
00098 
00099   // --experiment-name. the name of the gene expression experiment.
00100   string _experimentName;
00101 
00102   // -f, --functions-file
00103   string _annotationsFile;
00104 
00105   // -g, --go-file
00106   string _goFile;
00107 
00108   // -G, --group-functions-method, the method by which to group functions.
00109   string _groupFunctionsMethodString;
00110   bool _groupFunctionsByDepth;
00111   bool _groupFunctionsByParent;
00112 
00113 
00114 //  map< GOFunction*, MyHistogram > _functionEdgeProbabilities;
00115   map< BioFunction, MyHistogram > _functionEdgeProbabilities;
00116 
00117   // --hipr-directory
00118   string _hiprDirectory;
00119 
00120 
00121   // -i, --interactions-file. there can be multiple interaction files.
00122   vector< string > _flnFiles;
00123 
00124   //  --integrate. type of data integration.
00125   string _dataIntegrationType;
00126 
00127   // --maximum-go-depth.
00128   unsigned int _maximumGoDepth;
00129 
00130   // --minimum-go-depth.
00131   unsigned int _minimumGoDepth;
00132 
00133   // --maximum-annotated-genes.
00134   unsigned int _maximumAnnotatedGenes;
00135 
00136   // --minimum-annotated-genes.
00137   unsigned int _minimumAnnotatedGenes;
00138 
00139   // --num-rounds
00140   unsigned int _numRounds;
00141 
00142 
00143   // --number-runs
00144   unsigned int _numRuns;
00145 
00146   // --only-category
00147   set< string > _onlyFunctionCategories;
00148 
00149   // --only-functions
00150   string _onlyFunctionsFile;
00151 
00152   // -- original-annotations. are the GO annotations transitively closed? no, if this variable is true.
00153   bool _originalAnnotations;
00154 
00155   // --output-directory
00157   string _outputDirectory;
00158 
00160   string _outputFilePrefix;
00161 
00162   // --ova
00164   set< string > _ovaAlgorithms;
00165 
00166   // --ovn
00168   set< string > _ovnAlgorithms;
00169 
00170   // --predictions-file
00171   string _predictionsFile;
00172 
00173 
00174   // SVM related options.
00175   string _libSVMDirectory, _SVMLightDirectory;
00176   string _libSVMTrainOptions, _libSVMTestOptions,
00177     _SVMLightTrainOptions,_SVMLightTestOptions;
00178 
00179 
00180   // --treewidth.
00181   bool _computeTreewidth;
00182 
00183   // --validate
00184   string _validationAnnotationsFile;
00185 
00186 
00187 public:
00188   // streams.
00189 
00190   // print CV results here.
00191   ofstream cvStream;
00192   ofstream ecwCVStream;
00193   // print detailed CV results here.
00194   ofstream detailedCVStream;
00195   // the weights of the edges in an FLN.
00196   ofstream edgeWeightsStream;
00197 
00198   // universe of genes for which i can make predictions. this file is
00199   // useful in eval-gain.
00200   ofstream geneUniverseStream;
00201 
00202   // CV results per depth/cutoff/etc.
00203   ofstream groupedCVStream;
00204   ofstream groupedEcwCVStream;
00205 
00206   // invocation information.
00207   ofstream invocationStream;
00208 
00209   // generic information.
00210   ofstream logStream;
00211   // print predictions here.
00212   ofstream predictionsStream;
00213 
00214   // where are the propagation diagrams?
00215   ofstream propagationDiagramsStream;
00216 
00217   // results of the sanity check.
00218   ofstream sanityCheckStream;
00219 
00220   // various stats about predictions.
00221   ofstream statsStream;
00222 
00223   // a convenience variable to simplify the code for opening and
00224   // closing all the above streams.
00225   map< string, ofstream* > outputStreams;
00226   // the name of the file to dump the invocation to. I need the file
00227   // name rather than the ofstream because of the way
00228   // cmdline_parser_file_save() works.
00229   string _invocationFile;
00230 
00231 
00232 
00233   // if false, the state of a node be ANNOTATED_STATE or
00234   // NOT_ANNOTATED_STATE. if true, HYPOTHETICAL_STATE is also ok.
00235   bool allowZeroStates;
00236   // value that determines that state a node. if the input to the node
00237   // is < gainThreshold, the node is in NOT_ANNOTATED_STATE. if
00238   // the input is > gainThreshold, the node is in ANNOTATED_STATE.
00239   // if the input = gainThreshold, then the node could be in
00240   // ANNOTATED_STATE or HYPOTHETICAL_STATE. the computeState()
00241   // function says what i do.
00242   MyNT gainThreshold;
00243   // did the user provide this on the command line?
00244   bool gainThresholdUserInput;
00245   // did the user provide this on the command line?
00246   bool gainThresholdRangeUserInput;
00247 
00248   // name of the file containing gene expression data.
00249   string geneExpressionFile;
00250 
00251   // stuff to ignore.
00252   set< string > ignoredThings, ignoredEvidenceCodes;
00253 
00254 
00255 
00256   // use GE correlations directly as edge weights.
00257   bool justUseCorrelations;
00258 
00259 
00260   // should i update a node as soon as i realise its state has changed
00261   // or wait till i have gone over all the iterations and pick the
00262   // best node or a random one to change the state of.
00263   MyGainUpdateType updateType;
00264   // divide the input to a node by its degree.
00265   bool useDegree;
00266   // what maximum distance should be a node be to be considered a neighbour?
00267   unsigned int localRuleDistance;
00268 
00269   // compute p-values for the functional assignments. currently (apr
00270   // 23, 2004), do so by random assignments.
00271   bool computePvalues;
00272 
00273 
00274   // check to see if there is propagation in the gain net. print
00275   // some output to indicate so.
00276   bool checkPropagation;
00277   // it is useful to store whether i am cross-validating or not. for
00278   // example, i use this info to control some printing in
00279   // MyGainGraph::runGainNet().
00280   bool crossValidate;
00281 
00282   // file that contains cross-validation info.
00283   string crossValidationFile;
00284   // variables to store info in crossValidationFile.
00285 
00286   // am i doing leave-one-out cross-validation? true iff fracCrossValidate is 0.
00287   bool leaveOneOutCrossValidate;
00288 
00289 
00290   // fraction of nodes to cross validate for each function. this
00291   // parameter is set when i do not want to do leave-one-out
00292   // cross-validation.
00293 //  float fracCrossValidate;
00294 
00295   // the number of nodes to cross validate for each function.
00296 //  unsigned
00297   int foldCrossValidate;
00298   MyGainFoldInfo foldInfo;
00299   MyGainLOOCVInfo loocvInfo;
00300 
00301 
00302   // not based on a command-line parameter. this variable is a sneaky way of passing the frequency of the "current" function from one method to another.
00303   MyNT functionFrequency;
00304 
00306   bool printAllStates;
00307 
00308   // allows the state of +1 nodes to change.
00309   bool unclampPositives;
00310   // allows the state of -1 nodes to change.
00311   bool unclampNegatives;
00312 
00313   // allows the frequency of the function to influence the weight of a +1 or -1 node
00314   bool useStateWeights;
00315 
00316 
00317   // creates files for visualisation.
00318   bool visualisePredictions;
00319   bool visualiseCrossValidation;
00320   bool visualiseCut;
00321 
00322   // forces GAIN to do only predictions or only cross-validation
00323   bool onlyPredictions;
00324   bool onlyCrossVal;
00325 
00326   bool weightEdgeTypesCutoff;
00327   string cutoffsFile;
00328   bool weightEdgeTypesDepth;
00329   bool weightEdgeTypesJaccard;
00330 
00331   bool groupEdgeTypes;
00332   string edgeTypeGroupsFile;
00333   bool useCustomRNGSeed;
00334   int customRNGSeed;
00335 
00336   string edgeWeightingScheme;
00337 
00338 public:
00340   MyGainParams();
00341 
00343   virtual ~MyGainParams()
00344     {}
00345 
00347   string getAnnotationsFile() const
00348     {
00349       return(_annotationsFile);
00350     }
00351 
00352   // --no-true-path-rule-downward
00355   bool getApplyTruePathRuleDownward() const
00356     {
00357       // the option is true if the true path rule should NOT be applied downward.
00358       return(!_options.no_true_path_rule_downward_given);
00359     }
00360 
00362   string getGOFile() const
00363     {
00364       return(_goFile);
00365     }
00366 
00368   vector< string > getFLNFiles() const
00369     {
00370       return(_flnFiles);
00371     }
00372 
00373 
00374 
00375 
00377   string getDataIntegrationType() const
00378     {
00379       return(_dataIntegrationType);
00380     }
00381 
00382 
00385   unsigned int getMaximumGoDepth() const
00386     {
00387       return(_maximumGoDepth);
00388     }
00389 
00392   unsigned int getMinimumGoDepth() const
00393     {
00394       return(_minimumGoDepth);
00395     }
00396 
00399   unsigned int getMaximumAnnotatedGenes() const
00400     {
00401       return(_maximumAnnotatedGenes);
00402     }
00403 
00406   unsigned int getMinimumAnnotatedGenes() const
00407     {
00408       return(_minimumAnnotatedGenes);
00409     }
00410 
00419   bool getDoNotReduce() const
00420     {
00421       return(runAnyOneVersusNoneAlgorithm() || _options.no_reduce_flag);
00422     }
00423 
00424 
00426   unsigned int getNumRounds() const
00427     {
00428       return(_numRounds);
00429     }
00430 
00433   unsigned int getNumRuns() const
00434     {
00435       return(_numRuns);
00436     }
00437 
00438 
00440   bool getAnnotationsAreOriginal() const
00441     {
00442       return(_originalAnnotations);
00443     }
00444 
00445 
00447   string getExperimentName() const
00448     {
00449       return(_experimentName);
00450     }
00451 
00452   // -G, --group-functions-method.
00453   void setGroupFunctionsMethod(string method)
00454     {
00455       _groupFunctionsMethodString = method;
00456       _groupFunctionsByParent = false;
00457       _groupFunctionsByDepth = false;
00458       if ("depth" == method)
00459         {
00460           _groupFunctionsByDepth = true;
00461         }
00462       else if ("parent" == method)
00463         {
00464           _groupFunctionsByParent = true;
00465         }
00466       else
00467         {
00468           cerr << "Unknown method \"" << method << "\" for grouping functions. Using the default method of grouping by depth." << endl;
00469           _groupFunctionsByDepth = true;
00470           _groupFunctionsMethodString = "depth";
00471         }
00472     }
00473 
00474   bool getGroupFunctionsMethodIsParent() const
00475     {
00476       return(_groupFunctionsByParent);
00477     }
00478   bool getGroupFunctionsMethodIsDepth() const
00479     {
00480       return(_groupFunctionsByDepth);
00481     }
00482 
00483   string getGroupFunctionsMethod() const
00484     {
00485       return(_groupFunctionsMethodString);
00486     }
00487 
00490   //  void setFunctionEdgeProbabilities(const map< GOFunction*, MyHistogram > &hists)
00491   void setFunctionEdgeProbabilities(const map< BioFunction, MyHistogram > &hists)
00492     {
00493       _functionEdgeProbabilities = hists;
00494     }
00495 
00496   // --hipr-directory
00497 
00498   // --min-confidence
00500   MyNT getMinimumConfidence() const
00501     {
00502       return(_options.min_confidence_arg);
00503     }
00504 
00507   MyNT getOneVersusNoneSinkSourceArtificialEdgeWeight() const
00508     {
00509       return(_options.ovn_sinksource_edge_weight_arg);
00510     }
00511 
00512 
00513   // --only-category
00514 
00521   bool processCategory(string cat) const
00522     {
00523       return((0 == _onlyFunctionCategories.size())
00524              || (_onlyFunctionCategories.end() != _onlyFunctionCategories.find(cat)));
00525     }
00526 
00527   // --output-directory
00528 
00530   string getOutputDirectory() const
00531     {
00532       return(_outputDirectory);
00533     }
00534   
00535   string getOutputFileName(string name, string dir = "") const;
00536 
00538   const set< string > *getEvidenceCodesToIgnore() const
00539     {
00540       return(&ignoredEvidenceCodes);
00541     }
00542 
00546   void printInvocation();
00547 
00549   bool getPrintDetailedCVResults() const
00550     {
00551       return(_options.detailed_cross_validation_results_given);
00552     }
00553 
00555   bool getRunGraphviz() const
00556     {
00557       return(!_options.no_graphviz_given);
00558     }
00559 
00563   bool getPerformSanityCheck() const
00564     {
00565       return(_options.sanity_check_given);
00566     }
00567 
00568 
00569 
00570 
00573   string getVisualiseParamsFile() const
00574     {
00575       if (_options.visualise_params_file_given)
00576         return(_options.visualise_params_file_arg);
00577       // the default is to assume a file called "params.txt" in the current directory.
00578       return("params.txt");
00579     }
00580 
00582   int getNumPredictionsToPrintPerFunction() const
00583     {
00584       return(_options.num_print_predictions_arg);
00585     }
00586 
00588   bool runAnyOneVersusNoneAlgorithm() const;
00589 
00593   bool runOneVersusAllAlgorithm(string algorithm) const;
00594 
00598   bool runOneVersusNoneAlgorithm(string algorithm) const;
00599 
00600 
00603   string getPredictionsFile() const
00604     {
00605       return(_predictionsFile);
00606     }
00607 
00608 
00611   string getValidationAnnotationsFile() const
00612     {
00613       return(_validationAnnotationsFile);
00614 
00615     }
00616 
00617   // SVM related options.
00618 
00620   string getLibSVMDirectory() const
00621     {
00622       return(_libSVMDirectory);
00623     }
00624 
00626   string getSVMLightDirectory() const
00627     {
00628       return(_SVMLightDirectory);
00629     }
00630 
00632   string getLibSVMTrainOptions() const
00633     {
00634       return(_libSVMTrainOptions);
00635     }
00636 
00638   string getLibSVMTestOptions() const
00639     {
00640       return(_libSVMTestOptions);
00641     }
00642 
00644   string getSVMLightTrainOptions() const
00645     {
00646       return(_SVMLightTrainOptions);
00647     }
00648 
00650   string getSVMLightTestOptions() const
00651     {
00652       return(_SVMLightTestOptions);
00653     }
00654 
00655   // --treewidth.
00657   bool getComputeTreewidth() const
00658     {
00659       return(_computeTreewidth);
00660     }
00661 
00662   // --weight-evidence-codes
00664   bool getWeightEvidenceCodes() const
00665     {
00666       return(NULL != _options.weight_evidence_codes_arg);
00667     }
00668 
00670   string getEvidenceCodeWeightsFile() const
00671     {
00672       return(_options.weight_evidence_codes_arg);
00673     }
00674 
00679   void setParameters(gengetopt_args_info &gainOptions);
00680 
00681 
00682 private:
00683   void _openOutputFiles();
00684 
00685 };
00686 
00687 
00688 struct MyGainOrf
00689 {
00690 
00691 private:
00692 
00693   // what is the distinction between all these states? realState is
00694   // the *real* state of the orf. it will never change. initialState
00695   // is the initial state of the orf, at the beginning of
00696   // runGainNet(). for hypothetical proteins, realState always
00697   // equals initialState. the equality holds for all other proteins
00698   // too, except for those that are being cross-validated. for them
00699   // initialState will be HYPOTHETICAL_STATE. currentState is the
00700   // current state in runGainNet() and previousState is the
00701   // previous value of currentState (the value before currentState
00702   // changed or the value of initialState).
00703 
00704 public:
00705 
00706   void print(ostream &ostr) const;
00707 
00708 };
00709 
00710 typedef  map< MyNodeId, MyGainStateInfo< MyGainState< MyGainTriStateType > > > MyNodeStatesType;
00711 
00712 // tell doxygen to skip
00716 struct annotationtype_category_group_interactiontype_weight_index{};
00720 template< typename GroupType > struct SetOfEdgeTypeWeights
00721 {
00722         public:
00723 
00724                 struct MyEdgeTypeWeight
00725                 {
00726                         public:
00727 
00728                                 string annotationType; // description of annotation set used to calculate weight
00729                                 string category;
00730                                 GroupType group; // e.g. depth/cutoff value
00731                                 string interactionType;
00732                                 MyNT weight;
00733 
00734                                 MyEdgeTypeWeight(string annotationType, string category, GroupType group, string interactionType, MyNT weight)
00735                                         : annotationType(annotationType), category(category), group(group), interactionType(interactionType), weight(weight) {}
00736                 };
00737 
00738                 typedef multi_index_container
00739                 <
00740                         MyEdgeTypeWeight,
00741                         indexed_by
00742                         <
00743                                 ordered_unique
00744                                 <
00745                                         tag< annotationtype_category_group_interactiontype_weight_index >,
00746                                         composite_key
00747                                         <
00748                                                 MyEdgeTypeWeight,
00749                                                 member< MyEdgeTypeWeight, string, &MyEdgeTypeWeight::annotationType >,
00750                                                 member< MyEdgeTypeWeight, string, &MyEdgeTypeWeight::category >,
00751                                                 member< MyEdgeTypeWeight, GroupType, &MyEdgeTypeWeight::group >,
00752                                                 member< MyEdgeTypeWeight, string, &MyEdgeTypeWeight::interactionType >,
00753                                                 member< MyEdgeTypeWeight, MyNT, &MyEdgeTypeWeight::weight >
00754                                         >
00755                                 >
00756                         >
00757                 > type;
00758 };
00759 
00760 // tell doxygen to skip
00764 struct annotationtype_category_group_nodeid_index{};
00768 template< typename GroupType > struct SetOfGroupedNodes
00769 {
00770         public:
00771 
00772                 struct MyGroupedNode
00773                 {
00774                         public:
00775 
00776                                 string annotationType; // description of annotation set used to calculate weight
00777                                 string category;
00778                                 GroupType group; // e.g. depth/cutoff value
00779                                 string nodeId;
00780 
00781                                 MyGroupedNode(string annotationType, string category, GroupType group, string nodeId)
00782                                         : annotationType(annotationType), category(category), group(group), nodeId(nodeId) {}
00783                 };
00784 
00785                 typedef multi_index_container
00786                 <
00787                         MyGroupedNode,
00788                         indexed_by
00789                         <
00790                                 ordered_unique
00791                                 <
00792                                         tag< annotationtype_category_group_nodeid_index >,
00793                                         composite_key
00794                                         <
00795                                                 MyGroupedNode,
00796                                                 member< MyGroupedNode, string, &MyGroupedNode::annotationType >,
00797                                                 member< MyGroupedNode, string, &MyGroupedNode::category >,
00798                                                 member< MyGroupedNode, GroupType, &MyGroupedNode::group >,
00799                                                 member< MyGroupedNode, string, &MyGroupedNode::nodeId >
00800                                         >
00801                                 >
00802                         >
00803                 > type;
00804 };
00805 
00806 
00807 
00808 
00809 
00810 class MyGainGraph : public MyGraph
00811 {
00812 public:
00813   // typedefs. i guess typedefs are not inherited.
00814 //   typedef MyGraph::MyConstNodeIterator MyConstNodeIterator;
00815 //   typedef MyGraph::MyConstEdgeIterator MyConstEdgeIterator;
00816 
00817 
00818   // i don't think i need this right now, at least not until MyGainNode is fully defined.
00819 //  typedef MyGainNode Node;
00820 
00821   MyGainGraph()
00822       : MyGraph(), useDegree(false), _minThreshold(0), _maxThreshold(0), _threshold(0),
00823         _calculated_threshold(false), _reporter()
00824     {}
00825 
00826   MyGainGraph(string infile, string type, string outputDir, string commandLine, MyNT minEdgeWeight = 0)
00827       : MyGraph(infile, type, minEdgeWeight),
00828         useDegree(false), _minThreshold(0), _maxThreshold(0), _threshold(0),
00829         _calculated_threshold(false), _reporter(outputDir, commandLine)
00830     {}
00831 
00832 
00833   virtual ~MyGainGraph()
00834     {}
00835 
00836   // set and get for whether i should use degree in calculating
00837   // energies or not.
00838   void setUseDegree(bool ud)
00839     {
00840       useDegree = ud;
00841          setNodeWeights();
00842     }
00843 
00844   void resetThreshold() {
00845           this->_calculated_threshold=false;
00846   }
00847 
00848   // set the weights of the nodes based on their degrees.
00849   void setNodeWeights();
00850 
00851 //  void assignInitialStates(MyProteinFunctionType &proteinFunctions,
00852 
00853   // for each state, count the #nodes in *this annotated with that state.
00854 
00855   // params is not const because the propagationDiagramsStream in it will get output.
00856 
00857   // print the neighbourhood of a node. states cannot be const because i
00858   // need to access states using the [] operator.
00859 //  void printNeighbours(ostream &ostr,  MyNodeStatesType &states, const MyNode &node);
00860   // not using const MyNode& node because of problems with MyNode::ConstIncidentEdgeIterator
00861 
00862   // can't be const because i add info to reporter stored in *this.
00863   unsigned int printStates(ofstream &ostr,  const MyNodeStatesType &states,
00864                    string currentFunc, string algorithm = "Hopfield",
00865                    bool printAll = false);
00866 
00898 
00899   void reduce2(ostream &hopstr, map< MyNodeId, MyGainTriStateType > &nodeStates,
00900                const MyGainParams &params, MyGainGraph &reducedGraph,
00901                MyNodeIdSet &unpredictableNodes);
00902 
00903   // i know that all nodes in subgraph are either in state "state" or
00904   // HYPOTHETICAL_STATE. annotate all the corresponding nodes in *this
00905   // in HYPOTHETICAL_STATE with state "state."
00906 
00907   // perform some simple checks to see what the final states of the
00908   // some of the nodes will be.
00909 
00910 //  template< typename InputFunction >
00911 
00912 
00913   // just set edge weights based on correlations in the gene expression data set.
00914   void setEdgeWeightsCorrelations(ostream &fstr, const MyPointSet &geData);
00915 
00916 
00917   // compute dense functions, i.e., functions whose annotated genes
00918   // induce dense subgraphs in the invocant.
00919   void computeDenseFunctions(ostream *fstr,
00920                              MyAnnotations &annotations,
00921                              GeneOntology &go,
00922                              map< string, MyGraph > &denseFunctionSubgraphs);
00923 
00924   // convert edge weights to probabilities based on p-value of each
00925   // edge's correlation in the histogram of all pairs of
00926   // correlations. return the mapping from edge weights to
00927   // probabilities in a histogram.
00928   void computeProbabilitiesFromEdgeWeights(ostream &fstr, const MyPointSet &geData,
00929                                            string experimentName,
00930                                            MyHistogram  &edgeWeightsToProbabilities);
00931 
00932   // for each group of functions with a depth, compute mapping from
00933   // correlations to probabilities of shared function.
00934   void computeEdgeProbabilitiesPerDepth(
00935     MyGainParams &params,
00936     const map< unsigned int, set< GOFunction * > > &functionsByDepth,
00937     MyAnnotations &annotations,
00938 //    map< GOFunction*, MyHistogram > &probabilities
00939     map< BioFunction, MyHistogram > &probabilities
00940     );
00941 
00942 
00943   // For each group of functions with the same parent in the GO DAG,
00944   // convert edge weights to probabilities based on which edges have
00945   // both endpoints sharing a function. return the mapping from
00946   // edge weights to probabilities in a histogram.
00947   void computeEdgeProbabilitiesPerParent(MyGainParams &params, GeneOntology &go,
00948                                          MyAnnotations &annotations,
00949 //                                         map< GOFunction*, MyHistogram > &probs);
00950                                          map< BioFunction, MyHistogram > &probs);
00951 
00952   // convert edge weights to probabilities based on which edges have
00953   // both endpoints sharing a function in functionVector. return the
00954   // mapping from edge weights to probabilities in a histogram.
00955   void computeProbabilitiesFromEdgeWeights(
00956     ostream &fstr,
00957     // no const because of MyAnnotations::haveSameFunction()
00958     //const
00959     MyAnnotations &annotations,
00960     const vector< BioFunction > *functionVector,
00961     MyHistogram  &edgeWeightsToProbabilities
00962     );
00963 
00964   // set edge weights based on mapping from correlations to
00965   // probabilities in edgeWeightsToProbabilities.
00966   void setEdgeWeights(ostream &fstr,
00967                       const MyHistogram  &edgeWeightsToProbabilities
00968                       );
00969 
00970 
00971   // compute the input to a node from its neighbours. the input is the
00972   // sum of weighted states of the node's neighbours.
00973   MyNT computeInput(string nodeId, MyNodeStatesType &nodeStates) const;
00974   // compute the maximum possible input to a node from its neighbours,
00975   // by assuming that each neighbour is annotated and the edge weight
00976   // is 1.
00977   // compute the maximum input to a node from its neighbours, by
00978   // assuming that each hypothetical neighbour is in ANNOTATED_STATE.
00979   // compute the minimum input to a node from its neighbours. just the
00980   // opposite of computeMaximumInput()
00981   // compute the output of a node from its neighbours. the output is the
00982   // sum of weighted states of the node's neighbours.
00983   // for each orf, compute the probability with which it is in the state.
00984 
00985 
00986   MyNT computeEnergy(MyNodeStatesType &nodeStates);
00987   MyNT computeChangeInEnergy(MyNodeStatesType &nodeStates,
00988                              string changedNodeId);
00989 
00990   // print the file in DIMACS format. see
00991   // ftp://dimacs.rutgers.edu/pub/netflow/general-info/specs.tex for
00992   // the specifications.
00993   //
00994   // not making the method const because of a problem with MyNode::ConstEdgeIterator.
00995   void printDimacs(string filename, MyNodeStatesType &nodeStates, string format);// const;
00996   void runMinCut(string outfile, MyNodeStatesType &nodeStates, const MyGainParams &params);
00997 //  void explainCrossValidationResults(MyNodeStatesType &states);
00998 
00999 
01000   // i dont' think this method is used anywhere.
01001 // methods related to reporting results.
01002   // void printResults(MyGainParams &gainParams) {
01003   //   if (!gainParams.onlyPredictions)
01004   //     // don't print (or overwrite) CV results if i only computed
01005   //     // predictions.
01006   //     {
01007   //       _reporter.printDetailedCVResults(gainParams.detailedCVStream);
01008 #ifdef USE_CVRESULT
01009   //       _reporter.printCVResults(gainParams.cvStream);
01010 #endif // USE_CVRESULT
01011   //     }
01012 
01013   //   if (!gainParams.onlyCrossVal)
01014   //     // don't print (or overwrite) predictions if i only computed CV
01015   //     // results.
01016   //     _reporter.printPredictions(gainParams.predictionsStream);
01017   // }
01018 
01019 
01020 
01021   // void addCV(string function, int tp, int tn, int fp, int fn,MyNT threshold,
01022   //            string algo) {
01023   //   _reporter.addCV(function,tp,tn,fp,fn,threshold, algo);
01024   // }
01025 
01026   // detailed CV info for a single gene/function pair
01027 
01028         void computeEdgeTypeWeightsByCutoffFromSharedFunctionRatio(MyAnnotations &annotations, GeneOntology &go, MyGainParams &gainParams);
01029         void computeEdgeTypeWeightsByDepthFromSharedFunctionRatio(MyAnnotations &annotations, GeneOntology &go, MyGainParams &gainParams);
01030         void computeEdgeTypeWeightsJaccard(ostream &logStream,  MyAnnotations &annotations);
01031 
01032         void assignEdgeWeightFromDepth(const MyGainParams &gainParams, string annotationType, string category, unsigned int depth, MyEdge &edge);
01033 
01034 private:
01035 
01036   void _parseDimacsOutput(string filename, set< MyNodeId > &sinkSideNodes) const;
01037   void _setStates(MyNodeStatesType &nodeStates,
01038                   const set< MyNodeId > &sinkSideNodes) const;
01039 
01040 
01041 private:
01042 
01043   // divide inputs and outputs by node degrees.
01044   bool useDegree;
01045   MyNT _minThreshold, _maxThreshold, _threshold;
01046   bool _calculated_threshold;
01047 
01048   Reporter _reporter;
01049 
01050   SetOfEdgeTypeWeights< unsigned int >::type _depthEdgeTypeWeights;
01051   SetOfGroupedNodes< unsigned int >::type _depthGroupedNodes;
01052 
01053 };
01054 
01055 
01056 
01057 #endif // _GAIN_H
 All Classes Functions Variables Typedefs Friends