00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00032
00033
00034
00035
00036 #ifndef _GAIN_H
00037 #define _GAIN_H
00038
00039 #include <map>
00040
00041 #include "constants.h"
00042
00043 #if GCC_VERSION > 40300
00044
00045 #include <tr1/unordered_set>
00046 #else
00047 #include <ext/hash_map>
00048 #endif
00049
00050
00051 #include "boost/filesystem/path.hpp"
00052 #include "boost/filesystem/operations.hpp"
00053 #include "boost/filesystem/fstream.hpp"
00054
00055 using namespace std;
00056
00057 #include "old-annotations.h"
00058 #include "GO.h"
00059 #include "gain-opts.h"
00060 #include "gain-state.h"
00061 #include "graph.h"
00062 #include "point.h"
00063 #include "reporter.h"
00064
00065 typedef MyGainAnnotationType MyGainOrfState;
00066
00068 enum MyGainUpdateType {GAIN_UPDATE_IMMEDIATE};
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079 typedef map< string, vector< vector< vector< string > > > > MyGainFoldInfo;
00080
00081
00082
00083
00084
00085
00086 typedef map< string, map< MyGainAnnotationType, set< string > > > MyGainLOOCVInfo;
00087
00088
00089 struct MyGainParams
00090 {
00091 private:
00092
00093
00094 gengetopt_args_info _options;
00095
00096
00097
00098
00099
00100 string _experimentName;
00101
00102
00103 string _annotationsFile;
00104
00105
00106 string _goFile;
00107
00108
00109 string _groupFunctionsMethodString;
00110 bool _groupFunctionsByDepth;
00111 bool _groupFunctionsByParent;
00112
00113
00114
00115 map< BioFunction, MyHistogram > _functionEdgeProbabilities;
00116
00117
00118 string _hiprDirectory;
00119
00120
00121
00122 vector< string > _flnFiles;
00123
00124
00125 string _dataIntegrationType;
00126
00127
00128 unsigned int _maximumGoDepth;
00129
00130
00131 unsigned int _minimumGoDepth;
00132
00133
00134 unsigned int _maximumAnnotatedGenes;
00135
00136
00137 unsigned int _minimumAnnotatedGenes;
00138
00139
00140 unsigned int _numRounds;
00141
00142
00143
00144 unsigned int _numRuns;
00145
00146
00147 set< string > _onlyFunctionCategories;
00148
00149
00150 string _onlyFunctionsFile;
00151
00152
00153 bool _originalAnnotations;
00154
00155
00157 string _outputDirectory;
00158
00160 string _outputFilePrefix;
00161
00162
00164 set< string > _ovaAlgorithms;
00165
00166
00168 set< string > _ovnAlgorithms;
00169
00170
00171 string _predictionsFile;
00172
00173
00174
00175 string _libSVMDirectory, _SVMLightDirectory;
00176 string _libSVMTrainOptions, _libSVMTestOptions,
00177 _SVMLightTrainOptions,_SVMLightTestOptions;
00178
00179
00180
00181 bool _computeTreewidth;
00182
00183
00184 string _validationAnnotationsFile;
00185
00186
00187 public:
00188
00189
00190
00191 ofstream cvStream;
00192 ofstream ecwCVStream;
00193
00194 ofstream detailedCVStream;
00195
00196 ofstream edgeWeightsStream;
00197
00198
00199
00200 ofstream geneUniverseStream;
00201
00202
00203 ofstream groupedCVStream;
00204 ofstream groupedEcwCVStream;
00205
00206
00207 ofstream invocationStream;
00208
00209
00210 ofstream logStream;
00211
00212 ofstream predictionsStream;
00213
00214
00215 ofstream propagationDiagramsStream;
00216
00217
00218 ofstream sanityCheckStream;
00219
00220
00221 ofstream statsStream;
00222
00223
00224
00225 map< string, ofstream* > outputStreams;
00226
00227
00228
00229 string _invocationFile;
00230
00231
00232
00233
00234
00235 bool allowZeroStates;
00236
00237
00238
00239
00240
00241
00242 MyNT gainThreshold;
00243
00244 bool gainThresholdUserInput;
00245
00246 bool gainThresholdRangeUserInput;
00247
00248
00249 string geneExpressionFile;
00250
00251
00252 set< string > ignoredThings, ignoredEvidenceCodes;
00253
00254
00255
00256
00257 bool justUseCorrelations;
00258
00259
00260
00261
00262
00263 MyGainUpdateType updateType;
00264
00265 bool useDegree;
00266
00267 unsigned int localRuleDistance;
00268
00269
00270
00271 bool computePvalues;
00272
00273
00274
00275
00276 bool checkPropagation;
00277
00278
00279
00280 bool crossValidate;
00281
00282
00283 string crossValidationFile;
00284
00285
00286
00287 bool leaveOneOutCrossValidate;
00288
00289
00290
00291
00292
00293
00294
00295
00296
00297 int foldCrossValidate;
00298 MyGainFoldInfo foldInfo;
00299 MyGainLOOCVInfo loocvInfo;
00300
00301
00302
00303 MyNT functionFrequency;
00304
00306 bool printAllStates;
00307
00308
00309 bool unclampPositives;
00310
00311 bool unclampNegatives;
00312
00313
00314 bool useStateWeights;
00315
00316
00317
00318 bool visualisePredictions;
00319 bool visualiseCrossValidation;
00320 bool visualiseCut;
00321
00322
00323 bool onlyPredictions;
00324 bool onlyCrossVal;
00325
00326 bool weightEdgeTypesCutoff;
00327 string cutoffsFile;
00328 bool weightEdgeTypesDepth;
00329 bool weightEdgeTypesJaccard;
00330
00331 bool groupEdgeTypes;
00332 string edgeTypeGroupsFile;
00333 bool useCustomRNGSeed;
00334 int customRNGSeed;
00335
00336 string edgeWeightingScheme;
00337
00338 public:
00340 MyGainParams();
00341
00343 virtual ~MyGainParams()
00344 {}
00345
00347 string getAnnotationsFile() const
00348 {
00349 return(_annotationsFile);
00350 }
00351
00352
00355 bool getApplyTruePathRuleDownward() const
00356 {
00357
00358 return(!_options.no_true_path_rule_downward_given);
00359 }
00360
00362 string getGOFile() const
00363 {
00364 return(_goFile);
00365 }
00366
00368 vector< string > getFLNFiles() const
00369 {
00370 return(_flnFiles);
00371 }
00372
00373
00374
00375
00377 string getDataIntegrationType() const
00378 {
00379 return(_dataIntegrationType);
00380 }
00381
00382
00385 unsigned int getMaximumGoDepth() const
00386 {
00387 return(_maximumGoDepth);
00388 }
00389
00392 unsigned int getMinimumGoDepth() const
00393 {
00394 return(_minimumGoDepth);
00395 }
00396
00399 unsigned int getMaximumAnnotatedGenes() const
00400 {
00401 return(_maximumAnnotatedGenes);
00402 }
00403
00406 unsigned int getMinimumAnnotatedGenes() const
00407 {
00408 return(_minimumAnnotatedGenes);
00409 }
00410
00419 bool getDoNotReduce() const
00420 {
00421 return(runAnyOneVersusNoneAlgorithm() || _options.no_reduce_flag);
00422 }
00423
00424
00426 unsigned int getNumRounds() const
00427 {
00428 return(_numRounds);
00429 }
00430
00433 unsigned int getNumRuns() const
00434 {
00435 return(_numRuns);
00436 }
00437
00438
00440 bool getAnnotationsAreOriginal() const
00441 {
00442 return(_originalAnnotations);
00443 }
00444
00445
00447 string getExperimentName() const
00448 {
00449 return(_experimentName);
00450 }
00451
00452
00453 void setGroupFunctionsMethod(string method)
00454 {
00455 _groupFunctionsMethodString = method;
00456 _groupFunctionsByParent = false;
00457 _groupFunctionsByDepth = false;
00458 if ("depth" == method)
00459 {
00460 _groupFunctionsByDepth = true;
00461 }
00462 else if ("parent" == method)
00463 {
00464 _groupFunctionsByParent = true;
00465 }
00466 else
00467 {
00468 cerr << "Unknown method \"" << method << "\" for grouping functions. Using the default method of grouping by depth." << endl;
00469 _groupFunctionsByDepth = true;
00470 _groupFunctionsMethodString = "depth";
00471 }
00472 }
00473
00474 bool getGroupFunctionsMethodIsParent() const
00475 {
00476 return(_groupFunctionsByParent);
00477 }
00478 bool getGroupFunctionsMethodIsDepth() const
00479 {
00480 return(_groupFunctionsByDepth);
00481 }
00482
00483 string getGroupFunctionsMethod() const
00484 {
00485 return(_groupFunctionsMethodString);
00486 }
00487
00490
00491 void setFunctionEdgeProbabilities(const map< BioFunction, MyHistogram > &hists)
00492 {
00493 _functionEdgeProbabilities = hists;
00494 }
00495
00496
00497
00498
00500 MyNT getMinimumConfidence() const
00501 {
00502 return(_options.min_confidence_arg);
00503 }
00504
00507 MyNT getOneVersusNoneSinkSourceArtificialEdgeWeight() const
00508 {
00509 return(_options.ovn_sinksource_edge_weight_arg);
00510 }
00511
00512
00513
00514
00521 bool processCategory(string cat) const
00522 {
00523 return((0 == _onlyFunctionCategories.size())
00524 || (_onlyFunctionCategories.end() != _onlyFunctionCategories.find(cat)));
00525 }
00526
00527
00528
00530 string getOutputDirectory() const
00531 {
00532 return(_outputDirectory);
00533 }
00534
00535 string getOutputFileName(string name, string dir = "") const;
00536
00538 const set< string > *getEvidenceCodesToIgnore() const
00539 {
00540 return(&ignoredEvidenceCodes);
00541 }
00542
00546 void printInvocation();
00547
00549 bool getPrintDetailedCVResults() const
00550 {
00551 return(_options.detailed_cross_validation_results_given);
00552 }
00553
00555 bool getRunGraphviz() const
00556 {
00557 return(!_options.no_graphviz_given);
00558 }
00559
00563 bool getPerformSanityCheck() const
00564 {
00565 return(_options.sanity_check_given);
00566 }
00567
00568
00569
00570
00573 string getVisualiseParamsFile() const
00574 {
00575 if (_options.visualise_params_file_given)
00576 return(_options.visualise_params_file_arg);
00577
00578 return("params.txt");
00579 }
00580
00582 int getNumPredictionsToPrintPerFunction() const
00583 {
00584 return(_options.num_print_predictions_arg);
00585 }
00586
00588 bool runAnyOneVersusNoneAlgorithm() const;
00589
00593 bool runOneVersusAllAlgorithm(string algorithm) const;
00594
00598 bool runOneVersusNoneAlgorithm(string algorithm) const;
00599
00600
00603 string getPredictionsFile() const
00604 {
00605 return(_predictionsFile);
00606 }
00607
00608
00611 string getValidationAnnotationsFile() const
00612 {
00613 return(_validationAnnotationsFile);
00614
00615 }
00616
00617
00618
00620 string getLibSVMDirectory() const
00621 {
00622 return(_libSVMDirectory);
00623 }
00624
00626 string getSVMLightDirectory() const
00627 {
00628 return(_SVMLightDirectory);
00629 }
00630
00632 string getLibSVMTrainOptions() const
00633 {
00634 return(_libSVMTrainOptions);
00635 }
00636
00638 string getLibSVMTestOptions() const
00639 {
00640 return(_libSVMTestOptions);
00641 }
00642
00644 string getSVMLightTrainOptions() const
00645 {
00646 return(_SVMLightTrainOptions);
00647 }
00648
00650 string getSVMLightTestOptions() const
00651 {
00652 return(_SVMLightTestOptions);
00653 }
00654
00655
00657 bool getComputeTreewidth() const
00658 {
00659 return(_computeTreewidth);
00660 }
00661
00662
00664 bool getWeightEvidenceCodes() const
00665 {
00666 return(NULL != _options.weight_evidence_codes_arg);
00667 }
00668
00670 string getEvidenceCodeWeightsFile() const
00671 {
00672 return(_options.weight_evidence_codes_arg);
00673 }
00674
00679 void setParameters(gengetopt_args_info &gainOptions);
00680
00681
00682 private:
00683 void _openOutputFiles();
00684
00685 };
00686
00687
00688 struct MyGainOrf
00689 {
00690
00691 private:
00692
00693
00694
00695
00696
00697
00698
00699
00700
00701
00702
00703
00704 public:
00705
00706 void print(ostream &ostr) const;
00707
00708 };
00709
00710 typedef map< MyNodeId, MyGainStateInfo< MyGainState< MyGainTriStateType > > > MyNodeStatesType;
00711
00712
00716 struct annotationtype_category_group_interactiontype_weight_index{};
00720 template< typename GroupType > struct SetOfEdgeTypeWeights
00721 {
00722 public:
00723
00724 struct MyEdgeTypeWeight
00725 {
00726 public:
00727
00728 string annotationType;
00729 string category;
00730 GroupType group;
00731 string interactionType;
00732 MyNT weight;
00733
00734 MyEdgeTypeWeight(string annotationType, string category, GroupType group, string interactionType, MyNT weight)
00735 : annotationType(annotationType), category(category), group(group), interactionType(interactionType), weight(weight) {}
00736 };
00737
00738 typedef multi_index_container
00739 <
00740 MyEdgeTypeWeight,
00741 indexed_by
00742 <
00743 ordered_unique
00744 <
00745 tag< annotationtype_category_group_interactiontype_weight_index >,
00746 composite_key
00747 <
00748 MyEdgeTypeWeight,
00749 member< MyEdgeTypeWeight, string, &MyEdgeTypeWeight::annotationType >,
00750 member< MyEdgeTypeWeight, string, &MyEdgeTypeWeight::category >,
00751 member< MyEdgeTypeWeight, GroupType, &MyEdgeTypeWeight::group >,
00752 member< MyEdgeTypeWeight, string, &MyEdgeTypeWeight::interactionType >,
00753 member< MyEdgeTypeWeight, MyNT, &MyEdgeTypeWeight::weight >
00754 >
00755 >
00756 >
00757 > type;
00758 };
00759
00760
00764 struct annotationtype_category_group_nodeid_index{};
00768 template< typename GroupType > struct SetOfGroupedNodes
00769 {
00770 public:
00771
00772 struct MyGroupedNode
00773 {
00774 public:
00775
00776 string annotationType;
00777 string category;
00778 GroupType group;
00779 string nodeId;
00780
00781 MyGroupedNode(string annotationType, string category, GroupType group, string nodeId)
00782 : annotationType(annotationType), category(category), group(group), nodeId(nodeId) {}
00783 };
00784
00785 typedef multi_index_container
00786 <
00787 MyGroupedNode,
00788 indexed_by
00789 <
00790 ordered_unique
00791 <
00792 tag< annotationtype_category_group_nodeid_index >,
00793 composite_key
00794 <
00795 MyGroupedNode,
00796 member< MyGroupedNode, string, &MyGroupedNode::annotationType >,
00797 member< MyGroupedNode, string, &MyGroupedNode::category >,
00798 member< MyGroupedNode, GroupType, &MyGroupedNode::group >,
00799 member< MyGroupedNode, string, &MyGroupedNode::nodeId >
00800 >
00801 >
00802 >
00803 > type;
00804 };
00805
00806
00807
00808
00809
00810 class MyGainGraph : public MyGraph
00811 {
00812 public:
00813
00814
00815
00816
00817
00818
00819
00820
00821 MyGainGraph()
00822 : MyGraph(), useDegree(false), _minThreshold(0), _maxThreshold(0), _threshold(0),
00823 _calculated_threshold(false), _reporter()
00824 {}
00825
00826 MyGainGraph(string infile, string type, string outputDir, string commandLine, MyNT minEdgeWeight = 0)
00827 : MyGraph(infile, type, minEdgeWeight),
00828 useDegree(false), _minThreshold(0), _maxThreshold(0), _threshold(0),
00829 _calculated_threshold(false), _reporter(outputDir, commandLine)
00830 {}
00831
00832
00833 virtual ~MyGainGraph()
00834 {}
00835
00836
00837
00838 void setUseDegree(bool ud)
00839 {
00840 useDegree = ud;
00841 setNodeWeights();
00842 }
00843
00844 void resetThreshold() {
00845 this->_calculated_threshold=false;
00846 }
00847
00848
00849 void setNodeWeights();
00850
00851
00852
00853
00854
00855
00856
00857
00858
00859
00860
00861
00862
00863 unsigned int printStates(ofstream &ostr, const MyNodeStatesType &states,
00864 string currentFunc, string algorithm = "Hopfield",
00865 bool printAll = false);
00866
00898
00899 void reduce2(ostream &hopstr, map< MyNodeId, MyGainTriStateType > &nodeStates,
00900 const MyGainParams ¶ms, MyGainGraph &reducedGraph,
00901 MyNodeIdSet &unpredictableNodes);
00902
00903
00904
00905
00906
00907
00908
00909
00910
00911
00912
00913
00914 void setEdgeWeightsCorrelations(ostream &fstr, const MyPointSet &geData);
00915
00916
00917
00918
00919 void computeDenseFunctions(ostream *fstr,
00920 MyAnnotations &annotations,
00921 GeneOntology &go,
00922 map< string, MyGraph > &denseFunctionSubgraphs);
00923
00924
00925
00926
00927
00928 void computeProbabilitiesFromEdgeWeights(ostream &fstr, const MyPointSet &geData,
00929 string experimentName,
00930 MyHistogram &edgeWeightsToProbabilities);
00931
00932
00933
00934 void computeEdgeProbabilitiesPerDepth(
00935 MyGainParams ¶ms,
00936 const map< unsigned int, set< GOFunction * > > &functionsByDepth,
00937 MyAnnotations &annotations,
00938
00939 map< BioFunction, MyHistogram > &probabilities
00940 );
00941
00942
00943
00944
00945
00946
00947 void computeEdgeProbabilitiesPerParent(MyGainParams ¶ms, GeneOntology &go,
00948 MyAnnotations &annotations,
00949
00950 map< BioFunction, MyHistogram > &probs);
00951
00952
00953
00954
00955 void computeProbabilitiesFromEdgeWeights(
00956 ostream &fstr,
00957
00958
00959 MyAnnotations &annotations,
00960 const vector< BioFunction > *functionVector,
00961 MyHistogram &edgeWeightsToProbabilities
00962 );
00963
00964
00965
00966 void setEdgeWeights(ostream &fstr,
00967 const MyHistogram &edgeWeightsToProbabilities
00968 );
00969
00970
00971
00972
00973 MyNT computeInput(string nodeId, MyNodeStatesType &nodeStates) const;
00974
00975
00976
00977
00978
00979
00980
00981
00982
00983
00984
00985
00986 MyNT computeEnergy(MyNodeStatesType &nodeStates);
00987 MyNT computeChangeInEnergy(MyNodeStatesType &nodeStates,
00988 string changedNodeId);
00989
00990
00991
00992
00993
00994
00995 void printDimacs(string filename, MyNodeStatesType &nodeStates, string format);
00996 void runMinCut(string outfile, MyNodeStatesType &nodeStates, const MyGainParams ¶ms);
00997
00998
00999
01000
01001
01002
01003
01004
01005
01006
01007
01008 #ifdef USE_CVRESULT
01009
01010 #endif // USE_CVRESULT
01011
01012
01013
01014
01015
01016
01017
01018
01019
01020
01021
01022
01023
01024
01025
01026
01027
01028 void computeEdgeTypeWeightsByCutoffFromSharedFunctionRatio(MyAnnotations &annotations, GeneOntology &go, MyGainParams &gainParams);
01029 void computeEdgeTypeWeightsByDepthFromSharedFunctionRatio(MyAnnotations &annotations, GeneOntology &go, MyGainParams &gainParams);
01030 void computeEdgeTypeWeightsJaccard(ostream &logStream, MyAnnotations &annotations);
01031
01032 void assignEdgeWeightFromDepth(const MyGainParams &gainParams, string annotationType, string category, unsigned int depth, MyEdge &edge);
01033
01034 private:
01035
01036 void _parseDimacsOutput(string filename, set< MyNodeId > &sinkSideNodes) const;
01037 void _setStates(MyNodeStatesType &nodeStates,
01038 const set< MyNodeId > &sinkSideNodes) const;
01039
01040
01041 private:
01042
01043
01044 bool useDegree;
01045 MyNT _minThreshold, _maxThreshold, _threshold;
01046 bool _calculated_threshold;
01047
01048 Reporter _reporter;
01049
01050 SetOfEdgeTypeWeights< unsigned int >::type _depthEdgeTypeWeights;
01051 SetOfGroupedNodes< unsigned int >::type _depthGroupedNodes;
01052
01053 };
01054
01055
01056
01057 #endif // _GAIN_H