Biorithm  1.1
 All Classes Functions Variables Typedefs Friends
xmotif.h
00001 /**************************************************************************
00002  * Copyright (c) 2005-2011 T. M. Murali                                   *
00003  * Copyright (c) 2008-2011 Naveed Massjouni                               *
00004  * Copyright (c) 2004 Greg Grothaus                                       *
00005  *                                                                        *
00006  * This file is part of Biorithm.                                         *
00007  *                                                                        *
00008  * Biorithm is free software: you can redistribute it and/or modify       *
00009  * it under the terms of the GNU General Public License as published by   *
00010  * the Free Software Foundation, either version 3 of the License, or      *
00011  * (at your option) any later version.                                    *
00012  *                                                                        *
00013  * Biorithm is distributed in the hope that it will be useful,            *
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of         *
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          *
00016  * GNU General Public License for more details.                           *
00017  *                                                                        *
00018  * You should have received a copy of the GNU General Public License      *
00019  * along with Biorithm.  If not, see <http://www.gnu.org/licenses/>.      *
00020  *                                                                        *
00021  **************************************************************************/
00022 
00023 #include<string>
00024 #include<iostream>
00025 #include<fstream>
00026 #include<vector>
00027 #include<map>
00028 #include<sstream>
00029 #include<cstdlib>
00030 #include<ctime>
00031 #include<set>
00032 #include<cmath>
00033 #include<cfloat>
00034 #include<algorithm>
00035 
00036 
00037 // just for defining MyNT.
00038 #include "enrichment.h"
00039 // for ltqnorm().
00040 #include "ltqnorm.h"
00041 
00042 
00043 #include "motifDataset.h"
00044 #include "itemset.h"
00045 
00046 using namespace std;
00047 
00048 #ifndef XMOTIFDATASETGREG
00049 #define XMOTIFDATASETGREG
00050 
00051 enum XmotifDiscriminantMethod { DISCRIMINANT_ALL_SAMPLES = 1, DISCRIMINANT_ONE_CLASS_SAMPLES,
00052                                 //ALL_SAMPLES_WEIGHTED, 
00053                                 DISCRIMINANT_ONE_CLASS_SAMPLES_WEIGHTED };
00054 
00055 enum XmotifSampleExpansionMethod { ADD_SAMPLE_MATCH_GENE_RANGE = 1, ADD_SAMPLE_MATCH_GENE_RANGE_ONE_CLASS,
00056                              ADD_SAMPLE_SEARCH, ADD_SAMPLE_SEARCH_ONE_CLASS,
00057                                    ADD_SAMPLE_SEARCH_ONLY_ADD, ADD_SAMPLE_SEARCH_ONLY_ADD_ONE_CLASS,
00058 ADD_SAMPLE_NONE};
00059 
00060 enum XmotifGeneSelectionMethod { GENE_SELECTION_MAX = 1, GENE_SELECTION_STOUFFER,
00061                                  // used when i already know the genes
00062                                  // (e.g., when i read pre-computed
00063                                  // xMotifs from a file) and all i
00064                                  // want to do is compute stats for
00065                                  // the genes. THIS OPTION SHOULD
00066                                  // NEVER BE SET FROM THE COMMAND
00067                                  // LINE.
00068                                  GENE_SELECTION_TRIVIAL};
00069                              
00070 enum XmotifPValueMethod {PVALUE_RANK, PVALUE_TTEST};
00071 
00073 struct XMotifMethods
00074 {
00075 private:
00076   XmotifDiscriminantMethod _discSelectionMethod;
00077   XmotifGeneSelectionMethod _geneSelectionMethod;
00078   XmotifSampleExpansionMethod _sampleExpansionMethod;
00079   XmotifPValueMethod _pValueMethod;
00080 public:
00082   XMotifMethods(XmotifDiscriminantMethod _dsm, XmotifGeneSelectionMethod _gsm,
00083                 XmotifSampleExpansionMethod _asm, XmotifPValueMethod _pvm)
00084       : _discSelectionMethod(_dsm), _geneSelectionMethod(_gsm),
00085             _sampleExpansionMethod(_asm), _pValueMethod(_pvm)
00086     {}
00087   
00089   virtual ~XMotifMethods()
00090     {}
00091 
00093   XmotifDiscriminantMethod getDiscriminantSelectionMethod() const
00094     {
00095       return(_discSelectionMethod);
00096     }
00097   XmotifSampleExpansionMethod getSampleExpansionMethod() const
00098     {
00099       return(_sampleExpansionMethod);
00100     }
00101   XmotifGeneSelectionMethod getGeneSelectionMethod() const
00102     {
00103       return(_geneSelectionMethod);
00104     }
00105   XmotifPValueMethod getPValueMethod() const
00106     {
00107       return(_pValueMethod);
00108     }
00109 };
00110 
00111   
00112 
00116 class xmotif
00117 {
00118 private:
00119   XMotifMethods methods;
00120 
00121   struct _ProbeInterval
00122   {
00123   private:
00124     int probeIndex;
00125     double low;
00126     double high;
00127     double pval;
00128     double zscore;
00129     
00130   public:
00131     _ProbeInterval(int i, double l, double h, double p)
00132         : probeIndex(i), low(l), high(h), pval(p)
00133       {
00134         zscore = ltqnorm(1 - p);
00135       }
00136 
00137     double getHigh() const
00138       {
00139         return(high);
00140       }
00141     double getLow() const
00142       {
00143         return(low);
00144       }
00145     int getProbeIndex() const
00146       {
00147         return(probeIndex);
00148       }
00149     
00150     double getPValue() const
00151       {
00152         return(pval);
00153       }
00154 
00155     double getZScore() const
00156       {
00157         return(zscore);
00158       }
00159     
00160     
00161   };
00162   
00163   struct compareProbeIntervals
00164     : public binary_function< const xmotif::_ProbeInterval&, const xmotif::_ProbeInterval&, bool> 
00165   {
00166     bool operator()(const _ProbeInterval& __x, const _ProbeInterval& __y) const
00167       {
00168         return(__x.getZScore() > __y.getZScore());
00169       }
00170   };
00171 
00173         double pCutoff;
00174 
00175         vector<vector<int> > prevVal, nextVal, prevCountVal, nextCountVal;
00176         vector<int> numData;    //number of data points for each gene
00177         
00178         vector<vector<vector<double> > > countTables;
00179         double countPVal(int,int,int);
00180         double countPVal2(int,int,int);
00181 
00182         // Calculates p value using t test method.
00183         double pValueTTest(const set<int> &discriminatingSet,
00184                 const set<int> &otherSet, int probeIndex);
00185 
00186         void calculateCountTables();
00187         void calculateFactTables();
00188         void calculateWeightsTables();
00189         vector<double> logFactTable;
00190         double logProbeSize;
00191 
00192   // Use gsm to decide which method to use for computing matching probes.
00193         ItemsetRange computeMatchingProbes(const set<int> &discriminatingSet,
00194                 const set<int> &otherSet);
00195 
00196 
00197   ItemsetRange computeMatchingProbesKS(set<int>& D);
00198 
00199   // Compute the probes that match a discriminant by computing for
00200   // each probe the p-value of the interval spanned in that probe by
00201   // the samples in the discriminant and keeping only probes where the
00202   // p-value is less than a threshold. 
00203   ItemsetRange computeMatchingProbesCount(const set<int> &discriminatingSet,
00204     const set<int> &otherSet);
00205 
00206   // Compute the probes that match a discriminant by computing for
00207   // each probe by computing per-probe p-values as in
00208   // computeMatchingProbesCount and then using the subset of probes
00209   // that maximise Stouffer's Z.
00210   ItemsetRange computeMatchingProbesStouffersZ(
00211         const set<int>& discriminatingSet, const set<int>& otherSet);
00212 
00213   // Given a discriminant and the probes that match that discriminant,
00214   // simply compute per-probe p-values. This method is useful for
00215   // filling up pre-computed xMofits that you read from a file, when
00216   // these xMotifs only have row and column information.
00217   ItemsetRange computeMatchingProbesTrivial(set<int>& D, set< int > &probes);
00218 
00219   bool relaxRange;
00220 
00221 
00222   /* compute the range of probe j in the discriminant D. return false
00223      if the discriminant is empty or has other bad properties. return
00224      the indices and actual endpoints of the range in minIndex,
00225      maxIndex, minExpression, and maxExpression. return the number of
00226      points in the discriminant that are in the range in numPoints and
00227      the total number of points in the dataset that are within the
00228      range in totalCount. */
00229   bool computeRangeInDiscriminant(int j, const set<int> &D,
00230                                   int &minIndex, int &maxIndex,
00231                                   double &minExpression, double &maxExpression,
00232                                   int &numPoints, int &totalCount);
00233   
00234 
00235         set<int> randomSubset(set<int>&,int);
00236         set<int> randomSubset(vector<int>, int);
00237         set<int> randomSubset(vector<int>);
00238         set<int> randomSubset(vector<int>, vector<double>, double);
00239         set<int> randomSubset(int,int);
00240         set<int> randomSubset(int);
00241         int randInt(int,int);
00242         double randDouble(double,double);
00243 
00244         // Returns the discriminating set in discriminatingSet.
00245         // otherSet will contain indexes of all samples in all classes
00246         // except chosenClass.
00247         void getDiscriminatingSet(int discriminantMethod, int &chosenClass,
00248                 set<int> &discriminatingSet, set<int> &otherSet);
00249 
00250         map<set<int>,bool> alreadyFoundDiscriminant;
00251         vector<vector<double> > classWeights,nextWeights;
00252         vector<double> normalize(vector<double>);
00253         vector<double> updateWeights(vector<double>,set<int>);
00254         
00255         bool expandSamples(set<int> &discriminatingSet, set<int> &otherSet,
00256                 ItemsetRange &, int);
00257         bool expandSamplesMatchRange(set<int> &, ItemsetRange &, int, bool);
00258         bool expandSamplesGradientDescent(set<int> &discriminatingSet,
00259                 set<int> &otherSet, ItemsetRange &, int, bool);
00260         bool expandSamplesGradientDescentKeepOriginal(set<int> &, set<int> &,
00261                 ItemsetRange &, int, bool);
00262         itemset buildItemset(set<int>&, ItemsetRange&, int);
00263 
00265         motifDataset* myData;
00266         set<itemset> motifs;
00267   // true if i read pre-computed motifs from a file.
00268   bool _motifsRead;
00269   
00270 public:
00277         xmotif(motifDataset* in, const XMotifMethods& methods);
00278 
00281   void changePval(double);
00282 
00289   void computeMotifs(int ns, void (*processFunc)(const itemset&));
00290 
00291 
00293   bool getMotifsRead() const
00294     {
00295       return(_motifsRead);
00296     }
00297   
00301   void printMotifs(ostream* out);
00302 
00304   void readMotifs(string infile);
00305   
00309   set<itemset> getMotifs();
00310 };
00311 
00312 #endif
 All Classes Functions Variables Typedefs Friends