Biorithm  1.1
 All Classes Functions Variables Typedefs Friends
apriori.h
00001 /**************************************************************************
00002  * Copyright (c) 2004-2011 T. M. Murali                                   *
00003  * Copyright (c) 2004 Greg Grothaus                                       *
00004  *                                                                        *
00005  * This file is part of Biorithm.                                         *
00006  *                                                                        *
00007  * Biorithm is free software: you can redistribute it and/or modify       *
00008  * it under the terms of the GNU General Public License as published by   *
00009  * the Free Software Foundation, either version 3 of the License, or      *
00010  * (at your option) any later version.                                    *
00011  *                                                                        *
00012  * Biorithm is distributed in the hope that it will be useful,            *
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of         *
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          *
00015  * GNU General Public License for more details.                           *
00016  *                                                                        *
00017  * You should have received a copy of the GNU General Public License      *
00018  * along with Biorithm.  If not, see <http://www.gnu.org/licenses/>.      *
00019  *                                                                        *
00020  **************************************************************************/
00021 
00028 #ifndef APRIORI
00029 #define APRIORI
00030 
00031 #include<iostream>
00032 #include<fstream>
00033 #include<map>
00034 #include<string>
00035 #include<sstream>
00036 #include<vector>
00037 #include<algorithm>
00038 
00039 
00040 using namespace std;
00041 
00042 #include "dag.h"
00043 #include "itemset.h"
00044 #include "histogram.h"
00045 
00046 // start at 1 since  0 will correspond to the union of all edge types
00047 // (LATTICE_GENERIC_EDGE). LATTICE_COMPLEMENT is used by
00048 // AprioriWithComplement to indicate that the parent contains rows
00049 // that are complements of the rows in the child.
00050 enum AprioriLatticeEdgeType { LATTICE_NO_EDGE = -1, LATTICE_GENERIC_EDGE = 0, LATTICE_EDGE = 1, LATTICE_COMPLEMENT_EDGE = 2 };
00051 
00052 typedef DAGNode< Itemset > LatticeNode;
00053 typedef DAG< LatticeNode > ItemsetLattice;
00054 
00055 // defined in the appropriate Makefile.am now.
00056 //#define APRIORI_RANDOM_CONSIDER_COMPLEMENTS 0
00057 
00058 #ifdef APRIORI_RANDOM_CONSIDER_COMPLEMENTS
00059 // what is the type of the key for random distributions of itemset row sizes.
00060 //typedef unsigned int ItemsetRandomRowDistKeyType;
00061 typedef pair< unsigned int, unsigned int > ItemsetRandomRowDistKeyType;
00062 #else
00063 typedef unsigned int ItemsetRandomRowDistKeyType;
00064 #endif
00065 
00072 class Apriori
00073 {
00074 protected:
00075 
00076   //minimum number of rows/cols before region is an itemset
00077   unsigned int minrows; 
00078   unsigned int mincols;
00079   
00080   bool transposed;      //whether or not the dataset was transposed
00081 
00082   vector<vector<unsigned int> > data;
00083   map<unsigned int,string> columnNames,rowNames;        //stores the column and row names mapped from indices
00084 
00085   // variables to keep track of multiple data sets.
00086   
00087   unsigned int _numDatasets;
00088   // which data set does a row belong to? 
00089   vector< unsigned int > _rowIndexToDatasetIndex;
00090   
00091   
00092   vector<Itemset> itemsets;
00093   bool _itemsetsComputed;
00094   
00095   ItemsetLattice _closedLattice;
00096   ItemsetLattice _reducedLattice;
00097   bool _latticeComputed;
00098 
00099 protected:
00100   // compute all the rows to select the random set from.
00101   virtual void _computeAllRows(vector< unsigned int > &allRows) const;
00102 
00103   // computed a random set of rows in data. see comment in
00104   // Apriori::computeRandomDistribution() to see why this method
00105   // returns an unsigned int.
00106   virtual unsigned int _computeRandomRows(const vector< unsigned int > &allRows,
00107                                   vector< unsigned int > &randomRows) const;
00108 
00109   // are the rows i and j in the same dataset?
00110   virtual bool _areRowsInSameDataset(unsigned int i, unsigned int j) const
00111     {
00112       return(_rowIndexToDatasetIndex[i] == _rowIndexToDatasetIndex[j]);
00113     }
00114   
00115 private:
00116   //converts a string to an integer
00117   int atoi(string in);
00118 
00119   // count the number of ones in a row.
00120   unsigned int _getNumOnes(unsigned int index) const;
00121   
00125   void finalizeItemset(Itemset &itm, unsigned int index);
00126   
00127   
00130   void finalizeItemsets();
00131 
00132   // return true iff itemset contains rows from at least two datasets.
00133   // if there is only one dataset, always return true.
00134   bool _isStraddler(Itemset &itemset);
00135   
00136   // after setting minimums, remove all columns with < minrows ones
00137   // and all rows with < mincols columns.
00138   void _resizeData();
00139   
00146   // returns the transpose of a matrix, alters the internal variables
00147   // to correspond to the new transpose
00148   void transpose();
00149 
00150   
00151 public:
00152   Apriori()
00153       : minrows(1), mincols(1), transposed(false), data(),
00154         columnNames(), rowNames(), _numDatasets(0), _rowIndexToDatasetIndex(),
00155         itemsets(), _itemsetsComputed(false),
00156         _closedLattice(), _reducedLattice(), _latticeComputed(false)
00157     {}
00158 
00165 
00167   Apriori(string filename);
00168 
00169   //expects a vector of vectors as input, as well as maps specifying the row (major) vector names
00170   //and the column (minor) vector names.  Example: The vector matrix[i] corresponds to majornames[i]
00171   Apriori(vector<vector<unsigned int> > &matrix,
00172           map<unsigned int,string> &majorNames,
00173           map<unsigned int,string> &minorNames);
00174 
00176   virtual ~Apriori()
00177     {}
00178   
00181   virtual bool checkIfClosed(Itemset &itemset) const;
00182   
00183 
00184   
00186   virtual void computeItemsetFromRows(vector< unsigned int > &rowIndices, Itemset &itemset);
00187   
00189   virtual void computeItemsetFromRows(Itemset &itemset);
00190 
00204   virtual void computeItemsets(const set< unsigned int > *rowsToAvoidInSeed = NULL,
00205                        const set< unsigned int > *rowsToAvoidCompletely = NULL);
00206 
00213   virtual void computeItemsetsLowRAM(const set< unsigned int > *rowsToAvoidInSeed = NULL,
00214                        const set< unsigned int > *rowsToAvoidCompletely = NULL);
00223   virtual AprioriLatticeEdgeType computeLatticeEdgeType(const Itemset &itm1, const Itemset &itm2) const;
00224   
00225 
00230   virtual void computeLattice();
00231 
00234   virtual void computeRandomDistribution(ostream &ostrm, unsigned int numTries,
00235                                  map< ItemsetRandomRowDistKeyType, MyHistogram >&rowDists,
00236                                  map< unsigned int, MyHistogram >&columnDists,
00237                                  MyHistogram &sizeDist);
00238   
00239 
00244   bool containsColumn(Itemset &itemset, unsigned int index) const;
00245   
00246 
00251   bool containsRow(Itemset &itemset, unsigned int index) const;
00252 
00260   virtual void getItemsets(vector<Itemset> &itemsets, bool deleteItemsets = false);
00261 
00270   virtual void getLattice(ItemsetLattice &lattice, bool deleteLattice = false);
00271   
00279   virtual void getClosedLattice(ItemsetLattice &closedLattice, bool deleteLattice = false);
00280 
00282   virtual unsigned int getNumItemsets() const
00283     {
00284       return(itemsets.size());
00285     }
00286 
00288   virtual unsigned int getNumLatticeEdges() const
00289     {
00290       return(_reducedLattice.getNumEdges());
00291     }
00292 
00294   virtual void printItemsets(ostream& ostr) const;
00295   
00303   virtual void printItemsets(string outputFile) const;
00304 
00306   virtual void printItemsetsGraph(ostream& ostr) const;
00307 
00312   virtual void printItemsetsGraph(string outputFile) const;
00313 
00315   virtual void printItemsetStatistics(ostream& ostr) const;
00316   
00318   void readFile(string filename);
00319 
00332   virtual void setItemsets(const vector< Itemset > &isets);
00333   
00334   //sets the minimum number of rows/columns in a itemset before it is returned
00335   //default is 1, meaning that single array elements can be returned
00336   virtual unsigned int setMinimums(unsigned int rows, unsigned int columns)
00337     {
00338       minrows=rows; mincols=columns;
00339       _resizeData();
00340     }
00341   
00342 };
00343 
00344 inline void _computeComplement(vector< unsigned int > &binaryVector)
00345 {
00346   for (unsigned int i = 0; i < binaryVector.size(); i++)
00347     binaryVector[i] = 1 - binaryVector[i];
00348 }
00349 
00350 
00366 class AprioriWithComplement : public Apriori
00367 {
00368 private:
00369   // _complementRows[i] stores the index of the complement of row i in data.
00370   vector< unsigned int > _originalRowsToComplementRows;
00371   // _complementRowsToOriginalRows stores the set of rows that are
00372   // complements and which original row corresponds to each
00373   // complement.
00374   map< unsigned int, unsigned int > _complementRowsToOriginalRows;
00375   // _complementRowsSet just stores the set of rows that are complements.
00376   set< unsigned int > _complementRowsSet;
00377   
00378   
00379 protected:
00380   // compute all the rows to select the random set from.
00381   virtual void _computeAllRows(vector< unsigned int > &allRows) const;
00382   
00383   
00384   // computed a random set of rows in data. in this class, i ensure
00385   // that no rows and its complement appear in randomRows and that at
00386   // least one row is not a complement.
00387   //
00388   // see comment in Apriori::computeRandomDistribution() to see why
00389   // this method returns an unsigned int.
00390 
00391   virtual unsigned int _computeRandomRows(const vector< unsigned int > &allRows,
00392                                   vector< unsigned int > &randomRows) const;
00393 
00394   // return true iff index corresponds to a complemented row.
00395   virtual bool _isComplementRow(unsigned int index) const
00396     {
00397       return(_complementRowsToOriginalRows.end() != _complementRowsToOriginalRows.find(index));
00398     }
00399   // assumes that _isComplementRow(index) returned true. return the
00400   // original row corresponding to index.
00401   virtual unsigned int _getOriginalRow(unsigned int index) const
00402     {
00403       return(_complementRowsToOriginalRows.find(index)->second);
00404     }
00405   
00406 public:
00407   
00408   AprioriWithComplement()
00409       : Apriori(), _originalRowsToComplementRows(), _complementRowsToOriginalRows()
00410     {}
00411 
00413 
00419 
00421                        //  Apriori(string filename);
00422 
00423   //expects a vector of vectors as input, as well as maps specifying the row (major) vector names
00424   //and the column (minor) vector names.  Example: The vector matrix[i] corresponds to majornames[i]
00425   AprioriWithComplement(vector<vector<unsigned int> > &matrix,
00426           map<unsigned int,string> &majorNames,
00427           map<unsigned int,string> &minorNames);
00428 
00430   virtual ~AprioriWithComplement()
00431     {}
00432   
00433   
00447   virtual void computeItemsets(const set< unsigned int > *rowsToAvoidInSeed = NULL,
00448                                const set< unsigned int > *rowsToAvoidCompletely = NULL);
00449 
00460 
00461   virtual AprioriLatticeEdgeType computeLatticeEdgeType(const Itemset &itm1, const Itemset &itm2) const;
00462 
00463   
00465   //  virtual void printItemsetsGraph(ostream& ostr) const;
00466 
00471   //  virtual void printItemsetsGraph(string outputFile) const;
00472 
00473 };
00474 
00475 #endif
00476 
00477 
00478 
00479 
00480 
00481 
 All Classes Functions Variables Typedefs Friends