00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00028 #ifndef APRIORI
00029 #define APRIORI
00030
00031 #include<iostream>
00032 #include<fstream>
00033 #include<map>
00034 #include<string>
00035 #include<sstream>
00036 #include<vector>
00037 #include<algorithm>
00038
00039
00040 using namespace std;
00041
00042 #include "dag.h"
00043 #include "itemset.h"
00044 #include "histogram.h"
00045
00046
00047
00048
00049
00050 enum AprioriLatticeEdgeType { LATTICE_NO_EDGE = -1, LATTICE_GENERIC_EDGE = 0, LATTICE_EDGE = 1, LATTICE_COMPLEMENT_EDGE = 2 };
00051
00052 typedef DAGNode< Itemset > LatticeNode;
00053 typedef DAG< LatticeNode > ItemsetLattice;
00054
00055
00056
00057
00058 #ifdef APRIORI_RANDOM_CONSIDER_COMPLEMENTS
00059
00060
00061 typedef pair< unsigned int, unsigned int > ItemsetRandomRowDistKeyType;
00062 #else
00063 typedef unsigned int ItemsetRandomRowDistKeyType;
00064 #endif
00065
00072 class Apriori
00073 {
00074 protected:
00075
00076
00077 unsigned int minrows;
00078 unsigned int mincols;
00079
00080 bool transposed;
00081
00082 vector<vector<unsigned int> > data;
00083 map<unsigned int,string> columnNames,rowNames;
00084
00085
00086
00087 unsigned int _numDatasets;
00088
00089 vector< unsigned int > _rowIndexToDatasetIndex;
00090
00091
00092 vector<Itemset> itemsets;
00093 bool _itemsetsComputed;
00094
00095 ItemsetLattice _closedLattice;
00096 ItemsetLattice _reducedLattice;
00097 bool _latticeComputed;
00098
00099 protected:
00100
00101 virtual void _computeAllRows(vector< unsigned int > &allRows) const;
00102
00103
00104
00105
00106 virtual unsigned int _computeRandomRows(const vector< unsigned int > &allRows,
00107 vector< unsigned int > &randomRows) const;
00108
00109
00110 virtual bool _areRowsInSameDataset(unsigned int i, unsigned int j) const
00111 {
00112 return(_rowIndexToDatasetIndex[i] == _rowIndexToDatasetIndex[j]);
00113 }
00114
00115 private:
00116
00117 int atoi(string in);
00118
00119
00120 unsigned int _getNumOnes(unsigned int index) const;
00121
00125 void finalizeItemset(Itemset &itm, unsigned int index);
00126
00127
00130 void finalizeItemsets();
00131
00132
00133
00134 bool _isStraddler(Itemset &itemset);
00135
00136
00137
00138 void _resizeData();
00139
00146
00147
00148 void transpose();
00149
00150
00151 public:
00152 Apriori()
00153 : minrows(1), mincols(1), transposed(false), data(),
00154 columnNames(), rowNames(), _numDatasets(0), _rowIndexToDatasetIndex(),
00155 itemsets(), _itemsetsComputed(false),
00156 _closedLattice(), _reducedLattice(), _latticeComputed(false)
00157 {}
00158
00165
00167 Apriori(string filename);
00168
00169
00170
00171 Apriori(vector<vector<unsigned int> > &matrix,
00172 map<unsigned int,string> &majorNames,
00173 map<unsigned int,string> &minorNames);
00174
00176 virtual ~Apriori()
00177 {}
00178
00181 virtual bool checkIfClosed(Itemset &itemset) const;
00182
00183
00184
00186 virtual void computeItemsetFromRows(vector< unsigned int > &rowIndices, Itemset &itemset);
00187
00189 virtual void computeItemsetFromRows(Itemset &itemset);
00190
00204 virtual void computeItemsets(const set< unsigned int > *rowsToAvoidInSeed = NULL,
00205 const set< unsigned int > *rowsToAvoidCompletely = NULL);
00206
00213 virtual void computeItemsetsLowRAM(const set< unsigned int > *rowsToAvoidInSeed = NULL,
00214 const set< unsigned int > *rowsToAvoidCompletely = NULL);
00223 virtual AprioriLatticeEdgeType computeLatticeEdgeType(const Itemset &itm1, const Itemset &itm2) const;
00224
00225
00230 virtual void computeLattice();
00231
00234 virtual void computeRandomDistribution(ostream &ostrm, unsigned int numTries,
00235 map< ItemsetRandomRowDistKeyType, MyHistogram >&rowDists,
00236 map< unsigned int, MyHistogram >&columnDists,
00237 MyHistogram &sizeDist);
00238
00239
00244 bool containsColumn(Itemset &itemset, unsigned int index) const;
00245
00246
00251 bool containsRow(Itemset &itemset, unsigned int index) const;
00252
00260 virtual void getItemsets(vector<Itemset> &itemsets, bool deleteItemsets = false);
00261
00270 virtual void getLattice(ItemsetLattice &lattice, bool deleteLattice = false);
00271
00279 virtual void getClosedLattice(ItemsetLattice &closedLattice, bool deleteLattice = false);
00280
00282 virtual unsigned int getNumItemsets() const
00283 {
00284 return(itemsets.size());
00285 }
00286
00288 virtual unsigned int getNumLatticeEdges() const
00289 {
00290 return(_reducedLattice.getNumEdges());
00291 }
00292
00294 virtual void printItemsets(ostream& ostr) const;
00295
00303 virtual void printItemsets(string outputFile) const;
00304
00306 virtual void printItemsetsGraph(ostream& ostr) const;
00307
00312 virtual void printItemsetsGraph(string outputFile) const;
00313
00315 virtual void printItemsetStatistics(ostream& ostr) const;
00316
00318 void readFile(string filename);
00319
00332 virtual void setItemsets(const vector< Itemset > &isets);
00333
00334
00335
00336 virtual unsigned int setMinimums(unsigned int rows, unsigned int columns)
00337 {
00338 minrows=rows; mincols=columns;
00339 _resizeData();
00340 }
00341
00342 };
00343
00344 inline void _computeComplement(vector< unsigned int > &binaryVector)
00345 {
00346 for (unsigned int i = 0; i < binaryVector.size(); i++)
00347 binaryVector[i] = 1 - binaryVector[i];
00348 }
00349
00350
00366 class AprioriWithComplement : public Apriori
00367 {
00368 private:
00369
00370 vector< unsigned int > _originalRowsToComplementRows;
00371
00372
00373
00374 map< unsigned int, unsigned int > _complementRowsToOriginalRows;
00375
00376 set< unsigned int > _complementRowsSet;
00377
00378
00379 protected:
00380
00381 virtual void _computeAllRows(vector< unsigned int > &allRows) const;
00382
00383
00384
00385
00386
00387
00388
00389
00390
00391 virtual unsigned int _computeRandomRows(const vector< unsigned int > &allRows,
00392 vector< unsigned int > &randomRows) const;
00393
00394
00395 virtual bool _isComplementRow(unsigned int index) const
00396 {
00397 return(_complementRowsToOriginalRows.end() != _complementRowsToOriginalRows.find(index));
00398 }
00399
00400
00401 virtual unsigned int _getOriginalRow(unsigned int index) const
00402 {
00403 return(_complementRowsToOriginalRows.find(index)->second);
00404 }
00405
00406 public:
00407
00408 AprioriWithComplement()
00409 : Apriori(), _originalRowsToComplementRows(), _complementRowsToOriginalRows()
00410 {}
00411
00413
00419
00421
00422
00423
00424
00425 AprioriWithComplement(vector<vector<unsigned int> > &matrix,
00426 map<unsigned int,string> &majorNames,
00427 map<unsigned int,string> &minorNames);
00428
00430 virtual ~AprioriWithComplement()
00431 {}
00432
00433
00447 virtual void computeItemsets(const set< unsigned int > *rowsToAvoidInSeed = NULL,
00448 const set< unsigned int > *rowsToAvoidCompletely = NULL);
00449
00460
00461 virtual AprioriLatticeEdgeType computeLatticeEdgeType(const Itemset &itm1, const Itemset &itm2) const;
00462
00463
00465
00466
00471
00472
00473 };
00474
00475 #endif
00476
00477
00478
00479
00480
00481