Biorithm  1.1
 All Classes Functions Variables Typedefs Friends
kmeans.h
00001 /**************************************************************************
00002  * Copyright (c) 2002-2011 T. M. Murali                                   *
00003  *                                                                        *
00004  * This file is part of Biorithm.                                         *
00005  *                                                                        *
00006  * Biorithm is free software: you can redistribute it and/or modify       *
00007  * it under the terms of the GNU General Public License as published by   *
00008  * the Free Software Foundation, either version 3 of the License, or      *
00009  * (at your option) any later version.                                    *
00010  *                                                                        *
00011  * Biorithm is distributed in the hope that it will be useful,            *
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of         *
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          *
00014  * GNU General Public License for more details.                           *
00015  *                                                                        *
00016  * You should have received a copy of the GNU General Public License      *
00017  * along with Biorithm.  If not, see <http://www.gnu.org/licenses/>.      *
00018  *                                                                        *
00019  **************************************************************************/
00020 
00021 #ifndef _KMEANS_H
00022 #define _KMEANS_H
00023 
00024 #include "global.h"
00025 #include "math.h"
00026 
00027 struct MyKMeansInfo
00028 {
00029 public:
00030   MyKMeansInfo()
00031       : isSet(false), cost(), firstIndex()
00032     {}
00033   
00034   bool isSet;
00035   MyNT cost;
00036   unsigned int firstIndex;
00037 };
00038 
00039 inline ostream& operator<<(ostream &ostr, const MyKMeansInfo &info)
00040 {
00041   ostr << info.cost << " ";
00042   return(ostr);
00043 }
00044 
00045 typedef pointer_to_binary_function< MyNT, MyNT, MyNT > MyDistanceFunction;
00046 
00047 struct distance_Euclidean: public binary_function< MyNT, MyNT, MyNT > 
00048 {
00049   MyNT operator()(MyNT __x, MyNT __y) const
00050     {
00051       return(fabs(__x - __y));
00052     }
00053 };
00054 
00055 struct distance_squared_Euclidean: public binary_function< MyNT, MyNT, MyNT > 
00056 {
00057   MyNT operator()(MyNT __x, MyNT __y) const
00058     {
00059       return((__x - __y)*(__x - __y));
00060     }
00061 };
00062 
00063 
00064 
00065 // class to store info on a cluster in MyKMeansAlgo
00066 template < class MyPointType >
00067 class MyKMeansClusterType
00068 {
00069 //  typename int MyPointType;
00070   
00071 public:
00072   void add(MyPointType &point)
00073     {
00074       _points.insert(point);
00075     }
00076   
00077   void add(const vector< MyPointType > &points);
00078   
00079   void remove(MyPointType &point)
00080     {
00081       _points.erase(point);
00082     }
00083   
00084   void remove(const vector< MyPointType > &points);
00085 
00086   // compute the mean.
00087   void mean()
00088     {
00089          _mean = 0;
00090          typename _MyPtsType::const_iterator itr;
00091       for (itr = _points.begin(); itr != _points.end(); itr++)
00092         _mean += *itr;
00093 
00094 //       _mean = _points[0];
00095 //       for (unsigned int i = 1; i < _points.size(); i++)
00096 //         _mean += _points[i];
00097       _mean = _mean/_points.size();
00098     }
00099 
00100   void variance()
00101     {
00102       _variance = 0;
00103          typename _MyPtsType::const_iterator itr;
00104       for (itr = _points.begin(); itr != _points.end(); itr++)
00105         _variance += (*itr - _mean)*(*itr - _mean);
00106 //       _variance = (_points[0] - _mean)*(_points[0] - _mean);
00107 //       for (unsigned int i = 1; i < _points.size(); i++)
00108 //         _variance += _points[i];
00109       _variance = _variance/_points.size();
00110 
00111     }
00112 
00113   
00114   MyNT distance(MyPointType point)
00115   {
00116     return(::distance_Euclidean(mean(), point))();
00117   }
00118   
00119 private:
00120   MyPointType _mean;
00121   MyPointType _variance;
00122   
00123   typedef set< MyPointType > _MyPtsType;
00124   set< MyPointType > _points;
00125   
00126 };
00127 
00128 
00129 
00130 // stuff that a kmeans algorithm should support.
00131 template < class MyKMeansPointType >
00132 class MyKMeansAlgo
00133 {
00134 //  friend class MyKMeansClusterType< class MyKMeansPointType >;
00135   
00136 
00137   
00138   // should this typename be a template parameter?
00139 //  typename vector< MyNT > MyKMeansPointType;
00140 //  typename MyNT MyKMeansPointType;
00141   
00142 public:
00143 
00144   // default or constructor for a given # clusters.
00145   MyKMeansAlgo(unsigned int num = 0)
00146       : _points(), _numClusters(num), _clusters(_numClusters)
00147     {}
00148   
00149   void setPoints(vector< MyKMeansPointType > &points)
00150     {
00151       _points = vector< MyKMeansPointType >(points.size());
00152       for (unsigned int i = 0; i < points.size(); i++)
00153         _points[i] = points[i];
00154 
00155      }
00156   
00157   void setNumClusters(unsigned int num)
00158     {
00159       _numClusters = num;
00160     }
00161   
00162   unsigned int getNumClusters()
00163     {
00164       return(numClusters);
00165     }
00166 
00167   // initiliase. randomly assign points to clusters.
00168   void init()
00169     {
00170       // static just in case i call init() tonnes of times. recreating
00171       // MyRand<> is very slow.
00172       static MyRand< int > generator;
00173       int index;
00174       for (unsigned int i = 0; i < _points.size(); i++)
00175         {
00176           index = generator(_numClusters - 1);
00177 //          _clusters[index].add(i);
00178           _clusters[index].add(_points[i]);
00179           _whichCluster[i] = index;
00180         }
00181       for (unsigned int j = 0; j < _numClusters; j++)
00182         {
00183           _clusters[j].mean();
00184           _clusters[j].variance();
00185         }
00186       
00187     }
00188   
00189   // compute the mean of each cluster
00190   void estep()
00191     {
00192       for (unsigned int i = 0; i < _numClusters; i++)
00193         {
00194           _clusters[i].mean();
00195           _clusters[i].variance();
00196         }
00197     }
00198   
00199   // redistribute the points to the best cluster. return true iff some
00200   // point changes cluster.
00201   bool mstep()
00202     {
00203       unsigned int bestCluster;
00204       MyNT bestDist, dist;
00205       bool pointMoved = false;
00206       for (unsigned int i = 0; i < _points.size(); i++)
00207         {
00208           bestCluster = 0;
00209           bestDist = _clusters[bestCluster].distance(_points[i]);;
00210           for (unsigned int j = 1; j < _numClusters; j++)
00211             {
00212               dist = _clusters[j].distance(_points[i]);
00213               if (dist < bestDist)
00214                 {
00215                   bestCluster = j;
00216                   bestDist = dist;;
00217                 }
00218             }
00219           // don't do anything if bestCluster is the current cluster.
00220           if (bestCluster == _whichCluster[i])
00221             continue;
00222           pointMoved = true;
00223           // move point to the right cluster.
00224 //          _clusters[_whichCluster[i]].remove(i);
00225 //          _clusters[bestCluster].add(i);
00226           _clusters[_whichCluster[i]].remove(_points[i]);
00227           _clusters[bestCluster].add(_points[i]);
00228           _whichCluster[i] = bestCluster;
00229         }
00230       return(pointMoved);
00231     }
00232   
00233   
00234   // the actual function. returns the score.
00235 //   MyNT operator()
00236 //     {}
00237   
00238   
00239 
00240 private:
00241   // points to cluster.
00242   vector< MyKMeansPointType > _points;
00243   
00244   unsigned int _numClusters;
00245 
00246   // how do i represent the clusters.
00247 
00248   // i want to store only an index in each cluster.
00249 //  vector< MyKMeansClusterType< unsigned int > > _clusters;
00250 
00251     vector< MyKMeansClusterType< MyKMeansPointType > > _clusters;
00252 
00253   // for each point, store cluster it belongs to.
00254   vector< unsigned int > _whichCluster;
00255   
00256 };
00257 
00258 
00259 
00260 
00261 #endif // _KMEANS_H 
 All Classes Functions Variables Typedefs Friends