MNE-CPP  beta 1.0
kmeans.cpp
Go to the documentation of this file.
1 //=============================================================================================================
36 //*************************************************************************************************************
37 //=============================================================================================================
38 // INCLUDES
39 //=============================================================================================================
40 
41 #include "kmeans.h"
42 
43 
44 //*************************************************************************************************************
45 //=============================================================================================================
46 // STL INCLUDES
47 //=============================================================================================================
48 
49 #include <math.h>
50 #include <iostream>
51 #include <algorithm>
52 #include <vector>
53 #include <time.h>
54 
55 
56 //*************************************************************************************************************
57 //=============================================================================================================
58 // Qt INCLUDES
59 //=============================================================================================================
60 
61 #include <QDebug>
62 
63 
64 //*************************************************************************************************************
65 //=============================================================================================================
66 // USED NAMESPACES
67 //=============================================================================================================
68 
69 using namespace UTILSLIB;
70 
71 
72 //*************************************************************************************************************
73 //=============================================================================================================
74 // DEFINE MEMBER METHODS
75 //=============================================================================================================
76 
77 KMeans::KMeans(QString distance, QString start, qint32 replicates, QString emptyact, bool online, qint32 maxit)
78 : m_sDistance(distance)
79 , m_sStart(start)
80 , m_iReps(replicates)
81 , m_sEmptyact(emptyact)
82 , m_iMaxit(maxit)
83 , m_bOnline(online)
84 {
85  // Assume one replicate
86  if (m_iReps < 1)
87  m_iReps = 1;
88 }
89 
90 
91 //*************************************************************************************************************
92 
93 bool KMeans::calculate( MatrixXd X, qint32 kClusters, VectorXi& idx, MatrixXd& C, VectorXd& sumD, MatrixXd& D)
94 {
95  if (kClusters < 1)
96  return false;
97 
98  //Init random generator
99  srand ( time(NULL) );
100 
101 // n points in p dimensional space
102  k = kClusters;
103  n = X.rows();
104  p = X.cols();
105 
106  if(m_sDistance.compare("cosine") == 0)
107  {
108 // Xnorm = sqrt(sum(X.^2, 2));
109 // if any(min(Xnorm) <= eps(max(Xnorm)))
110 // error(['Some points have small relative magnitudes, making them ', ...
111 // 'effectively zero.\nEither remove those points, or choose a ', ...
112 // 'distance other than ''cosine''.']);
113 // end
114 // X = X ./ Xnorm(:,ones(1,p));
115  }
116  else if(m_sDistance.compare("correlation")==0)
117  {
118  X.array() -= (X.rowwise().sum().array() / (double)p).replicate(1,p); //X - X.rowwise().sum();//.repmat(mean(X,2),1,p);
119  MatrixXd Xnorm = (X.array().pow(2).rowwise().sum()).sqrt();//sqrt(sum(X.^2, 2));
120 // if any(min(Xnorm) <= eps(max(Xnorm)))
121 // error(['Some points have small relative standard deviations, making them ', ...
122 // 'effectively constant.\nEither remove those points, or choose a ', ...
123 // 'distance other than ''correlation''.']);
124 // end
125  X.array() /= Xnorm.replicate(1,p).array();
126  }
127 // else if(m_sDistance.compare('hamming')==0)
128 // {
129 // if ~all(ismember(X(:),[0 1]))
130 // error(message('NonbinaryDataForHamm'));
131 // end
132 // }
133 
134  // Start
135  RowVectorXd Xmins;
136  RowVectorXd Xmaxs;
137  if (m_sStart.compare("uniform") == 0)
138  {
139  if (m_sDistance.compare("hamming") == 0)
140  {
141  printf("Error: Uniform Start For Hamming\n");
142  return false;
143  }
144  Xmins = X.colwise().minCoeff();
145  Xmaxs = X.colwise().maxCoeff();
146  }
147 
148  //
149  // Done with input argument processing, begin clustering
150  //
151  if (m_bOnline)
152  {
153  Del = MatrixXd(n,k);
154  Del.fill(std::numeric_limits<double>::quiet_NaN());// reassignment criterion
155  }
156 
157  double totsumDBest = std::numeric_limits<double>::max();
158  emptyErrCnt = 0;
159 
160  VectorXi idxBest;
161  MatrixXd Cbest;
162  VectorXd sumDBest;
163  MatrixXd Dbest;
164 
165  for(qint32 rep = 0; rep < m_iReps; ++rep)
166  {
167  if (m_sStart.compare("uniform") == 0)
168  {
169  C = MatrixXd::Zero(k,p);
170  for(qint32 i = 0; i < k; ++i)
171  for(qint32 j = 0; j < p; ++j)
172  C(i,j) = unifrnd(Xmins[j], Xmaxs[j]);
173  // For 'cosine' and 'correlation', these are uniform inside a subset
174  // of the unit hypersphere. Still need to center them for
175  // 'correlation'. (Re)normalization for 'cosine'/'correlation' is
176  // done at each iteration.
177  if (m_sDistance.compare("correlation") == 0)
178  C.array() -= (C.array().rowwise().sum()/p).replicate(1, p).array();
179  }
180  else if (m_sStart.compare("sample") == 0)
181  {
182  C = MatrixXd::Zero(k,p);
183  for(qint32 i = 0; i < k; ++i)
184  C.block(i,0,1,p) = X.block(rand() % n, 0, 1, p);
185  // DEBUG
186 // C.block(0,0,1,p) = X.block(2, 0, 1, p);
187 // C.block(1,0,1,p) = X.block(7, 0, 1, p);
188 // C.block(2,0,1,p) = X.block(17, 0, 1, p);
189  }
190  // else if (start.compare("cluster") == 0)
191  // {
192  // Xsubset = X(randsample(n,floor(.1*n)),:);
193  // [dum, C] = kmeans(Xsubset, k, varargin{:}, 'start','sample', 'replicates',1);
194  // }
195  // else if (start.compare("numeric") == 0)
196  // {
197  // C = CC(:,:,rep);
198  // }
199 
200  // Compute the distance from every point to each cluster centroid and the
201  // initial assignment of points to clusters
202  D = distfun(X, C);//, 0);
203  idx = VectorXi::Zero(D.rows());
204  d = VectorXd::Zero(D.rows());
205 
206  for(qint32 i = 0; i < D.rows(); ++i)
207  d[i] = D.row(i).minCoeff(&idx[i]);
208 
209  m = VectorXi::Zero(k);
210  for(qint32 i = 0; i < k; ++i)
211  for (qint32 j = 0; j < idx.rows(); ++j)
212  if(idx[j] == i)
213  ++ m[i];
214 
215  try // catch empty cluster errors and move on to next rep
216  {
217  // Begin phase one: batch reassignments
218  bool converged = batchUpdate(X, C, idx);
219 
220  // Begin phase two: single reassignments
221  if (m_bOnline)
222  converged = onlineUpdate(X, C, idx);
223 
224  if (!converged)
225  printf("Failed To Converge during replicate %d\n", rep);
226 
227  // Calculate cluster-wise sums of distances
228  VectorXi nonempties = VectorXi::Zero(m.rows());
229  quint32 count = 0;
230  for(qint32 i = 0; i < m.rows(); ++i)
231  {
232  if(m[i] > 0)
233  {
234  nonempties[i] = 1;
235  ++count;
236  }
237  }
238  MatrixXd C_tmp(count,C.cols());
239  count = 0;
240  for(qint32 i = 0; i < nonempties.rows(); ++i)
241  {
242  if(nonempties[i])
243  {
244  C_tmp.row(count) = C.row(i);
245  ++count;
246  }
247  }
248 
249  MatrixXd D_tmp = distfun(X, C_tmp);//, iter);
250  count = 0;
251  for(qint32 i = 0; i < nonempties.rows(); ++i)
252  {
253  if(nonempties[i])
254  {
255  D.col(i) = D_tmp.col(count);
256  C.row(i) = C_tmp.row(count);
257  ++count;
258  }
259  }
260 
261  d = VectorXd::Zero(n);
262  for(qint32 i = 0; i < n; ++i)
263  d[i] += D.array()(idx[i]*n+i);//Colum Major
264 
265  sumD = VectorXd::Zero(k);
266  for(qint32 i = 0; i < k; ++i)
267  for (qint32 j = 0; j < idx.rows(); ++j)
268  if(idx[j] == i)
269  sumD[i] += d[j];
270 
271  totsumD = sumD.array().sum();
272 
273 // printf("%d iterations, total sum of distances = %f\n", iter, totsumD);
274 
275  // Save the best solution so far
276  if (totsumD < totsumDBest)
277  {
278  totsumDBest = totsumD;
279  idxBest = idx;
280  Cbest = C;
281  sumDBest = sumD;
282  Dbest = D;
283  }
284  }
285  catch (int e)
286  {
287  if(e == 0)
288  {
289  // If an empty cluster error occurred in one of multiple replicates, catch
290  // it, warn, and move on to next replicate. Error only when all replicates
291  // fail. Rethrow an other kind of error.
292  if (m_iReps == 1)
293  return false;
294  else
295  {
296  emptyErrCnt = emptyErrCnt + 1;
297 // printf("Replicate %d terminated: empty cluster created at iteration %d.\n", rep, iter);
298  if (emptyErrCnt == m_iReps)
299  {
300 // error(message('EmptyClusterAllReps'));
301  return false;
302  }
303 
304  }
305  }
306  } // catch
307 
308  } // replicates
309 
310  // Return the best solution
311  idx = idxBest;
312  C = Cbest;
313  sumD = sumDBest;
314  D = Dbest;
315 
316 //if hadNaNs
317 // idx = statinsertnan(wasnan, idx);
318 //end
319  return true;
320 }
321 
322 
323 //*************************************************************************************************************
324 
325 bool KMeans::batchUpdate(const MatrixXd& X, MatrixXd& C, VectorXi& idx)
326 {
327  // Every point moved, every cluster will need an update
328  qint32 i = 0;
329  VectorXi moved(n);
330  for(i = 0; i < n; ++i)
331  moved[i] = i;
332 
333  VectorXi changed(k);
334  for(i = 0; i < k; ++i)
335  changed[i] = i;
336 
337  previdx = VectorXi::Zero(n);
338 
339  prevtotsumD = std::numeric_limits<double>::max();//max double
340 
341 
342  MatrixXd D = MatrixXd::Zero(X.rows(), k);
343 
344  //
345  // Begin phase one: batch reassignments
346  //
347  iter = 0;
348  bool converged = false;
349  while(true)
350  {
351  ++iter;
352 
353  // Calculate the new cluster centroids and counts, and update the
354  // distance from every point to those new cluster centroids
355  MatrixXd C_new;
356  VectorXi m_new;
357  KMeans::gcentroids(X, idx, changed, C_new, m_new);
358  MatrixXd D_new = distfun(X, C_new);//, iter);
359 
360  for(qint32 i = 0; i < changed.rows(); ++i)
361  {
362  C.row(changed[i]) = C_new.row(i);
363  D.col(changed[i]) = D_new.col(i);
364  m[changed[i]] = m_new[i];
365  }
366 
367  // Deal with clusters that have just lost all their members
368  VectorXi empties = VectorXi::Zero(changed.rows());
369  for(qint32 i = 0; i < changed.rows(); ++i)
370  if(m(i) == 0)
371  empties[i] = 1;
372 
373  if (empties.sum() > 0)
374  {
375  if (m_sEmptyact.compare("error") == 0)
376  {
377  return converged;
378 // throw 0;
379  }
380  else if (m_sEmptyact.compare("drop") == 0)
381  {
382  // // Remove the empty cluster from any further processing
383  // D(:,empties) = NaN;
384  // changed = changed(m(changed) > 0);
385  // warning('Empty cluster created at iteration %d during replicate %d.',iter, rep,);
386  }
387  else if (m_sEmptyact.compare("singleton") == 0)
388  {
389  // warning('Empty cluster created at iteration %d during replicate %d.', iter, rep);
390 
391  // for i = empties
392  // d = D((idx-1)*n + (1:n)'); // use newly updated distances
393 
394  // % Find the point furthest away from its current cluster.
395  // % Take that point out of its cluster and use it to create
396  // % a new singleton cluster to replace the empty one.
397  // [dlarge, lonely] = max(d);
398  // from = idx(lonely); % taking from this cluster
399  // if m(from) < 2
400  // % In the very unusual event that the cluster had only
401  // % one member, pick any other non-singleton point.
402  // from = find(m>1,1,'first');
403  // lonely = find(idx==from,1,'first');
404  // end
405  // C(i,:) = X(lonely,:);
406  // m(i) = 1;
407  // idx(lonely) = i;
408  // D(:,i) = distfun(X, C(i,:), distance, iter);
409 
410  // % Update clusters from which points are taken
411  // [C(from,:), m(from)] = gcentroids(X, idx, from, distance);
412  // D(:,from) = distfun(X, C(from,:), distance, iter);
413  // changed = unique([changed from]);
414  // end
415  }
416  }
417 
418  // Compute the total sum of distances for the current configuration.
419  totsumD = 0;
420  for(qint32 i = 0; i < n; ++i)
421  totsumD += D.array()(idx[i]*n+i);//Colum Major
422  // Test for a cycle: if objective is not decreased, back out
423  // the last step and move on to the single update phase
424  if(prevtotsumD <= totsumD)
425  {
426  idx = previdx;
427  MatrixXd C_new;
428  VectorXi m_new;
429  gcentroids(X, idx, changed, C_new, m_new);
430  C.block(0,0,k,C.cols()) = C_new;
431  m.block(0,0,k,1) = m_new;
432  --iter;
433  break;
434  }
435 
436 // printf("%6d\t%6d\t%8d\t%12g\n",iter,1,moved.rows(),totsumD);
437  if (iter >= m_iMaxit)
438  break;
439 
440  // Determine closest cluster for each point and reassign points to clusters
441  previdx = idx;
442  prevtotsumD = totsumD;
443 
444  VectorXi nidx(D.rows());
445  for(qint32 i = 0; i < D.rows(); ++i)
446  d[i] = D.row(i).minCoeff(&nidx[i]);
447 
448  // Determine which points moved
449  VectorXi moved = VectorXi::Zero(nidx.rows());
450  qint32 count = 0;
451  for(qint32 i = 0; i < nidx.rows(); ++i)
452  {
453  if(nidx[i] != previdx[i])
454  {
455  moved[count] = i;
456  ++count;
457  }
458  }
459  moved.conservativeResize(count);
460 
461  if (moved.rows() > 0)
462  {
463  // Resolve ties in favor of not moving
464  VectorXi moved_new = VectorXi::Zero(moved.rows());
465  count = 0;
466  for(qint32 i = 0; i < moved.rows(); ++i)
467  {
468  if(D.array()(previdx[moved[i]] * n + moved[i]) > d[moved[i]])
469  {
470  moved_new[count] = moved[i];
471  ++count;
472  }
473  }
474  moved_new.conservativeResize(count);
475  moved = moved_new;
476  }
477 
478  if (moved.rows() == 0)
479  {
480  converged = true;
481  break;
482  }
483 
484  for(qint32 i = 0; i < moved.rows(); ++i)
485  if(moved[i] >= 0)
486  idx[ moved[i] ] = nidx[ moved[i] ];
487 
488  // Find clusters that gained or lost members
489  std::vector<int> tmp;
490  for(qint32 i = 0; i < moved.rows(); ++i)
491  tmp.push_back(idx[moved[i]]);
492  for(qint32 i = 0; i < moved.rows(); ++i)
493  tmp.push_back(previdx[moved[i]]);
494 
495  std::sort(tmp.begin(),tmp.end());
496 
497  std::vector<int>::iterator it;
498  it = std::unique(tmp.begin(),tmp.end());
499  tmp.resize( it - tmp.begin() );
500 
501  changed.conservativeResize(tmp.size());
502 
503  for(quint32 i = 0; i < tmp.size(); ++i)
504  changed[i] = tmp[i];
505  } // phase one
506  return converged;
507 } // nested function
508 
509 
510 
511 //*************************************************************************************************************
512 
513 bool KMeans::onlineUpdate(const MatrixXd& X, MatrixXd& C, VectorXi& idx)
514 {
515  // Initialize some cluster information prior to phase two
516  MatrixXd Xmid1;
517  MatrixXd Xmid2;
518  if (m_sDistance.compare("cityblock") == 0)
519  {
520  Xmid1 = MatrixXd::Zero(k,p);
521  Xmid2 = MatrixXd::Zero(k,p);
522  for(qint32 i = 0; i < k; ++i)
523  {
524  if (m[i] > 0)
525  {
526  // Separate out sorted coords for points in i'th cluster,
527  // and save values above and below median, component-wise
528  MatrixXd Xsorted(m[i],p);
529  qint32 c = 0;
530  for(qint32 j = 0; j < idx.rows(); ++j)
531  {
532  if(idx[j] == i)
533  {
534  Xsorted.row(c) = X.row(j);
535  ++c;
536  }
537  }
538  for(qint32 j = 0; j < Xsorted.cols(); ++j)
539  std::sort(Xsorted.col(j).data(),Xsorted.col(j).data()+Xsorted.rows());
540 
541  qint32 nn = floor(0.5*m[i])-1;
542  if ((m[i] % 2) == 0)
543  {
544  Xmid1.row(i) = Xsorted.row(nn);
545  Xmid2.row(i) = Xsorted.row(nn+1);
546  }
547  else if (m[i] > 1)
548  {
549  Xmid1.row(i) = Xsorted.row(nn);
550  Xmid2.row(i) = Xsorted.row(nn+2);
551  }
552  else
553  {
554  Xmid1.row(i) = Xsorted.row(0);
555  Xmid2.row(i) = Xsorted.row(0);
556  }
557  }
558  }
559  }
560  else if (m_sDistance.compare("hamming") == 0)
561  {
562 // Xsum = zeros(k,p);
563 // for i = 1:k
564 // if m(i) > 0
565 // % Sum coords for points in i'th cluster, component-wise
566 // Xsum(i,:) = sum(X(idx==i,:), 1);
567 // end
568 // end
569  }
570 
571  //
572  // Begin phase two: single reassignments
573  //
574  VectorXi changed = VectorXi(m.rows());
575  qint32 count = 0;
576  for(qint32 i = 0; i < m.rows(); ++i)
577  {
578  if(m[i] > 0)
579  {
580  changed[count] = i;
581  ++count;
582  }
583  }
584  changed.conservativeResize(count);
585 
586  qint32 lastmoved = 0;
587  qint32 nummoved = 0;
588  qint32 iter1 = iter;
589  bool converged = false;
590  while (iter < m_iMaxit)
591  {
592  // Calculate distances to each cluster from each point, and the
593  // potential change in total sum of errors for adding or removing
594  // each point from each cluster. Clusters that have not changed
595  // membership need not be updated.
596  //
597  // Singleton clusters are a special case for the sum of dists
598  // calculation. Removing their only point is never best, so the
599  // reassignment criterion had better guarantee that a singleton
600  // point will stay in its own cluster. Happily, we get
601  // Del(i,idx(i)) == 0 automatically for them.
602 
603  if (m_sDistance.compare("sqeuclidean") == 0)
604  {
605  for(qint32 j = 0; j < changed.rows(); ++j)
606  {
607  qint32 i = changed[j];
608  VectorXi mbrs = VectorXi::Zero(idx.rows());
609  for(qint32 l = 0; l < idx.rows(); ++l)
610  if(idx[l] == i)
611  mbrs[l] = 1;
612 
613  VectorXi sgn = 1 - 2 * mbrs.array(); // -1 for members, 1 for nonmembers
614 
615  if (m[i] == 1)
616  for(qint32 l = 0; l < mbrs.rows(); ++l)
617  if(mbrs[l])
618  sgn[l] = 0; // prevent divide-by-zero for singleton mbrs
619 
620  Del.col(i) = ((double)m[i] / ((double)m[i] + sgn.cast<double>().array()));
621 
622  Del.col(i).array() *= (X - C.row(i).replicate(n,1)).array().pow(2).rowwise().sum().array();
623  }
624  }
625  else if (m_sDistance.compare("cityblock") == 0)
626  {
627  for(qint32 j = 0; j < changed.rows(); ++j)
628  {
629  qint32 i = changed[j];
630  if (m(i) % 2 == 0) // this will never catch singleton clusters
631  {
632  MatrixXd ldist = Xmid1.row(i).replicate(n,1) - X;
633  MatrixXd rdist = X - Xmid2.row(i).replicate(n,1);
634  VectorXd mbrs = VectorXd::Zero(idx.rows());
635 
636  for(qint32 l = 0; l < idx.rows(); ++l)
637  if(idx[l] == i)
638  mbrs[l] = 1;
639  MatrixXd sgn = ((-2*mbrs).array() + 1).replicate(1, p); // -1 for members, 1 for nonmembers
640  rdist = sgn.array()*rdist.array(); ldist = sgn.array()*ldist.array();
641 
642  for(qint32 l = 0; l < idx.rows(); ++l)
643  {
644  double sum = 0;
645  for(qint32 h = 0; h < rdist.cols(); ++h)
646  sum += rdist(l,h) > ldist(l,h) ? rdist(l,h) < 0 ? 0 : rdist(l,h) : ldist(l,h) < 0 ? 0 : ldist(l,h);
647  Del(l,i) = sum;
648  }
649  }
650  else
651  Del.col(i) = ((X - C.row(i).replicate(n,1)).array().abs()).rowwise().sum();
652  }
653  }
654  else if (m_sDistance.compare("cosine") == 0 || m_sDistance.compare("correlation") == 0)
655  {
656  // The points are normalized, centroids are not, so normalize them
657  MatrixXd normC = C.array().pow(2).rowwise().sum().sqrt();
658 // if any(normC < eps(class(normC))) % small relative to unit-length data points
659 // error('Zero cluster centroid created at iteration %d during replicate %d.',iter, rep);
660 // end
661  // This can be done without a loop, but the loop saves memory allocations
662  MatrixXd XCi;
663  qint32 i;
664  for(qint32 j = 0; j < changed.rows(); ++j)
665  {
666  i = changed[j];
667  XCi = X * C.row(i).transpose();
668 
669  VectorXi mbrs = VectorXi::Zero(idx.rows());
670  for(qint32 l = 0; l < idx.rows(); ++l)
671  if(idx[l] == i)
672  mbrs[l] = 1;
673 
674  VectorXi sgn = 1 - 2 * mbrs.array(); // -1 for members, 1 for nonmembers
675 
676 
677  double A = (double)m[i] * normC(i,0);
678  double B = pow(((double)m[i] * normC(i,0)),2);
679 
680  Del.col(i) = 1 + sgn.cast<double>().array()*
681  (A - (B + 2 * sgn.cast<double>().array() * m[i] * XCi.array() + 1).sqrt());
682 
683  std::cout << "Del.col(i)\n" << Del.col(i) << std::endl;
684 
685 
686 
687 // Del(:,i) = 1 + sgn .*...
688 // (m(i).*normC(i) - sqrt((m(i).*normC(i)).^2 + 2.*sgn.*m(i).*XCi + 1));
689  }
690  }
691  else if (m_sDistance.compare("hamming") == 0)
692  {
693 // for i = changed
694 // if mod(m(i),2) == 0 % this will never catch singleton clusters
695 // % coords with an unequal number of 0s and 1s have a
696 // % different contribution than coords with an equal
697 // % number
698 // unequal01 = find(2*Xsum(i,:) ~= m(i));
699 // numequal01 = p - length(unequal01);
700 // mbrs = (idx == i);
701 // Di = abs(X(:,unequal01) - C(repmat(i,n,1),unequal01));
702 // Del(:,i) = (sum(Di, 2) + mbrs*numequal01) / p;
703 // else
704 // Del(:,i) = sum(abs(X - C(repmat(i,n,1),:)), 2) / p;
705 // end
706 // end
707  }
708 
709  // Determine best possible move, if any, for each point. Next we
710  // will pick one from those that actually did move.
711  previdx = idx;
712  prevtotsumD = totsumD;
713 
714  VectorXi nidx = VectorXi::Zero(Del.rows());
715  VectorXd minDel = VectorXd::Zero(Del.rows());
716 
717  for(qint32 i = 0; i < Del.rows(); ++i)
718  minDel[i] = Del.row(i).minCoeff(&nidx[i]);
719 
720  VectorXi moved = VectorXi::Zero(previdx.rows());
721  qint32 count = 0;
722  for(qint32 i = 0; i < moved.rows(); ++i)
723  {
724  if(previdx[i] != nidx[i])
725  {
726  moved[count] = i;
727  ++count;
728  }
729  }
730  moved.conservativeResize(count);
731 
732  if (moved.sum() > 0)
733  {
734  // Resolve ties in favor of not moving
735  VectorXi moved_new = VectorXi::Zero(moved.rows());
736  count = 0;
737  for(qint32 i = 0; i < moved.rows(); ++i)
738  {
739  if ( Del.array()(previdx[moved(i)]*n + moved(i)) > minDel(moved(i)))
740  {
741  moved_new[count] = moved[i];
742  ++count;
743  }
744  }
745  moved_new.conservativeResize(count);
746  moved = moved_new;
747  }
748 
749  if (moved.rows() <= 0)
750  {
751  // Count an iteration if phase 2 did nothing at all, or if we're
752  // in the middle of a pass through all the points
753  if ((iter == iter1) || nummoved > 0)
754  {
755  ++iter;
756 
757 // printf("%6d\t%6d\t%8d\t%12g\n",iter,2,nummoved,totsumD);
758  }
759  converged = true;
760  break;
761  }
762 
763  // Pick the next move in cyclic order
764  VectorXi moved_new(moved.rows());
765  for(qint32 i = 0; i < moved.rows(); ++i)
766  moved_new[i] = ((moved[i] - lastmoved) % n) + lastmoved;
767 
768  moved[0] = moved_new.minCoeff() % n;//+1
769  moved.conservativeResize(1);
770 
771  // If we've gone once through all the points, that's an iteration
772  if (moved[0] <= lastmoved)
773  {
774  ++iter;
775 // printf("%6d\t%6d\t%8d\t%12g\n",iter,2,nummoved,totsumD);
776  if(iter >= m_iMaxit)
777  break;
778  nummoved = 0;
779  }
780  ++nummoved;
781  lastmoved = moved[0];
782 
783  qint32 oidx = idx(moved[0]);
784  nidx[0] = nidx(moved[0]);
785  nidx.conservativeResize(1);
786  totsumD += Del(moved[0],nidx[0]) - Del(moved[0],oidx);
787 
788  // Update the cluster index vector, and the old and new cluster
789  // counts and centroids
790  idx[ moved[0] ] = nidx[0];
791  m( nidx[0] ) = m( nidx[0] ) + 1;
792  m( oidx ) = m( oidx ) - 1;
793 
794 
795  if (m_sDistance.compare("sqeuclidean") == 0)
796  {
797  C.row(nidx[0]) = C.row(nidx[0]).array() + (X.row(moved[0]) - C.row(nidx[0])).array() / m[nidx[0]];
798  C.row(oidx) = C.row(oidx).array() - (X.row(moved[0]) - C.row(oidx)).array() / m[oidx];
799  }
800  else if (m_sDistance.compare("cityblock") == 0)
801  {
802  VectorXi onidx(2);
803  onidx << oidx, nidx[0];//ToDo always right?
804 
805  qint32 i;
806  for(qint32 h = 0; h < 2; ++h)
807  {
808  i = onidx[h];
809  // Separate out sorted coords for points in each cluster.
810  // New centroid is the coord median, save values above and
811  // below median. All done component-wise.
812  MatrixXd Xsorted(m[i],p);
813  qint32 c = 0;
814  for(qint32 j = 0; j < idx.rows(); ++j)
815  {
816  if(idx[j] == i)
817  {
818  Xsorted.row(c) = X.row(j);
819  ++c;
820  }
821  }
822  for(qint32 j = 0; j < Xsorted.cols(); ++j)
823  std::sort(Xsorted.col(j).data(),Xsorted.col(j).data()+Xsorted.rows());
824 
825 
826  qint32 nn = floor(0.5*m[i])-1;
827  if ((m[i] % 2) == 0)
828  {
829  C.row(i) = 0.5 * (Xsorted.row(nn) + Xsorted.row(nn+1));
830  Xmid1.row(i) = Xsorted.row(nn);
831  Xmid2.row(i) = Xsorted.row(nn+1);
832  }
833  else
834  {
835  C.row(i) = Xsorted.row(nn+1);
836  if (m(i) > 1)
837  {
838  Xmid1.row(i) = Xsorted.row(nn);
839  Xmid2.row(i) = Xsorted.row(nn+2);
840  }
841  else
842  {
843  Xmid1.row(i) = Xsorted.row(0);
844  Xmid2.row(i) = Xsorted.row(0);
845  }
846  }
847  }
848  }
849  else if (m_sDistance.compare("cosine") == 0 || m_sDistance.compare("correlation") == 0)
850  {
851  C.row(nidx[0]).array() += (X.row(moved[0]) - C.row(nidx[0])).array() / m[nidx[0]];
852  C.row(oidx).array() += (X.row(moved[0]) - C.row(oidx)).array() / m[oidx];
853  }
854  else if (m_sDistance.compare("hamming") == 0)
855  {
856 // % Update summed coords for points in each cluster. New
857 // % centroid is the coord median. All done component-wise.
858 // Xsum(nidx,:) = Xsum(nidx,:) + X(moved,:);
859 // Xsum(oidx,:) = Xsum(oidx,:) - X(moved,:);
860 // C(nidx,:) = .5*sign(2*Xsum(nidx,:) - m(nidx)) + .5;
861 // C(oidx,:) = .5*sign(2*Xsum(oidx,:) - m(oidx)) + .5;
862  }
863 
864  VectorXi sorted_onidx(1+nidx.rows());
865  sorted_onidx << oidx, nidx;
866  std::sort(sorted_onidx.data(), sorted_onidx.data()+sorted_onidx.rows());
867  changed = sorted_onidx;
868  } // phase two
869 
870  return converged;
871 
872 } // nested function
873 
874 
875 //*************************************************************************************************************
876 //DISTFUN Calculate point to cluster centroid distances.
877 MatrixXd KMeans::distfun(const MatrixXd& X, MatrixXd& C)//, qint32 iter)
878 {
879  MatrixXd D = MatrixXd::Zero(n,C.rows());
880  qint32 nclusts = C.rows();
881 
882  if (m_sDistance.compare("sqeuclidean") == 0)
883  {
884  for(qint32 i = 0; i < nclusts; ++i)
885  {
886  D.col(i) = (X.col(0).array() - C(i,0)).pow(2);
887 
888  for(qint32 j = 1; j < p; ++j)
889  D.col(i) = D.col(i).array() + (X.col(j).array() - C(i,j)).pow(2);
890  }
891  }
892  else if (m_sDistance.compare("cityblock") == 0)
893  {
894  for(qint32 i = 0; i < nclusts; ++i)
895  {
896  D.col(i) = (X.col(0).array() - C(i,0)).array().abs();
897  for(qint32 j = 1; j < p; ++j)
898  {
899  D.col(i).array() += (X.col(j).array() - C(i,j)).array().abs();
900  }
901  }
902  }
903  else if (m_sDistance.compare("cosine") == 0 || m_sDistance.compare("correlation") == 0)
904  {
905  // The points are normalized, centroids are not, so normalize them
906  MatrixXd normC = C.array().pow(2).rowwise().sum().sqrt();
907 // if any(normC < eps(class(normC))) % small relative to unit-length data points
908 // error('Zero cluster centroid created at iteration %d.',iter);
909  for (qint32 i = 0; i < nclusts; ++i)
910  {
911  MatrixXd C_tmp = (C.row(i).array() / normC(i,0)).transpose();
912  D.col(i) = X * C_tmp;//max(1 - X * (C(i,:)./normC(i))', 0);
913  for(qint32 j = 0; j < D.rows(); ++j)
914  if(D(j,i) < 0)
915  D(j,i) = 0;
916  }
917  }
918 //case 'hamming'
919 // for i = 1:nclusts
920 // D(:,i) = abs(X(:,1) - C(i,1));
921 // for j = 2:p
922 // D(:,i) = D(:,i) + abs(X(:,j) - C(i,j));
923 // end
924 // D(:,i) = D(:,i) / p;
925 // % D(:,i) = sum(abs(X - C(repmat(i,n,1),:)), 2) / p;
926 // end
927 //end
928  return D;
929 } // function
930 
931 
932 //*************************************************************************************************************
933 //GCENTROIDS Centroids and counts stratified by group.
934 void KMeans::gcentroids(const MatrixXd& X, const VectorXi& index, const VectorXi& clusts,
935  MatrixXd& centroids, VectorXi& counts)
936 {
937  qint32 num = clusts.rows();
938  centroids = MatrixXd::Zero(num,p);
939  centroids.fill(std::numeric_limits<double>::quiet_NaN());
940  counts = VectorXi::Zero(num);
941 
942  VectorXi members;
943 
944  qint32 c;
945 
946  for(qint32 i = 0; i < num; ++i)
947  {
948  c = 0;
949  members = VectorXi::Zero(index.rows());
950  for(qint32 j = 0; j < index.rows(); ++j)
951  {
952  if(index[j] == clusts[i])
953  {
954  members[c] = j;
955  ++c;
956  }
957  }
958  members.conservativeResize(c);
959  if (c > 0)
960  {
961  counts[i] = c;
962  if(m_sDistance.compare("sqeuclidean") == 0)
963  {
964  //Initialize
965  if(members.rows() > 0)
966  centroids.row(i) = RowVectorXd::Zero(centroids.cols());
967 
968  for(qint32 j = 0; j < members.rows(); ++j)
969  centroids.row(i).array() += X.row(members[j]).array() / counts[i];
970  }
971  else if(m_sDistance.compare("cityblock") == 0)
972  {
973  // Separate out sorted coords for points in i'th cluster,
974  // and use to compute a fast median, component-wise
975  MatrixXd Xsorted(counts[i],p);
976  c = 0;
977 
978  for(qint32 j = 0; j < index.rows(); ++j)
979  {
980  if(index[j] == clusts[i])
981  {
982  Xsorted.row(c) = X.row(j);
983  ++c;
984  }
985  }
986 
987  for(qint32 j = 0; j < Xsorted.cols(); ++j)
988  std::sort(Xsorted.col(j).data(),Xsorted.col(j).data()+Xsorted.rows());
989 
990  qint32 nn = floor(0.5*(counts(i)))-1;
991  if (counts[i] % 2 == 0)
992  centroids.row(i) = .5 * (Xsorted.row(nn) + Xsorted.row(nn+1));
993  else
994  centroids.row(i) = Xsorted.row(nn+1);
995  }
996  else if(m_sDistance.compare("cosine") == 0 || m_sDistance.compare("correlation") == 0)
997  {
998  for(qint32 j = 0; j < members.rows(); ++j)
999  centroids.row(i).array() += X.row(members[j]).array() / counts[i]; // unnormalized
1000  }
1001 // else if(m_sDistance.compare("hamming") == 0)
1002 // {
1003 // % Compute a fast median for binary data, component-wise
1004 // centroids(i,:) = .5*sign(2*sum(X(members,:), 1) - counts(i)) + .5;
1005 // }
1006  }
1007  }
1008 }// function
1009 
1010 
1011 //*************************************************************************************************************
1012 
1013 double KMeans::unifrnd(double a, double b)
1014 {
1015  if (a > b)
1016  return std::numeric_limits<double>::quiet_NaN();
1017 
1018  double a2 = a/2.0;
1019  double b2 = b/2.0;
1020  double mu = a2+b2;
1021  double sig = b2-a2;
1022 
1023  double r = mu + sig * (2.0* (rand() % 1000)/1000 -1.0);
1024 
1025  return r;
1026 }
bool calculate(MatrixXd X, qint32 kClusters, VectorXi &idx, MatrixXd &C, VectorXd &sumD, MatrixXd &D)
Definition: kmeans.cpp:93
KMeans(QString distance=QString("sqeuclidean"), QString start=QString("sample"), qint32 replicates=1, QString emptyact=QString("error"), bool online=true, qint32 maxit=100)
Definition: kmeans.cpp:77
KMeans class declaration.