77 KMeans::KMeans(QString distance, QString start, qint32 replicates, QString emptyact,
bool online, qint32 maxit)
78 : m_sDistance(distance)
81 , m_sEmptyact(emptyact)
93 bool KMeans::calculate( MatrixXd X, qint32 kClusters, VectorXi& idx, MatrixXd& C, VectorXd& sumD, MatrixXd& D)
106 if(m_sDistance.compare(
"cosine") == 0)
116 else if(m_sDistance.compare(
"correlation")==0)
118 X.array() -= (X.rowwise().sum().array() / (double)p).replicate(1,p);
119 MatrixXd Xnorm = (X.array().pow(2).rowwise().sum()).sqrt();
125 X.array() /= Xnorm.replicate(1,p).array();
137 if (m_sStart.compare(
"uniform") == 0)
139 if (m_sDistance.compare(
"hamming") == 0)
141 printf(
"Error: Uniform Start For Hamming\n");
144 Xmins = X.colwise().minCoeff();
145 Xmaxs = X.colwise().maxCoeff();
154 Del.fill(std::numeric_limits<double>::quiet_NaN());
157 double totsumDBest = std::numeric_limits<double>::max();
165 for(qint32 rep = 0; rep < m_iReps; ++rep)
167 if (m_sStart.compare(
"uniform") == 0)
169 C = MatrixXd::Zero(k,p);
170 for(qint32 i = 0; i < k; ++i)
171 for(qint32 j = 0; j < p; ++j)
172 C(i,j) = unifrnd(Xmins[j], Xmaxs[j]);
177 if (m_sDistance.compare(
"correlation") == 0)
178 C.array() -= (C.array().rowwise().sum()/p).replicate(1, p).array();
180 else if (m_sStart.compare(
"sample") == 0)
182 C = MatrixXd::Zero(k,p);
183 for(qint32 i = 0; i < k; ++i)
184 C.block(i,0,1,p) = X.block(rand() % n, 0, 1, p);
203 idx = VectorXi::Zero(D.rows());
204 d = VectorXd::Zero(D.rows());
206 for(qint32 i = 0; i < D.rows(); ++i)
207 d[i] = D.row(i).minCoeff(&idx[i]);
209 m = VectorXi::Zero(k);
210 for(qint32 i = 0; i < k; ++i)
211 for (qint32 j = 0; j < idx.rows(); ++j)
218 bool converged = batchUpdate(X, C, idx);
222 converged = onlineUpdate(X, C, idx);
225 printf(
"Failed To Converge during replicate %d\n", rep);
228 VectorXi nonempties = VectorXi::Zero(m.rows());
230 for(qint32 i = 0; i < m.rows(); ++i)
238 MatrixXd C_tmp(count,C.cols());
240 for(qint32 i = 0; i < nonempties.rows(); ++i)
244 C_tmp.row(count) = C.row(i);
249 MatrixXd D_tmp = distfun(X, C_tmp);
251 for(qint32 i = 0; i < nonempties.rows(); ++i)
255 D.col(i) = D_tmp.col(count);
256 C.row(i) = C_tmp.row(count);
261 d = VectorXd::Zero(n);
262 for(qint32 i = 0; i < n; ++i)
263 d[i] += D.array()(idx[i]*n+i);
265 sumD = VectorXd::Zero(k);
266 for(qint32 i = 0; i < k; ++i)
267 for (qint32 j = 0; j < idx.rows(); ++j)
271 totsumD = sumD.array().sum();
276 if (totsumD < totsumDBest)
278 totsumDBest = totsumD;
296 emptyErrCnt = emptyErrCnt + 1;
298 if (emptyErrCnt == m_iReps)
325 bool KMeans::batchUpdate(
const MatrixXd& X, MatrixXd& C, VectorXi& idx)
330 for(i = 0; i < n; ++i)
334 for(i = 0; i < k; ++i)
337 previdx = VectorXi::Zero(n);
339 prevtotsumD = std::numeric_limits<double>::max();
342 MatrixXd D = MatrixXd::Zero(X.rows(), k);
348 bool converged =
false;
357 KMeans::gcentroids(X, idx, changed, C_new, m_new);
358 MatrixXd D_new = distfun(X, C_new);
360 for(qint32 i = 0; i < changed.rows(); ++i)
362 C.row(changed[i]) = C_new.row(i);
363 D.col(changed[i]) = D_new.col(i);
364 m[changed[i]] = m_new[i];
368 VectorXi empties = VectorXi::Zero(changed.rows());
369 for(qint32 i = 0; i < changed.rows(); ++i)
373 if (empties.sum() > 0)
375 if (m_sEmptyact.compare(
"error") == 0)
380 else if (m_sEmptyact.compare(
"drop") == 0)
387 else if (m_sEmptyact.compare(
"singleton") == 0)
420 for(qint32 i = 0; i < n; ++i)
421 totsumD += D.array()(idx[i]*n+i);
424 if(prevtotsumD <= totsumD)
429 gcentroids(X, idx, changed, C_new, m_new);
430 C.block(0,0,k,C.cols()) = C_new;
431 m.block(0,0,k,1) = m_new;
437 if (iter >= m_iMaxit)
442 prevtotsumD = totsumD;
444 VectorXi nidx(D.rows());
445 for(qint32 i = 0; i < D.rows(); ++i)
446 d[i] = D.row(i).minCoeff(&nidx[i]);
449 VectorXi moved = VectorXi::Zero(nidx.rows());
451 for(qint32 i = 0; i < nidx.rows(); ++i)
453 if(nidx[i] != previdx[i])
459 moved.conservativeResize(count);
461 if (moved.rows() > 0)
464 VectorXi moved_new = VectorXi::Zero(moved.rows());
466 for(qint32 i = 0; i < moved.rows(); ++i)
468 if(D.array()(previdx[moved[i]] * n + moved[i]) > d[moved[i]])
470 moved_new[count] = moved[i];
474 moved_new.conservativeResize(count);
478 if (moved.rows() == 0)
484 for(qint32 i = 0; i < moved.rows(); ++i)
486 idx[ moved[i] ] = nidx[ moved[i] ];
489 std::vector<int> tmp;
490 for(qint32 i = 0; i < moved.rows(); ++i)
491 tmp.push_back(idx[moved[i]]);
492 for(qint32 i = 0; i < moved.rows(); ++i)
493 tmp.push_back(previdx[moved[i]]);
495 std::sort(tmp.begin(),tmp.end());
497 std::vector<int>::iterator it;
498 it = std::unique(tmp.begin(),tmp.end());
499 tmp.resize( it - tmp.begin() );
501 changed.conservativeResize(tmp.size());
503 for(quint32 i = 0; i < tmp.size(); ++i)
513 bool KMeans::onlineUpdate(
const MatrixXd& X, MatrixXd& C, VectorXi& idx)
518 if (m_sDistance.compare(
"cityblock") == 0)
520 Xmid1 = MatrixXd::Zero(k,p);
521 Xmid2 = MatrixXd::Zero(k,p);
522 for(qint32 i = 0; i < k; ++i)
528 MatrixXd Xsorted(m[i],p);
530 for(qint32 j = 0; j < idx.rows(); ++j)
534 Xsorted.row(c) = X.row(j);
538 for(qint32 j = 0; j < Xsorted.cols(); ++j)
539 std::sort(Xsorted.col(j).data(),Xsorted.col(j).data()+Xsorted.rows());
541 qint32 nn = floor(0.5*m[i])-1;
544 Xmid1.row(i) = Xsorted.row(nn);
545 Xmid2.row(i) = Xsorted.row(nn+1);
549 Xmid1.row(i) = Xsorted.row(nn);
550 Xmid2.row(i) = Xsorted.row(nn+2);
554 Xmid1.row(i) = Xsorted.row(0);
555 Xmid2.row(i) = Xsorted.row(0);
560 else if (m_sDistance.compare(
"hamming") == 0)
574 VectorXi changed = VectorXi(m.rows());
576 for(qint32 i = 0; i < m.rows(); ++i)
584 changed.conservativeResize(count);
586 qint32 lastmoved = 0;
589 bool converged =
false;
590 while (iter < m_iMaxit)
603 if (m_sDistance.compare(
"sqeuclidean") == 0)
605 for(qint32 j = 0; j < changed.rows(); ++j)
607 qint32 i = changed[j];
608 VectorXi mbrs = VectorXi::Zero(idx.rows());
609 for(qint32 l = 0; l < idx.rows(); ++l)
613 VectorXi sgn = 1 - 2 * mbrs.array();
616 for(qint32 l = 0; l < mbrs.rows(); ++l)
620 Del.col(i) = ((double)m[i] / ((
double)m[i] + sgn.cast<
double>().array()));
622 Del.col(i).array() *= (X - C.row(i).replicate(n,1)).array().pow(2).rowwise().sum().array();
625 else if (m_sDistance.compare(
"cityblock") == 0)
627 for(qint32 j = 0; j < changed.rows(); ++j)
629 qint32 i = changed[j];
632 MatrixXd ldist = Xmid1.row(i).replicate(n,1) - X;
633 MatrixXd rdist = X - Xmid2.row(i).replicate(n,1);
634 VectorXd mbrs = VectorXd::Zero(idx.rows());
636 for(qint32 l = 0; l < idx.rows(); ++l)
639 MatrixXd sgn = ((-2*mbrs).array() + 1).replicate(1, p);
640 rdist = sgn.array()*rdist.array(); ldist = sgn.array()*ldist.array();
642 for(qint32 l = 0; l < idx.rows(); ++l)
645 for(qint32 h = 0; h < rdist.cols(); ++h)
646 sum += rdist(l,h) > ldist(l,h) ? rdist(l,h) < 0 ? 0 : rdist(l,h) : ldist(l,h) < 0 ? 0 : ldist(l,h);
651 Del.col(i) = ((X - C.row(i).replicate(n,1)).array().abs()).rowwise().sum();
654 else if (m_sDistance.compare(
"cosine") == 0 || m_sDistance.compare(
"correlation") == 0)
657 MatrixXd normC = C.array().pow(2).rowwise().sum().sqrt();
664 for(qint32 j = 0; j < changed.rows(); ++j)
667 XCi = X * C.row(i).transpose();
669 VectorXi mbrs = VectorXi::Zero(idx.rows());
670 for(qint32 l = 0; l < idx.rows(); ++l)
674 VectorXi sgn = 1 - 2 * mbrs.array();
677 double A = (double)m[i] * normC(i,0);
678 double B = pow(((
double)m[i] * normC(i,0)),2);
680 Del.col(i) = 1 + sgn.cast<
double>().array()*
681 (A - (B + 2 * sgn.cast<
double>().array() * m[i] * XCi.array() + 1).sqrt());
683 std::cout <<
"Del.col(i)\n" << Del.col(i) << std::endl;
691 else if (m_sDistance.compare(
"hamming") == 0)
712 prevtotsumD = totsumD;
714 VectorXi nidx = VectorXi::Zero(Del.rows());
715 VectorXd minDel = VectorXd::Zero(Del.rows());
717 for(qint32 i = 0; i < Del.rows(); ++i)
718 minDel[i] = Del.row(i).minCoeff(&nidx[i]);
720 VectorXi moved = VectorXi::Zero(previdx.rows());
722 for(qint32 i = 0; i < moved.rows(); ++i)
724 if(previdx[i] != nidx[i])
730 moved.conservativeResize(count);
735 VectorXi moved_new = VectorXi::Zero(moved.rows());
737 for(qint32 i = 0; i < moved.rows(); ++i)
739 if ( Del.array()(previdx[moved(i)]*n + moved(i)) > minDel(moved(i)))
741 moved_new[count] = moved[i];
745 moved_new.conservativeResize(count);
749 if (moved.rows() <= 0)
753 if ((iter == iter1) || nummoved > 0)
764 VectorXi moved_new(moved.rows());
765 for(qint32 i = 0; i < moved.rows(); ++i)
766 moved_new[i] = ((moved[i] - lastmoved) % n) + lastmoved;
768 moved[0] = moved_new.minCoeff() % n;
769 moved.conservativeResize(1);
772 if (moved[0] <= lastmoved)
781 lastmoved = moved[0];
783 qint32 oidx = idx(moved[0]);
784 nidx[0] = nidx(moved[0]);
785 nidx.conservativeResize(1);
786 totsumD += Del(moved[0],nidx[0]) - Del(moved[0],oidx);
790 idx[ moved[0] ] = nidx[0];
791 m( nidx[0] ) = m( nidx[0] ) + 1;
792 m( oidx ) = m( oidx ) - 1;
795 if (m_sDistance.compare(
"sqeuclidean") == 0)
797 C.row(nidx[0]) = C.row(nidx[0]).array() + (X.row(moved[0]) - C.row(nidx[0])).array() / m[nidx[0]];
798 C.row(oidx) = C.row(oidx).array() - (X.row(moved[0]) - C.row(oidx)).array() / m[oidx];
800 else if (m_sDistance.compare(
"cityblock") == 0)
803 onidx << oidx, nidx[0];
806 for(qint32 h = 0; h < 2; ++h)
812 MatrixXd Xsorted(m[i],p);
814 for(qint32 j = 0; j < idx.rows(); ++j)
818 Xsorted.row(c) = X.row(j);
822 for(qint32 j = 0; j < Xsorted.cols(); ++j)
823 std::sort(Xsorted.col(j).data(),Xsorted.col(j).data()+Xsorted.rows());
826 qint32 nn = floor(0.5*m[i])-1;
829 C.row(i) = 0.5 * (Xsorted.row(nn) + Xsorted.row(nn+1));
830 Xmid1.row(i) = Xsorted.row(nn);
831 Xmid2.row(i) = Xsorted.row(nn+1);
835 C.row(i) = Xsorted.row(nn+1);
838 Xmid1.row(i) = Xsorted.row(nn);
839 Xmid2.row(i) = Xsorted.row(nn+2);
843 Xmid1.row(i) = Xsorted.row(0);
844 Xmid2.row(i) = Xsorted.row(0);
849 else if (m_sDistance.compare(
"cosine") == 0 || m_sDistance.compare(
"correlation") == 0)
851 C.row(nidx[0]).array() += (X.row(moved[0]) - C.row(nidx[0])).array() / m[nidx[0]];
852 C.row(oidx).array() += (X.row(moved[0]) - C.row(oidx)).array() / m[oidx];
854 else if (m_sDistance.compare(
"hamming") == 0)
864 VectorXi sorted_onidx(1+nidx.rows());
865 sorted_onidx << oidx, nidx;
866 std::sort(sorted_onidx.data(), sorted_onidx.data()+sorted_onidx.rows());
867 changed = sorted_onidx;
877 MatrixXd KMeans::distfun(
const MatrixXd& X, MatrixXd& C)
879 MatrixXd D = MatrixXd::Zero(n,C.rows());
880 qint32 nclusts = C.rows();
882 if (m_sDistance.compare(
"sqeuclidean") == 0)
884 for(qint32 i = 0; i < nclusts; ++i)
886 D.col(i) = (X.col(0).array() - C(i,0)).pow(2);
888 for(qint32 j = 1; j < p; ++j)
889 D.col(i) = D.col(i).array() + (X.col(j).array() - C(i,j)).pow(2);
892 else if (m_sDistance.compare(
"cityblock") == 0)
894 for(qint32 i = 0; i < nclusts; ++i)
896 D.col(i) = (X.col(0).array() - C(i,0)).array().abs();
897 for(qint32 j = 1; j < p; ++j)
899 D.col(i).array() += (X.col(j).array() - C(i,j)).array().abs();
903 else if (m_sDistance.compare(
"cosine") == 0 || m_sDistance.compare(
"correlation") == 0)
906 MatrixXd normC = C.array().pow(2).rowwise().sum().sqrt();
909 for (qint32 i = 0; i < nclusts; ++i)
911 MatrixXd C_tmp = (C.row(i).array() / normC(i,0)).transpose();
912 D.col(i) = X * C_tmp;
913 for(qint32 j = 0; j < D.rows(); ++j)
934 void KMeans::gcentroids(
const MatrixXd& X,
const VectorXi& index,
const VectorXi& clusts,
935 MatrixXd& centroids, VectorXi& counts)
937 qint32 num = clusts.rows();
938 centroids = MatrixXd::Zero(num,p);
939 centroids.fill(std::numeric_limits<double>::quiet_NaN());
940 counts = VectorXi::Zero(num);
946 for(qint32 i = 0; i < num; ++i)
949 members = VectorXi::Zero(index.rows());
950 for(qint32 j = 0; j < index.rows(); ++j)
952 if(index[j] == clusts[i])
958 members.conservativeResize(c);
962 if(m_sDistance.compare(
"sqeuclidean") == 0)
965 if(members.rows() > 0)
966 centroids.row(i) = RowVectorXd::Zero(centroids.cols());
968 for(qint32 j = 0; j < members.rows(); ++j)
969 centroids.row(i).array() += X.row(members[j]).array() / counts[i];
971 else if(m_sDistance.compare(
"cityblock") == 0)
975 MatrixXd Xsorted(counts[i],p);
978 for(qint32 j = 0; j < index.rows(); ++j)
980 if(index[j] == clusts[i])
982 Xsorted.row(c) = X.row(j);
987 for(qint32 j = 0; j < Xsorted.cols(); ++j)
988 std::sort(Xsorted.col(j).data(),Xsorted.col(j).data()+Xsorted.rows());
990 qint32 nn = floor(0.5*(counts(i)))-1;
991 if (counts[i] % 2 == 0)
992 centroids.row(i) = .5 * (Xsorted.row(nn) + Xsorted.row(nn+1));
994 centroids.row(i) = Xsorted.row(nn+1);
996 else if(m_sDistance.compare(
"cosine") == 0 || m_sDistance.compare(
"correlation") == 0)
998 for(qint32 j = 0; j < members.rows(); ++j)
999 centroids.row(i).array() += X.row(members[j]).array() / counts[i];
1013 double KMeans::unifrnd(
double a,
double b)
1016 return std::numeric_limits<double>::quiet_NaN();
1023 double r = mu + sig * (2.0* (rand() % 1000)/1000 -1.0);
bool calculate(MatrixXd X, qint32 kClusters, VectorXi &idx, MatrixXd &C, VectorXd &sumD, MatrixXd &D)
KMeans(QString distance=QString("sqeuclidean"), QString start=QString("sample"), qint32 replicates=1, QString emptyact=QString("error"), bool online=true, qint32 maxit=100)
KMeans class declaration.