misc.kmeans.cc

#include <iostream>
#include <stack>
#include <cmath>
#include <vector>
#include <ctime>
using namespace std;

#define iota(i,n,b,s) for(int i=int(b);i!=int((b)+(s)*(n));i+=(s))
#define range(i,n,m) iota(i,(((n)>(m))?((n)-(m)+1):((m)-(n)+1)),(n),((n)>(m)?-1:1))
#define rep(i,n) iota(i,(n),0,1)

#define INF (1e9)
#define EPS (1e-9)
#define cons(a,b) (make_pair(a,b))
#define car(a) (a.first)
#define cdr(a) (a.second)
#define cadr(a) (car(cdr(a)))
#define cddr(a) (cdr(cdr(a)))
#define all(a) a.begin(), a.end()
#define trace(var) cerr<<">>> "<<#var<<" = "<<var<<endl;

template<class S, class T>
ostream& operator<<(ostream& os, pair<S,T> p) {
  os << '(' << car(p) << ", " << cdr(p) << ')';
  return os;
}

template<class T>
ostream& operator<<(ostream& os, vector<T> v) {
  os << v[0];
  for (int i=1, len=v.size(); i<len; ++i) os << ' ' << v[i];
  return os;
}

typedef double Real;
typedef pair<Real, Real> P;

/*
 * k-means
 *
 * return vector<int>ids;
 * ids[i] is the cluster ID of ps[i]
 */

/* vector operator */
P operator+(P&a, P&b) {
    return cons(car(a) + car(b), cdr(a) + cdr(b));
}
P operator-(P&a) {
    return cons(-car(a), -cdr(a));
}

P operator-(P&a, P&b) {
    return cons(car(a) - car(b), cdr(a) - cdr(b));
}

Real Euclidean(P&a, P&b) {
  P p = a - b;
  return sqrt(pow(car(p), 2) + pow(cdr(p), 2));
}

vector<int>
kmeans(int cluster_N, vector<P>&ps, int iterator_N = 20) {
  int len = ps.size();
  vector<int> ids(len);

  vector<P> ms(cluster_N);
  rep (i, cluster_N) ms[i] = ps[i * len / cluster_N];

  rep (i, iterator_N) {
    cerr << "iterate: " << i << endl;
    vector<P> sum(cluster_N, P(0, 0)); // sum[j] は id j のクラスタの和
    vector<int> cod(cluster_N, 0); // id j のクラスタの濃度

    // ps[j] と ms[n] の比較
    rep (j, len) {
      int id = -1;
      Real min_d = INF;
      rep (n, cluster_N) {
        Real d = Euclidean(ps[j], ms[n]);
        if (min_d > d) {
          min_d = d;
          id = n;
        }
      }
      ids[j] = id;
      sum[id] = sum[id] + ps[j];
      cod[id]++;
    }

    // update
    rep (n, cluster_N) {
      if (cod[n] == 0) {
        // fuck
        ms[n] = ps[rand() % len];
      } else {
        // means
        P total = sum[n];
        ms[n] = P(car(total) / cod[n], cdr(total) / cod[n]);
      }
    }
  }

  return ids;
}

const int M = 10;
Real gaussian_gen() {
  const int mu = 10;
  Real x = 0;
  rep (j, M) x += Real((rand() % (mu * 2)) - mu);
  x /= M;
  return x;
}

int main() {
  srand(time(NULL));

  /*
   * Point ps[j] has class ts[j]
   */
  vector<P> ps; // seq of point
  vector<int> ts; // seq of class

  const int N = 100;
  rep (i, N) {
    int klass = rand() % 3;
    Real x = gaussian_gen();
    Real y = gaussian_gen();

    // give klass by its mean
    switch (klass) {
      case 0:
        break;
      case 1:
        x += 10;
        y += 10;
        break;
      case 2:
        x += 5;
        y += 5;
        break;
    }
    ps.push_back(P(x, y));
    ts.push_back(klass);
  }

  auto result = kmeans(3, ps);

  /*
   * IDは違うので、単純に比較はできない。
   */
  float acc = 0;
  for (int id0 = 0; id0 < 3; ++id0) {
    for (int id1 = 0; id1 < 3; ++id1) {
      if (id0 == id1) continue;
      for (int id2 = 0; id2 < 3; ++id2) {
        if (id0 == id2) continue;
        if (id1 == id2) continue;
        int n = 0;
        rep (i, N) {
          if (ts[i] == 0 && result[i] == id0) ++n;
          else if (ts[i] == 1 && result[i] == id1) ++n;
          else if (ts[i] == 2 && result[i] == id2) ++n;
        }
        acc = max<float>(acc, float(n) / float(N));
      }
    }
  }
  cerr << "Accuracy: " << (acc * 100) << "%" << endl;

  return 0;
}