Source code for yatel.cluster.kmeans

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# "THE WISKEY-WARE LICENSE":
# <utn_kdd@googlegroups.com> wrote this file. As long as you retain this notice
# you can do whatever you want with this stuff. If we meet some day, and you
# think this stuff is worth it, you can buy us a WISKEY in return.


# =============================================================================
# DOCS
# =============================================================================

"""The Yatel kmeans algorithm clusters a network's environments, using as
dimensions the haplotypes which exists in each environment or arbitrary values
computed over them.

For more information about kmeans:

- `Scipy Doc <http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.vq.kmeans.html>`_
- `KMeans in wikipedia <http://en.wikipedia.org/wiki/K-means_clustering>`_

"""

# =============================================================================
# IMPORTS
# =============================================================================

import numpy as np
from scipy.cluster import vq

from yatel import db


# =============================================================================
# KMEANS
# =============================================================================

[docs]def kmeans(nw, envs, k_or_guess, whiten=False, coordc=None, *args, **kwargs): """Performs k-means on a set of all environments defined by `fact_attrs` of a network. Parameters ---------- nw : :py:class:`yatel.db.YatelNetwork` Network source of environments to classify. envs : iterable of :py:class:`yatel.dom.Environments` or dicts Represents all the environments to be clustered. k_or_guess : int or ndarray The number of centroids to generate. A code is assigned to each centroid, which is also the row index of the centroid in the code_book matrix generated. The initial k centroids are chosen by randomly selecting observations from the observation matrix. Alternatively, passing a k by N array specifies the initial k centroids. whiten : bool execute ``scipy.cluster.vq.whiten`` function over the observation array before executing subjacent *scipy kmeans*. coordc : None or callable If `coordc` is ``None`` generates use `hap_in_env_coords` function. Otherwise ``coordc`` must be a callable with 2 arguments: - ``nw`` network source of environments to classify. - ``env`` the environment to calculate the coordinates and must return an array of coordinates for the given network environment. args : arguments for scipy kmeans kwargs : keywords arguments for scipy kmeans Returns ------- coodebook : an array kxn of k centroids A k by N array of k centroids. The i’th centroid codebook[i] is represented with the code i. The centroids and codes generated represent the lowest distortion seen, not necessarily the globally minimal distortion. distortion : the value of the distortion The distortion between the observations passed and the centroids generated. Examples -------- >>> from yatel import nw >>> from yatel.cluster import kmeans >>> nw = db.YatelNetwork('memory', mode=db.MODE_WRITE) >>> nw.add_elements([dom.Haplotype(1), dom.Haplotype(2), dom.Haplotype(3)]) >>> nw.add_elements([dom.Fact(1, att0=True, att1=4), ... dom.Fact(2, att0=False), ... dom.Fact(2, att0=True, att2="foo")]) >>> nw.add_elements([dom.Edge(12, 1, 2), ... dom.Edge(34, 2, 3), ... dom.Edge(1.25, 3, 1)]) >>> nw.confirm_changes() >>> kmeans.kmeans(nw, nw.enviroments(["att0", "att2"]), 2) (array([[1, 0, 0], [0, 1, 0]]), 0.0, (({u'att0': True, u'att2': None},), ({u'att0': False, u'att2': None}, {u'att0': True, u'att2': u'foo'}))) >>> calc = lambda nw, env: [stats.average(nw, env), stats.std(nw, env)] >>> kmeans.kmeans(nw, ["att0", "att2"], 2, coordc=calc) (array([[ 23. , 11. ], [ 6.625, 5.375]]), 0.0) """ obs = nw2obs(nw, envs, coordc=coordc) codebook, distortion = vq.kmeans(obs=obs, k_or_guess=k_or_guess, *args, **kwargs) return codebook, distortion # ============================================================================= # SUPPORT # =============================================================================
[docs]def hap_in_env_coords(nw, env): """Generates the coordinates for the kmeans algorithm with the existences of haplotypes in the environment. Parameters ---------- nw : :py:class:`yatel.db.YatelNetwork` env : a collection of dict or :py:class:`yatel.dom.Enviroment` Returns ------- array : arrays of arrays The returned coordinates has M elements (M is the number of haplotypes in the network) with same order of ``yatel.db.YatelNetwork.haplotypes_ids`` function with 2 posible values: - **0** if the haplotype doesn´t exist in the environment. - **0** if the haplotype exist in the environment. """ haps_id = [hap.hap_id for hap in nw.haplotypes()] ehid = [hap.hap_id for hap in nw.haplotypes_by_environment(env=env)] haps_id.sort() ehid.sort() return [int(hid in ehid) for hid in haps_id]
[docs]def nw2obs(nw, envs, whiten=False, coordc=None): """Converts any given environments defined by ``fact_attrs`` of a network to an observation matrix to cluster with subjacent *scipy kmeans* Parameters ---------- nw : :py:class:`yatel.db.YatelNetwork` Network source of environments to classify. envs : iterable of :py:class:`yatel.dom.Enviroment` or dicts Represent all the environment to be clustered. whiten : bool execute `scipy.cluster.vq.whiten` function over the observation array before executing subjacent *scipy kmeans*. coordc : None or callable If coordc is ``None`` generates use `hap_in_env_coords` function. Otherwise `coordc` must be a callable with 2 arguments: - `nw` network source of environments to classify. - `env` the environment to calculate the coordinates and must return an array of coordinates for the given network environment. Returns ------- obs : a vector of envs Each I'th row of the M by N array is an observation vector of the I'th environment of ``envs``. Examples -------- >>> from yatel import nw >>> from yatel.cluster import kmeans >>> nw = db.YatelNetwork('memory', mode=db.MODE_WRITE) >>> nw.add_elements([dom.Haplotype(1), dom.Haplotype(2), dom.Haplotype(3)]) >>> nw.add_elements([dom.Fact(1, att0=True, att1=4), ... dom.Fact(2, att0=False), ... dom.Fact(2, att0=True, att2="foo")]) >>> nw.add_elements([dom.Edge(12, 1, 2), ... dom.Edge(34, 2, 3), ... dom.Edge(1.25, 3, 1)]) >>> nw.confirm_changes() >>> kmeans.nw2obs(nw, nw.enviroments(["att0", "att2"])) array([[1, 0, 0], [0, 1, 0], [0, 1, 0]]) """ if not isinstance(nw, db.YatelNetwork): msg = "nw must be 'yatel.db.YatelNetwork' instance" raise TypeError(msg) coordc = hap_in_env_coords if coordc is None else coordc mtx = [] for env in envs: row = coordc(nw, env) mtx.append(row) obs = np.array(mtx) if whiten: obs = vq.whiten(obs) return obs # ============================================================================= # MAIN # =============================================================================
if __name__ == "__main__": print(__doc__)