"""Objects related to graph clustering"""
__license__ = """
Copyright (C) 2006-2007 Gabor Csardi <csardi@rmki.kfki.hu>,
Tamas Nepusz <ntamas@rmki.kfki.hu>
MTA RMKI, Konkoly-Thege Miklos st. 29-33, Budapest 1121, Hungary
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301 USA
"""
from statistics import Histogram
from copy import copy, deepcopy
from StringIO import StringIO
class Clustering(object):
"""Class representing a clustering of an arbitrary ordered set.
This is now used as a base for L{VertexClustering}, but it might be
useful for other purposes as well.
Members of an individual cluster can be accessed by the C{[]} operator:
>>> cl = Clustering([0,0,0,0,1,1,1,2,2,2,2])
>>> cl[0]
[0, 1, 2, 3]
The membership vector can be accessed by the C{membership} property:
>>> cl.membership
[0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2]
The number of clusters can be retrieved by the C{len} function:
>>> len(cl)
3
"""
def __init__(self, membership, params = {}):
"""Constructor.
@param membership: the membership list -- that is, the cluster
index in which each element of the set belongs to.
@param params: additional parameters to be stored in this
object's dictionary."""
self._membership = list(membership)
self.__dict__.update(params)
def __getitem__(self, idx):
"""Returns the members of the specified cluster.
@param idx: the index of the cluster
@return: the members of the specified cluster as a list
"""
return [i for i,e in enumerate(self._membership) if e==idx]
def __len__(self):
"""Returns the number of clusters.
@note: the result is calculated by taking the difference between
the minimum and maximum elements of the membership vector and adding
it to 1. So if there are empty clusters, they are still counted in the
number of clusters -- except when the clusters with the largest or smallest
indices are empty!
@return: the number of clusters
"""
if len(self._membership) == 0: return 0
return max(self._membership) - min(self._membership) + 1
def _get_membership(self): return copy(self._membership)
membership = property(_get_membership, doc = "The membership vector (read only)")
def size(self, idx):
"""Returns the size of a given cluster.
@param idx: the cluster in which we are interested.
"""
return len([1 for x in self._membership if x == idx])
def sizes(self, *args):
"""Returns the size of given clusters.
@param idxs: the cluster indices in which we are interested. If C{None},
defaults to all clusters
"""
idxs = args
if len(idxs) == 0: idxs = None
counts = [0] * len(self)
m = min(self._membership)
for x in self._membership: counts[x+m] += 1
if idxs is None: return counts
result = []
for idx in idxs: result.append(counts[idx-m])
return result
def size_histogram(self, bin_width = 1):
"""Returns the histogram of cluster sizes.
@param bin_width: the bin width of the histogram
@return: a L{Histogram} object
"""
return Histogram(bin_width, self.sizes())
class VertexClustering(Clustering):
"""The clustering of the vertex set of a graph.
This class extends L{Clustering} by linking it to a specific L{Graph} object
and by optionally storing the modularity score of the clustering.
@note: since this class is linked to a L{Graph}, destroying the graph by the
C{del} operator does not free the memory occupied by the graph if there
exists a L{VertexClustering} that references the L{Graph}.
"""
def __init__(self, graph, membership = None, modularity = None, params = {}):
"""Creates a clustering object for a given graph.
@param graph: the graph that will be associated to the clustering
@param membership: the membership list. The length of the list must
be equal to the number of vertices in the graph. If C{None}, every
vertex is assumed to belong to the same cluster.
@param modularity: the modularity score of the clustering. If C{None},
it will be calculated.
@param params: additional parameters to be stored in this object.
"""
self._graph = graph
if membership is None:
Clustering.__init__(self, [0]*graph.vcount(), params)
else:
if len(membership) != graph.vcount():
raise ValueError, "membership list is too short"
Clustering.__init__(self, membership, params)
if modularity is None:
self._q = graph.modularity(membership)
else:
self._q = modularity
def _get_modularity(self): return self._q
modularity = property(_get_modularity, doc = "The modularity score")
q = modularity
def _get_graph(self): return self._graph
modularity = property(_get_graph, doc = "The graph belonging to this object")
def recalculate_modularity(self):
"""Recalculates the stored modularity value.
This method must be called before querying the modularity score of the
clustering through the class member C{modularity} or C{q} if the
graph has been modified (edges have been added or removed) since the
creation of the L{VertexClustering} object.
@return: the new modularity score
"""
self._q = self._graph.modularity(self._membership)
return self._q
def subgraph(self, idx):
"""Get the subgraph belonging to a given cluster.
@param idx: the cluster index
@return: a copy of the subgraph
@precondition: the vertex set of the graph hasn't been modified since
the moment the clustering was constructed.
"""
return self._graph.subgraph(self[idx])
def giant(self):
"""Returns the giant community of the clustered graph.
The giant component a community for which no larger community exists.
@note: there can be multiple giant communities, this method will return
the copy of an arbitrary one if there are multiple giant communities.
@return: a copy of the giant community.
@precondition: the vertex set of the graph hasn't been modified since
the moment the clustering was constructed.
"""
ss = self.sizes()
max_size = max(ss)
return self.subgraph(ss.index(max_size))
class Dendrogram(Clustering):
"""The hierarchical clustering (dendrogram) of some dataset.
A hierarchical clustering means that we know not only the way the
elements are separated into groups, but also the exact history of
how individual elements were joined into larger subgroups.
This class internally represents the hierarchy by a matrix with n rows
and 2 columns -- or more precisely, a list of lists of size 2. This is
exactly the same as the original format used by C{igraph}'s C core.
The M{i}th row of the matrix contains the indices of the two clusters
being joined in time step M{i}. The joint group will be represented by
the ID M{n+i}, with M{i} starting from one. The ID of the joint group
will be referenced in the upcoming steps instead of any of its individual
members. So, IDs less than or equal to M{n} (where M{n} is the number of
rows in the matrix) mean the original members of the dataset (with ID
from 0 to M{n}), while IDs up from M{n+1} mean joint groups. As an
example, take a look at the dendrogram and the internal representation of
a given clustering of five nodes::
0 -+
|
1 -+-+
|
2 ---+-+ <====> [[0, 1], [4, 5], [2, 3], [6, 7]]
|
3 -+ |
| |
4 -+---+---
"""
def __init__(self, merges):
"""Creates a hierarchical clustering.
@param merges: the merge history either in matrix or tuple format"""
Clustering.__init__(self, [0]*(len(merges)+1))
self._merges = [tuple(pair) for pair in merges]
self._nmerges = len(self._merges)
self._n = max(self._merges[-1])-self._nmerges+2
def _convert_matrix_to_tuple_repr(merges, n=None):
if n is None: n = len(merges)+1
t = range(n)
idxs = range(n)
for rowidx, row in enumerate(merges):
i, j = row
try:
idxi, idxj = idxs[i], idxs[j]
t[idxi] = (t[idxi], t[idxj])
t[idxj] = None
except IndexError:
raise ValueError, "malformed matrix, subgroup referenced before being created in step %d" % rowidx
idxs.append(j)
return [x for x in t if x is not None]
def _traverse_inorder(self):
"""Conducts an inorder traversal of the merge tree.
The inorder traversal returns the nodes on the last level in the order
they should be drawn so that no edges cross each other.
@return: the result of the inorder traversal in a list."""
stack = [self._merges[-1]]
result = []
while len(stack)>0:
last = stack[-1]
if len(last) == 0:
stack.pop()
continue
elif len(last) == 1: # Right child
stack[-1] = ()
last = last[0]
else: # Left child
stack[-1] = (last[1],)
last = last[0]
if last < self._n: # This will be a regular node
result.append(last)
else: # This is a merge node, proceed towards left
stack.append(self._merges[last-self._n])
return result
def __str__(self):
return "Dendrogram, %d elements, %d merges" % (self._n, self._nmerges)
def summary(self):
"""Draws the dendrogram of the hierarchical clustering in a string"""
out = StringIO()
print >>out, str(self)
if self._n == 0: return out.getvalue()
print >>out
positions = [None] * self._n
inorder = self._traverse_inorder()
distance = 2
level_distance = 2
nextp = 0
for idx, element in enumerate(inorder):
positions[element] = nextp
inorder[idx] = str(element)
nextp += max(distance, len(inorder[idx])+1)
width = max(positions)+1
# Print the nodes on the lowest level
print >>out, (" "*(distance-1)).join(inorder)
midx = 0
max_community_idx = self._n
while midx < self._nmerges:
s = [" "]*width
for p in positions:
if p >= 0: s[p] = "|"
for i in xrange(level_distance-1): print >>out, "".join(s) # Print the lines
cidx_incr = 0
while midx < self._nmerges:
v1, v2 = self._merges[midx]
if v1 >= max_community_idx or v2 >= max_community_idx: break
midx += 1
p1 = positions[v1]
p2 = positions[v2]
positions[v1] = -1
positions[v2] = -1
positions.append((p1+p2)/2)
s[p1:(p2+1)] = "+%s+" % ("-" * (p2-p1-1))
cidx_incr += 1
max_community_idx += cidx_incr
print >>out, "".join(s)
return out.getvalue()
def _get_merges(self): return copy(self._merges)
merges = property(_get_merges, doc = "The performed merges in matrix format")
class VertexDendrogram(VertexClustering, Dendrogram):
"""The dendrogram resulting from the hierarchical clustering of the
vertex set of a graph."""
def __init__(self, graph, merges, membership = None, modularity = None, params = {}):
"""Creates a dendrogram object for a given graph.
@param graph: the graph that will be associated to the clustering
@param merges: the merges performed given in matrix form.
@param membership: the membership list. The length of the list must
be equal to the number of vertices in the graph. If C{None}, the
dendrogram will be cut at the level where the modularity is maximized
and the membership list will represent this state.
@param modularity: the modularity score of the clustering on each
level of the dendrogram starting from the fully decomposed state.
If C{None}, it will be calculated.
@param params: additional parameters to be stored in this object.
"""
if modularity is None:
# TODO: this is a fairly simple way to calculate the modularity
ms = range(graph.vcount())
communities = range(graph.vcount())
modularity = []
n = graph.vcount()
modularity.append(graph.modularity(ms))
for c1, c2 in merges:
try:
cidx1 = communities[c1]
cidx2 = communities[c2]
except IndexError:
raise ValueError, "invalid merge matrix, referencing nonexisting community"
if cidx1 == -1 or cidx2 == -1:
raise ValueError, "invalid merge matrix, referencing already joined community"
for idx, m in enumerate(ms):
if m == cidx2: ms[idx] = cidx1
communities.append(communities[c1])
communities[c1] = -1
communities[c2] = -1
modularity.append(graph.modularity(ms))
if membership is None:
maxmod = max(modularity)
maxidx = modularity.index(maxmod)
membership = range(graph.vcount())
communities = range(graph.vcount())
midx = 0
while maxidx>0:
maxidx -= 1
c1, c2 = merges[midx]
midx += 1
try:
cidx1 = communities[c1]
cidx2 = communities[c2]
except IndexError:
raise ValueError, "invalid merge matrix, referencing nonexisting community"
if cidx1 == -1 or cidx2 == -1:
raise ValueError, "invalid merge matrix, referencing already joined community"
for idx, m in enumerate(membership):
if m == cidx2: membership[idx] = cidx1
communities.append(communities[c1])
communities[c1] = -1
communities[c2] = -1
recoding = {}
n=0
for idx, m in enumerate(membership):
try:
v = recoding[m]
except KeyError:
recoding[m], v = n, n
n += 1
membership[idx] = v
Dendrogram.__init__(self, merges)
VertexClustering.__init__(self, graph, membership, None, params)
def cut(self, n):
"""Cuts the dendrogram at a given level.
@param n: the desired number of clusters. Merges are replayed from the
beginning until the membership vector has exactly M{n} distinct elements
or until there are no more recorded merges, whichever happens first.
@return: the membership vector
"""
num_elts = self._graph.vcount()
membership = range(self._graph.vcount())
communities = range(self._graph.vcount())
midx = 0
while num_elts>n:
num_elts -= 1
c1, c2 = self._merges[midx]
midx += 1
try:
cidx1 = communities[c1]
cidx2 = communities[c2]
except IndexError:
raise ValueError, "invalid merge matrix, referencing nonexisting community in row %d" % idx
if cidx1 == -1 or cidx2 == -1:
raise ValueError, "invalid merge matrix, referencing already joined community in row %d" % idx
for idx, m in enumerate(membership):
if m == cidx2: membership[idx] = cidx1
communities.append(communities[c1])
communities[c1] = -1
communities[c2] = -1
recoding = {}
n=0
for idx, m in enumerate(membership):
try:
v = recoding[m]
except KeyError:
recoding[m], v = n, n
n += 1
membership[idx] = v
self._membership = membership
return copy(membership)
syntax highlighted by Code2HTML, v. 0.9.1