Spectral Clustering is a method to cluster data, where the clusters are non-convex. Having read the chapter (14.5.3 page 544) about spectral clustering in the recently released second edition of Elements of Statistical Learning II, i tried to reproduce figure 14.29 using only python and Scipy/ Matplotlib.
Well, the r-version still looks better, but it's quite possible to get similar results in python. Here's the figure:
The final clustering via kmeans is not very stable. Sometimes it works and sometimes it doesn't. Maybe you have to run kmeans multiple times and choose the best run, but since this is done for demonstration purposes only, it's not a big deal.
Here's the full python-source:
#!/usr/bin/python
import sys
import numpy as np
import matplotlib.pyplot as plt
from scipy.linalg import eig
from scipy.cluster.vq import kmeans2
from scipy.sparse.linalg import eigen
from scipy.spatial.kdtree import KDTree
def get_noise(stddev=0.25, numpoints=150):
# 2d gaussian random noise
x = np.random.normal(0, stddev, numpoints)
y = np.random.normal(0, stddev, numpoints)
return np.column_stack((x, y))
def get_circle(center=(0.0, 0.0), r=1.0, numpoints=150):
# use polar coordinates to get uniformly distributed points
step = np.pi * 2.0 / numpoints
t = np.arange(0, np.pi * 2.0, step)
x = center[0] + r * np.cos(t)
y = center[1] + r * np.sin(t)
return np.column_stack((x, y))
def radial_kernel(c=1.5):
def inner(a, b):
d = a - b
return np.exp((-1 * (np.sqrt(np.dot(d, d.conj()))**2)) / c)
return inner
def circle_samples():
circles = []
for radius in (1.0, 2.8, 5.0):
circles.append(get_circle(r=radius) + get_noise())
return np.vstack(circles)
def mutual_knn(points, n=10, distance=radial_kernel()):
knn = {}
kt = KDTree(points)
for i, point in enumerate(points):
# cannot use euclidean distance directly
for neighbour in kt.query(point, n + 1)[1]:
if i != neighbour:
knn.setdefault(i, []).append(
(distance(point, points[neighbour]), neighbour))
return knn
def get_distance_matrix(knn):
n = len(knn)
W = np.zeros((n, n))
for point, nearest_neighbours in knn.iteritems():
for distance, neighbour in nearest_neighbours:
W[point][neighbour] = distance
return W
def rename_clusters(idx):
# so that first cluster has index 0
num = -1
seen = {}
newidx = []
for id in idx:
if id not in seen:
num += 1
seen[id] = num
newidx.append(seen[id])
return np.array(newidx)
def cluster_points(L):
# sparse eigen is a little bit faster than eig
#evals, evcts = eigen(L, k=15, which="SM")
evals, evcts = eig(L)
evals, evcts = evals.real, evcts.real
edict = dict(zip(evals, evcts.transpose()))
evals = sorted(edict.keys())
# second and third smallest eigenvalue + vector
Y = np.array([edict[k] for k in evals[1:3]]).transpose()
res, idx = kmeans2(Y, 3, minit='random')
return evals[:15], Y, rename_clusters(idx)
def change_tick_fontsize(ax, size):
for tl in ax.get_xticklabels():
tl.set_fontsize(size)
for tl in ax.get_yticklabels():
tl.set_fontsize(size)
def get_colormap():
# map cluster label to color (0, 1, 2) -> (orange, blue, green)
from matplotlib.colors import ListedColormap
orange = (0.918, 0.545, 0.0)
blue = (0.169, 0.651, 0.914)
green = (0.0, 0.58, 0.365)
return ListedColormap([orange, blue, green])
def plot_circles(ax, points, idx, colormap):
plt.scatter(points[:,0], points[:,1], s=10, c=idx, cmap=colormap,
alpha=0.9, facecolors="none")
plt.xlabel("x1", fontsize=8)
plt.ylabel("x2", fontsize=8)
change_tick_fontsize(ax, 8 )
plt.ylim(-6, 6)
plt.xlim(-6, 6)
def plot_eigenvalues(ax, evals):
plt.scatter(np.arange(0, len(evals)), evals,
c=(0.0, 0.58, 0.365), linewidth=0)
plt.xlabel("Number", fontsize=8)
plt.ylabel("Eigenvalue", fontsize=8)
plt.axhline(0, ls="--", c="k")
change_tick_fontsize(ax, 8 )
def plot_eigenvectors(ax, Y, idx, colormap):
from matplotlib.ticker import MaxNLocator
from mpl_toolkits.axes_grid import make_axes_locatable
divider = make_axes_locatable(ax)
ax2 = divider.new_vertical(size="100%", pad=0.05)
fig1 = ax.get_figure()
fig1.add_axes(ax2)
ax2.set_title("Eigenvectors", fontsize=10)
ax2.scatter(np.arange(0, len(Y)), Y[:,0], s=10, c=idx, cmap=colormap,
alpha=0.9, facecolors="none")
ax2.axhline(0, ls="--", c="k")
ax2.yaxis.set_major_locator(MaxNLocator(4))
ax.yaxis.set_major_locator(MaxNLocator(4))
ax.axhline(0, ls="--", c="k")
ax.scatter(np.arange(0, len(Y)), Y[:,1], s=10, c=idx, cmap=colormap,
alpha=0.9, facecolors="none")
ax.set_xlabel("index", fontsize=8)
ax2.set_ylabel("2nd Smallest", fontsize=8)
ax.set_ylabel("3nd Smallest", fontsize=8)
change_tick_fontsize(ax, 8 )
change_tick_fontsize(ax2, 8 )
for tl in ax2.get_xticklabels():
tl.set_visible(False)
def plot_spec_clustering(ax, Y, idx, colormap):
plt.title("Spectral Clustering", fontsize=10)
plt.scatter(Y[:,0], Y[:,1], c=idx, cmap=colormap, s=10, alpha=0.9,
facecolors="none")
plt.xlabel("Second Smallest Eigenvector", fontsize=8)
plt.ylabel("Third Smallest Eigenvector", fontsize=8)
change_tick_fontsize(ax, 8 )
def plot_figure(points, evals, Y, idx):
colormap = get_colormap()
fig = plt.figure(figsize=(6, 5.5))
fig.subplots_adjust(wspace=0.4, hspace=0.3)
ax = fig.add_subplot(2, 2, 1)
plot_circles(ax, points, idx, colormap)
ax = fig.add_subplot(2, 2, 2)
plot_eigenvalues(ax, evals)
ax = fig.add_subplot(2, 2, 3)
plot_eigenvectors(ax, Y, idx, colormap)
ax = fig.add_subplot(2, 2, 4)
plot_spec_clustering(ax, Y, idx, colormap)
plt.show()
def main(args):
points = circle_samples()
knn_points = mutual_knn(points)
W = get_distance_matrix(knn_points)
G = np.diag([sum(Wi) for Wi in W])
# unnormalized graph Laplacian
L = G - W
evals, Y, idx = cluster_points(L)
plot_figure(points, evals, Y, idx)
if __name__ == "__main__":
main(sys.argv)
The plotting part is quite messy and i had to learn some ugly details about the matplotlib-api. If it's possible to do this in a simpler way, please let me know. With the mac os x binary, the source should work out of the box. On Ubuntu, i had to recompile matplotlib, which is another thing that you shouldn't do if you can avoid it (major pain in the ass).
When you view text files in less, sometimes you can simply
triple-click on a line and it just gets marked as a whole.
But sometimes, only the first viewable line on your terminal
gets marked and the rest of the real line is omitted. Well,
for me this is often quite frustrating, because i usually
have to copy&paste large sql-statements which won't fit on
one visible line. After playing around with this obstacle, i figured
out that if you only use page-down to browse in a file, every triple
click will mark the whole line. If you use page-up, then only the
visible line gets marked... if you use page-down again, everything
works like before. Hmm, really strange - maybe it's a bug, a feature,
or both... can't decide...
binary classification
The goal in binary classification is
to take an input vector  and to assign it to one of two discrete classes  where  . Given some training sample of vectors each labeled -1 or 1, the aim is to learn a discriminant function, which predicts the correct label for previously unseen vectors. Filtering spam is probably the most studied example of binary classification in practice. A linear discriminant function is easily obtained by taking a linear function of the input vectors:
Vector w is called the weight vector and b is the bias. The sign of  determines the class of an input vector  .
example python source
To make things even more simple, let's assume that the vectors in the training sample have only two dimensions and are more or less linear seperable, so that we can use a simple line to distinguish between both classes. Let's see how far we can get with a few lines of python and pylab/matplotlib. Well, ok here's a small script to generate a random line and make it visible:
from random import random
from math import sqrt
from pylab import plot, show, axis
def dot(x, y):
# return inner product for x and y
return sum(i * j for i, j in zip(x, y))
def normalize(x):
# return unit length version of x in euclidean space
norm = 1.0 / sqrt(sum(i**2 for i in x))
return [i * norm for i in x]
def get_random_line(p0, width, height):
# given p0, return a random line through p0 in normal form wx - b = 0
# get a random point from the square-area around p0 - lower left
# corner is (0, 0) upper right corner is (width, height)
p1 = (random() * width, random() * height)
# direction vector
v = (p0[0] - p1[0], p0[1] - p1[1])
# w is orthogonal to v and has unit length
w = normalize((v[1], -v[0]))
# wx - b = 0 is true for every point lying on the line
# so bias -b is simply the inner product of w and p1
b = -dot(w, p1)
return w, b
def plot_line(w, b, width, style):
y = []
n1, n2 = w
# convert wx + b = 0 to y : f(x) = ax + b
f = lambda x: -((n1 / n2) * x) - (b / n2)
for x in range(0, width + 1):
y.append(f(x))
plot(y, style)
def main():
p0 = (5, 5)
maxwidth, maxheight = 2 * p0[0], 3 * p0[1]
w, b = get_random_line(p0, maxwidth, maxheight)
margin = 0.5
plot_line(w, b, maxwidth, "r-")
axis([0.0, maxwidth, 0.0, maxheight])
show()
if __name__ == '__main__':
main()
We start with an arbitrary point p0 ((5, 5) in our case), draw a rectangle around it with the origin at the lower left corner and a multiple of p0 in the upper right, choose a second point p1 inside this rectangle randomly and build the line based upon those two points. Ok, the script fails, if the resulting line is parallel to the y-axis, but this is, ahem, unlikely  .
Running the script should show something like this:
The normal form  is used because it makes the calculation of distances between the line and arbitrary points easy. It's also used by the classifier showed later on, so we have to be able to plot lines of this type anyway.
training sample
Now, let's generate some training vectors (points). The points are generated randomly - if the distance between a generated point and the line is smaller than the margin, the point is disgarded. Depending on which side of the line a point lies, it is assigned the label 1 or -1. If the label is 1, the point is blue, otherwise it's red. To make this more interesting, we also add some noise:
def get_points(w, b, width, height, numpoints=500, mindist=1.0):
points = []
for i in range(numpoints):
x = (random() * width, random() * height)
dist = dot(w, x) + b
if abs(dist) > mindist:
points.append([(-1.0, 1.0)[dist > 0.0], x])
# add some noise
if abs(dist) < 2 * mindist and random() < 0.2:
points[-1][0] = points[-1][0] * -1.0
return points
def get_dim(x, dim):
return [i[dim] for i in x]
def plot_points(points):
red, blue = [], []
for label, p in points:
if label > 0:
blue.append(p)
else:
red.append(p)
plot(get_dim(blue, 0), get_dim(blue, 1), "b.")
plot(get_dim(red, 0), get_dim(red, 1), "r.")
def main():
p0 = (5, 5)
maxwidth, maxheight = 2 * p0[0], 3 * p0[1]
w, b = get_random_line(p0, maxwidth, maxheight)
margin = 0.5
plot_line(w, b, maxwidth, "r-")
points = get_points(w, b, maxwidth, maxheight, mindist=margin)
plot_points(points)
axis([.0, maxwidth, 0.0, maxheight])
show()
The resulting image should look like this:
Nice.
mlpy - learning a linear discriminant function
Now that we're able to generate random training data, let's test it  ! So we need a classifier which learns the decision boundary between the two classes. A really neat framework for building such classifiers is mlpy. Adding just one more function does the trick:
def mlpy(points):
from numpy import array
from mlpy import svm
labels, training = [], []
for label, p in points:
labels.append(int(label))
training.append(p)
mysvm = svm(maxloops=1000)
mysvm.compute(array(training), array(labels))
return mysvm._svm__w.tolist(), -mysvm._svm__b
def main():
p0 = (5, 5)
maxwidth, maxheight = 2 * p0[0], 3 * p0[1]
w, b = get_random_line(p0, maxwidth, maxheight)
margin = 0.5
plot_line(w, b, maxwidth, "r-")
plot_line(w, b + margin, maxwidth, "b--")
plot_line(w, b - margin, maxwidth, "b--")
points = get_points(w, b, maxwidth, maxheight, mindist=margin)
plot_points(points)
mw, mb = mlpy(points)
plot_line(mw, mb, maxwidth, "m-")
axis([.0, maxwidth, 0.0, maxheight])
show()
There are two additional lines to denote the margin, and the resulting hypothesis is showed as a magenta line. It's not that far away from the original line, which generated the test data:
Mlpy is used to train a linear Support Vector Machine which tries to maximize the margin between the two different classes. The resulting classifier is just a line in the normal form of  .
Today i wrote a small python-script to collect data from my various machines and backup them with rdiff-backup. Just in case someone finds it useful, here's the source:
#!/usr/bin/python
import os
import rdiff_backup.Main
from rdiff_backup.Main import Main as backup
BACKUP_ROOT = "/mnt/backup/"
def get_jobs(config):
src_prefix = ""
target_prefix = "%s%s" % (BACKUP_ROOT,
config.get("target", "localhost"))
if "remote" in config:
src_prefix = "%s@%s::" % (config.get("user", "root"),
config["remote"])
target_prefix = "%s%s" % (BACKUP_ROOT, config["remote"])
for item in config["entries"]:
job = []
for (option, value) in item["options"]:
job.append(option)
job.append(value)
job.append("%s%s" % (src_prefix, item["path"]))
target_dir = "%s%s" % (target_prefix, item["path"])
if not os.path.exists(target_dir):
os.makedirs(target_dir)
job.append(target_dir)
yield job
configs = [
{"remote": "foobar.com",
"user": "root",
"entries": [
{"options": [],
"path": "/etc",
},
{"options": [
("--exclude", "**src"),
("--exclude", "**svn*"),
("--exclude", "**csv"),
],
"path": "/home/user",
},
],
},
{"entries": [
{"options": [],
"path": "/etc",
},
{"options": [
("--include", "**src/google_appengine"),
("--exclude", "**src"),
("--exclude", "**mp3"),
("--exclude", "**iso"),
],
"path": "/home/user",
},
],
},
]
for config in configs:
for job in get_jobs(config):
print "rdiff-backup %s" % " ".join(job)
reload(rdiff_backup.Main)
backup(job)
Hmm, the config section is really messy, but the order of options is relevant - dunno how to make it simpler...
To avoid getting asked for your ssh passphrase over and over again, you could use screen and ssh-agent:
screen -S backup ssh-agent bash
ssh-add
time ./backup.py
 Tjo, auch bei dem Vortrag von Nachbar Julian war für mich nicht so schrecklich viel neu. Daß die Regierungen die Drittweltstaaten und die Konzerne die Arbeitnehmer zum eigenen Vorteil auspressen und dagegen etwas unternommen werden müsse, mag ja sein. Aber eigentlich ist das ja nur die halbe Wahrheit. Irgendwie kommt es mir bei dieser Debatte immer so vor, als würde die chancenreiche, fortschrittliche Richtung, in die sich die "Wissensgesellschaft" eben auch bewegen kann, bagatellisiert oder bewußt ignoriert. Hmm, wenn ich mal Zeit habe, muß ich dazu nochmal was posten...
 Einen sehr schönen Vortrag habe ich heute zu Tor (next generation onion router) gesehen. Tor ist überhaupt ein sehr schickes Projekt und wird anscheinend allmählich so richtig erfolgreich. Schade nur, daß Tor nicht wirklich gegen starke Angreifer (Trafficanalyse) schützt. Dummerweise scheint es aber eher nichttrivial zu sein, Echtzeitanforderungen mit Sicherheit zu verbinden.
In den USA scheinen die übrigens die vielen Spammer-Netze mit tausenden aufgemachter Zombiwindosen zu einem Umdenken seitens der Bedarfsträger geführt zu haben. Man geht dort wohl mittlerweile nicht mehr automatisch davon aus, daß der Betreiber eines Rechners unmittelbar schuld ist, wenn Spam o.ä. über seine Kiste verteilt wurde. D.h. man hat auch als Betreiber eines Anonymisierungsdienstes eine realistische Chance zu erklären, was man da macht, und daß das alles vollkommen legal ist, bevor einem von den Cops die Bude ausgeräumt wird. Hoffen wir mal, daß sich das auch beizeiten in .de rumspricht...
 Der zweite Vortrag beschäftige sich mit "Peak Oil" und den Problemen, die eine Zivilisation so bekommt, wenn sie festgestellt, daß es plötzlich kein Öl mehr gibt. Da waren ein paar schöne Beispiele dabei - so haben Anfang des letzten Jahrhunderts die zur Landwirtschaft nötigen Pferde einen Großteil der Ernte weggefuttert. Wenn man dann ausrechnet, wieviel Pferde man bräuchte, um bei der gestiegenen Bevölkerungsanzahl ausreichend Nahrungsmittel zu produzieren, wird schnell klar, daß man dahin nicht zurückkann. Überhaupt korrellieren die Zunahme der Ölförderung und die Zunahme der Weltbevölkerung auf erschreckende Weise. Hoffentlich ist das mal keine Kausalität, denn eine der beiden Kurven wird unweigerlich einen heftigen Abschwung nach unten erleben - ist nur eine Frage der Zeit. Andererseits hatte der Vortrag deutliche Längen - naja, ging so.
|
 |
Kommentare