Thursday, August 10, 2017

Python Basic Cluster Analysis

import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, hierarchical
from scipy.spatial.distance import cdist
from scipy.spatial.distance import pdist
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import linkage
from sklearn import preprocessing

Reading the data and making the Car Models as Index

print('Reading the original data mtcar...')
car = pd.read_csv(r'F:\Novus\ClusterAnalysis\mtcar.txt', 
                  sep='\t', 
                  header=0, 
                  low_memory=True, 
                  encoding='latin-1', 
                  error_bad_lines=False)

car.set_index('car', inplace=True)

scalling all the attribute to (0,1)...

scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
scaled_col = []
ori_col = car.columns.tolist()
for col in ori_col:
    new_col = 'scaled_'+col
    scaled_col.append(new_col)
    car[new_col] = scaler.fit_transform(car[[col]])

car_scaled = car[scaled_col].copy() #creating the scaled/transformed data
car = car[ori_col].copy()           #putting the original data back
del scaled_col, ori_col

Create dendrogram

tree = linkage(car_scaled, method='complete', metric='euclidean')
hierarchy.to_tree(tree, rd=False)

hierarchy.set_link_color_palette(['#800080', '#FF00FF', '#000080', '#0000FF', '#008080', '#008000', '#800000'])
plt.figure(figsize=(15,10))
dn = hierarchy.dendrogram(tree, 
                          orientation='top', 
                          color_threshold=1,
                          leaf_font_size=11,
                          leaf_rotation=90,
                          labels = car.index.tolist())
plt.show()
hierarchy.set_link_color_palette(None)  # reset to default after use
car_scaled['Group_Hierarchical'] = hierarchy.cut_tree(tree, n_clusters=5, height=None)

KMeans CLustering

#determine k range
k_range = range(1, len(car_scaled))

# fir the KMeans model for each n_cluster = k
k_means_var = [KMeans(n_clusters=k).fit(car_scaled) for k in k_range]

#pull out the cluster centers for each model
centroids = [X.cluster_centers_ for X in k_means_var]

#calculate the Euclidean distance from each point to each cluster center
k_euclid = [cdist(car_scaled, cent, 'euclidean') for cent in centroids]
dist = [np.min(ke, axis=1) for ke in k_euclid]

#total within cluster sum of squares
wcss = [sum(d**2) for d in dist]

#total sum of squares
tss = sum(pdist(car_scaled)**2)/car_scaled.shape[0]

#the between cluster sum of squares
bss = tss - wcss
plt.figure(figsize=(12,7))
plt.plot(range(1, len(car_scaled)), bss,
         linewidth=2.0,
#         linestyle="None", 
         marker="o", 
         color="blue")
plt.xlabel('# clusters')
plt.ylabel('% of variance explained')
plt.title('variance explained vs # cluster')
plt.annotate('optimal number of clusters', xy=(5, bss[4]), xycoords='data', xytext=(7.5, bss[4]-5),
            arrowprops=dict(facecolor='black', shrink=0.05),
            horizontalalignment='top', verticalalignment='top')
plt.grid(True)
plt.show()
clf = KMeans(n_clusters=5)
clf.fit(car_scaled)

centroids = clf.cluster_centers_
labels = clf.labels_
car_scaled['Group_KMeans'] = labels

#print(car_scaled)

crosstab = pd.crosstab(car_scaled['Group_KMeans'], car_scaled['Group_Hierarchical']).apply(lambda r: r, axis=1)
print(crosstab)
Group_Hierarchical  0  1  2   3  4
Group_KMeans                      
0                   0  0  7   0  0
1                   4  0  0   0  0
2                   0  0  0  12  0
3                   0  0  0   0  2
4                   0  7  0   0  0

No comments:

Post a Comment