Sunday, August 27, 2017

Use Python to find specific files from one location to another location

import os
import shutil
from fnmatch import fnmatch


root = r'H:'
dst_dir = r'G:'
pattern = "*.tif"
i = 1
for path, subdirs, files in os.walk(root):
  for name in files:
    if fnmatch(name, pattern):
      current_file = os.path.join(path, name)
      shutil.copy(current_file, dst_dir)
      dst_file = os.path.join(dst_dir, name)
      dst_new_file_name = os.path.join(dst_dir, "["+ str(i) +"] " + name)
      os.rename(dst_file, dst_new_file_name)
      print(os.path.join(path, name))
      i += 1

Python Split a large file into multiple files....

def file_split(filehandler, delimiter='\t', 
               row_limit=25, 
               output_name_template='output_%s.txt', 
               output_path='F:\\Novus\\Decision Tree\\Data\\', keep_headers=True):
    import csv
    reader = csv.reader(filehandler, delimiter=delimiter)   #reading the source file using csv module
    current_piece = 1                                       #identifier of the out data int number
    current_out_path = ''.join([output_path, output_name_template % current_piece]) #create the full path of the out data
    
    of = open(current_out_path, 'w')                            #open the source data
    current_out_writer = csv.writer(of, delimiter=delimiter)    #put the source data into a variable
    current_limit = row_limit                                   #set the max row limit
    
    if keep_headers:                            #check if the header option is True or False
        headers = next(reader)
        current_out_writer.writerow(headers)    #if header is True write the header into the out file
        
    for i, row in enumerate(reader):
        if i + 1 > current_limit:
            of.close()
            current_piece += 1
            current_limit = row_limit * current_piece
            current_out_path = ''.join([output_path, output_name_template % current_piece])
            of = open(current_out_path, 'w')
            current_out_writer = csv.writer(of, delimiter=delimiter)  #if the row limit is achieved then create a new out file
            
            if keep_headers:
                current_out_writer.writerow(headers)
        current_out_writer.writerow(row)
    of.close()
        
               
file_split(open(r'F:\Novus\Decision Tree\Data\iris_data.txt', 'r'), row_limit=100,
           output_name_template='iris_split_%s.txt',
           output_path='F:\\Novus\\Decision Tree\\Data\\')

Thursday, August 24, 2017

Python dask dataframe for large data

The data that I am using here fits in memory, but dask will work even when the data is larger than memory.
import dask.dataframe as dd

df = dd.read_csv(r'F:\Novus\Decision Tree\Data\iris_data_copy.txt', 
                sep='\t', header=0, encoding='latin-1', blocksize=100**4)
df.npartitions

df_summary = df.groupby(['Species']).mean()
print(df_summary.compute())

Friday, August 18, 2017

Update a Python list - Don't know why this the case.

I was trying to update a list which contents a few lists. When I try to update a one element of a list within the list it seems updating all lists of the list. But it only happens when I create the list in the following way. I have no clue why this is the case.
example_list = [[0,0]]*3
print(example_list)

example_list[0][0] = 1
print(example_list)
print('')

example_list = [[0,0],[0,0],[0,0]]
print(example_list)

example_list[0][0] = 1
print(example_list)
[[0, 0], [0, 0], [0, 0]]
[[1, 0], [1, 0], [1, 0]]

[[0, 0], [0, 0], [0, 0]]
[[1, 0], [0, 0], [0, 0]]

Wednesday, August 16, 2017

Python Recursive Function

Recursion is a way of programming or coding a problem, in which a function calls itself one or more times in its body. Usually, it is returning the return value of this function call. If a function definition fulfils the condition of recursion, we call this function a recursive function. 

Termination condition:
A recursive function has to terminate to be used in a program. A recursive function terminates, if with every recursive call the solution of the problem is downsized and moves towards a base case. A base case is a case, where the problem can be solved without further recursion. A recursion can lead to an infinite loop, if the base case is not met in the calls. 

Example: 
4! = 4 * 3!
3! = 3 * 2!
2! = 2 * 1
 
Replacing the calculated values gives us the following expression 
4! = 4 * 3 * 2 * 1 

Generally we can say: Recursion in computer science is a method where the solution to a problem is based on solving smaller instances of the same problem. 
#Recursive function to calculate the cumulative sum of a list
def cum_sum(l=None):
    if len(l) <= 1: return l[0]
    else: return l[0]+cum_sum(l[1:len(l)])

#Recursive function to calculate the the factorial of a number
def fact(number):
    if number == 1: return 1
    else: return number * fact(number-1)
    
print(cum_sum([1,2,3,4,5,6,7,8,9]))
print(fact(3))

#45
#6

Monday, August 14, 2017

Python Grouping Data Elements (itertools groupby)

#PYTHON GROUPING DATA ELEMENTS (ITERTOOLS GROUPBY)

import itertools

#make a iterator that returns consecutive keys and groups from the iterable 
list_01 = [100, 50, 50, 50, 50, 50, 60, 60, 60, 80, 80, 70, 70, 70, 70, 70, 70]

keys = []
groups = []
sorted_list_01 = sorted(list_01)
for k, g in itertools.groupby(sorted_list_01):
    keys.append(k)
    #make g a list
    groups.append(list(g))
    
print(keys, "==>", groups)

#make dict of key and group

dict_list_01 = dict(zip(keys, groups))
print(dict_list_01)

#another example
list_02 = 'AAAAAAACCCDDEEEEFF'
sorted_list_02 = sorted(list_02)
dict_list_02 = dict(zip([k for k, g in itertools.groupby(sorted_list_02)], 
                        [list(g) for k, g in itertools.groupby(sorted_list_02)]))
print(dict_list_02)

dict_list_02_len = dict(zip([k for k, g in itertools.groupby(sorted_list_02)], 
                            [len(list(g)) for k, g in itertools.groupby(sorted_list_02)]))
print(dict_list_02_len)

Friday, August 11, 2017

Python pickle

In this post i am going to talk about pickle. It is used for serializing and de-serializing a Python object structure. Any object in python can be pickled so that it can be saved on disk. What pickle does is that it “serialises” the object first before writing it to file. Pickling is a way to convert a python object (list, dict, etc.) into a character stream. The idea is that this character stream contains all the information necessary to reconstruct the object in another python script.

Example of pickle creation

##Import pickle module
import pickle
##Let's create example dic object
eample_dict = {'Sarbadal': ['Manager', '5 years', 'Data Science', 'Photography'], 
               'AJ':       ['Lead', '4 years', 'Data Science', 'Video Game'], 
               'Shobhit':  ['Sr. Analyst', '1 year', 'Data Science', 'Python'], 
               'Abhishek': ['Analyst', '2 years', 'Data Science', 'Painting']}
##Creating pickle
filename = r'F:\Python\Pickle Objects\pickle_example_dict'
file = open(filename, 'wb')
pickle.dump(eample_dict, file)
##Loading pickle
file = open(filename, 'rb')
new_dict = pickle.load(file)

##Checking the loaded object
print(type(new_dict))
print(new_dict['AJ'])

['Lead', '4 years', 'Data Science', 'Video Game']

Thursday, August 10, 2017

Permutation Function with Repetition

def perm(l=None, n=None, str_a=None, perm_a=None):
    if len(str_a) == n:
        return [str_a] + perm_a
    else:
        new_perm_a = perm_a
        for c in l:
            new_perm_a = perm(l=l, n=n, str_a=str_a + c, perm_a=new_perm_a)
        return new_perm_a

def permutations(l=None, n=None):
    str_a, perm_a = '', []
    result = perm(l=l, n=n, str_a=str_a, perm_a=perm_a)
    return result

lst = permutations(l=['a', 'b', 'c', 'd'], n=3)
print(lst)

Python Basic Cluster Analysis

import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, hierarchical
from scipy.spatial.distance import cdist
from scipy.spatial.distance import pdist
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import linkage
from sklearn import preprocessing

Reading the data and making the Car Models as Index

print('Reading the original data mtcar...')
car = pd.read_csv(r'F:\Novus\ClusterAnalysis\mtcar.txt', 
                  sep='\t', 
                  header=0, 
                  low_memory=True, 
                  encoding='latin-1', 
                  error_bad_lines=False)

car.set_index('car', inplace=True)

scalling all the attribute to (0,1)...

scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
scaled_col = []
ori_col = car.columns.tolist()
for col in ori_col:
    new_col = 'scaled_'+col
    scaled_col.append(new_col)
    car[new_col] = scaler.fit_transform(car[[col]])

car_scaled = car[scaled_col].copy() #creating the scaled/transformed data
car = car[ori_col].copy()           #putting the original data back
del scaled_col, ori_col

Create dendrogram

tree = linkage(car_scaled, method='complete', metric='euclidean')
hierarchy.to_tree(tree, rd=False)

hierarchy.set_link_color_palette(['#800080', '#FF00FF', '#000080', '#0000FF', '#008080', '#008000', '#800000'])
plt.figure(figsize=(15,10))
dn = hierarchy.dendrogram(tree, 
                          orientation='top', 
                          color_threshold=1,
                          leaf_font_size=11,
                          leaf_rotation=90,
                          labels = car.index.tolist())
plt.show()
hierarchy.set_link_color_palette(None)  # reset to default after use
car_scaled['Group_Hierarchical'] = hierarchy.cut_tree(tree, n_clusters=5, height=None)

KMeans CLustering

#determine k range
k_range = range(1, len(car_scaled))

# fir the KMeans model for each n_cluster = k
k_means_var = [KMeans(n_clusters=k).fit(car_scaled) for k in k_range]

#pull out the cluster centers for each model
centroids = [X.cluster_centers_ for X in k_means_var]

#calculate the Euclidean distance from each point to each cluster center
k_euclid = [cdist(car_scaled, cent, 'euclidean') for cent in centroids]
dist = [np.min(ke, axis=1) for ke in k_euclid]

#total within cluster sum of squares
wcss = [sum(d**2) for d in dist]

#total sum of squares
tss = sum(pdist(car_scaled)**2)/car_scaled.shape[0]

#the between cluster sum of squares
bss = tss - wcss
plt.figure(figsize=(12,7))
plt.plot(range(1, len(car_scaled)), bss,
         linewidth=2.0,
#         linestyle="None", 
         marker="o", 
         color="blue")
plt.xlabel('# clusters')
plt.ylabel('% of variance explained')
plt.title('variance explained vs # cluster')
plt.annotate('optimal number of clusters', xy=(5, bss[4]), xycoords='data', xytext=(7.5, bss[4]-5),
            arrowprops=dict(facecolor='black', shrink=0.05),
            horizontalalignment='top', verticalalignment='top')
plt.grid(True)
plt.show()
clf = KMeans(n_clusters=5)
clf.fit(car_scaled)

centroids = clf.cluster_centers_
labels = clf.labels_
car_scaled['Group_KMeans'] = labels

#print(car_scaled)

crosstab = pd.crosstab(car_scaled['Group_KMeans'], car_scaled['Group_Hierarchical']).apply(lambda r: r, axis=1)
print(crosstab)
Group_Hierarchical  0  1  2   3  4
Group_KMeans                      
0                   0  0  7   0  0
1                   4  0  0   0  0
2                   0  0  0  12  0
3                   0  0  0   0  2
4                   0  7  0   0  0