import os import shutil from fnmatch import fnmatch root = r'H:' dst_dir = r'G:' pattern = "*.tif" i = 1 for path, subdirs, files in os.walk(root): for name in files: if fnmatch(name, pattern): current_file = os.path.join(path, name) shutil.copy(current_file, dst_dir) dst_file = os.path.join(dst_dir, name) dst_new_file_name = os.path.join(dst_dir, "["+ str(i) +"] " + name) os.rename(dst_file, dst_new_file_name) print(os.path.join(path, name)) i += 1
Sunday, August 27, 2017
Use Python to find specific files from one location to another location
Python Split a large file into multiple files....
def file_split(filehandler, delimiter='\t', row_limit=25, output_name_template='output_%s.txt', output_path='F:\\Novus\\Decision Tree\\Data\\', keep_headers=True): import csv reader = csv.reader(filehandler, delimiter=delimiter) #reading the source file using csv module current_piece = 1 #identifier of the out data int number current_out_path = ''.join([output_path, output_name_template % current_piece]) #create the full path of the out data of = open(current_out_path, 'w') #open the source data current_out_writer = csv.writer(of, delimiter=delimiter) #put the source data into a variable current_limit = row_limit #set the max row limit if keep_headers: #check if the header option is True or False headers = next(reader) current_out_writer.writerow(headers) #if header is True write the header into the out file for i, row in enumerate(reader): if i + 1 > current_limit: of.close() current_piece += 1 current_limit = row_limit * current_piece current_out_path = ''.join([output_path, output_name_template % current_piece]) of = open(current_out_path, 'w') current_out_writer = csv.writer(of, delimiter=delimiter) #if the row limit is achieved then create a new out file if keep_headers: current_out_writer.writerow(headers) current_out_writer.writerow(row) of.close() file_split(open(r'F:\Novus\Decision Tree\Data\iris_data.txt', 'r'), row_limit=100, output_name_template='iris_split_%s.txt', output_path='F:\\Novus\\Decision Tree\\Data\\')
Thursday, August 24, 2017
Python dask dataframe for large data
The data that I am using here fits in memory, but dask
will work even when the data is larger than memory.
import dask.dataframe as dd df = dd.read_csv(r'F:\Novus\Decision Tree\Data\iris_data_copy.txt', sep='\t', header=0, encoding='latin-1', blocksize=100**4) df.npartitions df_summary = df.groupby(['Species']).mean() print(df_summary.compute())
Friday, August 18, 2017
Update a Python list - Don't know why this the case.
I was trying to update a list which contents a few lists. When I try to update a one element of a list within the list it seems updating all lists of the list. But it only happens when I create the list in the following way. I have no clue why this is the case.
example_list = [[0,0]]*3 print(example_list) example_list[0][0] = 1 print(example_list) print('') example_list = [[0,0],[0,0],[0,0]] print(example_list) example_list[0][0] = 1 print(example_list)
[[0, 0], [0, 0], [0, 0]]
[[1, 0], [1, 0], [1, 0]]
[[0, 0], [0, 0], [0, 0]]
[[1, 0], [0, 0], [0, 0]]
Wednesday, August 16, 2017
Python Recursive Function
Recursion is a way of programming or coding a problem, in which a function calls itself one or more times in its body. Usually, it is returning the return value of this function call. If a function definition fulfils the condition of recursion, we call this function a recursive function.
Termination condition:
A recursive function has to terminate to be used in a program. A recursive function terminates, if with every recursive call the solution of the problem is downsized and moves towards a base case. A base case is a case, where the problem can be solved without further recursion. A recursion can lead to an infinite loop, if the base case is not met in the calls.
Example:
Replacing the calculated values gives us the following expression
Generally we can say: Recursion in computer science is a method where the solution to a problem is based on solving smaller instances of the same problem.
Termination condition:
A recursive function has to terminate to be used in a program. A recursive function terminates, if with every recursive call the solution of the problem is downsized and moves towards a base case. A base case is a case, where the problem can be solved without further recursion. A recursion can lead to an infinite loop, if the base case is not met in the calls.
Example:
4! = 4 * 3!
3! = 3 * 2!
2! = 2 * 1
Replacing the calculated values gives us the following expression
4! = 4 * 3 * 2 * 1
Generally we can say: Recursion in computer science is a method where the solution to a problem is based on solving smaller instances of the same problem.
#Recursive function to calculate the cumulative sum of a list def cum_sum(l=None): if len(l) <= 1: return l[0] else: return l[0]+cum_sum(l[1:len(l)]) #Recursive function to calculate the the factorial of a number def fact(number): if number == 1: return 1 else: return number * fact(number-1) print(cum_sum([1,2,3,4,5,6,7,8,9])) print(fact(3)) #45 #6
Monday, August 14, 2017
Python Grouping Data Elements (itertools groupby)
#PYTHON GROUPING DATA ELEMENTS (ITERTOOLS GROUPBY) import itertools #make a iterator that returns consecutive keys and groups from the iterable list_01 = [100, 50, 50, 50, 50, 50, 60, 60, 60, 80, 80, 70, 70, 70, 70, 70, 70] keys = [] groups = [] sorted_list_01 = sorted(list_01) for k, g in itertools.groupby(sorted_list_01): keys.append(k) #make g a list groups.append(list(g)) print(keys, "==>", groups) #make dict of key and group dict_list_01 = dict(zip(keys, groups)) print(dict_list_01) #another example list_02 = 'AAAAAAACCCDDEEEEFF' sorted_list_02 = sorted(list_02) dict_list_02 = dict(zip([k for k, g in itertools.groupby(sorted_list_02)], [list(g) for k, g in itertools.groupby(sorted_list_02)])) print(dict_list_02) dict_list_02_len = dict(zip([k for k, g in itertools.groupby(sorted_list_02)], [len(list(g)) for k, g in itertools.groupby(sorted_list_02)])) print(dict_list_02_len)
Friday, August 11, 2017
Python pickle
In this post i am going to talk about pickle. It is used for serializing and de-serializing a Python object structure. Any object in python can be pickled so that it can be saved on disk. What pickle does is that it “serialises” the object first before writing it to file. Pickling is a way to convert a python object (list, dict, etc.) into a character stream. The idea is that this character stream contains all the information necessary to reconstruct the object in another python script.
Example of pickle creation
##Import pickle module
import pickle
##Let's create example dic object
eample_dict = {'Sarbadal': ['Manager', '5 years', 'Data Science', 'Photography'],
'AJ': ['Lead', '4 years', 'Data Science', 'Video Game'],
'Shobhit': ['Sr. Analyst', '1 year', 'Data Science', 'Python'],
'Abhishek': ['Analyst', '2 years', 'Data Science', 'Painting']}
##Creating pickle
filename = r'F:\Python\Pickle Objects\pickle_example_dict'
file = open(filename, 'wb')
pickle.dump(eample_dict, file)
##Loading pickle
file = open(filename, 'rb')
new_dict = pickle.load(file)
##Checking the loaded object
print(type(new_dict))
print(new_dict['AJ'])
['Lead', '4 years', 'Data Science', 'Video Game']
Thursday, August 10, 2017
Permutation Function with Repetition
def perm(l=None, n=None, str_a=None, perm_a=None): if len(str_a) == n: return [str_a] + perm_a else: new_perm_a = perm_a for c in l: new_perm_a = perm(l=l, n=n, str_a=str_a + c, perm_a=new_perm_a) return new_perm_a def permutations(l=None, n=None): str_a, perm_a = '', [] result = perm(l=l, n=n, str_a=str_a, perm_a=perm_a) return result lst = permutations(l=['a', 'b', 'c', 'd'], n=3) print(lst)
Python Basic Cluster Analysis
import matplotlib.pyplot as plt from matplotlib import style style.use('ggplot') import numpy as np import pandas as pd from sklearn.cluster import KMeans, hierarchical from scipy.spatial.distance import cdist from scipy.spatial.distance import pdist from scipy.cluster import hierarchy from scipy.cluster.hierarchy import linkage from sklearn import preprocessing
Reading the data and making the Car Models as Index
print('Reading the original data mtcar...')
car = pd.read_csv(r'F:\Novus\ClusterAnalysis\mtcar.txt',
sep='\t',
header=0,
low_memory=True,
encoding='latin-1',
error_bad_lines=False)
car.set_index('car', inplace=True)
scalling all the attribute to (0,1)...
scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
scaled_col = []
ori_col = car.columns.tolist()
for col in ori_col:
new_col = 'scaled_'+col
scaled_col.append(new_col)
car[new_col] = scaler.fit_transform(car[[col]])
car_scaled = car[scaled_col].copy() #creating the scaled/transformed data
car = car[ori_col].copy() #putting the original data back
del scaled_col, ori_col
Create dendrogram
tree = linkage(car_scaled, method='complete', metric='euclidean')
hierarchy.to_tree(tree, rd=False)
hierarchy.set_link_color_palette(['#800080', '#FF00FF', '#000080', '#0000FF', '#008080', '#008000', '#800000'])
plt.figure(figsize=(15,10))
dn = hierarchy.dendrogram(tree,
orientation='top',
color_threshold=1,
leaf_font_size=11,
leaf_rotation=90,
labels = car.index.tolist())
plt.show()
hierarchy.set_link_color_palette(None) # reset to default after use
car_scaled['Group_Hierarchical'] = hierarchy.cut_tree(tree, n_clusters=5, height=None)
KMeans CLustering
#determine k range
k_range = range(1, len(car_scaled))
# fir the KMeans model for each n_cluster = k
k_means_var = [KMeans(n_clusters=k).fit(car_scaled) for k in k_range]
#pull out the cluster centers for each model
centroids = [X.cluster_centers_ for X in k_means_var]
#calculate the Euclidean distance from each point to each cluster center
k_euclid = [cdist(car_scaled, cent, 'euclidean') for cent in centroids]
dist = [np.min(ke, axis=1) for ke in k_euclid]
#total within cluster sum of squares
wcss = [sum(d**2) for d in dist]
#total sum of squares
tss = sum(pdist(car_scaled)**2)/car_scaled.shape[0]
#the between cluster sum of squares
bss = tss - wcss
plt.figure(figsize=(12,7))
plt.plot(range(1, len(car_scaled)), bss,
linewidth=2.0,
# linestyle="None",
marker="o",
color="blue")
plt.xlabel('# clusters')
plt.ylabel('% of variance explained')
plt.title('variance explained vs # cluster')
plt.annotate('optimal number of clusters', xy=(5, bss[4]), xycoords='data', xytext=(7.5, bss[4]-5),
arrowprops=dict(facecolor='black', shrink=0.05),
horizontalalignment='top', verticalalignment='top')
plt.grid(True)
plt.show()
clf = KMeans(n_clusters=5)
clf.fit(car_scaled)
centroids = clf.cluster_centers_
labels = clf.labels_
car_scaled['Group_KMeans'] = labels
#print(car_scaled)
crosstab = pd.crosstab(car_scaled['Group_KMeans'], car_scaled['Group_Hierarchical']).apply(lambda r: r, axis=1)
print(crosstab)
Group_Hierarchical 0 1 2 3 4
Group_KMeans
0 0 0 7 0 0
1 4 0 0 0 0
2 0 0 0 12 0
3 0 0 0 0 2
4 0 7 0 0 0
Subscribe to:
Posts (Atom)