import pandas as pd import numpy as np def tiles(no_tiles=8, data=None, sum_cols=None, group_by_col=None, verbose=True, maxrecord=30): if group_by_col==None: dma_sales = pd.DataFrame({'sales': [data[sum_cols].sum()]}) else: dma_sales = pd.DataFrame({'sales': data.groupby(group_by_col)[sum_cols].sum()}) # dma_sales_ind = dma_sales.reset_index(inplace=False).copy() df = pd.DataFrame(data=None, columns=data.columns) df['zip_score'] = np.nan df['zip_intra_rank'] = np.nan #formatted printing.... if group_by_col==None: # setting the first line of summary print... str_len = 0 + 49 - len(' Tiles Creation Summary ') star_str = '='*int(str_len/2) str_to_print = ''.join((star_str,' Tiles Creation Summary ',star_str)) if verbose: print('') if verbose: print(str_to_print) sl_no = 0 else: # setting the first line of summary print... str_gap = max(len(str(group_by_col))+12, max([len(str(c)) for c in dma_sales.index.tolist()])+2) str_len = str_gap + 49 - len(' Tiles Creation Summary ') star_str = '='*int(str_len/2) str_to_print = ''.join((star_str,' Tiles Creation Summary ',star_str)) if verbose: print('') if verbose: print(str_to_print) sl_no = 0 for dma in dma_sales.index: sales = dma_sales.loc[dma, 'sales'].sum() if group_by_col==None: df_dma_tmp = data.reset_index(drop=True).copy() else: cond_dic = {} for i, col in enumerate(group_by_col): cond_dic[i] = data[col].unique().tolist() condition_dic = {} for i in range(len(group_by_col)): str_i = 'abc'+str(i) if len(group_by_col)>1: condition_dic[str_i] = data[group_by_col[i]]==dma[i] if len(group_by_col)==1: condition_dic[str_i] = data[group_by_col[i]]==dma c_list = ' & '.join(["condition_dic['"+x+"']" for x in condition_dic.keys()]) df_dma_tmp = data[eval(c_list)].reset_index(drop=True).copy() total_row = len(df_dma_tmp)-1 row = 0 zip_sales = 0 for zip_score in range(1, no_tiles + 1): left_over_sales = sales - zip_sales bucket_sales = left_over_sales/(no_tiles - (zip_score - 1)) row_sales = df_dma_tmp.loc[row, sum_cols].sum().copy() zip_intra_rank = 0 while ((row_sales <= bucket_sales) & (row <= total_row)) | ((zip_score == no_tiles) & (row <= total_row)): if row < total_row: row_sales += df_dma_tmp.loc[row+1, sum_cols].sum().copy() zip_sales += df_dma_tmp.loc[row+1, sum_cols].sum().copy() zip_intra_rank += 1 df_tmp = df_dma_tmp[df_dma_tmp.index == row].copy() df_tmp['zip_score'] = zip_score df_tmp['zip_intra_rank'] = zip_intra_rank df_tmp = pd.DataFrame(df_tmp) df_tmp.reset_index(drop=True, inplace=True) df = df.append(df_tmp, ignore_index=True) row += 1 #formatted printing.... if verbose: # variable add print if group_by_col==None: if sl_no==0: print('{:<6 span="">.format('score', 'left over value', 'bucket sales', width=0)) print('-'*int(0+48)) sl_no += 1 if sl_no <= int(maxrecord/1): print('{:<6 span="">.format(zip_score, round(left_over_sales,2), round(row_sales,2), width=0)) elif (sl_no > int(maxrecord/1)) and (sl_no <= int(maxrecord/1)+3): print('{:<6 span="">.format('...', '...', '...', width=0)) elif (sl_no == int(maxrecord/1)+4): print('printing has been stopped... max reached.') else: if sl_no==0: print('variable=={:<{width}} {:<6 span="">.format(str(group_by_col), 'score', 'left over value', 'bucket sales', width=str_gap-10)) print('-'*int(str_gap+48)) sl_no += 1 if sl_no <= int(maxrecord/1): print('{:<{width}} {:<6 span="">.format(str(dma), zip_score, round(left_over_sales,2), round(row_sales,2), width=str_gap)) elif (sl_no > int(maxrecord/1)) and (sl_no <= int(maxrecord/1)+3): print('{:<{width}} {:<6 span="">.format('...', '...', '...', '...', width=str_gap)) elif (sl_no == int(maxrecord/1)+4): print('printing has been stopped... max iteration reached.') if group_by_col==None: # setting the last line of summary print... str_len = 0 + 49 - len(' Tiles Creation Summary End ') star_str = '='*int(str_len/2) str_to_print = ''.join((star_str,' Tiles Creation Summary End ',star_str)) else: # setting the last line of summary print... str_len = str_gap + 49 - len(' Tiles Creation Summary End ') star_str = '='*int(str_len/2) str_to_print = ''.join((star_str,' Tiles Creation Summary End ',star_str)) if verbose: print(str_to_print) return df if __name__=='__main__': # if data in in excel file... from pandas import ExcelWriter data_path = r'F:\Python_Modules\TileData.xlsx' df_tile = pd.read_excel(open(data_path,'rb'), sheetname='Data') df=tiles(no_tiles=8, data=df_tile, sum_cols='Total_ZIP_Sales', group_by_col=['STORE_DMA'], verbose=True, maxrecord=60) # WRITE IN DATA FRAME TO 'ab analysis' SHEET............................................................................... excel_file_path = r'F:\Python_Modules\TileData_Output.xlsx' #.......................................................................................................................... writer = ExcelWriter(excel_file_path) df.to_excel(writer, sheet_name='tiles', startrow=1, header=False, index=False) workbook = writer.book worksheet = writer.sheets['tiles'] # Add a header format. header_format = workbook.add_format({'bold': True,'text_wrap': False,'valign': 'top','fg_color': '#D3D3D3','border': 0}) # Write the column headers with the defined format. for col_num, value in enumerate(df.columns.values): worksheet.write(0, col_num + 0, value, header_format) writer.save() #..........................................................................................................................6>6>6>6>6>6>
Wednesday, January 31, 2018
tiles logic and python code
Wednesday, January 24, 2018
Linear Regression / forward selection
import statsmodels.formula.api as smf def forward_selected(data, y, Intercept=True, verbose=True): """ Linear model designed by forward selection. Parameters: ----------- data : pandas DataFrame with all possible predictors and y y : string, name of y column in data Intercept : bool parameter. Whether we want Intercept in regression model or not verbose : bool parameter. if True the it will print out the steps of adding variables Returns: -------- model: an "optimal" fitted statsmodels linear model with an intercept selected by forward selection evaluated by adjusted R-squared """ step = 1 result = [] reg_df_tmp = data.copy() remaining = set(reg_df_tmp.columns) str_gap = max([len(c) for c in list(remaining)])+2 # setting the first line of summary print... str_len = str_gap + 22 - len(' Forward Selection Summary ') star_str = '='*int(str_len/2) str_toverbose_print = ''.join((star_str,' Forward Selection Summary ',star_str)) if verbose: print('') if verbose: print(str_to_print) remaining.remove(y) selected = [] current_score, best_new_score = 0.0, 0.0 sl_no = 0 while remaining and current_score == best_new_score: scores_with_candidates = [] for candidate in remaining: if Intercept==True: formula = "{} ~ {} + 1".format(y, ' + '.join(selected + [candidate])) else: formula = "{} ~ {} + 0".format(y, ' + '.join(selected + [candidate])) score = smf.ols(formula, reg_df_tmp).fit().rsquared_adj scores_with_candidates.append((score, candidate)) scores_with_candidates.sort() best_new_score, best_candidate = scores_with_candidates.pop() if current_score <= best_new_score: remaining.remove(best_candidate) selected.append(best_candidate) current_score = best_new_score if verbose: # variable add print if sl_no==0: print('Sl.No. {:<{width}} Adj. R-squared'.format('Variable Added', width=str_gap)) print('-'*int(str_gap+22)) sl_no += 1 print('{:<6 span="" width="">.format(sl_no, best_candidate, best_new_score, width=str_gap)) # setting the last line of summary print... str_len = str_gap + 22 - len(' Forward Selection End ') star_str = '='*int(str_len/2) str_to_print = ''.join((star_str,' Forward Selection End ',star_str)) if verbose: print(str_to_print) if Intercept==True: formula = "{} ~ {} + 1".format(y, ' + '.join(selected)) else: formula = "{} ~ {} + 0".format(y, ' + '.join(selected)) model = smf.ols(formula, reg_df_tmp).fit() result.append([selected, model.rsquared_adj, model]) step += 1 if verbose: print('') if verbose: print(model.summary()) del reg_df_tmp return [selected, model]6>
Linear Regression / stepwise selection
import pandas as pd import statsmodels.api as smf def stepwise_selection(data, y, Intercept=True, initial_list=[], threshold_in=0.01, threshold_out=0.05, verbose=True): """ Perform a forward-backward feature selection based on p-value from statsmodels.api.ols Arguments: X - pandas.DataFrame with candidate features and y y - list-like with the target initial_list - list of features to start with (column names of X) threshold_in - include a feature if its p-value < threshold_in threshold_out - exclude a feature if its p-value > threshold_out verbose - whether to print the sequence of inclusions and exclusions Returns: list of selected features Always set threshold_in < threshold_out to avoid infinite looping. """ X = data.columns.tolist() X.remove(y) str_gap = max([len(c) for c in X])+2 included = list(initial_list) x_vars = data.copy() while True: changed=False # forward step excluded = list(set(X)-set(included)) new_pval = pd.Series(index=excluded) for new_column in excluded: if Intercept==True: formula = "{} ~ {} + 1".format(y, ' + '.join(included+[new_column])) else: formula = "{} ~ {} + 0".format(y, ' + '.join(included+[new_column])) model = smf.ols(formula, data=x_vars).fit() new_pval[new_column] = model.pvalues[new_column] best_pval = new_pval.min() if best_pval <= threshold_in: best_feature = new_pval.argmin() included.append(best_feature) changed=True if verbose: print('Add {:<{width}} with p-value {:.6}'.format(best_feature, best_pval, width=str_gap)) # backward step if Intercept==True: formula = "{} ~ {} + 1".format(y, ' + '.join(included)) else: formula = "{} ~ {} + 0".format(y, ' + '.join(included)) model = smf.ols(formula, data=x_vars).fit() if Intercept==True: # use all coefs except intercept pvalues = model.pvalues.iloc[1:] else: pvalues = model.pvalues worst_pval = pvalues.max() # null if pvalues is empty if worst_pval > threshold_out: changed=True worst_feature = pvalues.argmax() included.remove(worst_feature) if verbose: print('Drop {:<{width}} with p-value {:.6}'.format(worst_feature, worst_pval, width=str_gap)) if not changed: break if Intercept==True: formula = "{} ~ {} + 1".format(y, ' + '.join(included)) else: formula = "{} ~ {} + 0".format(y, ' + '.join(included)) model = smf.ols(formula, data=x_vars).fit() print('') print(model.summary()) return [included, model]
Subscribe to:
Posts (Atom)