Wednesday, January 31, 2018

tiles logic and python code

import pandas as pd
import numpy as np

def tiles(no_tiles=8, data=None, sum_cols=None, group_by_col=None, verbose=True, maxrecord=30):

    if group_by_col==None:
        dma_sales = pd.DataFrame({'sales': [data[sum_cols].sum()]})
    else:
        dma_sales = pd.DataFrame({'sales': data.groupby(group_by_col)[sum_cols].sum()})
    # dma_sales_ind = dma_sales.reset_index(inplace=False).copy()

    df = pd.DataFrame(data=None, columns=data.columns)
    df['zip_score'] = np.nan
    df['zip_intra_rank'] = np.nan

    #formatted printing....

    if group_by_col==None:
        # setting the first line of summary print...
        str_len = 0 + 49 - len(' Tiles Creation Summary ')
        star_str = '='*int(str_len/2)
        str_to_print = ''.join((star_str,' Tiles Creation Summary ',star_str))
        if verbose: print('')
        if verbose: print(str_to_print)
        sl_no = 0
    else:
        # setting the first line of summary print...
        str_gap = max(len(str(group_by_col))+12, max([len(str(c)) for c in dma_sales.index.tolist()])+2)
        str_len = str_gap + 49 - len(' Tiles Creation Summary ')
        star_str = '='*int(str_len/2)
        str_to_print = ''.join((star_str,' Tiles Creation Summary ',star_str))
        if verbose: print('')
        if verbose: print(str_to_print)
        sl_no = 0

    for dma in dma_sales.index:
        sales = dma_sales.loc[dma, 'sales'].sum()
        if group_by_col==None: 
            df_dma_tmp = data.reset_index(drop=True).copy()
        else: 
            cond_dic = {}
            for i, col in enumerate(group_by_col):
                cond_dic[i] = data[col].unique().tolist()

            condition_dic = {}
            for i in range(len(group_by_col)):
                str_i = 'abc'+str(i)
                if len(group_by_col)>1: condition_dic[str_i] = data[group_by_col[i]]==dma[i]
                if len(group_by_col)==1: condition_dic[str_i] = data[group_by_col[i]]==dma

                c_list = ' & '.join(["condition_dic['"+x+"']" for x in condition_dic.keys()])

            df_dma_tmp = data[eval(c_list)].reset_index(drop=True).copy()

        total_row = len(df_dma_tmp)-1
        row = 0

        zip_sales = 0

        for zip_score in range(1, no_tiles + 1):
            left_over_sales = sales - zip_sales
            bucket_sales = left_over_sales/(no_tiles - (zip_score - 1))
            row_sales = df_dma_tmp.loc[row, sum_cols].sum().copy()
            zip_intra_rank = 0

            while ((row_sales <= bucket_sales) & (row <= total_row)) | ((zip_score == no_tiles) & (row <= total_row)):
                if row < total_row:
                    row_sales += df_dma_tmp.loc[row+1, sum_cols].sum().copy()
                    zip_sales += df_dma_tmp.loc[row+1, sum_cols].sum().copy()

                zip_intra_rank += 1

                df_tmp = df_dma_tmp[df_dma_tmp.index == row].copy()
                df_tmp['zip_score'] = zip_score
                df_tmp['zip_intra_rank'] = zip_intra_rank
                df_tmp = pd.DataFrame(df_tmp)

                df_tmp.reset_index(drop=True, inplace=True)
                df = df.append(df_tmp, ignore_index=True)

                row += 1

            #formatted printing....
            if verbose: # variable add print 
                if group_by_col==None:
                    if sl_no==0:
                        print('{:<6 span="">.format('score', 'left over value', 'bucket sales', width=0))
                        print('-'*int(0+48))

                    sl_no += 1
                    if sl_no <= int(maxrecord/1):
                        print('{:<6 span="">.format(zip_score, round(left_over_sales,2), round(row_sales,2), width=0))
                    elif (sl_no > int(maxrecord/1)) and (sl_no <= int(maxrecord/1)+3):
                        print('{:<6 span="">.format('...', '...', '...', width=0))
                    elif (sl_no == int(maxrecord/1)+4):
                        print('printing has been stopped... max reached.')
                else: 
                    if sl_no==0:
                        print('variable=={:<{width}} {:<6 span="">.format(str(group_by_col), 'score', 'left over value', 'bucket sales', width=str_gap-10))
                        print('-'*int(str_gap+48))

                    sl_no += 1
                    if sl_no <= int(maxrecord/1):
                        print('{:<{width}} {:<6 span="">.format(str(dma), zip_score, round(left_over_sales,2), round(row_sales,2), width=str_gap))
                    elif (sl_no > int(maxrecord/1)) and (sl_no <= int(maxrecord/1)+3):
                        print('{:<{width}} {:<6 span="">.format('...', '...', '...', '...', width=str_gap))
                    elif (sl_no == int(maxrecord/1)+4):
                        print('printing has been stopped... max iteration reached.')

    if group_by_col==None:
        # setting the last line of summary print...
        str_len = 0 + 49 - len(' Tiles Creation Summary End ')
        star_str = '='*int(str_len/2)
        str_to_print = ''.join((star_str,' Tiles Creation Summary End ',star_str))
    else:
        # setting the last line of summary print...
        str_len = str_gap + 49 - len(' Tiles Creation Summary End ')
        star_str = '='*int(str_len/2)
        str_to_print = ''.join((star_str,' Tiles Creation Summary End ',star_str))

    if verbose: print(str_to_print)
    return df

if __name__=='__main__':
    # if data in in excel file...
    from pandas import ExcelWriter

    data_path = r'F:\Python_Modules\TileData.xlsx'
    df_tile = pd.read_excel(open(data_path,'rb'), sheetname='Data')
    df=tiles(no_tiles=8, data=df_tile, sum_cols='Total_ZIP_Sales', group_by_col=['STORE_DMA'], verbose=True, maxrecord=60)

    # WRITE IN DATA FRAME TO 'ab analysis' SHEET...............................................................................
    excel_file_path = r'F:\Python_Modules\TileData_Output.xlsx'
    #..........................................................................................................................
    writer = ExcelWriter(excel_file_path)
    df.to_excel(writer, sheet_name='tiles', startrow=1, header=False, index=False)
    workbook  = writer.book
    worksheet = writer.sheets['tiles']

    # Add a header format.
    header_format = workbook.add_format({'bold': True,'text_wrap': False,'valign': 'top','fg_color': '#D3D3D3','border': 0})
    # Write the column headers with the defined format.
    for col_num, value in enumerate(df.columns.values):
      worksheet.write(0, col_num + 0, value, header_format)
    writer.save()
    #..........................................................................................................................

Wednesday, January 24, 2018

Linear Regression / forward selection

import statsmodels.formula.api as smf

def forward_selected(data, y, Intercept=True, verbose=True):
    """
    Linear model designed by forward selection.
    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and y
    y : string, name of y column in data
    Intercept : bool parameter. Whether we want Intercept in regression model or not
    verbose : bool parameter. if True the it will print out the steps of adding variables

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """

    step = 1
    result = []
    reg_df_tmp = data.copy()   
    remaining = set(reg_df_tmp.columns)
    str_gap = max([len(c) for c in list(remaining)])+2
    
    # setting the first line of summary print...
    str_len = str_gap + 22 - len(' Forward Selection Summary ')
    star_str = '='*int(str_len/2)
    str_toverbose_print = ''.join((star_str,' Forward Selection Summary ',star_str))
    if verbose: print('')
    if verbose: print(str_to_print)

    remaining.remove(y)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    sl_no = 0

    while remaining and current_score == best_new_score:
        scores_with_candidates = []

        for candidate in remaining:
            if Intercept==True: formula = "{} ~ {} + 1".format(y, ' + '.join(selected + [candidate]))
            else: formula = "{} ~ {} + 0".format(y, ' + '.join(selected + [candidate]))
            score = smf.ols(formula, reg_df_tmp).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()

        if current_score <= best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score

            if verbose: # variable add print 
                if sl_no==0:
                    print('Sl.No. {:<{width}} Adj. R-squared'.format('Variable Added', width=str_gap))
                    print('-'*int(str_gap+22))
                
                sl_no += 1
                print('{:<6 span="" width="">.format(sl_no, best_candidate, best_new_score, width=str_gap))
    
    # setting the last line of summary print...
    str_len = str_gap + 22 - len(' Forward Selection End ')
    star_str = '='*int(str_len/2)
    str_to_print = ''.join((star_str,' Forward Selection End ',star_str))
    if verbose: print(str_to_print)

    if Intercept==True: formula = "{} ~ {} + 1".format(y, ' + '.join(selected))
    else: formula = "{} ~ {} + 0".format(y, ' + '.join(selected))
    model = smf.ols(formula, reg_df_tmp).fit()
    result.append([selected, model.rsquared_adj, model])
    step += 1
    if verbose: print('')
    if verbose: print(model.summary())
    del reg_df_tmp
    return [selected, model]

Linear Regression / stepwise selection

import pandas as pd
import statsmodels.api as smf

def stepwise_selection(data, y, Intercept=True, initial_list=[], threshold_in=0.01, threshold_out=0.05, verbose=True):
    """ Perform a forward-backward feature selection 
    based on p-value from statsmodels.api.ols
    Arguments:
        X - pandas.DataFrame with candidate features and y
        y - list-like with the target
        initial_list - list of features to start with (column names of X)
        threshold_in - include a feature if its p-value < threshold_in
        threshold_out - exclude a feature if its p-value > threshold_out
        verbose - whether to print the sequence of inclusions and exclusions
    Returns: list of selected features 
    Always set threshold_in < threshold_out to avoid infinite looping.
    """
    X = data.columns.tolist()
    X.remove(y)
    str_gap = max([len(c) for c in X])+2
    included = list(initial_list)
    x_vars = data.copy()

    while True:
        changed=False
        
        # forward step
        excluded = list(set(X)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            if Intercept==True: formula = "{} ~ {} + 1".format(y, ' + '.join(included+[new_column]))
            else: formula = "{} ~ {} + 0".format(y, ' + '.join(included+[new_column])) 
            model = smf.ols(formula, data=x_vars).fit()
            new_pval[new_column] = model.pvalues[new_column]

        best_pval = new_pval.min()
        if best_pval <= threshold_in:
            best_feature = new_pval.argmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:<{width}} with p-value {:.6}'.format(best_feature, best_pval, width=str_gap))

        # backward step
        if Intercept==True: formula = "{} ~ {} + 1".format(y, ' + '.join(included))
        else: formula = "{} ~ {} + 0".format(y, ' + '.join(included))
        model = smf.ols(formula, data=x_vars).fit()

        if Intercept==True: 
            # use all coefs except intercept
            pvalues = model.pvalues.iloc[1:]
        else: pvalues = model.pvalues
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.argmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:<{width}} with p-value {:.6}'.format(worst_feature, worst_pval, width=str_gap))
        if not changed:
            break
    if Intercept==True: formula = "{} ~ {} + 1".format(y, ' + '.join(included))
    else: formula = "{} ~ {} + 0".format(y, ' + '.join(included))
    model = smf.ols(formula, data=x_vars).fit()
    print('')
    print(model.summary())
    return [included, model]