Miscellaneous.py

from Packages import *

# local CSS
## load the custom CSS in the style folder
@st.cache_data
def local_css(file_name):
    with open(file_name) as f:
        st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)

# predict module
def prediction(NIRS_csv, qsep, qhdr, model):
    # hdr var correspond to column header True or False in the CSV
    if qhdr == 'yes':
        col = 0
    else:
        col = False
    X_test = read_csv(NIRS_csv, sep=qsep, index_col=col)
    Y_preds = model.predict(X_test)
    # Y_preds = X_test
    return Y_preds


@st.cache_data
def reg_plot( meas, pred, train_idx, test_idx):
    a0 = np.ones(2)
    a1 = np.ones(2)
    
    for i in range(len(meas)):
        meas[i] = np.array(meas[i]).reshape(-1, 1) 
        pred[i] = np.array(pred[i]).reshape(-1, 1)

        M = LinearRegression()
        M.fit(meas[i], pred[i])
        a1[i] = np.round(M.coef_[0][0],2)
        a0[i] = np.round(M.intercept_[0],2)

    ec = np.subtract(np.array(meas[0]).reshape(-1), np.array(pred[0]).reshape(-1))
    et = np.subtract(np.array(meas[1]).reshape(-1), np.array(pred[1]).reshape(-1))

    fig, ax = plt.subplots(figsize = (12,4))
    sns.regplot(x = meas[0] , y = pred[0], color="#2C6B6F", label = f'Cal (Predicted = {a0[0]} + {a1[0]} x Measured)', scatter_kws={'edgecolor': 'black'})
    sns.regplot(x = meas[1], y = pred[1], color='#d0f7be', label = f'Val (Predicted = {a0[1]} + {a1[1]} x Measured)', scatter_kws={'edgecolor': 'black'})
    plt.plot([np.min(meas[0]) - 0.05, np.max([meas[0]]) + 0.05], [np.min(meas[0]) - 0.05, np.max([meas[0]]) + 0.05], color = 'black')

    for i, txt  in enumerate(train_idx):
        #plt.annotate(txt ,(np.array(meas[0]).reshape(-1)[i],ec[i]))
        if np.abs(ec[i])> np.mean(ec)+ 3*np.std(ec):
            plt.annotate(txt ,(np.array(meas[0]).reshape(-1)[i], np.array(pred[0]).reshape(-1)[i]))

    for i, txt  in enumerate(test_idx):
        if np.abs(et[i])> np.mean(et)+ 3*np.std(et):
            plt.annotate(txt ,(np.array(meas[1]).reshape(-1)[i], np.array(pred[1]).reshape(-1)[i]))

    ax.set_ylabel('Predicted values')
    ax.set_xlabel('Measured values')
    plt.legend()
    plt.margins(0)
    # fig.savefig('./report/figures/measured_vs_predicted.png')
    return fig

@st.cache_data
def resid_plot( meas, pred, train_idx, test_idx):
    a0 = np.ones(2)
    a1 = np.ones(2)
    e = [np.subtract(meas[0] ,pred[0]), np.subtract(meas[1], pred[1])]

    for i in range(len(meas)):
        M = LinearRegression()
        M.fit( np.array(meas[i]).reshape(-1,1), np.array(e[i]).reshape(-1,1))
        a1[i] = np.round(M.coef_[0],2)
        a0[i] = np.round(M.intercept_,2)
    

    fig, ax = plt.subplots(figsize = (12,4))
    sns.scatterplot(x = pred[0], y = e[0], color="#2C6B6F", label = f'Cal', edgecolor="black")
    sns.scatterplot(x = pred[1], y = e[1], color="#d0f7be", label = f'Val', edgecolor="black")

    # sns.scatterplot(x = pred[0], y = e[0], color='blue', label = f'Cal (Residual = {a0[0]} + {a1[0]} * Predicted)')
    # sns.scatterplot(x = pred[1], y = e[1], color='green', label = f'Val (Residual = {a0[1]} + {a1[1]} * Predicted)')
    plt.axhline(y= 0, c ='black', linestyle = ':')
    lim = np.max(abs(np.concatenate([e[0], e[1]], axis = 0)))*1.1
    plt.ylim(- lim, lim )    
    

    for i in range(2):
        e[i] = np.array(e[i]).reshape(-1,1)

    for i, txt  in enumerate(train_idx):
        #plt.annotate(txt ,(np.array(meas[0]).reshape(-1)[i],ec[i]))
        if np.abs(e[0][i])> np.mean(e[0])+ 3*np.std(e[0]):
            plt.annotate(txt ,(np.array(pred[0]).reshape(-1)[i],e[0][i]))

    for i, txt  in enumerate(test_idx):
        if np.abs(e[1][i])> np.mean(e[1])+ 3*np.std(e[1]):
            plt.annotate(txt ,(np.array(pred[1]).reshape(-1)[i],e[1][i]))
    ax.set_xlabel(f'{ train_idx.shape}')
    ax.set_ylabel('Residuals')
    ax.set_xlabel('Predicted values')
    plt.legend()
    plt.margins(0)
    # fig.savefig('./report/figures/residuals_plot.png')
    return fig


# function that create a download button - needs the data to save and the file name to store to
def download_results(data, export_name):
    with open(data) as f:
        st.download_button('Download', f, export_name, type='primary')

@st.cache_data
def plot_spectra(specdf, xunits, yunits):
    fig, ax = plt.subplots(figsize = (30,7))
    if isinstance(specdf.columns[0], str):
        specdf.T.plot(legend=False, ax = ax, color = '#2474b4')
        min = 0
    else: 
        min = np.max(specdf.columns)
        specdf.T.plot(legend=False, ax = ax, color = '#2474b4').invert_xaxis()

    ax.set_xlabel(xunits, fontsize=30)
    ax.set_ylabel(yunits, fontsize=30)
    plt.margins(x = 0)
    plt.tight_layout()
    return fig

@st.cache_data
def hist(y, y_train, y_test, target_name = 'y'):
    fig, ax = plt.subplots(figsize = (12,3))
    sns.histplot(y, color = "#004e9e", kde = True, label = str(target_name), ax = ax, fill = True)
    sns.histplot(y_train, color = "#2C6B6F", kde = True, label = str(target_name)+" (Cal)", ax = ax, fill = True)
    sns.histplot(y_test, color = "#d0f7be", kde = True, label = str(target_name)+" (Val)", ax = ax, fill = True)
    ax.set_xlabel(str(target_name))
    plt.legend()
    plt.tight_layout()
    return fig


@st.cache_data
def pred_hist(pred):
    # Creating histogram
    hist, axs = plt.subplots(1, 1, figsize =(15, 3), 
                            tight_layout = True)

    # Add x, y gridlines 
    axs.grid( color ='grey', linestyle ='-.', linewidth = 0.5, alpha = 0.6) 
    # Remove axes splines 
    for s in ['top', 'bottom', 'left', 'right']: 
        axs.spines[s].set_visible(False)
    # Remove x, y ticks
    axs.xaxis.set_ticks_position('none') 
    axs.yaxis.set_ticks_position('none') 
    # Add padding between axes and labels 
    axs.xaxis.set_tick_params(pad = 5) 
    axs.yaxis.set_tick_params(pad = 10) 
    # Creating histogram
    N, bins, patches = axs.hist(pred, bins = 12)
    return hist

@st.cache_data
def fig_export():
    pass


@st.cache_data(show_spinner =True)
def data_split(x, y):
    # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing
    train_index, test_index = train_test_split_idx(x , y = y, method = "kennard_stone", metric = "correlation", test_size = 0.25, random_state = 42)
    # Assign data to training and test sets
    X_train, y_train = DataFrame(x.iloc[train_index,:]), y.iloc[train_index]
    X_test, y_test = DataFrame(x.iloc[test_index,:]), y.iloc[test_index]
    return X_train, X_test, y_train, y_test, train_index, test_index

## descriptive stat
@st.cache_data(show_spinner =True)
def desc_stats(x):
    a = {}
    a['N samples'] = x.shape[0]
    a['Min'] =  np.min(x)
    a['Max'] = np.max(x)
    a['Mean'] = np.mean(x)
    a['Median'] = np.median(x)
    a['S'] = np.std(x)
    a['RSD'] = np.std(x)*100/np.mean(x)
    a['Skew'] = skew(x, axis=0, bias=True)
    a['Kurt'] = kurtosis(x, axis=0, bias=True)
    return a


def hash_data(data):
    import xxhash
    """Hash various data types using MD5."""
    
    # Convert to a string representation
    if isinstance(data, DataFrame):
        data_str = data.to_string()
    elif isinstance(data, Series):
        data_str = data.to_string()
    elif isinstance(data, np.ndarray):
        data_str = np.array2string(data, separator=',')
    elif isinstance(data, (list, tuple)):
        data_str = str(data)
    elif isinstance(data, dict):
        # Ensure consistent order for dict items
        data_str = str(sorted(data.items()))
    elif isinstance(data, (int, float, str, bool)):
        data_str = str(data)
    elif isinstance(data, bytes):
        data_str = data.decode('utf-8', 'ignore')  # Decode bytes to string
    elif isinstance(data, str):  # Check if it's a string representing file content
        data_str = data
    else:
        raise TypeError(f"Unsupported data type: {type(data)}")
    
    # Encode the string to bytes
    data_bytes = data_str.encode()
    
    # Compute the MD5 hash
    md5_hash = xxhash.xxh32(data_bytes).hexdigest()
    
    return str(md5_hash)


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ style test

@st.cache_data
def background_img(change):
    import base64
    image_path = './images/img-sky.jpg'
    with open(image_path, "rb") as image_file:
        base64_image= base64.b64encode(image_file.read()).decode('utf-8')


    # CSS code to set the background image
    # Get the base64-encoded image

    # CSS code to set the background image
    background_image_style = f"""
        <style>
        .stApp {{
            background-image: url("data:image/jpeg;base64,{base64_image}");
            background-size: cover;
            background-repeat: no-repeat;
            background-attachment: fixed;
        }}
        </style>
    """

    # Inject the CSS style
    st.markdown(background_image_style, unsafe_allow_html=True)