2-model_creation.py

import streamlit
from Packages import *
st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
from Modules import *
from Class_Mod.DATA_HANDLING import *
from pandas.api.types import is_float_dtype

add_header()

st.session_state["interface"] = st.session_state.get('interface')
if st.session_state["interface"] == 'simple':
    hide_pages("Predictions")


    ####################################### page Design #######################################
st.header("Calibration Model Development", divider='blue')
st.write("Create a predictive model, then use it for predicting your target variable (chemical values) from NIRS spectra")
M1, M2, M3 = st.columns([2,3,2])
M4, M5 = st.columns([6,2])
st.write("---")
st.header("Model Diagnosis", divider='blue')

M7, M8 = st.columns([2,2])
M7.write('Predicted vs Measured values')
M8.write('Residuals plot')
M9, M10 = st.columns([2,2])
M9.write("-- Save the model --")
            ######################################################################


reg_algo = ["","Full-PLSR", "Locally Weighted PLSR", "Interval-PLSR", "Full-PLSR-sklearn", "PrePLStester"]
      #######################################        ###########################################

files_format = ['.csv', '.dx']
file = M3.radio('select data file format:', options = files_format)

### Data
spectra = pd.DataFrame
y = pd.DataFrame

# load .csv file
if file == files_format[0]:
    xcal_csv = M3.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
    if xcal_csv:
        sepx = M3.radio("Select separator (X file) - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)),
                                options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0)
        hdrx = M3.radio("samples name (X file)? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)),
                                options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1)
        if hdrx == "yes": col = 0
        else: col = False
        
    ycal_csv = M3.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column")
    if ycal_csv:
        sepy = M3.radio("separator (Y file): ", options=[";", ","], key=2)
        hdry = M3.radio("samples name (Y file)?: ", options=["no", "yes"], key=3)
        if hdry == "yes": col = 0
        else: col = False
    
    if xcal_csv and ycal_csv:
        xfile = pd.read_csv(xcal_csv, decimal='.', sep=sepx, index_col=col, header=0)
        yfile =  pd.read_csv(ycal_csv, decimal='.', sep=sepy, index_col=col)
        
        if yfile.shape[1]>0 and xfile.shape[1]>0 :
            spectra, meta_data = col_cat(xfile)
            y, idx = col_cat(yfile)
            if y.shape[1]>1:
                yname = M3.selectbox('Select target', options=y.columns)
                y = y.loc[:,yname]
            else:
                y = y.iloc[:,0]
            

            spectra = pd.DataFrame(spectra).astype(float)
            if not meta_data.empty :
                st.write(meta_data)

            if spectra.shape[0] != y.shape[0]:
                M3.warning('X and Y have different sample size')
                y = pd.DataFrame
                spectra = pd.DataFrame

        else:
            M1.warning('Tune decimal and separator parameters')

        
## Load .dx file
elif file == files_format[1]:
    data_file = M3.file_uploader("Select Data", type=".dx", help=" :mushroom: select a dx file")
    if data_file:
        with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
            tmp.write(data_file.read())
            tmp_path = tmp.name
            chem_data, spectra, meta_data, meta_data_st = read_dx(file =  tmp_path)
            M3.success("The data have been loaded successfully", icon="✅")
            if chem_data.shape[1]>0:
                yname = M3.selectbox('Select target', options=chem_data.columns)
                measured = chem_data.loc[:,yname] > 0
                y = chem_data.loc[:,yname].loc[measured]
                spectra = spectra.loc[measured]
            else:
                M3.warning('Warning: Chemical data are not included in your file !', icon="⚠️")
        os.unlink(tmp_path)

### split the data
if not spectra.empty and not y.empty:
    rd_seed = M1.slider("Customize Train-test split", min_value=1, max_value=100, value=42, format="%i")
    # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing
    train_index, test_index = train_test_split_idx(spectra, y = y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=rd_seed)
    

    # Assign data to training and test sets
    X_train, y_train = pd.DataFrame(spectra.iloc[train_index,:]), y.iloc[train_index]
    X_test, y_test = pd.DataFrame(spectra.iloc[test_index,:]), y.iloc[test_index]
    
    sk = lambda x: skew(x, axis=0, bias=True)
    ku = lambda x:kurtosis(x, axis=0, bias=True)
    cv = lambda x: x.std()*100/x.mean()

    M2.write('Loaded data summary')
    M2.write(f'The loaded spectra consist of {spectra.shape[1]} wavelengths')
    datainf = pd.DataFrame()
    datainf['N samples'] = [X_train.shape[0], X_test.shape[0], spectra.shape[0] ]
    datainf['Mean'] = [y_train.mean(), y_test.mean(), y.mean()]
    datainf['SD'] = [y_train.std(), y_test.std(), y.std()]
    datainf['CV(%)'] = [cv(y_train), cv(y_test), cv(y)]
    datainf['Skewness'] = [sk(y_train), sk(y_test), sk(y)]
    datainf['Kurtosis'] = [ku(y_train), ku(y_test), ku(y)]
    datainf.index = ['Train', 'Test', 'Total']
    M2.write(datainf.round(3))

#######################################
    regression_algo = M1.selectbox("Choose the algorithm for regression", options=reg_algo, key = 12)
    if regression_algo == reg_algo[1]:
        # Train model with model function from application_functions.py
        Reg = PinardPlsr(x_train = X_train, x_test = X_test,y_train = y_train, y_test = y_test)
        reg_model = Reg.model_
        #M2.dataframe(Pin.pred_data_)
    elif regression_algo == reg_algo[2]:
        data_to_work_with = ['x_train_np', 'y_train_np', 'x_test_np', 'y_test_np']#,'x_train_np_cv1', 'y_train_np_cv1', 'x_test_np_cv1', 'y_test_np_cv1', 'x_train_np_cv2', 'y_train_np_cv2', 'x_test_np_cv2', 'y_test_np_cv2', 'x_train_np_cv3', 'y_train_np_cv3', 'x_test_np_cv3', 'y_test_np_cv3',]
        x_train_np, y_train_np, x_test_np, y_test_np = X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy(), y_test.to_numpy()
        # x_train_np_cv1, y_train_np_cv1, x_test_np_cv1, y_test_np_cv1, x_train_np_cv2, y_train_np_cv2, x_test_np_cv2, y_test_np_cv2, x_train_np_cv3, y_train_np_cv3, x_test_np_cv3, y_test_np_cv3, = X_train_cv1.to_numpy(), y_train_cv1.to_numpy(), X_test_cv1.to_numpy(), y_test_cv1.to_numpy(), X_train_cv2.to_numpy(), y_train_cv2.to_numpy(), X_test_cv2.to_numpy(), y_test_cv2.to_numpy(), X_train_cv3.to_numpy(), y_train_cv3.to_numpy(), X_test_cv3.to_numpy(), y_test_cv3.to_numpy()
        temp_path = Path('temp/')
        for i in data_to_work_with: np.savetxt(temp_path / str(i + ".csv"), vars()[i], delimiter=",")
        import subprocess
        subprocess_path = Path("Class_Mod/")
        subprocess.run([f"{sys.executable}", subprocess_path / "LWPLSR_Call.py"])
        with open(temp_path / "lwplsr_outputs.json", "r") as outfile:
                Reg_json = json.load(outfile)
                for i in data_to_work_with: os.unlink(temp_path / str(i + ".csv"))
        os.unlink(temp_path / "lwplsr_outputs.json")
        # Reg = type('obj', (object,), {'model' : pd.json_normalize(Reg_json['model']), 'pred_data_' : [pd.json_normalize(Reg_json['pred_data_train']), pd.json_normalize(Reg_json['pred_data_cv1']), pd.json_normalize(Reg_json['pred_data_cv2']), pd.json_normalize(Reg_json['pred_data_cv3']), pd.json_normalize(Reg_json['pred_data_test'])]})
        pred = ['pred_data_train', 'pred_data_cv1', 'pred_data_cv2', 'pred_data_cv3', 'pred_data_test']
        Reg = type('obj', (object,), {'model' : pd.json_normalize(Reg_json['model']), 'pred_data_' : [pd.json_normalize(Reg_json[i]) for i in pred]})
        for i in range(len(pred)):
            Reg.pred_data_[i] = Reg.pred_data_[i].T.reset_index().drop(columns = ['index'])
            if i is not 4:
                Reg.pred_data_[i].index = list(y_train.index)
            else:
                Reg.pred_data_[i].index = list(y_test.index)
        # Reg.pred_data_[0] = Reg.pred_data_[0].T.reset_index().drop(columns = ['index'])
        # Reg.pred_data_[0].index = list(y_train.index)
        # Reg.pred_data_[1] = Reg.pred_data_[1].T.reset_index().drop(columns = ['index'])
        # Reg.pred_data_[1].index = list(y_train_cv1.index)
        #  Reg.pred_data_[2] = Reg.pred_data_[2].T.reset_index().drop(columns = ['index'])
        # Reg.pred_data_[2].index = list(y_train_cv2.index)
        #  Reg.pred_data_[3] = Reg.pred_data_[3].T.reset_index().drop(columns = ['index'])
        # Reg.pred_data_[3].index = list(y_train_cv3.index)
        # Reg.pred_data_[4] = Reg.pred_data_[4].T.reset_index().drop(columns = ['index'])
        # Reg.pred_data_[4].index = list(y_test.index)

    elif regression_algo == reg_algo[3]:
        s = M1.number_input(label='Enter the maximum number of intervals', min_value=1, max_value=6, value=3)
        it = M1.number_input(label='Enter the number of iterations', min_value=50, max_value=1000, value=100)
        progress_text = "The model is being created. Please wait."
            
        Reg = TpeIpls(x_train = X_train, x_test=X_test, y_train = y_train, y_test = y_test, scale = False, Kfold = 3, n_intervall = s)
        pro = M1.progress(0, text="The model is being created. Please wait!")
        rega = Reg.BandSelect(n_iter=it)
        pro.empty()
        M1.progress(100, text = "The model has successfully been  created!")            
        time.sleep(1)
        reg_model = Reg.model_
        M2.write('-- Table of selected wavelengths --')
        M2.table(rega[0])
        
    elif regression_algo == reg_algo[4]:
        Reg = PlsR(x_train = X_train, x_test = X_test, y_train = y_train, y_test = y_test)
        reg_model = Reg.model_
    
    
    elif regression_algo == reg_algo[5]:
        Reg = PlsProcess(x_train = X_train, x_test = X_test, y_train = y_train, y_test = y_test, scale = False, Kfold=3)
        Reg.tune(n_iter=500)
        reg_model = Reg.model_

        ################# Model analysis ############
    if regression_algo in reg_algo[1:]:
        yc = Reg.pred_data_[0]
        ycv = Reg.pred_data_[1]
        yt = Reg.pred_data_[2]
            
        #if
        M2.write('-- Spectral preprocessing info --')
        M2.write(Reg.best_hyperparams)
        with open("data/params/Preprocessing.json", "w") as outfile: 
            json.dump(Reg.best_hyperparams, outfile)      

        M2.write("-- Performance metrics --")
        M2.dataframe(Reg.metrics_)
        #from st_circular_progress import CircularProgress
        #my_circular_progress = CircularProgress(label = 'Performance',value = 50, key = 'my performance',
        #                                         size = "medium", track_color = "black", color = "blue")
        
        #my_circular_progress.st_circular_progress()
        #my_circular_progress.update_value(progress=20)

        M7.pyplot(reg_plot([y_train, y_train, y_test],[yc, ycv, yt], train_idx = train_index, test_idx = test_index))
        M8.pyplot(resid_plot([y_train, y_train, y_test],[yc, ycv, yt], train_idx = train_index, test_idx = test_index))
            
            
            #model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20)
        model_name = M9.text_input('Give it a name')
        date_time = datetime.datetime.strftime(datetime.date.today(), '_%Y_%m_%d_')
        if M9.button('Export Model'):
            path = 'data/models/model_'
            if file == files_format[0]:
                #export_package = __import__(model_export)
                with open(path + model_name + date_time + '_created_on_' + xcal_csv.name[:xcal_csv.name.find(".")] +""+
                           '_and_' + ycal_csv.name[:ycal_csv.name.find(".")] + '_data_' + '.pkl','wb') as f:
                    joblib.dump(reg_model, f)
                    if regression_algo == reg_algo[3]:
                        rega[1].sort()
                        pd.DataFrame(rega[1]).to_csv(path + model_name + date_time + '_on_' + xcal_csv.name[:xcal_csv.name.find(".")]
                                                      + '_and_' + ycal_csv.name[:ycal_csv.name.find(".")] + '_data_'+'Wavelengths_index.csv', sep = ';')

            elif file == files_format[1]:
                #export_package = __import__(model_export)
                with open(path + model_name + '_on_'+ data_file.name[:data_file.name.find(".")] + '_data_' + '.pkl','wb') as f:
                    joblib.dump(reg_model, f)
                    if regression_algo == reg_algo[3]:
                        rega[1].sort()
                        pd.DataFrame(rega[1]).to_csv(path +data_file.name[:data_file.name.find(".")]+ model_name + date_time+ '_on_' + '_data_'+'Wavelengths_index.csv', sep = ';')
                        st.write('Model Exported ')
                    
            if regression_algo == reg_algo[3]:
                st.write('Model Exported')
                        

                # create a report with information on the model
                ## see https://stackoverflow.com/a/59578663


        if st.session_state['interface'] == 'simple':
            st.page_link('pages\\3-prediction.py', label = 'Keep on keepin\' on to predict your values !')

## Load .dx file