Skip to content
Snippets Groups Projects
2-model_creation.py 9.3 KiB
Newer Older
  • Learn to ignore specific revisions
  • from Packages import *
    st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
    from Modules import *
    from Class_Mod.DATA_HANDLING import *
    
    DIANE's avatar
    DIANE committed
    
    
    
    # HTML pour le bandeau "CEFE - CNRS"
    bandeau_html = """
    <div style="width: 100%; background-color: #4682B4; padding: 10px; margin-bottom: 10px;">
    
      <h1 style="text-align: center; color: white;">CEFE - CNRS / UM</h1>
    
    </div>
    """
    # Injecter le code HTML du bandeau
    st.markdown(bandeau_html, unsafe_allow_html=True)
    
    DIANE's avatar
    DIANE committed
    
    
    st.session_state["interface"] = st.session_state.get('interface')
    if st.session_state["interface"] == 'simple':
        hide_pages("Predictions")
    
    def nn(x):
        return x is not None
    
    ########################################################################################
    
    DIANE's avatar
    DIANE committed
    reg_algo = ["","Full-PLSR", "Locally Weighted PLSR", "Interval-PLSR", "Full-PLSR-sklearn"]
    
    DIANE's avatar
    DIANE committed
    # page Design
    
    st.header("Calibration Model Development", divider='blue')
    
    st.write("Create a predictive model, then use it for predicting your target variable (chemical values) from NIRS spectra")
    
    M1, M2, M3 = st.columns([2,2,2])
    M4, M5 = st.columns([6,2])
    
    st.write("---")
    st.header("Model Diagnosis", divider='blue')
    
    
    M7, M8 = st.columns([2,2])
    
    M7.write('Predicted vs Measured values')
    M8.write('Residuals plot')
    M9, M10 = st.columns([2,2])
    M9.write("-- Save the model --")
    
    
    
    DIANE's avatar
    DIANE committed
    files_format = ['.csv', '.dx']
    file = M3.radio('select data file format:', options = files_format)
    
    DIANE's avatar
    DIANE committed
    ### Data
    spectra = pd.DataFrame
    y = pd.DataFrame
    
    DIANE's avatar
    DIANE committed
    # load .csv file
    if file == files_format[0]:
        xcal_csv = M3.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
    
    DIANE's avatar
    DIANE committed
        if xcal_csv:
            sepx = M3.radio("Select separator (X file) - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)),
                                    options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0)
            hdrx = M3.radio("samples name (X file)? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)),
                                    options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1)
            if hdrx == "yes": col = 0
            else: col = False
            
    
    DIANE's avatar
    DIANE committed
        ycal_csv = M3.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column")
    
    DIANE's avatar
    DIANE committed
            
                
        if ycal_csv:
            sepy = M3.radio("separator (Y file): ", options=[";", ","], key=2)
            hdry = M3.radio("samples name (Y file)?: ", options=["no", "yes"], key=3)
            if hdry == "yes": col = 0
            else: col = False
            
    
    DIANE's avatar
    DIANE committed
        
        if xcal_csv and ycal_csv:
    
    DIANE's avatar
    DIANE committed
            spectra, meta_data = col_cat(pd.read_csv(xcal_csv, decimal='.', sep=sepx, index_col=col, header=0))
            y, _ = col_cat(pd.read_csv(ycal_csv, decimal='.', sep=sepy, index_col=col))
    
            y = pd.DataFrame(y).astype(float).iloc[:,0]
            spectra = pd.DataFrame(spectra).astype(float)
            st.write(meta_data)
    
            if spectra.shape[0] == y.shape[0]:
                pass
    
    DIANE's avatar
    DIANE committed
                M3.warning('The number of samples is different in X and Y')
                y = pd.DataFrame
                spectra = pd.DataFrame
                
                
    
    DIANE's avatar
    DIANE committed
    
    
    ## Load .dx file
    elif file == files_format[1]:
        data_file = M3.file_uploader("Select Data", type=".dx", help=" :mushroom: select a dx file")
        if data_file:
            with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
                tmp.write(data_file.read())
                tmp_path = tmp.name
                chem_data, spectra, meta_data = read_dx(file =  tmp_path)
                M3.success("The data have been loaded successfully", icon="")
                yname = M3.selectbox('Select target', options=chem_data.columns)
    
    DIANE's avatar
    DIANE committed
                measured = chem_data.loc[:,yname] > 0
                y = chem_data.loc[:,yname].loc[measured]
                spectra = spectra.loc[measured]
    
    DIANE's avatar
    DIANE committed
            os.unlink(tmp_path)
    
    ### split the data
    if not spectra.empty and not y.empty:
        rd_seed = M1.slider("Customize Train-test split", min_value=1, max_value=100, value=42, format="%i")
        # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing
    
    DIANE's avatar
    DIANE committed
        train_index, test_index = train_test_split_idx(spectra, y = y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=rd_seed)
    
    DIANE's avatar
    DIANE committed
        # Assign data to training and test sets
        X_train, y_train = pd.DataFrame(spectra.iloc[train_index,:]), y.iloc[train_index]
        X_test, y_test = pd.DataFrame(spectra.iloc[test_index,:]), y.iloc[test_index]
    
    DIANE's avatar
    DIANE committed
        
        sk = lambda x: skew(x, axis=0, bias=True)
        ku = lambda x:kurtosis(x, axis=0, bias=True)
        cv = lambda x: x.std()*100/x.mean()
    
        M2.write('Loaded data summary')
        M2.write(f'The loaded spectra consist of {spectra.shape[1]} wavelengths')
        datainf = pd.DataFrame()
        datainf['N samples'] = [X_train.shape[0], X_test.shape[0], spectra.shape[0] ]
        datainf['Mean'] = [y_train.mean(), y_test.mean(), y.mean()]
        datainf['SD'] = [y_train.std(), y_test.std(), y.std()]
        datainf['CV(%)'] = [cv(y_train), cv(y_test), cv(y)]
        datainf['Skewness'] = [sk(y_train), sk(y_test), sk(y)]
        datainf['Kurtosis'] = [ku(y_train), ku(y_test), ku(y)]
        datainf.index = ['Train', 'Test', 'Total']
        M2.write(datainf.round(3))
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
    #######################################
        regression_algo = M1.selectbox("Choose the algorithm for regression", options=reg_algo, key = 12)
        if regression_algo == reg_algo[1]:
            # Train model with model function from application_functions.py
            Reg = PinardPlsr(x_train = X_train, x_test = X_test,y_train = y_train, y_test = y_test)
            reg_model = Reg.model_
            #M2.dataframe(Pin.pred_data_)
        elif regression_algo == reg_algo[2]:
    
            Reg = LWPLSR(x_train = X_train, x_test = X_test, y_train = y_train, y_test = y_test)
    
    DIANE's avatar
    DIANE committed
    
        elif regression_algo == reg_algo[3]:
    
            s = M1.number_input(label='Enter the maximum number of intervals', min_value=1, max_value=6, value=3)
    
    DIANE's avatar
    DIANE committed
            it = M1.number_input(label='Enter the number of iterations', min_value=50, max_value=1000, value=100)
            progress_text = "The model is being created. Please wait."
    
    DIANE's avatar
    DIANE committed
                
    
    DIANE's avatar
    DIANE committed
            Reg = TpeIpls(x_train = X_train, x_test=X_test, y_train = y_train, y_test = y_test, scale = False, Kfold = 3, n_intervall = s)
            pro = M1.progress(0, text="The model is being created. Please wait!")
            rega = Reg.BandSelect(n_iter=it)
            pro.empty()
            M1.progress(100, text = "The model has successfully been  created!")            
            time.sleep(1)
            reg_model = Reg.model_
            M2.write('-- Table of selected wavelengths --')
            M2.table(rega[0])
    
    DIANE's avatar
    DIANE committed
        elif regression_algo == reg_algo[4]:
            Reg = PlsR(x_train = X_train, x_test = X_test, y_train = y_train, y_test = y_test)
            reg_model = Reg.model_
    
    
            ################# Model analysis ############
    
    DIANE's avatar
    DIANE committed
        if regression_algo in reg_algo[1:]:
            yc = Reg.pred_data_[0]
            ycv = Reg.pred_data_[1]
            yt = Reg.pred_data_[2]
    
    DIANE's avatar
    DIANE committed
                
    
    DIANE's avatar
    DIANE committed
            M2.write("-- Performance metrics --")
            M2.dataframe(Reg.metrics_)
    
    DIANE's avatar
    DIANE committed
    
    
    DIANE's avatar
    DIANE committed
            M7.pyplot(reg_plot([y_train, y_train, y_test],[yc, ycv, yt]))
            M8.pyplot(resid_plot([y_train, y_train, y_test],[yc, ycv, yt]))
    
                #model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20)
    
    DIANE's avatar
    DIANE committed
            model_name = M9.text_input('Give it a name')
    
    DIANE's avatar
    DIANE committed
            date_time = datetime.datetime.strftime(datetime.date.today(), '_%Y_%m_%d_')
    
    DIANE's avatar
    DIANE committed
            if M9.button('Export Model'):
                path = 'data/models/model_'
                if file == files_format[0]:
    
                    #export_package = __import__(model_export)
    
    DIANE's avatar
    DIANE committed
                    with open(path + model_name + date_time + '_created_on_' + xcal_csv.name[:xcal_csv.name.find(".")] +""+
                               '_and_' + ycal_csv.name[:ycal_csv.name.find(".")] + '_data_' + '.pkl','wb') as f:
    
                        joblib.dump(reg_model, f)
    
    DIANE's avatar
    DIANE committed
                        if regression_algo == reg_algo[3]:
                            rega[1].sort()
    
    DIANE's avatar
    DIANE committed
                            pd.DataFrame(rega[1]).to_csv(path + model_name + date_time + '_on_' + xcal_csv.name[:xcal_csv.name.find(".")]
                                                          + '_and_' + ycal_csv.name[:ycal_csv.name.find(".")] + '_data_'+'Wavelengths_index.csv', sep = ';')
    
    DIANE's avatar
    DIANE committed
    
                elif file == files_format[1]:
                    #export_package = __import__(model_export)
    
    DIANE's avatar
    DIANE committed
                    with open(path + model_name + '_on_'+ data_file.name[:data_file.name.find(".")] + '_data_' + '.pkl','wb') as f:
    
    DIANE's avatar
    DIANE committed
                        joblib.dump(reg_model, f)
                        if regression_algo == reg_algo[3]:
                            rega[1].sort()
    
    DIANE's avatar
    DIANE committed
                            pd.DataFrame(rega[1]).to_csv(path +data_file.name[:data_file.name.find(".")]+ model_name + date_time+ '_on_' + '_data_'+'Wavelengths_index.csv', sep = ';')
                            st.write('Model Exported ')
    
    DIANE's avatar
    DIANE committed
                        
                if regression_algo == reg_algo[3]:
    
                    st.write('Model Exported')
    
    DIANE's avatar
    DIANE committed
                            
    
    
                    # create a report with information on the model
                    ## see https://stackoverflow.com/a/59578663
    
    DIANE's avatar
    DIANE committed
            if st.session_state['interface'] == 'simple':
                st.page_link('pages\\3-prediction.py', label = 'Keep on keepin\' on to predict your values !')
    
    
    ## Load .dx file