Skip to content
Snippets Groups Projects
Commit 1b115462 authored by Nicolas Barthes's avatar Nicolas Barthes
Browse files

Merge remote-tracking branch 'origin/master'

# Conflicts:
#	Class_Mod/UMAP_.py
#	Class_Mod/__init__.py
#	Modules.py
#	Packages.py
#	pages/1-samples_selection.py
parents f98b67f8 9c513852
No related branches found
No related tags found
No related merge requests found
...@@ -31,17 +31,13 @@ def col_cat(data_import): ...@@ -31,17 +31,13 @@ def col_cat(data_import):
if len(categorical_columns_list) > 0: if len(categorical_columns_list) > 0:
categorical_data = pd.concat(categorical_columns_list, axis=1) categorical_data = pd.concat(categorical_columns_list, axis=1)
if len(categorical_columns_list) == 0: if len(categorical_columns_list) == 0:
empty = ["" for x in range(len(data_import))] categorical_data = pd.DataFrame
categorical_columns_list.append(empty)
categorical_data = pd.DataFrame(categorical_columns_list).T
categorical_data.columns = ['no categories']
# Create numerical data matrix from the numerical columns list and fill na with the mean of the column # Create numerical data matrix from the numerical columns list and fill na with the mean of the column
numerical_data = pd.concat(numerical_columns_list, axis=1) numerical_data = pd.concat(numerical_columns_list, axis=1)
numerical_data = numerical_data.apply(lambda x: x.fillna(x.mean())) #np.mean(x))) numerical_data = numerical_data.apply(lambda x: x.fillna(x.mean())) #np.mean(x)))
# Scale the numerical data
scaler = StandardScaler() return numerical_data, categorical_data
scaled_values = scaler.fit_transform(numerical_data)
return numerical_data, categorical_data, scaled_values
def list_files(mypath, import_type): def list_files(mypath, import_type):
......
...@@ -29,21 +29,21 @@ class DxRead: ...@@ -29,21 +29,21 @@ class DxRead:
block_met = { 'name': block['title'], block_met = { 'name': block['title'],
'origin': block['origin'], 'origin': block['origin'],
'date': block['date'], 'date': block['date'],
'time': block['time'], # 'time': block['time'],
'spectrometer/data system': block['spectrometer/data system'], # 'spectrometer/data system': block['spectrometer/data system'],
'instrumental parameters': block['instrumental parameters'], # 'instrumental parameters': block['instrumental parameters'],
'xunits': block['xunits'], # 'xunits': block['xunits'],
'yunits': block['yunits'], # 'yunits': block['yunits'],
'xfactor': block['xfactor'], # 'xfactor': block['xfactor'],
'yfactor': block['yfactor'], # 'yfactor': block['yfactor'],
'firstx': block['firstx'], # 'firstx': block['firstx'],
'lastx': block['lastx'], # 'lastx': block['lastx'],
'firsty':block['firsty'], # 'firsty':block['firsty'],
'miny': block['miny'], # 'miny': block['miny'],
'maxy': block['maxy'], # 'maxy': block['maxy'],
'npoints': block['npoints'], # 'npoints': block['npoints'],
'concentrations':block['concentrations'], 'concentrations':block['concentrations'],
'deltax':block['deltax'] # 'deltax':block['deltax']
} }
self.__met[f'{i}'] = block_met self.__met[f'{i}'] = block_met
self.metadata_ = pd.DataFrame(self.__met).T self.metadata_ = pd.DataFrame(self.__met).T
...@@ -87,8 +87,13 @@ class DxRead: ...@@ -87,8 +87,13 @@ class DxRead:
return self.spectra return self.spectra
@property @property
def md_df_(self): def md_df_(self):
return self.metadata_ return self.metadata_.drop("concentrations", axis = 1)
@property @property
def chem_data_(self): def chem_data_(self):
return self.chem_data return self.chem_data
\ No newline at end of file
@st.cache_data
def read_dx(file):
M = DxRead(file)
return M.chem_data, M.specs_df_, M.md_df_
\ No newline at end of file
...@@ -19,13 +19,6 @@ class Sk_Kmeans: ...@@ -19,13 +19,6 @@ class Sk_Kmeans:
def fit_optimal(self, nclusters): def fit_optimal(self, nclusters):
model = KMeans(n_clusters = nclusters, init = 'k-means++', random_state = 42) model = KMeans(n_clusters = nclusters, init = 'k-means++', random_state = 42)
model.fit(self.x) model.fit(self.x)
yp = model.predict(self.x) yp = model.predict(self.x)+1
num_colors = nclusters clu = [f'cluster#{i}' for i in yp]
colors = ['#' + ''.join([random.choice('0123456789ABCDEF') for _ in range(6)]) for _ in range(num_colors)] return self.x, clu
col = np.array(['#' + ''.join([random.choice('0123456789ABCDEF') for _ in range(6)]) for _ in range(self.x.shape[0])]) \ No newline at end of file
for i in range(nclusters):
ss = np.where(yp==i)
col[ss] = colors[i]
return self.x, col
\ No newline at end of file
...@@ -47,3 +47,18 @@ def resid_plot( meas, pred): ...@@ -47,3 +47,18 @@ def resid_plot( meas, pred):
def download_results(data, export_name): def download_results(data, export_name):
with open(data) as f: with open(data) as f:
st.download_button('Download Results', f, export_name) st.download_button('Download Results', f, export_name)
@st.cache_resource
def plot_spectra(df):
if isinstance(df.columns[0], str):
m = 0
else:
m = np.min(df.columns)
fig, ax = plt.subplots(figsize = (30,7))
df.T.plot(legend=False, ax = ax, color = 'blue')
ax.set_xlabel('Wavelength/Wavenumber', fontsize=18)
ax.set_ylabel('Signal intensity', fontsize=18)
plt.margins(x = 0)
plt.annotate(text = f'The total number of spectra is {df.shape[0]}', xy =(m, np.max(df)), size=20, color = 'black', backgroundcolor='red')
return fig
...@@ -7,6 +7,6 @@ from .LWPLSR_ import model_LWPLSR ...@@ -7,6 +7,6 @@ from .LWPLSR_ import model_LWPLSR
from .Regression_metrics import metrics from .Regression_metrics import metrics
from .VarSel import TpeIpls from .VarSel import TpeIpls
from .Miscellaneous import resid_plot, reg_plot from .Miscellaneous import resid_plot, reg_plot
from .DxReader import DxRead from .DxReader import DxRead, read_dx
from .HDBSCAN_Clustering import Hdbscan from .HDBSCAN_Clustering import Hdbscan
from Class_Mod import LinearPCA, Umap, find_col_index, PinardPlsr, model_LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead, Hdbscan from Class_Mod import LinearPCA, Umap, find_col_index, PinardPlsr, model_LWPLSR, list_files, metrics, TpeIpls, reg_plot, resid_plot, Sk_Kmeans, DxRead, Hdbscan, read_dx
# find_col_index # find_col_index
from Class_Mod.Miscellaneous import prediction, download_results from Class_Mod.Miscellaneous import prediction, download_results, plot_spectra
...@@ -41,6 +41,7 @@ from PIL import Image ...@@ -41,6 +41,7 @@ from PIL import Image
import plotly.express as px import plotly.express as px
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import seaborn as sns import seaborn as sns
import matplotlib
### Important Metrics ### Important Metrics
from sklearn.metrics import pairwise_distances_argmin_min, adjusted_rand_score, adjusted_mutual_info_score from sklearn.metrics import pairwise_distances_argmin_min, adjusted_rand_score, adjusted_mutual_info_score
......
...@@ -8,195 +8,175 @@ if st.session_state["interface"] == 'simple': ...@@ -8,195 +8,175 @@ if st.session_state["interface"] == 'simple':
hide_pages("Predictions") hide_pages("Predictions")
################################### Data Loading and Visualization ######################################## ################################### Data Loading and Visualization ########################################
container1 = st.container(border=True)
col2, col1 = st.columns([3, 1]) col2, col1 = st.columns([3, 1])
col1.header("Data Loading", divider='blue') col1.header("Data Loading", divider='blue')
col2.header("Spectral Data Visualization", divider='blue') col2.header("Spectral Data Visualization", divider='blue')
container2 = st.container(border=True) ## Preallocation of data structure
container2.header("Exploratory Data Analysis-Multivariable Data Analysis", divider='blue') spectra = pd.DataFrame
scores, loadings, pc = st.columns([2, 3, 0.5]) meta_data = pd.DataFrame
influence, hotelling, qexp = st.columns([2, 2, 1]) selected_samples = pd.DataFrame
with container1: # loader for datafile
# loader for csv file containing NIRS spectra data_file = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5)
sselectx_csv = col1.file_uploader("Load NIRS Data", type=["csv","dx"], help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns", key=5)
if sselectx_csv is not None:
test = sselectx_csv.name[sselectx_csv.name.find('.'):] if data_file:
if test== '.csv': # Retrieve the extension of the file
with col1: test = data_file.name[data_file.name.find('.'):]
# Select list for CSV delimiter
psep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+sselectx_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+sselectx_csv.name))), key=9) ## Load .csv file
if test== '.csv':
with col1:
# Select list for CSV delimiter
psep = st.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+data_file.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+data_file.name))), key=9)
# Select list for CSV header True / False # Select list for CSV header True / False
phdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+sselectx_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+sselectx_csv.name))), key=31) phdr = st.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+data_file.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+data_file.name))), key=31)
if phdr == 'yes': if phdr == 'yes':
col = 0 col = 0
else: else:
col = False col = False
data_import = pd.read_csv(sselectx_csv, sep=psep, index_col=col) imp = pd.read_csv(data_file, sep=psep, index_col=col)
data_import, categorical_data, scaled_values = col_cat(data_import) spectra = col_cat(imp)[0]
meta_data = col_cat(imp)[1]
st.success("The data have been loaded successfully", icon="")
## Load .dx file
elif test == '.dx':
# Create a temporary file to save the uploaded file
with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
tmp.write(data_file.read())
tmp_path = tmp.name
with col1:
_, spectra, meta_data = read_dx(file = tmp_path)
st.success("The data have been loaded successfully", icon="") st.success("The data have been loaded successfully", icon="")
## Visualize spectra os.unlink(tmp_path)
with col2:
fig, ax = plt.subplots(figsize = (30,7)) ## Visualize spectra
data_import.T.plot(legend=False, ax = ax, color = 'blue') if not spectra.empty:
ax.set_xlabel('Wavelength/Wavenumber', fontsize=18) with col2:
ax.set_ylabel('Signal', fontsize=18) fig = plot_spectra(spectra)
plt.margins(x = 0) st.pyplot(fig)
st.pyplot(fig)
st.write("Summary")
info = pd.DataFrame({'N':[data_import.shape[0]],
'Min': [np.min(data_import)],
'Max':[np.max(data_import)],}, index = ['Values']).T
info.rename_axis('information')
st.table(data=info)
elif test == '.dx':
# Create a temporary file to save the uploaded file
with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
tmp.write(sselectx_csv.read())
tmp_path = tmp.name
with col1:
data = DxRead(path = tmp_path)
data_import = data.specs_df_
st.success("The data have been loaded successfully", icon="")
## Visualize spectra
with col2:
fig, ax = plt.subplots(figsize = (30,7))
data_import.T.plot(legend=False, ax = ax, color = 'blue')
ax.set_xlabel('Wavelength/Wavenumber', fontsize=18)
ax.set_ylabel('Signal', fontsize=18)
plt.margins(x = 0)
st.pyplot(fig)
st.write("Summary")
info = pd.DataFrame({'N':[data_import.shape[0]],
'Min': [np.min(data_import)],
'Max':[np.max(data_import)],}, index = ['Values']).T
info.rename_axis('information')
st.table(data=info)
os.unlink(tmp_path)
######################################################################################
############################## Exploratory data analysis ############################### ############################## Exploratory data analysis ###############################
plot_type=['', 'PCA','UMAP', 'NMF'] container2 = st.container(border=True)
cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP'] container2.header("Exploratory Data Analysis-Multivariable Data Analysis", divider='blue')
with container2: scores, loadings, pc = st.columns([2, 3, 0.5])
if sselectx_csv is not None: influence, hotelling, qexp = st.columns([2, 2, 1])
plot_type=['', 'PCA','UMAP', 'NMF']
cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP']
with pc:
type_plot = st.selectbox("Dimensionality reduction techniques: ", options=plot_type, key=37)
type_cluster = st.selectbox("Clustering techniques: ", options=cluster_methods, key=38)
# compute UMAP - umap_maker in application_functions.py
if type_plot == 'PCA':
model = LinearPCA(data_import, Ncomp=5)
elif type_plot =='UMAP':
model = Umap(data_import = data_import, numerical_data = scaled_values, cat_data = categorical_data)
if type_plot in ['PCA', 'UMAP']:
if type_plot in ['PCA']:
# add 2 select lists to choose which component to plot
axis1 = pc.selectbox("x-axis", options = model.scores_.columns, index=0)
axis2 = pc.selectbox("y-axis", options = model.scores_.columns, index=1)
axis3 = pc.selectbox("z-axis", options = model.scores_.columns, index=2)
elif type_plot in ['UMAP']:
axis1 = 0
axis2 = 1
axis3 = 2
if type_cluster == 'Kmeans':
scsc = pd.concat([model.scores_.loc[:,axis1], model.scores_.loc[:,axis2], model.scores_.loc[:,axis3]], axis = 1)
cl = Sk_Kmeans(scsc, max_clusters = 30)
elif type_cluster == 'HDBSCAN':
optimized_hdbscan = Hdbscan(model.scores_raw_)
labels, hdbscan_score = optimized_hdbscan.HDBSCAN_scores_
with scores:
t = model.scores_
if type_cluster in ['AP', 'Kmeans']:
st.write('Scree plot')
fig2 = px.scatter(cl.inertia_.T, y = 'inertia')
st.plotly_chart(fig2)
ncluster = st.number_input(min_value=2, max_value=30, value=3, label = 'Select the desired number of clusters')
data, colors = cl.fit_optimal(nclusters=ncluster)
#fig = px.scatter(data, x=axis1, y=axis2, color= colors)
st.write('Scores plot')
fig = px.scatter_3d(data, x=axis1, y=axis2, z = axis3, color=colors)
fig.update_traces(marker=dict(size=4))
elif type_cluster in ['HDBSCAN']:
st.write('plot HDBSCAN clustering')
fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color=labels)
fig.update_traces(marker=dict(size=4))
# st.plotly_chart(fig_hdbscan)
st.write('Optimal number of clusters = ' + str(len(set(labels))))
st.write('DBCV score (-1 to 1 - higher is better) = ' + str(round(hdbscan_score,3)))
st.write('Unclassified samples: ' + str(len(t[labels==-1])) + ' on ' + str(len(t)) + ' samples (' + str(round(len(t[labels==-1])/len(t)*100, 1)) + '%).')
else:
if test == '.dx':
filter = ['origin', 'date', 'time', 'spectrometer/data system']
col = st.selectbox('filter', options= filter)
fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3, color = data.md_df_[col])
fig.update_traces(marker=dict(size=4))
else:
fig = px.scatter_3d(t, x=axis1, y=axis2, z = axis3 )
fig.update_traces(marker=dict(size=4))
st.plotly_chart(fig)
if type_plot =='PCA':
with loadings:
st.write('Loadings plot')
p = model.loadings_
pp = pd.concat([p, pd.DataFrame(np.arange(p.shape[0]), index=p.index, columns=['wl'])], axis =1)
df1 = pp.melt(id_vars="wl")
fig = px.line(df1, x = 'wl', y = 'value', color='variable')
fig.update_layout(
legend=dict(x=1, y=0,
font=dict(
family="Courier", size=12, color="black"),
bordercolor="Black", borderwidth=2)
)
st.plotly_chart(fig, use_container_width = True)
with influence:
st.write('Influence plot')
ax1 = st.selectbox("Component", options=model.scores_.columns, index=3)
leverage = model.leverage_
residuals = model.residuals_
fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color = leverage[ax1]*residuals[ax1]).update_layout(xaxis_title="Leverage",yaxis_title="Residuals")
st.plotly_chart(fig)
with hotelling:
st.write('T²-Hotelling vs Q residuals plot')
hotelling = model.hotelling_
ax2 = st.selectbox("Component", options=model.scores_.columns, index=4)
hotelling = model.hotelling_
fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="",yaxis_title="Residuals")
st.plotly_chart(fig)
else: dim_red_methods=['', 'PCA','UMAP', 'NMF'] # List of dimensionality reduction algos
st.markdown('Select a dimensionality reduction technique from the dropdown list') cluster_methods = ['', 'Kmeans','HDBSCAN', 'AP'] # List of clustering algos
dr_model = None # dimensionality reduction model
cl_model = None # clustering model
# Dimensionality reduction
t = pd.DataFrame # scores
p = pd.DataFrame # loadings
labels = []
if not spectra.empty:
dim_red_method = pc.selectbox("Dimensionality reduction techniques: ", options = dim_red_methods, key = 37)
clus_method = pc.selectbox("Clustering techniques: ", options = cluster_methods, key = 38)
xc = standardize(spectra)
if dim_red_method == dim_red_methods[1]:
dr_model = LinearPCA(xc, Ncomp=5)
elif dim_red_method == dim_red_methods[2]:
dr_model = Umap(data_import = data_import, numerical_data = scaled_values, cat_data = categorical_data)
if dr_model:
axis1 = pc.selectbox("x-axis", options = dr_model.scores_.columns, index=0)
axis2 = pc.selectbox("y-axis", options = dr_model.scores_.columns, index=1)
axis3 = pc.selectbox("z-axis", options = dr_model.scores_.columns, index=2)
t = pd.concat([dr_model.scores_.loc[:,axis1], dr_model.scores_.loc[:,axis2], dr_model.scores_.loc[:,axis3]], axis = 1)
# clustering
if not t.empty:
tcr = standardize(t)
# Clustering
if clus_method == cluster_methods[1]:
ncluster = scores.number_input(min_value=2, max_value=30, value=3, label = 'Select the desired number of clusters')
cl_model = Sk_Kmeans(tcr, max_clusters = 30)
fig2 = px.scatter(cl_model.inertia_.T, y = 'inertia')
scores.plotly_chart(fig2)
data, labels = cl_model.fit_optimal(nclusters = ncluster)
elif clus_method == cluster_methods[2]:
optimized_hdbscan = Hdbscan(model.scores_raw_)
labels, hdbscan_score = optimized_hdbscan.HDBSCAN_scores_
##### Plots
## Scores
if not t.empty:
with scores:
st.write('Scores plot')
# scores plot with clustering
if list(labels) and meta_data.empty:
fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = labels)
# scores plot with metadata
elif len(list(labels)) == 0 and not meta_data.empty:
filter = meta_data.columns[1:]
col = st.selectbox('Group by:', options= filter)
if col == 0:
fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3)
else:
fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,meta_data[col])) )
# color with scores and metadata
elif len(list(labels)) > 0 and not meta_data.empty:
if clus_method in cluster_methods[1:]:
filter = ['None', clus_method]
filter.extend(meta_data.columns[1:])
else:
filter = meta_data.columns[1:].insert(0,'None')
col = st.selectbox('Group by:', options= filter)
if col == "None":
fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3)
elif col == clus_method:
fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = labels)
else:
fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3, color = list(map(str.lower,meta_data[col])))
else:
fig = px.scatter_3d(tcr, x=axis1, y=axis2, z = axis3)
fig.update_traces(marker=dict(size=4))
st.plotly_chart(fig)
if not spectra.empty:
if dim_red_method == dim_red_methods[1]:
with loadings:
st.write('Loadings plot')
p = dr_model.loadings_
pp = pd.concat([p, pd.DataFrame(np.arange(p.shape[0]), index=p.index, columns=['wl'])], axis =1)
df1 = pp.melt(id_vars="wl")
fig = px.line(df1, x = 'wl', y = 'value', color='variable')
fig.update_layout(legend=dict(x=1, y=0,font=dict(family="Courier", size=12, color="black"),
bordercolor="Black", borderwidth=2))
st.plotly_chart(fig, use_container_width = True)
with influence:
st.write('Influence plot')
ax1 = st.selectbox("Component", options=dr_model.scores_.columns, index=3)
leverage = dr_model.leverage_
residuals = dr_model.residuals_
fig = px.scatter(x=leverage[ax1], y=residuals[ax1], color = leverage[ax1]*residuals[ax1]).update_layout(xaxis_title="Leverage",yaxis_title="Residuals")
st.plotly_chart(fig)
with hotelling:
st.write('T²-Hotelling vs Q residuals plot')
hotelling = dr_model.hotelling_
ax2 = st.selectbox("Component", options=dr_model.scores_.columns, index=4)
hotelling = dr_model.hotelling_
fig = px.scatter(t, x=hotelling[ax2], y=residuals[ax2]).update_layout(xaxis_title="",yaxis_title="Residuals")
st.plotly_chart(fig)
\ No newline at end of file
...@@ -3,9 +3,12 @@ st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide") ...@@ -3,9 +3,12 @@ st.set_page_config(page_title="NIRS Utils", page_icon=":goat:", layout="wide")
from Modules import * from Modules import *
from Class_Mod.DATA_HANDLING import * from Class_Mod.DATA_HANDLING import *
st.session_state["interface"] = st.session_state.get('interface') st.session_state["interface"] = st.session_state.get('interface')
if st.session_state["interface"] == 'simple': if st.session_state["interface"] == 'simple':
hide_pages("Predictions") hide_pages("Predictions")
def nn(x): def nn(x):
return x is not None return x is not None
######################################################################################## ########################################################################################
...@@ -26,91 +29,135 @@ M9, M10 = st.columns([2,2]) ...@@ -26,91 +29,135 @@ M9, M10 = st.columns([2,2])
M9.write("-- Save the model --") M9.write("-- Save the model --")
files_format = ['.csv', '.dx']
file = M3.radio('select data file format:', options = files_format)
# CSV files loader ### Data
xcal_csv = M3.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns") spectra = pd.DataFrame
ycal_csv = M3.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column") y = pd.DataFrame
# load .csv file
if xcal_csv is not None and ycal_csv is not None: if file == files_format[0]:
xcal_csv = M3.file_uploader("Select NIRS Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and lambdas as columns")
ycal_csv = M3.file_uploader("Select corresponding Chemical Data", type="csv", help=" :mushroom: select a csv matrix with samples as rows and chemical values as a column")
if xcal_csv and ycal_csv:
# Select list for CSV delimiter # Select list for CSV delimiter
sep = M3.selectbox("Select csv separator - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)), options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0) sep = M3.radio("Select csv separator - _detected_: " + str(find_delimiter('data/'+xcal_csv.name)),
options=[";", ","], index=[";", ","].index(str(find_delimiter('data/'+xcal_csv.name))), key=0)
# Select list for CSV header True / False # Select list for CSV header True / False
hdr = M3.selectbox("indexes column in csv? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)), options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1) hdr = M3.radio("indexes column in csv? - _detected_: " + str(find_col_index('data/'+xcal_csv.name)),
options=["no", "yes"], index=["no", "yes"].index(str(find_col_index('data/'+xcal_csv.name))), key=1)
###############
if hdr == 'yes': if hdr == 'yes':
col = 0 col = 0
else: else:
col = False col = False
rd_seed = M1.slider("Customize Train-test split", min_value=1, max_value=100, value=42, format="%i") ###############
x, y = utils.load_csv(xcal_csv, ycal_csv, autoremove_na=True, sep=sep, x_hdr=0, y_hdr=0, x_index_col=col, y_index_col=col) spectra, y = utils.load_csv(xcal_csv, ycal_csv, autoremove_na=True, sep=sep, x_hdr=0, y_hdr=0, x_index_col=col, y_index_col=col)
# Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing spectra = pd.DataFrame(spectra)
train_index, test_index = train_test_split_idx(x, y=y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=rd_seed) y = pd.DataFrame(y)
# Assign data to training and test sets
X_train, y_train, X_test, y_test = pd.DataFrame(x[train_index]), pd.DataFrame(y[train_index]), pd.DataFrame(x[test_index]), pd.DataFrame(y[test_index])
y_train = y_train.iloc[:,0]
y_test = y_test.iloc[:,0] ## Load .dx file
elif file == files_format[1]:
data_file = M3.file_uploader("Select Data", type=".dx", help=" :mushroom: select a dx file")
if data_file:
############################# Regression modelling ########################################## with NamedTemporaryFile(delete=False, suffix=".dx") as tmp:
regression_algo = M1.selectbox("Choose the algorithm for regression", options=reg_algo, key = 12) tmp.write(data_file.read())
if regression_algo == reg_algo[1]: tmp_path = tmp.name
# Train model with model function from application_functions.py chem_data, spectra, meta_data = read_dx(file = tmp_path)
Reg = PinardPlsr(x_train = X_train, x_test = X_test,y_train = y_train, y_test = y_test) M3.success("The data have been loaded successfully", icon="")
reg_model = Reg.model_ yname = M3.selectbox('Select target', options=chem_data.columns)
#M2.dataframe(Pin.pred_data_) spectra = spectra
y = chem_data.loc[:,yname]
elif regression_algo == reg_algo[2]:
reg_model = model_LWPLSR(xcal_csv, ycal_csv, sep, hdr) os.unlink(tmp_path)
elif regression_algo == reg_algo[3]: ### split the data
s = M2.number_input(label='Enter the maximum number of intervalls', min_value=1, max_value=6, value=3) if not spectra.empty and not y.empty:
it = M2.number_input(label='Enter the number of iterations', min_value=50, max_value=1000, value=100) rd_seed = M1.slider("Customize Train-test split", min_value=1, max_value=100, value=42, format="%i")
progress_text = "The model is being created. Please wait." # Split data into training and test sets using the kennard_stone method and correlation metric, 25% of data is used for testing
train_index, test_index = train_test_split_idx(spectra, y=y, method="kennard_stone", metric="correlation", test_size=0.25, random_state=rd_seed)
Reg = TpeIpls(x_train = X_train, x_test=X_test, y_train = y_train, y_test = y_test, scale = False, Kfold = 3, n_intervall = s) # Assign data to training and test sets
pro = M1.progress(0, text="The model is being created. Please wait!") X_train, y_train, X_test, y_test = pd.DataFrame(spectra.iloc[train_index,:]), pd.DataFrame(y.iloc[train_index]), pd.DataFrame(spectra.iloc[test_index,:]), pd.DataFrame(y.iloc[test_index])
rega = Reg.BandSelect(n_iter=it) y_train = y_train.iloc[:,0]
pro.empty() y_test = y_test.iloc[:,0]
M1.progress(100, text = "The model has successfully been created!")
#######################################
regression_algo = M1.selectbox("Choose the algorithm for regression", options=reg_algo, key = 12)
if regression_algo == reg_algo[1]:
# Train model with model function from application_functions.py
Reg = PinardPlsr(x_train = X_train, x_test = X_test,y_train = y_train, y_test = y_test)
reg_model = Reg.model_
#M2.dataframe(Pin.pred_data_)
elif regression_algo == reg_algo[2]:
reg_model = model_LWPLSR(xcal_csv, ycal_csv, sep, hdr)
elif regression_algo == reg_algo[3]:
s = M1.number_input(label='Enter the maximum number of intervalls', min_value=1, max_value=6, value=3)
it = M1.number_input(label='Enter the number of iterations', min_value=50, max_value=1000, value=100)
progress_text = "The model is being created. Please wait."
time.sleep(1) Reg = TpeIpls(x_train = X_train, x_test=X_test, y_train = y_train, y_test = y_test, scale = False, Kfold = 3, n_intervall = s)
reg_model = Reg.model_ pro = M1.progress(0, text="The model is being created. Please wait!")
M2.table(rega[0]) rega = Reg.BandSelect(n_iter=it)
pro.empty()
M1.progress(100, text = "The model has successfully been created!")
time.sleep(1)
reg_model = Reg.model_
M2.write('-- Table of selected wavelengths --')
M2.table(rega[0])
################# Model analysis ############ ################# Model analysis ############
if regression_algo in reg_algo[1:]:
if regression_algo in reg_algo[1:]: yc = Reg.pred_data_[0]
yc = Reg.pred_data_[0] ycv = Reg.pred_data_[1]
ycv = Reg.pred_data_[1] yt = Reg.pred_data_[2]
yt = Reg.pred_data_[2]
M1.write("-- Performance metrics --") M2.write("-- Performance metrics --")
M1.dataframe(Reg.metrics_) M2.dataframe(Reg.metrics_)
M7.pyplot(reg_plot([y_train, y_train, y_test],[yc, ycv, yt])) M7.pyplot(reg_plot([y_train, y_train, y_test],[yc, ycv, yt]))
M8.pyplot(resid_plot([y_train, y_train, y_test],[yc, ycv, yt])) M8.pyplot(resid_plot([y_train, y_train, y_test],[yc, ycv, yt]))
#model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20) #model_export = M1.selectbox("Choose way to export", options=["pickle", "joblib"], key=20)
model_name = M9.text_input('Give it a name') model_name = M9.text_input('Give it a name')
if M9.button('Export Model'): if M9.button('Export Model'):
path = 'data/models/model_'
if file == files_format[0]:
#export_package = __import__(model_export) #export_package = __import__(model_export)
with open('data/models/model_' + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_' + '.pkl','wb') as f: with open(path + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_' + '.pkl','wb') as f:
joblib.dump(reg_model, f) joblib.dump(reg_model, f)
if regression_algo == reg_algo[3]:
if regression_algo == reg_algo[3]: rega[1].sort()
rega[1].sort() pd.DataFrame(rega[1]).to_csv(path + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_'+'Wavelengths_index.csv', sep = ';')
pd.DataFrame(rega[1]).to_csv('data/models/model_' + model_name + '_on_' + xcal_csv.name + '_and_' + ycal_csv.name + '_data_''Wavelengths_index.csv', sep = ';')
elif file == files_format[1]:
#export_package = __import__(model_export)
with open(path + model_name + '_on_' + '_data_' + '.pkl','wb') as f:
joblib.dump(reg_model, f)
if regression_algo == reg_algo[3]:
rega[1].sort()
pd.DataFrame(rega[1]).to_csv(path + model_name + '_on_' + '_data_'+'Wavelengths_index.csv', sep = ';')
st.write('Model Exported')
if regression_algo == reg_algo[3]:
st.write('Model Exported') st.write('Model Exported')
# create a report with information on the model # create a report with information on the model
## see https://stackoverflow.com/a/59578663 ## see https://stackoverflow.com/a/59578663
#M4.pyplot(reg_plot(meas==(ycal_csv,ycal_csv,ycal_csv], pred=[ycal_csv,ycal_csv,ycal_csv]))
if st.session_state['interface'] == 'simple': if st.session_state['interface'] == 'simple':
st.page_link('pages\\3-prediction.py', label = 'Keep on keepin\' on to predict your values !') st.page_link('pages\\3-prediction.py', label = 'Keep on keepin\' on to predict your values !')
## Load .dx file
...@@ -47,7 +47,6 @@ if NIRS_csv: ...@@ -47,7 +47,6 @@ if NIRS_csv:
if st.button("Predict"): if st.button("Predict"):
if s: if s:
result = model_loaded.predict(pred_data.iloc[:,idx]) result = model_loaded.predict(pred_data.iloc[:,idx])
else: else:
# use prediction function from application_functions.py to predict chemical values # use prediction function from application_functions.py to predict chemical values
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment