import numpy as np
import pandas as pd
import feyn
from feyn.tools import split
from feyn.losses import binary_cross_entropy
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
ql = feyn.QLattice(qlattice="3c83b161", api_token="9ba5b8ae7a0b4206b23e413c741eda73")
from IPython.display import Image
from IPython.core.display import HTML
Image(url= "QSAR.png",width=800,height=800)
# Load the curated Estrogen data set
df = pd.read_csv("estrogen_indexes_nolabels.csv", sep=";")
# Published predictor:
# LogRBA = -43.75 * X2A + 0.04 * TIC1 - 2.67 * EEig02d + 79.92 * JGI10 + 2.6 * SPH - 7.12 * E1u + 4.78 * RTm_plus - 1.25 * nArOR + 15.83
def QDB_Predictor(X2A, TIC1, EEig02d, JGI10, SPH, E1u, RTm_plus, nArOR):
literature_prediction = -43.75 * X2A + 0.04 * TIC1 - 2.67 * EEig02d + 79.92 * JGI10 + 2.6 * SPH - 7.12 * E1u + 4.78 * RTm_plus - 1.25 * nArOR + 15.83
return literature_prediction.to_numpy()
descriptors = {
"X2A" : "Average connectivity index chi-2",
"TIC1" : "Total information index (neighborhood symmetry 1-order)",
"EEig02d" : "Eigenvalue 2 from edge adjacency matrix weighted dipole moments",
"JGI10": "Mean topological charge index of order 10",
"SPH" : "Spherosity index",
"E1u" : "The first component accessibility directional WHIM index/unweighted",
"nArOR" : "The number of aromatic ether groups",
"RTm_plus" : "R maximal index weighed by atomic masses"}
descriptors["X2A"]
'Average connectivity index chi-2'
def r_sqr(x,y): # calculate r^2 for two numpy arrays
x_minus_mean=x-x.mean()
y_minus_mean=y-y.mean()
cov_ = np.dot(x_minus_mean,y_minus_mean)/(len(x))
r = cov_/(x.std()*y.std())
rs=r*r
return(rs)
def PredPlot(x,y,title,xlabel,ylabel,color): # for plotting pred/exp plots
import matplotlib.pyplot as plt
# plot them predictions
#c = x**2 + y**2
fig,ax = plt.subplots()
ax.scatter(x, y, s=25, c=color, cmap=plt.cm.coolwarm, zorder=10)
rsq = "$r^2$ = "+str(r_sqr(x,y).round(2))
#ax.set_aspect(1.0/ax.get_data_ratio(), adjustable='box')
npmin=np.min([ax.get_xlim(), ax.get_ylim()]) # min of both axes
npmax=np.max([ax.get_xlim(), ax.get_ylim()]) # max of both axes
lims = [
npmin, # min of both axes
npmax, # max of both axes
]
# now plot both limits against eachother
ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
ax.set_aspect('equal')
ax.set_xlim(lims)
ax.set_ylim(lims)
ax.set_title(title)
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
ax.text(npmin*0.8, npmax*0.6, rsq, style='italic',
bbox={'facecolor': 'grey', 'alpha': 0.5, 'pad': 10})
#fig.savefig('/Users/niels/Desktop/test.png', dpi=300)
return
def CombiPredPlot(x,y,x1,y1,title,xlabel,ylabel,color): # for plotting pred/exp plots
import matplotlib.pyplot as plt
# plot them predictions
#c = x**2 + y**2
fig,ax = plt.subplots()
ax.scatter(x, y, s=25, c=color, cmap=plt.cm.coolwarm, zorder=10)
ax.scatter(x1, y1, s=25, c='g', cmap=plt.cm.coolwarm, zorder=10)
npmin=np.min([ax.get_xlim(), ax.get_ylim()]) # min of both axes
npmax=np.max([ax.get_xlim(), ax.get_ylim()]) # max of both axes
lims = [
npmin, # min of both axes
npmax, # max of both axes
]
# now plot both limits against eachother
ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
ax.set_aspect('equal')
ax.set_xlim(lims)
ax.set_ylim(lims)
ax.set_title(title)
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
#ax.text(npmin*0.8, npmax*0.6, rsq, style='italic',
# bbox={'facecolor': 'grey', 'alpha': 0.5, 'pad': 10})
#fig.savefig('/Users/niels/Desktop/test.png', dpi=300)
return
random_seed = 552
#target variable
target = df.columns[-1]
#train, test = train_test_split(df, test_size=0.33, stratify=df[target], random_state=random_seed)
train = df[df.columns[:]][0:128]
test = df[df.columns[:]][128:151]
# Mean subtraction
#test[train.columns]-test[train.columns].mean()
# Training set: Predictions from QDB_predictor
#QDB_Predictor(train['X2A'],train['TIC1'],train['EEig02d'],train['JGI10'],train['SPH'],train['E1u'],train['RTm_plus'],train['nArOR'])
y_pred_train = QDB_Predictor(train['X2A'],train['TIC1'],train['EEig02d'],train['JGI10'],train['SPH'],train['E1u'],train['RTm_plus'],train['nArOR'])
y_exp_train = train['LogRBA'].to_numpy()
PredPlot(y_exp_train,y_pred_train,'Train set','Experimental LogRBA','Predicted LogRBA','r')
# Test set: Predictions from QDB_predictor
y_pred_test = QDB_Predictor(test['X2A'],test['TIC1'],test['EEig02d'],test['JGI10'],test['SPH'],test['E1u'],test['RTm_plus'],test['nArOR'])
y_exp_test = test['LogRBA'].to_numpy()
PredPlot(y_exp_test,y_pred_test,'Test set','Experimental LogRBA','Predicted LogRBA','g')
# Train + Test in same plot
CombiPredPlot(y_exp_train,y_pred_train,y_exp_test,y_pred_test,'Train & Test sets','Experimental LogRBA','Predicted LogRBA','r')
ql.reset(random_seed=random_seed)
#------max_depth=1
qgraph = ql.get_regressor(train.columns, output = target, max_depth=1)
#training loop
for _ in range(50):
qgraph.fit(train, threads=8)
ql.update(qgraph.best())
import sympy
qgraph[0].sympify()
qgraph[0].save('c:/Users/models/_QSAR_/QSAR/graphs/qsar_estrogen_090621_MAXDEPTH1.graph')
qgraph[0].plot_summary(train)
qgraph[0].plot_summary(test)
qgraph[0].plot_goodness_of_fit(data=train)
#predictions = best_graph.predict(test[input_columns])
<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>
qgraph[0].plot_goodness_of_fit(data=test)
<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>
#------max_depth=2 ------------------------------------------------------------
qgraph = ql.get_regressor(train.columns, output = target, max_depth=2)
#training loop
for _ in range(50):
qgraph.fit(train, threads=8)
ql.update(qgraph.best())
qgraph[0].save('c:/Users/models/_QSAR_/QSAR/graphs/qsar_estrogen_090621_MAXDEPTH2.graph')
qgraph[0].plot_summary(train)
qgraph[0].plot_summary(test)
qgraph[0].plot_goodness_of_fit(data=train)
<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>
qgraph[0].plot_goodness_of_fit(data=test)
<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>
#------max_depth=3 ------------------------------------------------------------
qgraph = ql.get_regressor(train.columns, output = target, max_depth=3)
#training loop
for _ in range(50):
qgraph.fit(train, threads=8)
ql.update(qgraph.best())
qgraph[0].save('c:/Users/models/_QSAR_/QSAR/graphs/qsar_estrogen_090621_MAXDEPTH3.graph')
qgraph[0].plot_summary(train)
qgraph[0].plot_summary(test)
qgraph[0].plot_goodness_of_fit(data=train)
<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>
qgraph[0].plot_goodness_of_fit(data=test)
<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>
#------max_depth=4 ------------------------------------------------------------
qgraph = ql.get_regressor(train.columns, output = target, max_depth=4)
#training loop
for _ in range(50):
qgraph.fit(train, threads=8)
ql.update(qgraph.best())
qgraph[0].save('c:/Users/models/_QSAR_/QSAR/graphs/qsar_estrogen_090621_MAXDEPTH4.graph')
qgraph[0].plot_summary(train)
qgraph[0].plot_summary(test)
qgraph[0].plot_goodness_of_fit(data=train)
<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>
qgraph[0].plot_goodness_of_fit(data=test)
<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>
# OK, so brute force did not work. Let's filter out some functions and restart...----
#qgraph = qgraph.filter(feyn.filters.ExcludeFunctions(["gaussian", "tanh"]))
#qgraph.fit(data)
qgraph = ql.get_regressor(train.columns, output = target, max_depth=1).filter(feyn.filters.ExcludeFunctions(["gaussian", "tanh"]))
#training loop
for _ in range(50):
qgraph.fit(train, threads=8)
ql.update(qgraph.best())
qgraph[0].plot_summary(train)
qgraph[0].plot_summary(test)
qgraph[0].plot_goodness_of_fit(data=train)
<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>
qgraph[0].plot_goodness_of_fit(data=test)
<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>
qgraph = ql.get_regressor(train.columns, output = target, max_depth=2).filter(feyn.filters.ExcludeFunctions(["gaussian", "tanh"]))
#training loop
for _ in range(50):
qgraph.fit(train, threads=8)
ql.update(qgraph.best())
qgraph[0].plot_summary(train)
qgraph[0].plot_summary(test)
# --- AIC criterion ---
qgraph = ql.get_regressor(train.columns, output = target, max_depth=2).filter(feyn.filters.ExcludeFunctions(["gaussian", "tanh"]))
#training loop
for _ in range(50):
qgraph.fit(train, threads=8, criterion='aic')
ql.update(qgraph.best())
qgraph[0].plot_summary(train)
qgraph[0].plot_summary(test)
qgraph[0].plot_goodness_of_fit(data=train)
<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>
qgraph[0].plot_goodness_of_fit(data=test)
<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>
# --- try AIC criterion, maxdepth =3
qgraph = ql.get_regressor(train.columns, output = target, max_depth=3).filter(feyn.filters.ExcludeFunctions(["gaussian", "tanh"]))
#training loop
for _ in range(50):
qgraph.fit(train, threads=8, criterion='aic')
ql.update(qgraph.best())
qgraph[0].plot_summary(train)
qgraph[0].plot_summary(test)
qgraph[0].plot_goodness_of_fit(data=test)
<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>
#training loop
for _ in range(50):
qgraph.fit(train, threads=8, criterion='aic')
ql.update(qgraph.best())
qgraph[0].save('c:/Users/models/_QSAR_/QSAR/graphs/qsar_estrogen_100621_MAXDEPTH3_NOGaussTanh.graph')
qgraph[0].plot_summary(train)
qgraph[0].plot_summary(test)
qgraph[0].plot_goodness_of_fit(data=train)
<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>
qgraph[0].plot_goodness_of_fit(data=test)
<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>
qgraph[0].sympify()
% --- Exclude descriptor multiplications, maxdepth 3 ----
qgraph = ql.get_regressor(train.columns, output = target, max_depth=3).filter(feyn.filters.ExcludeFunctions(["gaussian", "tanh", "multiply"]))
#training loop
for _ in range(50):
qgraph.fit(train, threads=8, criterion='aic')
ql.update(qgraph.best())
#training loop
for _ in range(50):
qgraph.fit(train, threads=8, criterion='aic')
ql.update(qgraph.best())
# --- Exclude descriptor multiplications, maxdepth 4 -----
qgraph = ql.get_regressor(train.columns, output = target, max_depth=4).filter(feyn.filters.ExcludeFunctions(["gaussian", "tanh", "multiply"]))
#training loop
for _ in range(50):
qgraph.fit(train, threads=8, criterion='aic')
ql.update(qgraph.best())
qgraph[0].plot_summary(train)
qgraph[0].plot_summary(test)
# --- Re-introduce descriptor multiplications, maxdepth 4 -----
qgraph = ql.get_regressor(train.columns, output = target, max_depth=4).filter(feyn.filters.ExcludeFunctions(["gaussian", "tanh"]))
#training loop
for _ in range(50):
qgraph.fit(train, threads=8, criterion='aic')
ql.update(qgraph.best())
qgraph[0].plot_summary(train)
qgraph[0].plot_summary(test)
qgraph[0].plot_goodness_of_fit(data=train)
<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>
qgraph[0].plot_goodness_of_fit(data=test)
<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>
# --- Re-introduce descriptor multiplications, maxdepth 5 -----
qgraph = ql.get_regressor(train.columns, output = target, max_depth=5).filter(feyn.filters.ExcludeFunctions(["gaussian", "tanh"]))
#training loop
for _ in range(50):
qgraph.fit(train, threads=8, criterion='aic')
ql.update(qgraph.best())
qgraph[0].plot_summary(train)
qgraph[0].plot_summary(test)
qgraph[0].plot_goodness_of_fit(data=train)
<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>
qgraph[0].plot_goodness_of_fit(data=test)
<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>
# --- Re-remove descriptor multiplications, maxdepth 6 -----
qgraph = ql.get_regressor(train.columns, output = target, max_depth=6).filter(feyn.filters.ExcludeFunctions(["gaussian", "tanh", "multiply"]))
#training loop
for _ in range(50):
qgraph.fit(train, threads=8, criterion='aic')
ql.update(qgraph.best())
#training loop
for _ in range(50):
qgraph.fit(train, threads=8, criterion='aic')
ql.update(qgraph.best())
qgraph[0].plot_summary(train)
qgraph[0].plot_summary(test)
qgraph[0].plot_goodness_of_fit(data=train)
<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>
qgraph[0].plot_goodness_of_fit(data=test)
<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>
qgraph[0].sympify()
qgraph[0].save('c:/Users/models/_QSAR_/QSAR/graphs/qsar_estrogen_100621_AllAdd.graph')
# Does this improve with more iterations?
#training loop
for _ in range(100):
qgraph.fit(train, threads=8, criterion='aic')
ql.update(qgraph.best())
qgraph[0].save('c:/Users/models/_QSAR_/QSAR/graphs/qsar_estrogen_100621_LOSS0p608.graph')
qgraph[0].plot_summary(train)
qgraph[0].plot_summary(test)
qgraph[0].plot_goodness_of_fit(data=train)
<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>
qgraph[0].plot_goodness_of_fit(data=test)
<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>
qgraph[0].sympify()
# Almost there... try more iterations
#training loop
for _ in range(100):
qgraph.fit(train, threads=8, criterion='aic')
ql.update(qgraph.best())
qgraph[0].plot_summary(train)
qgraph[0].plot_summary(test)
qgraph[0].plot_goodness_of_fit(data=train)
<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>
qgraph[0].plot_goodness_of_fit(data=test)
<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>
# --- Perhaps starting to get into overfitting ?? Anyways, try more iterations...
#training loop
for _ in range(100):
qgraph.fit(train, threads=8, criterion='aic')
ql.update(qgraph.best())
qgraph[0].plot_summary(train)
qgraph[0].plot_summary(test)