import numpy as np 
import pandas as pd
import feyn
from feyn.tools import split
from feyn.losses import binary_cross_entropy
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
ql = feyn.QLattice(qlattice="3c83b161", api_token="9ba5b8ae7a0b4206b23e413c741eda73")


from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "QSAR.png",width=800,height=800)


# Load the curated Estrogen data set
df = pd.read_csv("estrogen_indexes_nolabels.csv", sep=";")

# Published predictor:
# LogRBA = -43.75 * X2A + 0.04 * TIC1 - 2.67 * EEig02d + 79.92 * JGI10 + 2.6 * SPH - 7.12 * E1u + 4.78 * RTm_plus - 1.25 * nArOR + 15.83
def QDB_Predictor(X2A, TIC1, EEig02d, JGI10, SPH, E1u, RTm_plus, nArOR):
    literature_prediction = -43.75 * X2A + 0.04 * TIC1 - 2.67 * EEig02d + 79.92 * JGI10 + 2.6 * SPH - 7.12 * E1u + 4.78 * RTm_plus - 1.25 * nArOR + 15.83
    return literature_prediction.to_numpy()


descriptors = {
 "X2A" : "Average connectivity index chi-2",
 "TIC1" : "Total information index (neighborhood symmetry 1-order)",
 "EEig02d" : "Eigenvalue 2 from edge adjacency matrix weighted dipole moments",
 "JGI10": "Mean topological charge index of order 10",
 "SPH" : "Spherosity index",
 "E1u" : "The first component accessibility directional WHIM index/unweighted",
 "nArOR" : "The number of aromatic ether groups",
 "RTm_plus" : "R maximal index weighed by atomic masses"}
descriptors["X2A"]

'Average connectivity index chi-2'


def r_sqr(x,y): # calculate r^2 for two numpy arrays
    x_minus_mean=x-x.mean()
    y_minus_mean=y-y.mean()
    cov_ = np.dot(x_minus_mean,y_minus_mean)/(len(x))    
    r = cov_/(x.std()*y.std())
    rs=r*r
    return(rs)


def PredPlot(x,y,title,xlabel,ylabel,color): # for plotting pred/exp plots
    import matplotlib.pyplot as plt
    # plot them predictions
    #c = x**2 + y**2
    fig,ax = plt.subplots()
    ax.scatter(x, y, s=25, c=color, cmap=plt.cm.coolwarm, zorder=10)
    
    rsq = "$r^2$ = "+str(r_sqr(x,y).round(2))
    
    #ax.set_aspect(1.0/ax.get_data_ratio(), adjustable='box')
    npmin=np.min([ax.get_xlim(), ax.get_ylim()])  # min of both axes
    npmax=np.max([ax.get_xlim(), ax.get_ylim()])  # max of both axes
    lims = [
        npmin,  # min of both axes
        npmax,  # max of both axes
    ]
    # now plot both limits against eachother
    ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
    ax.set_aspect('equal')
    ax.set_xlim(lims)
    ax.set_ylim(lims)
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.text(npmin*0.8, npmax*0.6, rsq, style='italic',
        bbox={'facecolor': 'grey', 'alpha': 0.5, 'pad': 10})
    #fig.savefig('/Users/niels/Desktop/test.png', dpi=300)
    return


def CombiPredPlot(x,y,x1,y1,title,xlabel,ylabel,color): # for plotting pred/exp plots
    import matplotlib.pyplot as plt
    # plot them predictions
    #c = x**2 + y**2
    fig,ax = plt.subplots()
    ax.scatter(x, y, s=25, c=color, cmap=plt.cm.coolwarm, zorder=10)
    ax.scatter(x1, y1, s=25, c='g', cmap=plt.cm.coolwarm, zorder=10)    
    npmin=np.min([ax.get_xlim(), ax.get_ylim()])  # min of both axes
    npmax=np.max([ax.get_xlim(), ax.get_ylim()])  # max of both axes
    lims = [
        npmin,  # min of both axes
        npmax,  # max of both axes
    ]
    # now plot both limits against eachother
    ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
    ax.set_aspect('equal')
    ax.set_xlim(lims)
    ax.set_ylim(lims)
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    #ax.text(npmin*0.8, npmax*0.6, rsq, style='italic',
    #    bbox={'facecolor': 'grey', 'alpha': 0.5, 'pad': 10})
    #fig.savefig('/Users/niels/Desktop/test.png', dpi=300)
    return


random_seed = 552


#target variable
target = df.columns[-1]


#train, test = train_test_split(df, test_size=0.33, stratify=df[target], random_state=random_seed)
train = df[df.columns[:]][0:128]
test = df[df.columns[:]][128:151]


# Mean subtraction
#test[train.columns]-test[train.columns].mean()


# Training set: Predictions from QDB_predictor

#QDB_Predictor(train['X2A'],train['TIC1'],train['EEig02d'],train['JGI10'],train['SPH'],train['E1u'],train['RTm_plus'],train['nArOR'])
y_pred_train = QDB_Predictor(train['X2A'],train['TIC1'],train['EEig02d'],train['JGI10'],train['SPH'],train['E1u'],train['RTm_plus'],train['nArOR'])
y_exp_train = train['LogRBA'].to_numpy()
PredPlot(y_exp_train,y_pred_train,'Train set','Experimental LogRBA','Predicted LogRBA','r')

# Test set: Predictions from QDB_predictor
y_pred_test = QDB_Predictor(test['X2A'],test['TIC1'],test['EEig02d'],test['JGI10'],test['SPH'],test['E1u'],test['RTm_plus'],test['nArOR'])
y_exp_test = test['LogRBA'].to_numpy()
PredPlot(y_exp_test,y_pred_test,'Test set','Experimental LogRBA','Predicted LogRBA','g')

# Train + Test in same plot
CombiPredPlot(y_exp_train,y_pred_train,y_exp_test,y_pred_test,'Train & Test sets','Experimental LogRBA','Predicted LogRBA','r')


ql.reset(random_seed=random_seed)


#------max_depth=1


qgraph = ql.get_regressor(train.columns, output = target, max_depth=1)


#training loop
for _ in range(50):
    qgraph.fit(train, threads=8)
    ql.update(qgraph.best())


import sympy
qgraph[0].sympify()


qgraph[0].save('c:/Users/models/_QSAR_/QSAR/graphs/qsar_estrogen_090621_MAXDEPTH1.graph')


qgraph[0].plot_summary(train)


qgraph[0].plot_summary(test)


qgraph[0].plot_goodness_of_fit(data=train)
#predictions = best_graph.predict(test[input_columns])

<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>


qgraph[0].plot_goodness_of_fit(data=test)

<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>


#------max_depth=2 ------------------------------------------------------------


qgraph = ql.get_regressor(train.columns, output = target, max_depth=2)


#training loop
for _ in range(50):
    qgraph.fit(train, threads=8)
    ql.update(qgraph.best())


qgraph[0].save('c:/Users/models/_QSAR_/QSAR/graphs/qsar_estrogen_090621_MAXDEPTH2.graph')


qgraph[0].plot_summary(train)


qgraph[0].plot_summary(test)


qgraph[0].plot_goodness_of_fit(data=train)

<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>


qgraph[0].plot_goodness_of_fit(data=test)

<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>


#------max_depth=3 ------------------------------------------------------------


qgraph = ql.get_regressor(train.columns, output = target, max_depth=3)


#training loop
for _ in range(50):
    qgraph.fit(train, threads=8)
    ql.update(qgraph.best())


qgraph[0].save('c:/Users/models/_QSAR_/QSAR/graphs/qsar_estrogen_090621_MAXDEPTH3.graph')


qgraph[0].plot_summary(train)


qgraph[0].plot_summary(test)


qgraph[0].plot_goodness_of_fit(data=train)

<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>


qgraph[0].plot_goodness_of_fit(data=test)

<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>


#------max_depth=4 ------------------------------------------------------------


qgraph = ql.get_regressor(train.columns, output = target, max_depth=4)


#training loop
for _ in range(50):
    qgraph.fit(train, threads=8)
    ql.update(qgraph.best())


qgraph[0].save('c:/Users/models/_QSAR_/QSAR/graphs/qsar_estrogen_090621_MAXDEPTH4.graph')


qgraph[0].plot_summary(train)


qgraph[0].plot_summary(test)


qgraph[0].plot_goodness_of_fit(data=train)

<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>


qgraph[0].plot_goodness_of_fit(data=test)

<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>


# OK, so brute force did not work. Let's filter out some functions and restart...----


#qgraph = qgraph.filter(feyn.filters.ExcludeFunctions(["gaussian", "tanh"]))
#qgraph.fit(data)


qgraph = ql.get_regressor(train.columns, output = target, max_depth=1).filter(feyn.filters.ExcludeFunctions(["gaussian", "tanh"]))


#training loop
for _ in range(50):
    qgraph.fit(train, threads=8)
    ql.update(qgraph.best())


qgraph[0].plot_summary(train)


qgraph[0].plot_summary(test)


qgraph[0].plot_goodness_of_fit(data=train)

<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>


qgraph[0].plot_goodness_of_fit(data=test)

<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>


qgraph = ql.get_regressor(train.columns, output = target, max_depth=2).filter(feyn.filters.ExcludeFunctions(["gaussian", "tanh"]))


#training loop
for _ in range(50):
    qgraph.fit(train, threads=8)
    ql.update(qgraph.best())


qgraph[0].plot_summary(train)


qgraph[0].plot_summary(test)


# --- AIC criterion ---


qgraph = ql.get_regressor(train.columns, output = target, max_depth=2).filter(feyn.filters.ExcludeFunctions(["gaussian", "tanh"]))


#training loop
for _ in range(50):
    qgraph.fit(train, threads=8, criterion='aic')
    ql.update(qgraph.best())


qgraph[0].plot_summary(train)


qgraph[0].plot_summary(test)


qgraph[0].plot_goodness_of_fit(data=train)

<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>


qgraph[0].plot_goodness_of_fit(data=test)

<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>


# --- try AIC criterion, maxdepth =3


qgraph = ql.get_regressor(train.columns, output = target, max_depth=3).filter(feyn.filters.ExcludeFunctions(["gaussian", "tanh"]))


#training loop
for _ in range(50):
    qgraph.fit(train, threads=8, criterion='aic')
    ql.update(qgraph.best())


qgraph[0].plot_summary(train)


qgraph[0].plot_summary(test)


qgraph[0].plot_goodness_of_fit(data=test)

<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>


#training loop
for _ in range(50):
    qgraph.fit(train, threads=8, criterion='aic')
    ql.update(qgraph.best())


qgraph[0].save('c:/Users/models/_QSAR_/QSAR/graphs/qsar_estrogen_100621_MAXDEPTH3_NOGaussTanh.graph')


qgraph[0].plot_summary(train)


qgraph[0].plot_summary(test)


qgraph[0].plot_goodness_of_fit(data=train)

<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>


qgraph[0].plot_goodness_of_fit(data=test)

<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>


qgraph[0].sympify()


% --- Exclude descriptor multiplications, maxdepth 3 ----


qgraph = ql.get_regressor(train.columns, output = target, max_depth=3).filter(feyn.filters.ExcludeFunctions(["gaussian", "tanh", "multiply"]))


#training loop
for _ in range(50):
    qgraph.fit(train, threads=8, criterion='aic')
    ql.update(qgraph.best())


#training loop
for _ in range(50):
    qgraph.fit(train, threads=8, criterion='aic')
    ql.update(qgraph.best())


# --- Exclude descriptor multiplications, maxdepth 4 -----


qgraph = ql.get_regressor(train.columns, output = target, max_depth=4).filter(feyn.filters.ExcludeFunctions(["gaussian", "tanh", "multiply"]))


#training loop
for _ in range(50):
    qgraph.fit(train, threads=8, criterion='aic')
    ql.update(qgraph.best())


qgraph[0].plot_summary(train)


qgraph[0].plot_summary(test)


# --- Re-introduce descriptor multiplications, maxdepth 4 -----


qgraph = ql.get_regressor(train.columns, output = target, max_depth=4).filter(feyn.filters.ExcludeFunctions(["gaussian", "tanh"]))


#training loop
for _ in range(50):
    qgraph.fit(train, threads=8, criterion='aic')
    ql.update(qgraph.best())


qgraph[0].plot_summary(train)


qgraph[0].plot_summary(test)


qgraph[0].plot_goodness_of_fit(data=train)

<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>


qgraph[0].plot_goodness_of_fit(data=test)

<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>


# --- Re-introduce descriptor multiplications, maxdepth 5 -----


qgraph = ql.get_regressor(train.columns, output = target, max_depth=5).filter(feyn.filters.ExcludeFunctions(["gaussian", "tanh"]))


#training loop
for _ in range(50):
    qgraph.fit(train, threads=8, criterion='aic')
    ql.update(qgraph.best())


qgraph[0].plot_summary(train)


qgraph[0].plot_summary(test)


qgraph[0].plot_goodness_of_fit(data=train)

<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>


qgraph[0].plot_goodness_of_fit(data=test)

<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>


# --- Re-remove descriptor multiplications, maxdepth 6 -----


qgraph = ql.get_regressor(train.columns, output = target, max_depth=6).filter(feyn.filters.ExcludeFunctions(["gaussian", "tanh", "multiply"]))


#training loop
for _ in range(50):
    qgraph.fit(train, threads=8, criterion='aic')
    ql.update(qgraph.best())


#training loop
for _ in range(50):
    qgraph.fit(train, threads=8, criterion='aic')
    ql.update(qgraph.best())


qgraph[0].plot_summary(train)


qgraph[0].plot_summary(test)


qgraph[0].plot_goodness_of_fit(data=train)

<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>


qgraph[0].plot_goodness_of_fit(data=test)

<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>


qgraph[0].sympify()


qgraph[0].save('c:/Users/models/_QSAR_/QSAR/graphs/qsar_estrogen_100621_AllAdd.graph')


# Does this improve with more iterations?


#training loop
for _ in range(100):
    qgraph.fit(train, threads=8, criterion='aic')
    ql.update(qgraph.best())


qgraph[0].save('c:/Users/models/_QSAR_/QSAR/graphs/qsar_estrogen_100621_LOSS0p608.graph')


qgraph[0].plot_summary(train)


qgraph[0].plot_summary(test)


qgraph[0].plot_goodness_of_fit(data=train)

<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>


qgraph[0].plot_goodness_of_fit(data=test)

<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>


qgraph[0].sympify()


# Almost there... try more iterations


#training loop
for _ in range(100):
    qgraph.fit(train, threads=8, criterion='aic')
    ql.update(qgraph.best())


qgraph[0].plot_summary(train)


qgraph[0].plot_summary(test)


qgraph[0].plot_goodness_of_fit(data=train)

<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>


qgraph[0].plot_goodness_of_fit(data=test)

<AxesSubplot:title={'center':'Actuals vs Prediction'}, xlabel='Actuals', ylabel='Predictions'>


# --- Perhaps starting to get into overfitting ?? Anyways, try more iterations...


#training loop
for _ in range(100):
    qgraph.fit(train, threads=8, criterion='aic')
    ql.update(qgraph.best())


qgraph[0].plot_summary(train)


qgraph[0].plot_summary(test)