"""
!pip install seaborn_qqplot
!pip install pycaret
!pip install scipy
!pip install shap
"""

'\n!pip install seaborn_qqplot\n!pip install pycaret\n!pip install scipy\n!pip install shap\n'

from matplotlib import rcParams
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Load the dataset
df = pd.read_csv('diabetes.csv')

# rename the DiabetesPedigreeFunction column
df.rename(columns ={"DiabetesPedigreeFunction":"DPF"},inplace=True)
feature_names = [cname for cname in df.loc[:,:'Age'].columns]

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Pregnancies    768 non-null    int64  
 1   Glucose        768 non-null    int64  
 2   BloodPressure  768 non-null    int64  
 3   SkinThickness  768 non-null    int64  
 4   Insulin        768 non-null    int64  
 5   BMI            768 non-null    float64
 6   DPF            768 non-null    float64
 7   Age            768 non-null    int64  
 8   Outcome        768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB

df.describe()

df.isna().sum()

Pregnancies      0
Glucose          0
BloodPressure    0
SkinThickness    0
Insulin          0
BMI              0
DPF              0
Age              0
Outcome          0
dtype: int64

df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DPF', 'Age', 'Outcome'],
      dtype='object')

print('Not Diabetic', round(df['Outcome'].value_counts()[0]/len(df) * 100,2), '% of the dataset')
print('Diabetic', round(df['Outcome'].value_counts()[1]/len(df) * 100,2), '% of the dataset')

Not Diabetic 65.1 % of the dataset
Diabetic 34.9 % of the dataset

import seaborn as sns
import matplotlib.pyplot as plt

colors = ["#0101DF", "#DF0101"]

sns.countplot(x='Outcome', data=df, palette=colors)
plt.title('Class Distributions \n (0: Not Diabetic || 1: Diabetic)', fontsize=14)

/tmp/ipykernel_414463/918470883.py:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x='Outcome', data=df, palette=colors)

Text(0.5, 1.0, 'Class Distributions \n (0: Not Diabetic || 1: Diabetic)')

df.hist(figsize=(12, 8))
plt.show()

def compute_stat(df):
  # Column names
  columns = ['Feature','Mean', 'Std', 'Skewness','Kurtosis']
  df_stats = pd.DataFrame(columns=columns)

  for column in df.columns:
      mean = df[column].mean()
      std = df[column].std()
      skew = df[column].skew()
      kurtosis = df[column].kurtosis()
      df_stats.loc[len(df_stats)] = [column, mean,std,skew,kurtosis]
      #print(column+':')
      #print('mean: {0:.4f}, std: {1:.4f}, skew: {2:.4f}, kurtosis: {3:.4f} '.format(mean, std, skew, kurtosis))
  return df_stats

df_stats = compute_stat(df)
print(df_stats)

         Feature        Mean         Std  Skewness  Kurtosis
0    Pregnancies    3.845052    3.369578  0.901674  0.159220
1        Glucose  120.894531   31.972618  0.173754  0.640780
2  BloodPressure   69.105469   19.355807 -1.843608  5.180157
3  SkinThickness   20.536458   15.952218  0.109372 -0.520072
4        Insulin   79.799479  115.244002  2.272251  7.214260
5            BMI   31.992578    7.884160 -0.428982  3.290443
6            DPF    0.471876    0.331329  1.919911  5.594954
7            Age   33.240885   11.760232  1.129597  0.643159
8        Outcome    0.348958    0.476951  0.635017 -1.600930

def kurtosis_warning(val):
  if -0.5 < val < 0.5:
    color = "green"
  if val > .5:
    color = 'blue'
  if val < -.5:
    color = "cyan"
  return f'color: {color}'

def skew_warning(val):
  if -0.5 < val < 0.5:
    color = "green"
  if -1 < val < -0.5 or 0.5 < val < 1:
    color = 'yellow'
  if val < -1 or val > 1:
    color = "red"
  return f'color: {color}'

df_styled = (df_stats.style.applymap(skew_warning, subset=['Skewness']).applymap(kurtosis_warning, subset=['Kurtosis']))
df_styled

/tmp/ipykernel_414463/3360238985.py:19: FutureWarning: Styler.applymap has been deprecated. Use Styler.map instead.
  df_styled = (df_stats.style.applymap(skew_warning, subset=['Skewness']).applymap(kurtosis_warning, subset=['Kurtosis']))

df.corr()

import seaborn as sns
#sns.pairplot(df)

sns.set(font_scale=2)
plt.figure(figsize=(10, 8))
sns.set_style("white")
sns.set_palette("bright")
sns.pairplot(df,kind = 'reg',corner = True,hue="Outcome")

<seaborn.axisgrid.PairGrid at 0x7f1575560e50>

<Figure size 1000x800 with 0 Axes>

#sns.heatmap(df.corr())

corr=df.corr().round(2)

sns.set(font_scale=1.15)
plt.figure(figsize=(14, 10))
sns.set_palette("bright")
sns.set_style("white")
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr,annot=True,cmap='gist_yarg_r',mask=mask,cbar=True)
plt.title('Correlation Plot')

Text(0.5, 1.0, 'Correlation Plot')

df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DPF', 'Age', 'Outcome'],
      dtype='object')

import numpy as np

# Number of columns and rows for the subplot array
n_cols = 4
n_rows = int(np.ceil( (len(df.columns)-1) / n_cols))
# Create a figure and an array of subplots with 2 rows
fig, ax = plt.subplots(n_rows, n_cols, figsize=(10, 10))

# Flatten the array of subplots if necessary
ax = ax.flatten()

# Loop through each column and each subplot
for i, column in enumerate(df.columns):
    if column == "Outcome":
      continue
    ax[i].boxplot(df[column])  # Create the boxplot
    ax[i].set_title(f"{column}")  # Set the title
    ax[i].set_xlabel(column)  # Set the x-axis label
    ax[i].set_ylabel('Values')  # Set the y-axis label

# Remove any unused subplots
for i in range(len(df.columns), n_cols * n_rows):
    fig.delaxes(ax[i])

plt.tight_layout()  # Adjust layout to prevent overlap
plt.show()  # Show the plot

f, axes = plt.subplots(nrows=2,ncols=4, figsize=(20,4))

for i,column in enumerate(df.columns):
  if column != 'Outcome':
    sns.boxplot(x="Outcome", y=column, data=df, palette=colors, ax=axes[int(i/4),i%4])

/tmp/ipykernel_414463/385244152.py:5: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x="Outcome", y=column, data=df, palette=colors, ax=axes[int(i/4),i%4])
/tmp/ipykernel_414463/385244152.py:5: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x="Outcome", y=column, data=df, palette=colors, ax=axes[int(i/4),i%4])
/tmp/ipykernel_414463/385244152.py:5: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x="Outcome", y=column, data=df, palette=colors, ax=axes[int(i/4),i%4])
/tmp/ipykernel_414463/385244152.py:5: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x="Outcome", y=column, data=df, palette=colors, ax=axes[int(i/4),i%4])
/tmp/ipykernel_414463/385244152.py:5: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x="Outcome", y=column, data=df, palette=colors, ax=axes[int(i/4),i%4])
/tmp/ipykernel_414463/385244152.py:5: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x="Outcome", y=column, data=df, palette=colors, ax=axes[int(i/4),i%4])
/tmp/ipykernel_414463/385244152.py:5: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x="Outcome", y=column, data=df, palette=colors, ax=axes[int(i/4),i%4])
/tmp/ipykernel_414463/385244152.py:5: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x="Outcome", y=column, data=df, palette=colors, ax=axes[int(i/4),i%4])

def histplot(df):
  rcParams['figure.figsize'] = 40,60
  sns.set(font_scale = 3)
  sns.set_style("white")
  sns.set_palette("bright")
  plt.subplots_adjust(hspace=0.5)
  i = 1;
  for name in feature_names:
      plt.subplot(5,2,i)
      sns.histplot(data=df, x=name, hue="Outcome",kde=True,palette="YlGnBu")
      i = i + 1

zero_features = ['Glucose','BloodPressure','SkinThickness',"Insulin",'BMI']
total_count = df.shape[0]

print("Proportion of Zeros:")
for feature in zero_features:
    zero_count = df[df[feature]==0].shape[0]
    pcent = zero_count/total_count * 100
    print('{0}: {1}, {2:.2f}%'.format(feature, zero_count, pcent))

Proportion of Zeros:
Glucose: 5, 0.65%
BloodPressure: 35, 4.56%
SkinThickness: 227, 29.56%
Insulin: 374, 48.70%
BMI: 11, 1.43%

mean = df[zero_features].mean()
df[zero_features]=df[zero_features].replace(0, mean)

histplot(df)

df_stats2 = compute_stat(df)
df_styled2 = (df_stats2.style.applymap(skew_warning, subset=['Skewness']).applymap(kurtosis_warning, subset=['Kurtosis']))
df_styled2

/tmp/ipykernel_414463/930638157.py:2: FutureWarning: Styler.applymap has been deprecated. Use Styler.map instead.
  df_styled2 = (df_stats2.style.applymap(skew_warning, subset=['Skewness']).applymap(kurtosis_warning, subset=['Kurtosis']))

from sklearn.preprocessing import QuantileTransformer

X = df.iloc[:,:-1]
y = df.iloc[:,-1]

# During initial quantile tranformation of all columns we found that pregnancy
# and age do not correspond to the normal distribution. Quantile transformation
# in those columns increased skewness and kurtosis of those columns.
# Hence we are not going to process them.

# Remove and store columns
cols_to_remove = ['Pregnancies', 'Age']
stored_cols_data = {col: (X.columns.get_loc(col), X.pop(col)) for col in cols_to_remove}

scaler = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution='normal')
X_scaled = scaler.fit_transform(X)

df_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Add stored columns back to their original positions
for col, (idx, ser) in stored_cols_data.items():
    df_scaled.insert(idx, col, ser)

df_scaled["Outcome"] = df["Outcome"]

rcParams['figure.figsize'] = 40,60
sns.set(font_scale = 3)
sns.set_style("white")
sns.set_palette("bright")
plt.subplots_adjust(hspace=0.5)
i = 1;
for name in feature_names:
    plt.subplot(4,4,i)
    plt.title("Before")
    sns.histplot(data=df, x=name, hue="Outcome",kde=True,palette="YlGnBu")
    plt.subplot(4,4,i+1)
    plt.title("After")
    sns.histplot(data=df_scaled, x=name, hue="Outcome",kde=True,palette="YlGnBu")
    i = i + 2

df_stats3 = compute_stat(df_scaled)
df_styled3 = (df_stats3.style.applymap(skew_warning, subset=['Skewness']).applymap(kurtosis_warning, subset=['Kurtosis']))
df_styled3

/tmp/ipykernel_414463/1351867406.py:2: FutureWarning: Styler.applymap has been deprecated. Use Styler.map instead.
  df_styled3 = (df_stats3.style.applymap(skew_warning, subset=['Skewness']).applymap(kurtosis_warning, subset=['Kurtosis']))

import statsmodels.api as sm
import pylab as py
import matplotlib.pyplot as plt

rcParams['figure.figsize'] = 20,60
sns.set(font_scale = 1)
sns.set_style("white")
total_features = len(feature_names)
fig, axes = plt.subplots(total_features, 2)
plt.subplots_adjust(hspace=.5)

for i,col in zip(range(total_features),feature_names):
    sm.qqplot(df[col], line ='q',ax=axes[i,0])
    title = col + "(Before)"
    axes[i,0].set_title(title)
    sm.qqplot(df_scaled[col], line ='q',ax=axes[i,1])
    title = col + "(After)"
    axes[i,1].set_title(title)

plt.show()

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

classifiers = { "LogisiticRegression": LogisticRegression(max_iter=500),
                "KNearest": KNeighborsClassifier(),
                "Support Vector Classifier": SVC(),
                "DecisionTreeClassifier": DecisionTreeClassifier()
              }

# Split data into training and test sets
X = df.drop('Outcome', axis=1)
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=44)

# Turn the values into an array for feeding the classification algorithms.
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

for key, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    training_score = cross_val_score(classifier, X_train, y_train, cv=5)
    print("Classifiers: ", classifier.__class__.__name__, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")

Classifiers:  LogisticRegression Has a training score of 75.0 % accuracy score
Classifiers:  KNeighborsClassifier Has a training score of 69.0 % accuracy score
Classifiers:  SVC Has a training score of 75.0 % accuracy score
Classifiers:  DecisionTreeClassifier Has a training score of 70.0 % accuracy score

# Split data into training and test sets
X = df_scaled.drop('Outcome', axis=1)
y = df_scaled['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=44)

# Turn the values into an array for feeding the classification algorithms.
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

for key, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    training_score = cross_val_score(classifier, X_train, y_train, cv=5)
    print("Classifiers: ", classifier.__class__.__name__, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")

Classifiers:  LogisticRegression Has a training score of 74.0 % accuracy score
Classifiers:  KNeighborsClassifier Has a training score of 68.0 % accuracy score
Classifiers:  SVC Has a training score of 67.0 % accuracy score
Classifiers:  DecisionTreeClassifier Has a training score of 68.0 % accuracy score

from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_reduced = pca.fit_transform(X_scaled)
df_pca = pd.DataFrame(X_reduced, columns=['PC1', 'PC2'])
df_pca['Outcome'] = df["Outcome"]  # Assume `y` contains labels for each data point

# Plot
plt.figure(figsize=(10, 10))
sns.scatterplot(data=df_pca, x='PC1', y='PC2', hue='Outcome')
plt.show()

from pycaret.classification import *

clf1 = setup(data = df_scaled,
             target = 'Outcome',
             preprocess = False)

top5 = compare_models(sort='AUC',
                      n_select = 5,
                      exclude=["ridge","qda","lda","dt","dummy","lightgbm","xgboost"]
                      #exclude=['svm','knn','nb']
                     )

lda = create_model('lda')
lr = create_model('lr')
et = create_model('et')
rf = create_model('rf')
gbc = create_model('gbc')
dt = create_model("dt")

interpret_model(rf)

stack_model = stack_models(estimator_list = top5, meta_model = top5[0],optimize = 'AUC')

plt.figure(figsize=(8, 8))
plot_model(stack_model, plot='boundary')

plt.figure(figsize=(8, 8))
plot_model(stack_model, plot = 'auc')

from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)

    roc_auc = roc_auc_score(y_test, pred_proba)

    # ROC-AUC print
    print('accuracy: {0:.4f}, precision: {1:.4f}, recall: {2:.4f},\
    F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))
    return confusion

#prediction
pred = stack_model.predict(X_test)
pred_proba = stack_model.predict_proba(X_test)[:,1]
#Accuracy
confusion_stack = get_clf_eval(y_test,pred,pred_proba)

accuracy: 0.9427, precision: 0.9853, recall: 0.8701,    F1: 0.9241, AUC:0.9923

plt.figure(figsize=(10,6))
sns.kdeplot(pred_proba,label="lda")
plt.title("Probability density plots")

Text(0.5, 1.0, 'Probability density plots')

#pred = lda.predict(X_test)
#pred_proba = lda.predict_proba(X_test)[:,1]
#confusion_stack = get_clf_eval(y_test,pred,pred_proba)
stack_model

StackingClassifier(cv=5,
                   estimators=[('Logistic Regression',
                                LogisticRegression(C=1.0, class_weight=None,
                                                   dual=False,
                                                   fit_intercept=True,
                                                   intercept_scaling=1,
                                                   l1_ratio=None, max_iter=1000,
                                                   multi_class='auto',
                                                   n_jobs=None, penalty='l2',
                                                   random_state=5912,
                                                   solver='lbfgs', tol=0.0001,
                                                   verbose=0,
                                                   warm_start=False)),
                               ('Random Forest Classifier',
                                RandomForestClassifier(boots...
                                GaussianNB(priors=None, var_smoothing=1e-09))],
                   final_estimator=LogisticRegression(C=1.0, class_weight=None,
                                                      dual=False,
                                                      fit_intercept=True,
                                                      intercept_scaling=1,
                                                      l1_ratio=None,
                                                      max_iter=1000,
                                                      multi_class='auto',
                                                      n_jobs=None, penalty='l2',
                                                      random_state=5912,
                                                      solver='lbfgs',
                                                      tol=0.0001, verbose=0,
                                                      warm_start=False),
                   n_jobs=-1, passthrough=False, stack_method='auto',
                   verbose=0)

StackingClassifier(cv=5,
                   estimators=[('Logistic Regression',
                                LogisticRegression(C=1.0, class_weight=None,
                                                   dual=False,
                                                   fit_intercept=True,
                                                   intercept_scaling=1,
                                                   l1_ratio=None, max_iter=1000,
                                                   multi_class='auto',
                                                   n_jobs=None, penalty='l2',
                                                   random_state=5912,
                                                   solver='lbfgs', tol=0.0001,
                                                   verbose=0,
                                                   warm_start=False)),
                               ('Random Forest Classifier',
                                RandomForestClassifier(boots...
                                GaussianNB(priors=None, var_smoothing=1e-09))],
                   final_estimator=LogisticRegression(C=1.0, class_weight=None,
                                                      dual=False,
                                                      fit_intercept=True,
                                                      intercept_scaling=1,
                                                      l1_ratio=None,
                                                      max_iter=1000,
                                                      multi_class='auto',
                                                      n_jobs=None, penalty='l2',
                                                      random_state=5912,
                                                      solver='lbfgs',
                                                      tol=0.0001, verbose=0,
                                                      warm_start=False),
                   n_jobs=-1, passthrough=False, stack_method='auto',
                   verbose=0)

LogisticRegression(max_iter=1000, random_state=5912)

RandomForestClassifier(n_jobs=-1, random_state=5912)

GradientBoostingClassifier(random_state=5912)

ExtraTreesClassifier(n_jobs=-1, random_state=5912)

GaussianNB()

LogisticRegression(max_iter=1000, random_state=5912)

plt.figure(figsize=(8, 6))
ax = sns.heatmap(confusion_stack, cmap = 'YlGnBu',annot = True, fmt='d')
ax.set_title('Confusion Matrix (Stacking)')

Text(0.5, 1.0, 'Confusion Matrix (Stacking)')

soft = blend_models(estimator_list = top5, optimize = 'AUC',method = 'soft')

# @title
plt.figure(figsize=(8, 8))
plot_model(soft, plot='boundary')

plt.figure(figsize=(8, 8))
plot_model(soft, plot = 'auc')

#prediction
pred = soft.predict(X_test)
pred_proba = soft.predict_proba(X_test)[:,1]
#Accuracy
confusion_soft = get_clf_eval(y_test,pred,pred_proba)

accuracy: 0.9531, precision: 1.0000, recall: 0.8831,    F1: 0.9379, AUC:0.9916

plt.figure(figsize=(8, 6))
ax = sns.heatmap(confusion_soft, cmap = 'YlGnBu',annot = True, fmt='d')
ax.set_title('Confusion Matrix (Soft Blending)')

Text(0.5, 1.0, 'Confusion Matrix (Soft Blending)')

blend_hard = blend_models(estimator_list = top5, optimize = 'AUC',method = 'hard')

plt.figure(figsize=(8, 8))
plot_model(blend_hard, plot='boundary')

#prediction
pred = blend_hard.predict(X_test)
#Accuracy
confusion_hard = confusion_matrix( y_test, pred)
accuracy = accuracy_score(y_test , pred)
precision = precision_score(y_test , pred)
recall = recall_score(y_test , pred)
f1 = f1_score(y_test,pred)
print('accuracy: {0:.4f}, precision: {1:.4f}, recall: {2:.4f},\
F1: {3:.4f}'.format(accuracy, precision, recall, f1))

accuracy: 0.9375, precision: 0.9577, recall: 0.8831,F1: 0.9189

plt.figure(figsize=(8, 6))
ax = sns.heatmap(confusion_hard, cmap = 'YlGnBu',annot = True, fmt='d')
ax.set_title('Confusion Matrix (Hard Blending)')

Text(0.5, 1.0, 'Confusion Matrix (Hard Blending)')

cali_model = calibrate_model(soft)

from pycaret.classification import *
clf1 = setup(data, target = 'target_column')
lr = create_model('lr')  # Example: creating a logistic regression model
calibrated_lr = calibrate_model(lr, method='isotonic')  # Using isotonic regression for calibration

plt.figure(figsize=(8, 8))
plot_model(cali_model, plot='calibration')

<Figure size 800x800 with 0 Axes>

final_model = finalize_model(cali_model)

final_model

Pipeline(memory=Memory(location=None),
         steps=[('placeholder', None),
                ('actual_estimator',
                 CalibratedClassifierCV(cv=5, ensemble=True,
                                        estimator=VotingClassifier(estimators=[('Logistic '
                                                                                'Regression',
                                                                                LogisticRegression(C=1.0,
                                                                                                   class_weight=None,
                                                                                                   dual=False,
                                                                                                   fit_intercept=True,
                                                                                                   intercept_scaling=1,
                                                                                                   l1_ratio=None,
                                                                                                   max_iter=1000,
                                                                                                   multi_class='auto',
                                                                                                   n_jobs=None,
                                                                                                   penalty...
                                                                                                     min_samples_leaf=1,
                                                                                                     min_samples_split=2,
                                                                                                     min_weight_fraction_leaf=0.0,
                                                                                                     monotonic_cst=None,
                                                                                                     n_estimators=100,
                                                                                                     n_jobs=-1,
                                                                                                     oob_score=False,
                                                                                                     random_state=5912,
                                                                                                     verbose=0,
                                                                                                     warm_start=False)),
                                                                               ('Naive '
                                                                                'Bayes',
                                                                                GaussianNB(priors=None,
                                                                                           var_smoothing=1e-09))],
                                                                   flatten_transform=True,
                                                                   n_jobs=-1,
                                                                   verbose=False,
                                                                   voting='soft',
                                                                   weights=None),
                                        method='sigmoid', n_jobs=None))],
         verbose=False)

Pipeline(memory=Memory(location=None),
         steps=[('placeholder', None),
                ('actual_estimator',
                 CalibratedClassifierCV(cv=5, ensemble=True,
                                        estimator=VotingClassifier(estimators=[('Logistic '
                                                                                'Regression',
                                                                                LogisticRegression(C=1.0,
                                                                                                   class_weight=None,
                                                                                                   dual=False,
                                                                                                   fit_intercept=True,
                                                                                                   intercept_scaling=1,
                                                                                                   l1_ratio=None,
                                                                                                   max_iter=1000,
                                                                                                   multi_class='auto',
                                                                                                   n_jobs=None,
                                                                                                   penalty...
                                                                                                     min_samples_leaf=1,
                                                                                                     min_samples_split=2,
                                                                                                     min_weight_fraction_leaf=0.0,
                                                                                                     monotonic_cst=None,
                                                                                                     n_estimators=100,
                                                                                                     n_jobs=-1,
                                                                                                     oob_score=False,
                                                                                                     random_state=5912,
                                                                                                     verbose=0,
                                                                                                     warm_start=False)),
                                                                               ('Naive '
                                                                                'Bayes',
                                                                                GaussianNB(priors=None,
                                                                                           var_smoothing=1e-09))],
                                                                   flatten_transform=True,
                                                                   n_jobs=-1,
                                                                   verbose=False,
                                                                   voting='soft',
                                                                   weights=None),
                                        method='sigmoid', n_jobs=None))],
         verbose=False)

None

CalibratedClassifierCV(cv=5, ensemble=True,
                       estimator=VotingClassifier(estimators=[('Logistic '
                                                               'Regression',
                                                               LogisticRegression(C=1.0,
                                                                                  class_weight=None,
                                                                                  dual=False,
                                                                                  fit_intercept=True,
                                                                                  intercept_scaling=1,
                                                                                  l1_ratio=None,
                                                                                  max_iter=1000,
                                                                                  multi_class='auto',
                                                                                  n_jobs=None,
                                                                                  penalty='l2',
                                                                                  random_state=5912,
                                                                                  solver='lbfgs',
                                                                                  tol=0.0001,
                                                                                  verbose=0,
                                                                                  warm_start=False)),
                                                              ('Rando...
                                                                                    min_impurity_decrease=0.0,
                                                                                    min_samples_leaf=1,
                                                                                    min_samples_split=2,
                                                                                    min_weight_fraction_leaf=0.0,
                                                                                    monotonic_cst=None,
                                                                                    n_estimators=100,
                                                                                    n_jobs=-1,
                                                                                    oob_score=False,
                                                                                    random_state=5912,
                                                                                    verbose=0,
                                                                                    warm_start=False)),
                                                              ('Naive Bayes',
                                                               GaussianNB(priors=None,
                                                                          var_smoothing=1e-09))],
                                                  flatten_transform=True,
                                                  n_jobs=-1, verbose=False,
                                                  voting='soft', weights=None),
                       method='sigmoid', n_jobs=None)

VotingClassifier(estimators=[('Logistic Regression',
                              LogisticRegression(max_iter=1000,
                                                 random_state=5912)),
                             ('Random Forest Classifier',
                              RandomForestClassifier(n_jobs=-1,
                                                     random_state=5912)),
                             ('Gradient Boosting Classifier',
                              GradientBoostingClassifier(random_state=5912)),
                             ('Extra Trees Classifier',
                              ExtraTreesClassifier(n_jobs=-1,
                                                   random_state=5912)),
                             ('Naive Bayes', GaussianNB())],
                 n_jobs=-1, voting='soft')

LogisticRegression(max_iter=1000, random_state=5912)

RandomForestClassifier(n_jobs=-1, random_state=5912)

GradientBoostingClassifier(random_state=5912)

ExtraTreesClassifier(n_jobs=-1, random_state=5912)

GaussianNB()

plt.figure(figsize=(8, 8))
plot_model(final_model, plot='threshold')

#prediction
pred = final_model.predict(X_test)
#Accuracy
final_model = confusion_matrix( y_test, pred)
accuracy = accuracy_score(y_test , pred)
precision = precision_score(y_test , pred)
recall = recall_score(y_test , pred)
f1 = f1_score(y_test,pred)
print('accuracy: {0:.4f}, precision: {1:.4f}, recall: {2:.4f},\
F1: {3:.4f}'.format(accuracy, precision, recall, f1))

accuracy: 0.9479, precision: 0.9718, recall: 0.8961,F1: 0.9324

plt.figure(figsize=(8, 6))
ax = sns.heatmap(confusion_hard, cmap = 'YlGnBu',annot = True, fmt='d')
ax.set_title('Confusion Matrix (final_model)')

Text(0.5, 1.0, 'Confusion Matrix (final_model)')

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DPF	Age	Outcome
count	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000
mean	3.845052	120.894531	69.105469	20.536458	79.799479	31.992578	0.471876	33.240885	0.348958
std	3.369578	31.972618	19.355807	15.952218	115.244002	7.884160	0.331329	11.760232	0.476951
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.078000	21.000000	0.000000
25%	1.000000	99.000000	62.000000	0.000000	0.000000	27.300000	0.243750	24.000000	0.000000
50%	3.000000	117.000000	72.000000	23.000000	30.500000	32.000000	0.372500	29.000000	0.000000
75%	6.000000	140.250000	80.000000	32.000000	127.250000	36.600000	0.626250	41.000000	1.000000
max	17.000000	199.000000	122.000000	99.000000	846.000000	67.100000	2.420000	81.000000	1.000000

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DPF	Age	Outcome
Pregnancies	1.000000	0.129459	0.141282	-0.081672	-0.073535	0.017683	-0.033523	0.544341	0.221898
Glucose	0.129459	1.000000	0.152590	0.057328	0.331357	0.221071	0.137337	0.263514	0.466581
BloodPressure	0.141282	0.152590	1.000000	0.207371	0.088933	0.281805	0.041265	0.239528	0.065068
SkinThickness	-0.081672	0.057328	0.207371	1.000000	0.436783	0.392573	0.183928	-0.113970	0.074752
Insulin	-0.073535	0.331357	0.088933	0.436783	1.000000	0.197859	0.185071	-0.042163	0.130548
BMI	0.017683	0.221071	0.281805	0.392573	0.197859	1.000000	0.140647	0.036242	0.292695
DPF	-0.033523	0.137337	0.041265	0.183928	0.185071	0.140647	1.000000	0.033561	0.173844
Age	0.544341	0.263514	0.239528	-0.113970	-0.042163	0.036242	0.033561	1.000000	0.238356
Outcome	0.221898	0.466581	0.065068	0.074752	0.130548	0.292695	0.173844	0.238356	1.000000

	Description	Value
0	Session id	5912
1	Target	Outcome
2	Target type	Binary
3	Original data shape	(768, 9)
4	Transformed data shape	(768, 9)
5	Transformed train set shape	(537, 9)
6	Transformed test set shape	(231, 9)
7	Numeric features	8

	Model	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC	TT (Sec)
lr	Logistic Regression	0.7487	0.8291	0.5611	0.6761	0.6082	0.4262	0.4336	0.3870
rf	Random Forest Classifier	0.7656	0.8272	0.6044	0.6996	0.6404	0.4693	0.4778	0.1520
gbc	Gradient Boosting Classifier	0.7487	0.8249	0.6041	0.6628	0.6248	0.4379	0.4444	0.0800
et	Extra Trees Classifier	0.7636	0.8232	0.5830	0.6999	0.6316	0.4602	0.4676	0.0980
nb	Naive Bayes	0.7431	0.8098	0.6409	0.6378	0.6342	0.4373	0.4415	0.0120
ada	Ada Boost Classifier	0.7411	0.7975	0.6038	0.6507	0.6213	0.4260	0.4303	0.0540
svm	SVM - Linear Kernel	0.6613	0.7886	0.5772	0.4990	0.4526	0.2630	0.3151	0.0120
knn	K Neighbors Classifier	0.6853	0.7163	0.5184	0.5548	0.5321	0.2964	0.2996	0.0220

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
Fold
0	0.7407	0.8271	0.5789	0.6471	0.6111	0.4176	0.4190
1	0.7593	0.8511	0.6316	0.6667	0.6486	0.4658	0.4661
2	0.8148	0.8857	0.6842	0.7647	0.7222	0.5840	0.5860
3	0.6481	0.6722	0.5263	0.5000	0.5128	0.2377	0.2379
4	0.7037	0.7970	0.6316	0.5714	0.6000	0.3656	0.3668
5	0.7407	0.8602	0.4211	0.7273	0.5333	0.3710	0.3976
6	0.7778	0.8662	0.5263	0.7692	0.6250	0.4749	0.4921
7	0.7925	0.8667	0.5556	0.7692	0.6452	0.5038	0.5172
8	0.7736	0.8746	0.6111	0.6875	0.6471	0.4812	0.4830
9	0.6981	0.7778	0.4444	0.5714	0.5000	0.2886	0.2933
Mean	0.7449	0.8278	0.5611	0.6675	0.6045	0.4190	0.4259
Std	0.0475	0.0615	0.0797	0.0900	0.0664	0.0993	0.1002

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
Fold
0	0.6852	0.8241	0.4737	0.5625	0.5143	0.2839	0.2862
1	0.7778	0.8632	0.6316	0.7059	0.6667	0.5008	0.5025
2	0.8333	0.8992	0.6842	0.8125	0.7429	0.6209	0.6259
3	0.6667	0.6677	0.5789	0.5238	0.5500	0.2863	0.2872
4	0.7037	0.7895	0.6316	0.5714	0.6000	0.3656	0.3668
5	0.7407	0.8526	0.4211	0.7273	0.5333	0.3710	0.3976
6	0.7963	0.8632	0.5789	0.7857	0.6667	0.5248	0.5375
7	0.7925	0.8794	0.5556	0.7692	0.6452	0.5038	0.5172
8	0.7736	0.8825	0.6111	0.6875	0.6471	0.4812	0.4830
9	0.7170	0.7698	0.4444	0.6154	0.5161	0.3234	0.3320
Mean	0.7487	0.8291	0.5611	0.6761	0.6082	0.4262	0.4336
Std	0.0516	0.0666	0.0831	0.0968	0.0736	0.1092	0.1101

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
Fold
0	0.7593	0.8451	0.6842	0.6500	0.6667	0.4785	0.4788
1	0.7778	0.8586	0.5789	0.7333	0.6471	0.4882	0.4954
2	0.8333	0.8827	0.7368	0.7778	0.7568	0.6301	0.6307
3	0.6111	0.6383	0.4211	0.4444	0.4324	0.1370	0.1371
4	0.7407	0.8068	0.6316	0.6316	0.6316	0.4316	0.4316
5	0.7593	0.8504	0.4737	0.7500	0.5806	0.4236	0.4456
6	0.7963	0.8722	0.5263	0.8333	0.6452	0.5123	0.5389
7	0.8113	0.8444	0.6111	0.7857	0.6875	0.5554	0.5644
8	0.8113	0.8659	0.6667	0.7500	0.7059	0.5677	0.5698
9	0.7358	0.7675	0.5000	0.6429	0.5625	0.3775	0.3836
Mean	0.7636	0.8232	0.5830	0.6999	0.6316	0.4602	0.4676
Std	0.0594	0.0695	0.0961	0.1063	0.0854	0.1291	0.1306

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
Fold
0	0.7963	0.8534	0.7895	0.6818	0.7317	0.5689	0.5729
1	0.7963	0.8752	0.6316	0.7500	0.6857	0.5367	0.5410
2	0.7963	0.8827	0.6842	0.7222	0.7027	0.5479	0.5484
3	0.6481	0.6346	0.5263	0.5000	0.5128	0.2377	0.2379
4	0.7037	0.8023	0.6842	0.5652	0.6190	0.3802	0.3848
5	0.7407	0.8519	0.4211	0.7273	0.5333	0.3710	0.3976
6	0.7407	0.8414	0.4737	0.6923	0.5625	0.3874	0.4014
7	0.8491	0.8770	0.6667	0.8571	0.7500	0.6443	0.6547
8	0.8113	0.8675	0.6667	0.7500	0.7059	0.5677	0.5698
9	0.7736	0.7865	0.5000	0.7500	0.6000	0.4508	0.4688
Mean	0.7656	0.8272	0.6044	0.6996	0.6404	0.4693	0.4778
Std	0.0555	0.0710	0.1110	0.0959	0.0815	0.1180	0.1168

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
Fold
0	0.7593	0.8737	0.7368	0.6364	0.6829	0.4906	0.4940
1	0.7963	0.9113	0.6316	0.7500	0.6857	0.5367	0.5410
2	0.8519	0.8602	0.7368	0.8235	0.7778	0.6672	0.6695
3	0.6481	0.6571	0.5263	0.5000	0.5128	0.2377	0.2379
4	0.6852	0.7639	0.7368	0.5385	0.6222	0.3634	0.3766
5	0.7407	0.8466	0.4737	0.6923	0.5625	0.3874	0.4014
6	0.7222	0.8376	0.4211	0.6667	0.5161	0.3350	0.3524
7	0.8113	0.8778	0.6111	0.7857	0.6875	0.5554	0.5644
8	0.7547	0.8635	0.6111	0.6471	0.6286	0.4457	0.4461
9	0.7170	0.7571	0.5556	0.5882	0.5714	0.3604	0.3607
Mean	0.7487	0.8249	0.6041	0.6628	0.6248	0.4379	0.4444
Std	0.0573	0.0724	0.1061	0.0989	0.0809	0.1199	0.1189

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
Fold
0	0.6852	0.6729	0.6316	0.5455	0.5854	0.3338	0.3361
1	0.7037	0.6752	0.5789	0.5789	0.5789	0.3504	0.3504
2	0.8333	0.8353	0.8421	0.7273	0.7805	0.6473	0.6518
3	0.6667	0.6226	0.4737	0.5294	0.5000	0.2512	0.2520
4	0.6111	0.5797	0.4737	0.4500	0.4615	0.1575	0.1576
5	0.7222	0.6895	0.5789	0.6111	0.5946	0.3836	0.3839
6	0.7963	0.7346	0.5263	0.8333	0.6452	0.5123	0.5389
7	0.7925	0.7484	0.6111	0.7333	0.6667	0.5178	0.5223
8	0.6415	0.6341	0.6111	0.4783	0.5366	0.2513	0.2563
9	0.6226	0.5929	0.5000	0.4500	0.4737	0.1808	0.1814
Mean	0.7075	0.6785	0.5827	0.5937	0.5823	0.3586	0.3631
Std	0.0736	0.0742	0.1025	0.1250	0.0926	0.1512	0.1549

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
Fold
0	0.8148	0.0000	0.7895	0.7143	0.7500	0.6035	0.6054
1	0.7778	0.0000	0.6316	0.7059	0.6667	0.5008	0.5025
2	0.8333	0.0000	0.6842	0.8125	0.7429	0.6209	0.6259
3	0.6481	0.0000	0.5789	0.5000	0.5366	0.2554	0.2572
4	0.7222	0.0000	0.6316	0.6000	0.6154	0.3982	0.3985
5	0.7222	0.0000	0.3684	0.7000	0.4828	0.3170	0.3476
6	0.7778	0.0000	0.5789	0.7333	0.6471	0.4882	0.4954
7	0.8491	0.0000	0.6667	0.8571	0.7500	0.6443	0.6547
8	0.8113	0.0000	0.6667	0.7500	0.7059	0.5677	0.5698
9	0.7358	0.0000	0.5000	0.6429	0.5625	0.3775	0.3836
Mean	0.7693	0.0000	0.6096	0.7016	0.6460	0.4774	0.4841
Std	0.0589	0.0000	0.1084	0.0972	0.0903	0.1282	0.1262

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
Fold
0	0.8148	0.8782	0.7895	0.7143	0.7500	0.6035	0.6054
1	0.8148	0.8752	0.6842	0.7647	0.7222	0.5840	0.5860
2	0.8148	0.8962	0.6842	0.7647	0.7222	0.5840	0.5860
3	0.6852	0.6632	0.5263	0.5556	0.5405	0.3014	0.3016
4	0.7593	0.8060	0.6842	0.6500	0.6667	0.4785	0.4788
5	0.7222	0.8722	0.3684	0.7000	0.4828	0.3170	0.3476
6	0.8148	0.8707	0.6316	0.8000	0.7059	0.5735	0.5820
7	0.8113	0.8825	0.6111	0.7857	0.6875	0.5554	0.5644
8	0.7925	0.8841	0.6111	0.7333	0.6667	0.5178	0.5223
9	0.7170	0.7714	0.4444	0.6154	0.5161	0.3234	0.3320
Mean	0.7747	0.8400	0.6035	0.7084	0.6461	0.4838	0.4906
Std	0.0474	0.0698	0.1190	0.0754	0.0912	0.1164	0.1130

Load Data¶

Exploratory Data Analysis¶

Skewness and Kurtosis in Data Distributions¶

Skewness¶

Impact of Skewness in Classification¶

1. Feature Skewness¶

2. Class Label Skewness (Class Imbalance)¶

Kurtosis¶

Impact of Kurtosis in Classification¶

1. Feature Kurtosis¶

2. Class Label Kurtosis¶

Tolerable Value Range¶

Caveats¶

Addressing Skewness and Kurtosis¶

Correlation¶

Importance of Correlation Matrix in Binary Classification¶

Pair Plots or Scatter Matrix:¶

Pair Plots (Scatter Matrices)¶

Key Insights from Pair Plots:¶

Points to Consider:¶

Heatmaps¶

Box Plots¶

Checking and Removing Outliers¶

Scaling¶

Quantile Transformation¶

What is a Quantile?¶

Quantile Transformation:¶

Steps involved in Quantile Transformation:¶

Why use Quantile Transformation?¶

Caveats:¶

QQ Plot¶

Classifiers¶

Visualize Dataset¶

PCA Visualization Interpretation¶

1. Overlap of Classes¶

2. Density and Clustering¶

3. Spread Along PC1 vs. PC2¶

4. Outliers¶

5. Potential for Further Analysis¶

Model Training¶

Model Creation¶

Stacking¶

Soft Voting¶

Hard Voting¶

Calibrating the model¶

References:¶

Model Calibration in PyCaret¶

Why is Calibration Important?¶

Calibration Techniques:¶

Calibration Techniques in Machine Learning¶

1. Platt Scaling:¶

How it works:¶

2. Isotonic Regression:¶

How it works:¶

Using Calibration in PyCaret:¶

Finalizing the last model¶

Precision and Recall Trade-off¶

Definitions¶

Threshold Adjustment¶

Illustration¶

Precision-Recall Curve¶

Balancing Precision and Recall¶