In [60]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
# Evaluation
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error,mean_squared_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
#classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
#split
from sklearn.model_selection import train_test_split
# import warnings
load data¶
In [61]:
df = pd.read_csv('../data/train.csv')
X=df.drop(columns=['Survived','Name','Ticket','Cabin'],axis=1)
y = df['Survived']
transform data¶
In [62]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
num_pipeline= Pipeline(
steps=[
("imputer",SimpleImputer(strategy="median")),
("scaler",StandardScaler())
]
)
cat_pipeline=Pipeline(
steps=[
("imputer",SimpleImputer(strategy="most_frequent")),
("one_hot_encoder",OneHotEncoder()),
("scaler",StandardScaler(with_mean=False))
]
)
preprocessor=ColumnTransformer(
[
("num_pipeline",num_pipeline,num_features),
("cat_pipelines",cat_pipeline,cat_features)
]
)
X = preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
load model¶
In [ ]:
def evaluate_model(true, predicted):
accuracy = accuracy_score(true, predicted)
precision = precision_score(true, predicted)
recall = recall_score(true, predicted)
f1 = f1_score(true, predicted)
return accuracy, precision, recall, f1
models = {
"Logistic Regression": LogisticRegression(),
"K-Neighbors": KNeighborsClassifier(),
"Decision Tree": DecisionTreeClassifier(),
"Random Forest": RandomForestClassifier(),
"XGBClassifier": XGBClassifier(),
"naive_bayes": GaussianNB(),
"neural_network": MLPClassifier()
}
params={
"Logistic Regression": {'max_iter':[100,500,1000]},
"K-Neighbors": {'weights' : ['uniform', 'distance']},
"Decision Tree": {'splitter' : ["best", "random"]},
"Random Forest": {'criterion' : ["gini", "entropy", "log_loss"]},
"XGBClassifier": {},
"naive_bayes": {},
"neural_network": {'alpha' : [0.0001,0.001,0.01]}
}
model_list = []
f1_list =[]
for i in range(len(list(models))):
model = list(models.values())[i]
param=params[list(models.keys())[i]]
gs = GridSearchCV(model,param,cv=3)
gs.fit(X_train,y_train)
model.set_params(**gs.best_params_)
model.fit(X_train, y_train) # Train model
# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
# Evaluate Train and Test dataset
model_train_accuracy , model_train_precision, model_train_recall,model_train_f1 = evaluate_model(y_train, y_train_pred)
model_test_accuracy , model_test_precision, model_test_recall,model_test_f1 = evaluate_model(y_test, y_test_pred)
print(list(models.keys())[i])
model_list.append(list(models.keys())[i])
print('Model performance for Training set')
print("- accuracy: {:.4f}".format(model_train_accuracy))
print("- precision: {:.4f}".format(model_train_precision))
print("- recall: {:.4f}".format(model_train_recall))
print("- f1: {:.4f}".format(model_train_f1))
print('----------------------------------')
print('Model performance for Training set')
print("- accuracy: {:.4f}".format(model_test_accuracy))
print("- precision: {:.4f}".format(model_test_precision))
print("- recall: {:.4f}".format(model_test_recall))
print("- f1: {:.4f}".format(model_test_f1))
f1_list.append(model_test_f1)
print('='*35)
print('\n')
Performance rank of different models¶
In [64]:
result=np.c_[model_list,f1_list]
sorted_indices = np.argsort(result[:, 1])[::-1]
result=result[sorted_indices]
result
Out[64]:
array([['Random Forest', '0.7916666666666666'], ['Logistic Regression', '0.7586206896551724'], ['neural_network', '0.7536231884057971'], ['K-Neighbors', '0.75'], ['naive_bayes', '0.7435897435897436'], ['XGBClassifier', '0.7346938775510204'], ['Decision Tree', '0.7019867549668874']], dtype='<U32')