Battle of the Auto ML titans for People Analytics application

towards-data-science

This post was originally published by Shilpa Leo at Towards Data Science

Why Auto ML?

Image by author

Understanding Fundamentals

Image by author

Evaluating Auto ML libraries

# importing libraries
import pandas as pd
import numpy as np

#auto ML library 1 pycaret imports
import pycaret 
from pycaret import classification

#auto ML library 2 tpot imports
import tpot
from tpot import TPOTClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split

# importing dataset
data_df = pd.read_csv('../data/HR_comma_sep.csv') #../ goes one folder back
print(data_df.shape)
data_df.head()
Image by author
Image by author
# checking columns in dataset for missing values
data_df['salary'].isna().sum()
Image by author

1. Pycaret

exp_clf101 = classification.setup(data = data_df, target = 'left', session_id=123)
Image by author

Image by author
exp_clf101 = classification.setup(data = data_df, target = 'left', session_id=123, numeric_features = ['number_project', 'time_spend_company', 'Work_accident', 'promotion_last_5years'])
Image by author

Image by author
def salary_num(val):
    if val=='low':
        return 0
    elif val=='medium':
        return 1
    else:
        return 2
data_df['salary'] = data_df['salary'].apply(salary_num)
data_df.head()
Image by author

Image by author

Image by author
best_model = classification.compare_models(sort='F1')
Image by author

Image by author
# train split confusion matrix
classification.plot_model(best_model, plot = 'confusion_matrix', use_train_data=True)

# test split confusion matrix
classification.plot_model(best_model, plot = 'confusion_matrix')
Image by author

Image by author
copy_data_df = data_df.copy()
classification.predict_model(best_model, data=copy_data_df)
Image by author

Image by author

2. TPOT

# splitting features (X) and response (y)
data = data_df.values
X,y = data[:, 1:], data[:, 0]
print(X.shape, y.shape)

# splitting train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.70, random_state=1)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
Image by author

Image by author
# define model evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# define search
model = TPOTClassifier(generations=5, population_size=50, cv=cv, scoring='f1', verbosity=2, random_state=1, n_jobs=6)

# fitting the model on train features-response
model.fit(X_train, y_train)
Image by author

Image by author

Image by author

Conclusion

Spread the word

This post was originally published by Shilpa Leo at Towards Data Science

Related posts