Supervised ML workflow for building a classification model on tabular data with categorical and continuous features.
Using Palmer’s penguins dataset from seaborn, train a random forest to predict the penguin species. Use scikit-learn for pre-processing, modeling, and evaluation.
This script:
Includes stratified train/test split
Includes imputation inside the pipeline
Handles categorical and numerical features separately
Runs a grid search to find best model parameters
Evaluates results using both classification report and confusion matrix
Extracts feature importances with proper naming
Note: Using display for HTML tables
print(summarize(df)) and print(df.head()) return tables printed in plain text. To get nicer-formatted HTML tables, use the following instead of print():
from IPython.display import displaydisplay(df.head())# Display summarydisplay(summarize(df))
from IPython.display import displayimport seaborn as snsimport pandas as pdfrom minieda import summarize # pip install git+https://github.com/dbolotov/minieda.gitimport timefrom pprint import pprintfrom sklearn.model_selection import train_test_splitfrom sklearn.compose import ColumnTransformerfrom sklearn.preprocessing import OneHotEncoder, StandardScalerfrom sklearn.pipeline import Pipelinefrom sklearn.impute import SimpleImputerfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.model_selection import GridSearchCVfrom sklearn.metrics import classification_report, confusion_matrixpd.set_option("display.width", 220) # set display width for printed tables# Load dataset and display first few rowsdf = sns.load_dataset("penguins")print("----- SCRIPT OUTPUT -----")print("\n----- First Few Rows of Data -----\n")print(df.head())# Display summaryprint("\n----- Data Summary -----\n")print(summarize(df))# Per-class value countprint("\n----- Target class frequencies (normalized) -----\n")print(df['species'].value_counts(normalize=True).rename(None).rename_axis(None))# Define columnscat_cols = ['island', 'sex']num_cols = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']# Drop rows where the target is missing (can't model without target)df = df.dropna(subset=['species'])# Split the dataX = df[cat_cols + num_cols]y = df['species']# Split into training and testX_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y)# Define preprocessing for numeric and categorical featuresnumeric_preprocessing = Pipeline([ ('impute', SimpleImputer(strategy='mean')), ('scale', StandardScaler())])categorical_preprocessing = Pipeline([ ('impute', SimpleImputer(strategy='most_frequent')), ('encode', OneHotEncoder(drop='first', sparse_output=False)) # one-hot; drop first feature to avoid multicollinearity])# Combine into a column transformerpreprocessor = ColumnTransformer([ ('num', numeric_preprocessing, num_cols), ('cat', categorical_preprocessing, cat_cols)])# Base pipelineclf_pipeline = Pipeline([ ('pre', preprocessor), ('model', RandomForestClassifier(random_state=42))])# Define hyperparameter gridparam_grid = {'model__n_estimators': [20,30,40],'model__max_depth': [None],'model__min_samples_leaf': [1, 3, 5, 7],'model__max_features': ['sqrt']}# Grid search with cross-validationgrid_search = GridSearchCV( estimator=clf_pipeline, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)# Fit grid search on training dataprint("\n----- GRID SEARCH -----\n")start_time = time.time()grid_search.fit(X_train, y_train)print(f"\nGrid search completed in {time.time() - start_time:.2f} seconds")print("\n----- Best Grid Search Result -----")print(f"Accuracy: {grid_search.best_score_:.4f} ± {grid_search.cv_results_['std_test_score'][grid_search.best_index_]:.4f}")print("Parameters:")pprint(grid_search.best_params_)# Use best model from grid searchclf_pipeline = grid_search.best_estimator_# Evaluateprint("\n----- EVALUATION -----")print("\n----- Train/Test Accuracy -----\n")print(f"Train accuracy: {clf_pipeline.score(X_train, y_train):.4f}")print(f"Test accuracy: {clf_pipeline.score(X_test, y_test):.4f}")y_test_pred = clf_pipeline.predict(X_test)print("\n----- Classification Report -----\n")print(classification_report(y_test, y_test_pred))print("\n----- Confusion Matrix -----\n")cm = confusion_matrix(y_test, y_test_pred, labels=clf_pipeline.classes_)print(cm)# Print normalized feature importancesmodel = clf_pipeline.named_steps['model']encoded_feature_names = clf_pipeline.named_steps['pre'].get_feature_names_out()feat_importance_df = pd.DataFrame({'feature': encoded_feature_names,'importance': model.feature_importances_}).sort_values(by='importance', ascending=False)print("\n----- Feature Importance -----\n")print(feat_importance_df)
----- SCRIPT OUTPUT -----
----- First Few Rows of Data -----
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 Male
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 Female
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 Female
3 Adelie Torgersen NaN NaN NaN NaN NaN
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 Female
----- Data Summary -----
dtype count unique unique_perc missing missing_perc zero zero_perc top freq mean std min 50% max skew
bill_length_mm float64 342 164 47.67 2 0.58 0 0.0 43.92 5.46 32.1 44.45 59.6 0.05
bill_depth_mm float64 342 80 23.26 2 0.58 0 0.0 17.15 1.97 13.1 17.3 21.5 -0.14
flipper_length_mm float64 342 55 15.99 2 0.58 0 0.0 200.92 14.06 172.0 197.0 231.0 0.35
body_mass_g float64 342 94 27.33 2 0.58 0 0.0 4201.75 801.95 2700.0 4050.0 6300.0 0.47
species object 344 3 0.87 0 0.00 0 0.0 Adelie 152
island object 344 3 0.87 0 0.00 0 0.0 Biscoe 168
sex object 333 2 0.58 11 3.20 0 0.0 Male 168
----- Target class frequencies (normalized) -----
Adelie 0.441860
Gentoo 0.360465
Chinstrap 0.197674
dtype: float64
----- GRID SEARCH -----
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Grid search completed in 3.15 seconds
----- Best Grid Search Result -----
Accuracy: 0.9855 ± 0.0073
Parameters:
{'model__max_depth': None,
'model__max_features': 'sqrt',
'model__min_samples_leaf': 1,
'model__n_estimators': 30}
----- EVALUATION -----
----- Train/Test Accuracy -----
Train accuracy: 1.0000
Test accuracy: 1.0000
----- Classification Report -----
precision recall f1-score support
Adelie 1.00 1.00 1.00 30
Chinstrap 1.00 1.00 1.00 14
Gentoo 1.00 1.00 1.00 25
accuracy 1.00 69
macro avg 1.00 1.00 1.00 69
weighted avg 1.00 1.00 1.00 69
----- Confusion Matrix -----
[[30 0 0]
[ 0 14 0]
[ 0 0 25]]
----- Feature Importance -----
feature importance
0 num__bill_length_mm 0.312770
1 num__bill_depth_mm 0.216133
2 num__flipper_length_mm 0.205913
4 cat__island_Dream 0.138414
3 num__body_mass_g 0.096331
5 cat__island_Torgersen 0.021391
6 cat__sex_Male 0.009048