{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Example: Automated feature scaling\n", "------------------------------------\n", "\n", "This example shows how ATOM handles models that require automated feature scaling.\n", "\n", "Import the breast cancer dataset from [sklearn.datasets](https://scikit-learn.org/stable/datasets/index.html#wine-dataset). This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load the data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Import packages\n", "from sklearn.datasets import load_breast_cancer\n", "from atom import ATOMClassifier" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Load the data\n", "X, y = load_breast_cancer(return_X_y=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Run the pipeline" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<< ================== ATOM ================== >>\n", "\n", "Configuration ==================== >>\n", "Algorithm task: Binary classification.\n", "\n", "Dataset stats ==================== >>\n", "Shape: (569, 31)\n", "Train set size: 456\n", "Test set size: 113\n", "-------------------------------------\n", "Memory: 141.24 kB\n", "Scaled: False\n", "Outlier values: 167 (1.2%)\n", "\n" ] } ], "source": [ "atom = ATOMClassifier(X, y, verbose=2, random_state=1)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
acronymfullnameestimatormodulehandles_missingneeds_scalingaccepts_sparsenative_multilabelnative_multioutputvalidationsupports_engines
0CatBCatBoostCatBoostClassifiercatboost.coreTrueTrueTrueFalseFalsen_estimatorscatboost
1KNNKNearestNeighborsKNeighborsClassifiersklearn.neighbors._classificationFalseTrueTrueTrueTrueNonesklearn, sklearnex, cuml
2LGBLightGBMLGBMClassifierlightgbm.sklearnTrueTrueTrueFalseFalsen_estimatorslightgbm
3lSVMLinearSVMLinearSVCsklearn.svm._classesFalseTrueTrueFalseFalseNonesklearn, cuml
4LRLogisticRegressionLogisticRegressionsklearn.linear_model._logisticFalseTrueTrueFalseFalseNonesklearn, sklearnex, cuml
5MLPMultiLayerPerceptronMLPClassifiersklearn.neural_network._multilayer_perceptronFalseTrueTrueTrueFalsemax_itersklearn
6PAPassiveAggressivePassiveAggressiveClassifiersklearn.linear_model._passive_aggressiveFalseTrueTrueFalseFalsemax_itersklearn
7PercPerceptronPerceptronsklearn.linear_model._perceptronFalseTrueFalseFalseFalsemax_itersklearn
8RNNRadiusNearestNeighborsRadiusNeighborsClassifiersklearn.neighbors._classificationFalseTrueTrueTrueTrueNonesklearn
9RidgeRidgeRidgeClassifiersklearn.linear_model._ridgeFalseTrueTrueTrueFalseNonesklearn, sklearnex, cuml
10SGDStochasticGradientDescentSGDClassifiersklearn.linear_model._stochastic_gradientFalseTrueTrueFalseFalsemax_itersklearn
11SVMSupportVectorMachineSVCsklearn.svm._classesFalseTrueTrueFalseFalseNonesklearn, sklearnex, cuml
12XGBXGBoostXGBClassifierxgboost.sklearnTrueTrueTrueFalseFalsen_estimatorsxgboost
\n", "
" ], "text/plain": [ " acronym fullname estimator \\\n", "0 CatB CatBoost CatBoostClassifier \n", "1 KNN KNearestNeighbors KNeighborsClassifier \n", "2 LGB LightGBM LGBMClassifier \n", "3 lSVM LinearSVM LinearSVC \n", "4 LR LogisticRegression LogisticRegression \n", "5 MLP MultiLayerPerceptron MLPClassifier \n", "6 PA PassiveAggressive PassiveAggressiveClassifier \n", "7 Perc Perceptron Perceptron \n", "8 RNN RadiusNearestNeighbors RadiusNeighborsClassifier \n", "9 Ridge Ridge RidgeClassifier \n", "10 SGD StochasticGradientDescent SGDClassifier \n", "11 SVM SupportVectorMachine SVC \n", "12 XGB XGBoost XGBClassifier \n", "\n", " module handles_missing \\\n", "0 catboost.core True \n", "1 sklearn.neighbors._classification False \n", "2 lightgbm.sklearn True \n", "3 sklearn.svm._classes False \n", "4 sklearn.linear_model._logistic False \n", "5 sklearn.neural_network._multilayer_perceptron False \n", "6 sklearn.linear_model._passive_aggressive False \n", "7 sklearn.linear_model._perceptron False \n", "8 sklearn.neighbors._classification False \n", "9 sklearn.linear_model._ridge False \n", "10 sklearn.linear_model._stochastic_gradient False \n", "11 sklearn.svm._classes False \n", "12 xgboost.sklearn True \n", "\n", " needs_scaling accepts_sparse native_multilabel native_multioutput \\\n", "0 True True False False \n", "1 True True True True \n", "2 True True False False \n", "3 True True False False \n", "4 True True False False \n", "5 True True True False \n", "6 True True False False \n", "7 True False False False \n", "8 True True True True \n", "9 True True True False \n", "10 True True False False \n", "11 True True False False \n", "12 True True False False \n", "\n", " validation supports_engines \n", "0 n_estimators catboost \n", "1 None sklearn, sklearnex, cuml \n", "2 n_estimators lightgbm \n", "3 None sklearn, cuml \n", "4 None sklearn, sklearnex, cuml \n", "5 max_iter sklearn \n", "6 max_iter sklearn \n", "7 max_iter sklearn \n", "8 None sklearn \n", "9 None sklearn, sklearnex, cuml \n", "10 max_iter sklearn \n", "11 None sklearn, sklearnex, cuml \n", "12 n_estimators xgboost " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Check which models require feature scaling\n", "atom.available_models(needs_scaling=True)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Training ========================= >>\n", "Models: LR, Bag\n", "Metric: f1\n", "\n", "\n", "Results for LogisticRegression:\n", "Fit ---------------------------------------------\n", "Train evaluation --> f1: 0.9913\n", "Test evaluation --> f1: 0.9861\n", "Time elapsed: 0.120s\n", "-------------------------------------------------\n", "Time: 0.120s\n", "\n", "\n", "Results for Bagging:\n", "Fit ---------------------------------------------\n", "Train evaluation --> f1: 0.9982\n", "Test evaluation --> f1: 0.9444\n", "Time elapsed: 0.067s\n", "-------------------------------------------------\n", "Time: 0.067s\n", "\n", "\n", "Final results ==================== >>\n", "Total time: 0.194s\n", "-------------------------------------\n", "LogisticRegression --> f1: 0.9861 !\n", "Bagging --> f1: 0.9444\n" ] } ], "source": [ "# We fit two models: LR needs scaling and Bag doesn't\n", "atom.run([\"LR\", \"Bag\"])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Successfully created new branch: scaling.\n" ] } ], "source": [ "# Now, we create a new branch and scale the features before fitting the model\n", "atom.branch = \"scaling\"" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fitting Scaler...\n", "Scaling features...\n" ] } ], "source": [ "atom.scale()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Training ========================= >>\n", "Models: LR_2\n", "Metric: f1\n", "\n", "\n", "Results for LogisticRegression:\n", "Fit ---------------------------------------------\n", "Train evaluation --> f1: 0.9913\n", "Test evaluation --> f1: 0.9861\n", "Time elapsed: 0.032s\n", "-------------------------------------------------\n", "Time: 0.032s\n", "\n", "\n", "Final results ==================== >>\n", "Total time: 0.035s\n", "-------------------------------------\n", "LogisticRegression --> f1: 0.9861\n" ] } ], "source": [ "atom.run(\"LR_2\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Analyze the results" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Scaler()\n", "None\n", "None\n" ] } ], "source": [ "# Let's compare the differences between the models\n", "print(atom.lr.scaler)\n", "print(atom.bag.scaler)\n", "print(atom.lr_2.scaler)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " x0 x1 x2\n", "0 -0.181875 0.356669 -0.147122\n", "1 1.162216 0.300578 1.159704\n", "2 1.056470 1.212060 0.933833\n", "3 0.277287 2.457753 0.188054\n", "4 -1.442482 -0.825921 -1.343434\n", "-----------------------------\n", " x0 x1 x2\n", "0 13.48 20.82 88.40\n", "1 18.31 20.58 120.80\n", "2 17.93 24.48 115.20\n", "3 15.13 29.81 96.71\n", "4 8.95 15.76 58.74\n", "-----------------------------\n", "True\n" ] } ], "source": [ "# And the data they use is different\n", "print(atom.lr.X.iloc[:5, :3])\n", "print(\"-----------------------------\")\n", "print(atom.bag.X.iloc[:5, :3])\n", "print(\"-----------------------------\")\n", "print(atom.lr_2.X_train.equals(atom.lr.X_train))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pipeline(memory=Memory(location=None),\n", " ('steps', [('AutomatedScaler', Scaler())]),\n", " verbose=False)\n", "-----------------------------\n", "Pipeline(memory=Memory(location=None), steps=[], verbose=False)\n", "-----------------------------\n", "Pipeline(memory=Memory(location=None),\n", " ('steps', [('scaler', Scaler(verbose=2))]),\n", " verbose=False)\n" ] } ], "source": [ "# Note that the scaler is included in the model's pipeline\n", "print(atom.lr.pipeline)\n", "print(\"-----------------------------\")\n", "print(atom.bag.pipeline)\n", "print(\"-----------------------------\")\n", "print(atom.lr_2.pipeline)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "atom.plot_pipeline()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.2" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }