{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Example: Automated feature scaling\n", "------------------------------------\n", "\n", "This example shows how ATOM handles models that require automated feature scaling.\n", "\n", "Import the breast cancer dataset from [sklearn.datasets](https://scikit-learn.org/stable/datasets/index.html#wine-dataset). This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load the data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Import packages\n", "from sklearn.datasets import load_breast_cancer\n", "from atom import ATOMClassifier" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Load the data\n", "X, y = load_breast_cancer(return_X_y=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Run the pipeline" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<< ================== ATOM ================== >>\n", "Algorithm task: binary classification.\n", "\n", "Dataset stats ==================== >>\n", "Shape: (569, 31)\n", "Train set size: 456\n", "Test set size: 113\n", "-------------------------------------\n", "Memory: 141.24 kB\n", "Scaled: False\n", "Outlier values: 167 (1.2%)\n", "\n" ] } ], "source": [ "atom = ATOMClassifier(X, y, verbose=2, random_state=1)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
acronymmodelneeds_scaling
0AdaBAdaBoostFalse
1BagBaggingFalse
2BNBBernoulliNBFalse
3CatBCatBoostTrue
4CatNBCategoricalNBFalse
5CNBComplementNBFalse
6TreeDecisionTreeFalse
7DummyDummyFalse
8ETreeExtraTreeFalse
9ETExtraTreesFalse
10GNBGaussianNBFalse
11GPGaussianProcessFalse
12GBMGradientBoostingFalse
13hGBMHistGradientBoostingFalse
14KNNKNearestNeighborsTrue
15LGBLightGBMTrue
16LDALinearDiscriminantAnalysisFalse
17lSVMLinearSVMTrue
18LRLogisticRegressionTrue
19MLPMultiLayerPerceptronTrue
20MNBMultinomialNBFalse
21PAPassiveAggressiveTrue
22PercPerceptronTrue
23QDAQuadraticDiscriminantAnalysisFalse
24RNNRadiusNearestNeighborsTrue
25RFRandomForestFalse
26RidgeRidgeTrue
27SGDStochasticGradientDescentTrue
28SVMSupportVectorMachineTrue
29XGBXGBoostTrue
\n", "
" ], "text/plain": [ " acronym model needs_scaling\n", "0 AdaB AdaBoost False\n", "1 Bag Bagging False\n", "2 BNB BernoulliNB False\n", "3 CatB CatBoost True\n", "4 CatNB CategoricalNB False\n", "5 CNB ComplementNB False\n", "6 Tree DecisionTree False\n", "7 Dummy Dummy False\n", "8 ETree ExtraTree False\n", "9 ET ExtraTrees False\n", "10 GNB GaussianNB False\n", "11 GP GaussianProcess False\n", "12 GBM GradientBoosting False\n", "13 hGBM HistGradientBoosting False\n", "14 KNN KNearestNeighbors True\n", "15 LGB LightGBM True\n", "16 LDA LinearDiscriminantAnalysis False\n", "17 lSVM LinearSVM True\n", "18 LR LogisticRegression True\n", "19 MLP MultiLayerPerceptron True\n", "20 MNB MultinomialNB False\n", "21 PA PassiveAggressive True\n", "22 Perc Perceptron True\n", "23 QDA QuadraticDiscriminantAnalysis False\n", "24 RNN RadiusNearestNeighbors True\n", "25 RF RandomForest False\n", "26 Ridge Ridge True\n", "27 SGD StochasticGradientDescent True\n", "28 SVM SupportVectorMachine True\n", "29 XGB XGBoost True" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Check which models require feature scaling\n", "atom.available_models()[[\"acronym\", \"model\", \"needs_scaling\"]]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Training ========================= >>\n", "Models: LR, Bag\n", "Metric: f1\n", "\n", "\n", "Results for LogisticRegression:\n", "Fit ---------------------------------------------\n", "Train evaluation --> f1: 0.9913\n", "Test evaluation --> f1: 0.9861\n", "Time elapsed: 0.182s\n", "-------------------------------------------------\n", "Total time: 0.182s\n", "\n", "\n", "Results for Bagging:\n", "Fit ---------------------------------------------\n", "Train evaluation --> f1: 0.9982\n", "Test evaluation --> f1: 0.9444\n", "Time elapsed: 0.378s\n", "-------------------------------------------------\n", "Total time: 0.378s\n", "\n", "\n", "Final results ==================== >>\n", "Total time: 0.572s\n", "-------------------------------------\n", "LogisticRegression --> f1: 0.9861 !\n", "Bagging --> f1: 0.9444\n" ] } ], "source": [ "# We fit two models: LR needs scaling and Bag doesn't\n", "atom.run([\"LR\", \"Bag\"])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "New branch scaling successfully created.\n" ] } ], "source": [ "# Now, we create a new branch and scale the features before fitting the model\n", "atom.branch = \"scaling\"" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fitting Scaler...\n", "Scaling features...\n" ] } ], "source": [ "atom.scale()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Training ========================= >>\n", "Models: LR2\n", "Metric: f1\n", "\n", "\n", "Results for LogisticRegression:\n", "Fit ---------------------------------------------\n", "Train evaluation --> f1: 0.9913\n", "Test evaluation --> f1: 0.9861\n", "Time elapsed: 0.123s\n", "-------------------------------------------------\n", "Total time: 0.123s\n", "\n", "\n", "Final results ==================== >>\n", "Total time: 0.133s\n", "-------------------------------------\n", "LogisticRegression --> f1: 0.9861\n" ] } ], "source": [ "atom.run(\"LR2\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Analyze the results" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Scaler()\n", "None\n", "None\n" ] } ], "source": [ "# Let's compare the differences between the models\n", "print(atom.lr.scaler)\n", "print(atom.bag.scaler)\n", "print(atom.lr2.scaler)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " x0 x1 x2\n", "0 -0.181875 0.356669 -0.147122\n", "1 1.162216 0.300578 1.159704\n", "2 1.056470 1.212060 0.933833\n", "3 0.277287 2.457753 0.188054\n", "4 -1.442482 -0.825921 -1.343434\n", "-----------------------------\n", " x0 x1 x2\n", "0 13.48 20.82 88.40\n", "1 18.31 20.58 120.80\n", "2 17.93 24.48 115.20\n", "3 15.13 29.81 96.71\n", "4 8.95 15.76 58.74\n", "-----------------------------\n", "True\n" ] } ], "source": [ "# And the data they use is different\n", "print(atom.lr.X.iloc[:5, :3])\n", "print(\"-----------------------------\")\n", "print(atom.bag.X.iloc[:5, :3])\n", "print(\"-----------------------------\")\n", "print(atom.lr2.X_train.equals(atom.lr.X_train))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 Scaler()\n", "dtype: object\n", "-----------------------------\n", "Series([], Name: master, dtype: object)\n", "-----------------------------\n", "0 Scaler(verbose=2)\n", "dtype: object\n" ] } ], "source": [ "# Note that the scaler is included in the model's pipeline\n", "print(atom.lr.pipeline)\n", "print(\"-----------------------------\")\n", "print(atom.bag.pipeline)\n", "print(\"-----------------------------\")\n", "print(atom.lr2.pipeline)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "atom.plot_pipeline()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }