{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Example: Automated feature scaling\n", "------------------------------------\n", "\n", "This example shows how ATOM handles models that require automated feature scaling.\n", "\n", "Import the breast cancer dataset from [sklearn.datasets](https://scikit-learn.org/stable/datasets/index.html#wine-dataset). This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load the data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Import packages\n", "from sklearn.datasets import load_breast_cancer\n", "from atom import ATOMClassifier" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Load the data\n", "X, y = load_breast_cancer(return_X_y=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Run the pipeline" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<< ================== ATOM ================== >>\n", "\n", "Configuration ==================== >>\n", "Algorithm task: Binary classification.\n", "\n", "Dataset stats ==================== >>\n", "Shape: (569, 31)\n", "Train set size: 456\n", "Test set size: 113\n", "-------------------------------------\n", "Memory: 141.24 kB\n", "Scaled: False\n", "Outlier values: 167 (1.2%)\n", "\n" ] } ], "source": [ "atom = ATOMClassifier(X, y, verbose=2, random_state=1)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | acronym | \n", "fullname | \n", "estimator | \n", "module | \n", "handles_missing | \n", "needs_scaling | \n", "accepts_sparse | \n", "native_multilabel | \n", "native_multioutput | \n", "validation | \n", "supports_engines | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "CatB | \n", "CatBoost | \n", "CatBoostClassifier | \n", "catboost.core | \n", "True | \n", "True | \n", "True | \n", "False | \n", "False | \n", "n_estimators | \n", "catboost | \n", "
1 | \n", "KNN | \n", "KNearestNeighbors | \n", "KNeighborsClassifier | \n", "sklearn.neighbors._classification | \n", "False | \n", "True | \n", "True | \n", "True | \n", "True | \n", "None | \n", "sklearn, sklearnex, cuml | \n", "
2 | \n", "LGB | \n", "LightGBM | \n", "LGBMClassifier | \n", "lightgbm.sklearn | \n", "True | \n", "True | \n", "True | \n", "False | \n", "False | \n", "n_estimators | \n", "lightgbm | \n", "
3 | \n", "lSVM | \n", "LinearSVM | \n", "LinearSVC | \n", "sklearn.svm._classes | \n", "False | \n", "True | \n", "True | \n", "False | \n", "False | \n", "None | \n", "sklearn, cuml | \n", "
4 | \n", "LR | \n", "LogisticRegression | \n", "LogisticRegression | \n", "sklearn.linear_model._logistic | \n", "False | \n", "True | \n", "True | \n", "False | \n", "False | \n", "None | \n", "sklearn, sklearnex, cuml | \n", "
5 | \n", "MLP | \n", "MultiLayerPerceptron | \n", "MLPClassifier | \n", "sklearn.neural_network._multilayer_perceptron | \n", "False | \n", "True | \n", "True | \n", "True | \n", "False | \n", "max_iter | \n", "sklearn | \n", "
6 | \n", "PA | \n", "PassiveAggressive | \n", "PassiveAggressiveClassifier | \n", "sklearn.linear_model._passive_aggressive | \n", "False | \n", "True | \n", "True | \n", "False | \n", "False | \n", "max_iter | \n", "sklearn | \n", "
7 | \n", "Perc | \n", "Perceptron | \n", "Perceptron | \n", "sklearn.linear_model._perceptron | \n", "False | \n", "True | \n", "False | \n", "False | \n", "False | \n", "max_iter | \n", "sklearn | \n", "
8 | \n", "RNN | \n", "RadiusNearestNeighbors | \n", "RadiusNeighborsClassifier | \n", "sklearn.neighbors._classification | \n", "False | \n", "True | \n", "True | \n", "True | \n", "True | \n", "None | \n", "sklearn | \n", "
9 | \n", "Ridge | \n", "Ridge | \n", "RidgeClassifier | \n", "sklearn.linear_model._ridge | \n", "False | \n", "True | \n", "True | \n", "True | \n", "False | \n", "None | \n", "sklearn, sklearnex, cuml | \n", "
10 | \n", "SGD | \n", "StochasticGradientDescent | \n", "SGDClassifier | \n", "sklearn.linear_model._stochastic_gradient | \n", "False | \n", "True | \n", "True | \n", "False | \n", "False | \n", "max_iter | \n", "sklearn | \n", "
11 | \n", "SVM | \n", "SupportVectorMachine | \n", "SVC | \n", "sklearn.svm._classes | \n", "False | \n", "True | \n", "True | \n", "False | \n", "False | \n", "None | \n", "sklearn, sklearnex, cuml | \n", "
12 | \n", "XGB | \n", "XGBoost | \n", "XGBClassifier | \n", "xgboost.sklearn | \n", "True | \n", "True | \n", "True | \n", "False | \n", "False | \n", "n_estimators | \n", "xgboost | \n", "