{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Example: Feature engineering\n", "------------------------------\n", "\n", "This example shows how to use automated feature generation to improve a model's performance.\n", "\n", "The data used is a variation on the [Australian weather dataset](https://www.kaggle.com/jsphyg/weather-dataset-rattle-package) from Kaggle. You can download it from [here](https://github.com/tvdboom/ATOM/blob/master/examples/datasets/weatherAUS.csv). The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target `RainTomorrow`." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load the data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Import packages\n", "import pandas as pd\n", "from atom import ATOMClassifier" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Location | \n", "MinTemp | \n", "MaxTemp | \n", "Rainfall | \n", "Evaporation | \n", "Sunshine | \n", "WindGustDir | \n", "WindGustSpeed | \n", "WindDir9am | \n", "WindDir3pm | \n", "... | \n", "Humidity9am | \n", "Humidity3pm | \n", "Pressure9am | \n", "Pressure3pm | \n", "Cloud9am | \n", "Cloud3pm | \n", "Temp9am | \n", "Temp3pm | \n", "RainToday | \n", "RainTomorrow | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "MelbourneAirport | \n", "18.0 | \n", "26.9 | \n", "21.4 | \n", "7.0 | \n", "8.9 | \n", "SSE | \n", "41.0 | \n", "W | \n", "SSE | \n", "... | \n", "95.0 | \n", "54.0 | \n", "1019.5 | \n", "1017.0 | \n", "8.0 | \n", "5.0 | \n", "18.5 | \n", "26.0 | \n", "Yes | \n", "0 | \n", "
1 | \n", "Adelaide | \n", "17.2 | \n", "23.4 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "S | \n", "41.0 | \n", "S | \n", "WSW | \n", "... | \n", "59.0 | \n", "36.0 | \n", "1015.7 | \n", "1015.7 | \n", "NaN | \n", "NaN | \n", "17.7 | \n", "21.9 | \n", "No | \n", "0 | \n", "
2 | \n", "Cairns | \n", "18.6 | \n", "24.6 | \n", "7.4 | \n", "3.0 | \n", "6.1 | \n", "SSE | \n", "54.0 | \n", "SSE | \n", "SE | \n", "... | \n", "78.0 | \n", "57.0 | \n", "1018.7 | \n", "1016.6 | \n", "3.0 | \n", "3.0 | \n", "20.8 | \n", "24.1 | \n", "Yes | \n", "0 | \n", "
3 | \n", "Portland | \n", "13.6 | \n", "16.8 | \n", "4.2 | \n", "1.2 | \n", "0.0 | \n", "ESE | \n", "39.0 | \n", "ESE | \n", "ESE | \n", "... | \n", "76.0 | \n", "74.0 | \n", "1021.4 | \n", "1020.5 | \n", "7.0 | \n", "8.0 | \n", "15.6 | \n", "16.0 | \n", "Yes | \n", "1 | \n", "
4 | \n", "Walpole | \n", "16.4 | \n", "19.9 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "SE | \n", "44.0 | \n", "SE | \n", "SE | \n", "... | \n", "78.0 | \n", "70.0 | \n", "1019.4 | \n", "1018.9 | \n", "NaN | \n", "NaN | \n", "17.4 | \n", "18.1 | \n", "No | \n", "0 | \n", "
5 rows × 22 columns
\n", "\n", " | drop | \n", "corr_feature | \n", "corr_value | \n", "
---|---|---|---|
0 | \n", "MinTemp | \n", "Location + MinTemp, MinTemp + RainToday_No, Mi... | \n", "1.0, 0.9979, 1.0, 1.0, 1.0 | \n", "
1 | \n", "Location + MinTemp | \n", "MinTemp, MinTemp + RainToday_No, MinTemp + Win... | \n", "1.0, 0.9979, 1.0, 1.0, 1.0 | \n", "
2 | \n", "MinTemp + RainToday_No | \n", "MinTemp, Location + MinTemp, MinTemp + WindGus... | \n", "0.9979, 0.9979, 0.9978, 0.9979, 0.9979 | \n", "
3 | \n", "MinTemp - WindDir3pm | \n", "MinTemp, Location + MinTemp, MinTemp + RainTod... | \n", "1.0, 1.0, 0.9979, 0.9999, 1.0 | \n", "
4 | \n", "MinTemp - WindDir9am | \n", "MinTemp, Location + MinTemp, MinTemp + RainTod... | \n", "1.0, 1.0, 0.9979, 0.9999, 1.0 | \n", "
5 | \n", "MaxTemp | \n", "Temp3pm, Location + MaxTemp, Location + Temp3p... | \n", "0.9834, 1.0, 0.9834, 0.9999, 0.9985, 1.0 | \n", "
6 | \n", "Temp3pm | \n", "MaxTemp, Location + MaxTemp, Location + Temp3p... | \n", "0.9834, 0.9834, 1.0, 0.9833, 0.9825, 0.9835 | \n", "
7 | \n", "Location + MaxTemp | \n", "MaxTemp, Temp3pm, Location + Temp3pm, MaxTemp ... | \n", "1.0, 0.9834, 0.9834, 0.9999, 0.9985, 1.0 | \n", "
8 | \n", "Location + Temp3pm | \n", "MaxTemp, Temp3pm, Location + MaxTemp, MaxTemp ... | \n", "0.9834, 1.0, 0.9834, 0.9833, 0.9825, 0.9835 | \n", "
9 | \n", "MaxTemp - RainToday_Yes | \n", "MaxTemp, Temp3pm, Location + MaxTemp, Location... | \n", "0.9985, 0.9825, 0.9985, 0.9825, 0.9984, 0.9984 | \n", "
10 | \n", "MaxTemp - WindGustDir | \n", "MaxTemp, Temp3pm, Location + MaxTemp, Location... | \n", "1.0, 0.9835, 1.0, 0.9835, 0.9999, 0.9984 | \n", "
11 | \n", "Evaporation | \n", "Evaporation + RainToday_Yes, Evaporation + Win... | \n", "0.9936, 0.9999 | \n", "
12 | \n", "Evaporation + WindDir3pm | \n", "Evaporation, Evaporation + RainToday_Yes | \n", "0.9999, 0.9935 | \n", "
13 | \n", "Sunshine | \n", "Sunshine + WindDir3pm | \n", "0.9999 | \n", "
14 | \n", "WindGustDir | \n", "Location + WindGustDir | \n", "1.0 | \n", "
15 | \n", "WindGustSpeed | \n", "WindDir3pm + WindGustSpeed | \n", "1.0 | \n", "
16 | \n", "WindSpeed3pm | \n", "WindGustDir + WindSpeed3pm | \n", "1.0 | \n", "
17 | \n", "Humidity9am | \n", "Humidity9am + WindGustDir | \n", "1.0 | \n", "
18 | \n", "Humidity3pm | \n", "Evaporation + Humidity3pm, Humidity3pm + Sunsh... | \n", "0.9857, 0.9911, 1.0, 1.0, 0.9998, 1.0 | \n", "
19 | \n", "Humidity3pm + Sunshine | \n", "Humidity3pm, Evaporation + Humidity3pm, Humidi... | \n", "0.9911, 0.9804, 0.9911, 0.9911, 0.9907, 0.9911 | \n", "
20 | \n", "Humidity3pm + WindGustDir | \n", "Humidity3pm, Evaporation + Humidity3pm, Humidi... | \n", "1.0, 0.9857, 0.9911, 1.0, 0.9998, 1.0 | \n", "
21 | \n", "Humidity3pm - Location | \n", "Humidity3pm, Evaporation + Humidity3pm, Humidi... | \n", "1.0, 0.9857, 0.9911, 1.0, 0.9998, 1.0 | \n", "
22 | \n", "Humidity3pm - RainToday_No | \n", "Humidity3pm, Evaporation + Humidity3pm, Humidi... | \n", "0.9998, 0.9855, 0.9907, 0.9998, 0.9998, 0.9998 | \n", "
23 | \n", "Humidity3pm - RainToday_infrequent | \n", "Humidity3pm, Evaporation + Humidity3pm, Humidi... | \n", "1.0, 0.9857, 0.9911, 1.0, 1.0, 0.9998 | \n", "
24 | \n", "Cloud9am - RainToday_infrequent | \n", "Cloud9am | \n", "0.9992 | \n", "
25 | \n", "Temp9am | \n", "Temp9am + WindGustDir | \n", "1.0 | \n", "
26 | \n", "RainToday_Yes - WindDir3pm | \n", "RainToday_Yes | \n", "0.9944 | \n", "
27 | \n", "MinTemp + Temp3pm | \n", "MaxTemp + MinTemp | \n", "0.9949 | \n", "
\n", " | name | \n", "description | \n", "fitness | \n", "
---|---|---|---|
0 | \n", "x23 | \n", "add(add(sub(Cloud3pm, mul(RainToday_No, abs(Wi... | \n", "0.542362 | \n", "
1 | \n", "x24 | \n", "add(add(sub(Cloud3pm, mul(RainToday_No, abs(Wi... | \n", "0.542049 | \n", "
2 | \n", "x25 | \n", "add(add(sub(Cloud3pm, mul(RainToday_No, abs(Wi... | \n", "0.540022 | \n", "
3 | \n", "x26 | \n", "add(add(sub(Cloud3pm, mul(RainToday_No, Sunshi... | \n", "0.534542 | \n", "
4 | \n", "x27 | \n", "add(sub(sub(sub(Humidity3pm, Pressure3pm), mul... | \n", "0.533542 | \n", "
5 | \n", "x28 | \n", "add(add(sub(sub(Cloud3pm, mul(RainToday_No, Su... | \n", "0.533542 | \n", "
6 | \n", "x29 | \n", "add(sub(sub(sub(Humidity3pm, Pressure3pm), abs... | \n", "0.533542 | \n", "
7 | \n", "x30 | \n", "add(sub(sub(Humidity3pm, Pressure3pm), mul(Rai... | \n", "0.532984 | \n", "
8 | \n", "x31 | \n", "sub(Sunshine, add(add(sub(Cloud3pm, abs(WindSp... | \n", "0.532205 | \n", "
9 | \n", "x32 | \n", "sub(Sunshine, add(add(sub(Cloud3pm, abs(WindSp... | \n", "0.532200 | \n", "
10 | \n", "x33 | \n", "add(add(sub(Cloud3pm, abs(WindSpeed3pm)), Wind... | \n", "0.532200 | \n", "
11 | \n", "x34 | \n", "add(sub(sub(sub(Humidity3pm, Sunshine), Pressu... | \n", "0.532200 | \n", "
12 | \n", "x35 | \n", "add(sub(sub(sub(Humidity3pm, Pressure3pm), Sun... | \n", "0.532200 | \n", "
13 | \n", "x36 | \n", "sub(Sunshine, add(add(sub(Cloud3pm, abs(WindSp... | \n", "0.532200 | \n", "
14 | \n", "x37 | \n", "add(add(sub(Cloud3pm, abs(WindSpeed3pm)), Wind... | \n", "0.532200 | \n", "
15 | \n", "x38 | \n", "sub(add(add(sub(Humidity3pm, Pressure3pm), Win... | \n", "0.532200 | \n", "
16 | \n", "x39 | \n", "add(sub(sub(sub(sub(Humidity3pm, Pressure3pm),... | \n", "0.531546 | \n", "
17 | \n", "x40 | \n", "add(add(sub(Cloud3pm, abs(WindSpeed3pm)), Wind... | \n", "0.531546 | \n", "
18 | \n", "x41 | \n", "sub(Sunshine, add(add(sub(Cloud3pm, abs(abs(Wi... | \n", "0.531200 | \n", "
19 | \n", "x42 | \n", "add(sub(sub(sub(Humidity3pm, Pressure3pm), Sun... | \n", "0.531067 | \n", "