{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Example: Memory considerations\n",
    "--------------------------------\n",
    "\n",
    "This example shows how to use the `memory` parameter to make efficient use of the available memory.\n",
    "\n",
    "The data used is a variation on the [Australian weather dataset](https://www.kaggle.com/jsphyg/weather-dataset-rattle-package) from Kaggle. You can download it from [here](https://github.com/tvdboom/ATOM/blob/master/examples/datasets/weatherAUS.csv). The goal of this dataset is to predict whether or not it will rain tomorrow training a binary classifier on target `RainTomorrow`."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import packages\n",
    "import os\n",
    "import tempfile\n",
    "import pandas as pd\n",
    "from atom import ATOMClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Location</th>\n",
       "      <th>MinTemp</th>\n",
       "      <th>MaxTemp</th>\n",
       "      <th>Rainfall</th>\n",
       "      <th>Evaporation</th>\n",
       "      <th>Sunshine</th>\n",
       "      <th>WindGustDir</th>\n",
       "      <th>WindGustSpeed</th>\n",
       "      <th>WindDir9am</th>\n",
       "      <th>WindDir3pm</th>\n",
       "      <th>...</th>\n",
       "      <th>Humidity9am</th>\n",
       "      <th>Humidity3pm</th>\n",
       "      <th>Pressure9am</th>\n",
       "      <th>Pressure3pm</th>\n",
       "      <th>Cloud9am</th>\n",
       "      <th>Cloud3pm</th>\n",
       "      <th>Temp9am</th>\n",
       "      <th>Temp3pm</th>\n",
       "      <th>RainToday</th>\n",
       "      <th>RainTomorrow</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>MelbourneAirport</td>\n",
       "      <td>18.0</td>\n",
       "      <td>26.9</td>\n",
       "      <td>21.4</td>\n",
       "      <td>7.0</td>\n",
       "      <td>8.9</td>\n",
       "      <td>SSE</td>\n",
       "      <td>41.0</td>\n",
       "      <td>W</td>\n",
       "      <td>SSE</td>\n",
       "      <td>...</td>\n",
       "      <td>95.0</td>\n",
       "      <td>54.0</td>\n",
       "      <td>1019.5</td>\n",
       "      <td>1017.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>18.5</td>\n",
       "      <td>26.0</td>\n",
       "      <td>Yes</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Adelaide</td>\n",
       "      <td>17.2</td>\n",
       "      <td>23.4</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>41.0</td>\n",
       "      <td>S</td>\n",
       "      <td>WSW</td>\n",
       "      <td>...</td>\n",
       "      <td>59.0</td>\n",
       "      <td>36.0</td>\n",
       "      <td>1015.7</td>\n",
       "      <td>1015.7</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>17.7</td>\n",
       "      <td>21.9</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Cairns</td>\n",
       "      <td>18.6</td>\n",
       "      <td>24.6</td>\n",
       "      <td>7.4</td>\n",
       "      <td>3.0</td>\n",
       "      <td>6.1</td>\n",
       "      <td>SSE</td>\n",
       "      <td>54.0</td>\n",
       "      <td>SSE</td>\n",
       "      <td>SE</td>\n",
       "      <td>...</td>\n",
       "      <td>78.0</td>\n",
       "      <td>57.0</td>\n",
       "      <td>1018.7</td>\n",
       "      <td>1016.6</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>20.8</td>\n",
       "      <td>24.1</td>\n",
       "      <td>Yes</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Portland</td>\n",
       "      <td>13.6</td>\n",
       "      <td>16.8</td>\n",
       "      <td>4.2</td>\n",
       "      <td>1.2</td>\n",
       "      <td>0.0</td>\n",
       "      <td>ESE</td>\n",
       "      <td>39.0</td>\n",
       "      <td>ESE</td>\n",
       "      <td>ESE</td>\n",
       "      <td>...</td>\n",
       "      <td>76.0</td>\n",
       "      <td>74.0</td>\n",
       "      <td>1021.4</td>\n",
       "      <td>1020.5</td>\n",
       "      <td>7.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>15.6</td>\n",
       "      <td>16.0</td>\n",
       "      <td>Yes</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Walpole</td>\n",
       "      <td>16.4</td>\n",
       "      <td>19.9</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>SE</td>\n",
       "      <td>44.0</td>\n",
       "      <td>SE</td>\n",
       "      <td>SE</td>\n",
       "      <td>...</td>\n",
       "      <td>78.0</td>\n",
       "      <td>70.0</td>\n",
       "      <td>1019.4</td>\n",
       "      <td>1018.9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>17.4</td>\n",
       "      <td>18.1</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 22 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  \\\n",
       "0  MelbourneAirport     18.0     26.9      21.4          7.0       8.9   \n",
       "1          Adelaide     17.2     23.4       0.0          NaN       NaN   \n",
       "2            Cairns     18.6     24.6       7.4          3.0       6.1   \n",
       "3          Portland     13.6     16.8       4.2          1.2       0.0   \n",
       "4           Walpole     16.4     19.9       0.0          NaN       NaN   \n",
       "\n",
       "  WindGustDir  WindGustSpeed WindDir9am WindDir3pm  ...  Humidity9am  \\\n",
       "0         SSE           41.0          W        SSE  ...         95.0   \n",
       "1           S           41.0          S        WSW  ...         59.0   \n",
       "2         SSE           54.0        SSE         SE  ...         78.0   \n",
       "3         ESE           39.0        ESE        ESE  ...         76.0   \n",
       "4          SE           44.0         SE         SE  ...         78.0   \n",
       "\n",
       "   Humidity3pm  Pressure9am  Pressure3pm  Cloud9am  Cloud3pm  Temp9am  \\\n",
       "0         54.0       1019.5       1017.0       8.0       5.0     18.5   \n",
       "1         36.0       1015.7       1015.7       NaN       NaN     17.7   \n",
       "2         57.0       1018.7       1016.6       3.0       3.0     20.8   \n",
       "3         74.0       1021.4       1020.5       7.0       8.0     15.6   \n",
       "4         70.0       1019.4       1018.9       NaN       NaN     17.4   \n",
       "\n",
       "   Temp3pm  RainToday  RainTomorrow  \n",
       "0     26.0        Yes             0  \n",
       "1     21.9         No             0  \n",
       "2     24.1        Yes             0  \n",
       "3     16.0        Yes             1  \n",
       "4     18.1         No             0  \n",
       "\n",
       "[5 rows x 22 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Load data\n",
    "X = pd.read_csv(\"./datasets/weatherAUS.csv\")\n",
    "\n",
    "# Let's have a look\n",
    "X.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define a temp directory to store the files in this example\n",
    "tempdir = tempfile.gettempdir()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_size(filepath):\n",
    "    \"\"\"Return the size of the object in MB.\"\"\"\n",
    "    return f\"{os.path.getsize(filepath + '.pkl') / 1e6:.2f}MB\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Run the pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<< ================== ATOM ================== >>\n",
      "\n",
      "Configuration ==================== >>\n",
      "Algorithm task: Binary classification.\n",
      "\n",
      "Dataset stats ==================== >>\n",
      "Shape: (142193, 22)\n",
      "Train set size: 113755\n",
      "Test set size: 28438\n",
      "-------------------------------------\n",
      "Memory: 25.03 MB\n",
      "Scaled: False\n",
      "Missing values: 316559 (10.1%)\n",
      "Categorical features: 5 (23.8%)\n",
      "Duplicates: 45 (0.0%)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "atom = ATOMClassifier(X, y=\"RainTomorrow\", verbose=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note that the datset takes ~25MB. We can reduce the size of the dataset using \n",
    "the shrink method, which reduces the dtypes to their smallest possible value."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Location          object\n",
       "MinTemp          float64\n",
       "MaxTemp          float64\n",
       "Rainfall         float64\n",
       "Evaporation      float64\n",
       "Sunshine         float64\n",
       "WindGustDir       object\n",
       "WindGustSpeed    float64\n",
       "WindDir9am        object\n",
       "WindDir3pm        object\n",
       "WindSpeed9am     float64\n",
       "WindSpeed3pm     float64\n",
       "Humidity9am      float64\n",
       "Humidity3pm      float64\n",
       "Pressure9am      float64\n",
       "Pressure3pm      float64\n",
       "Cloud9am         float64\n",
       "Cloud3pm         float64\n",
       "Temp9am          float64\n",
       "Temp3pm          float64\n",
       "RainToday         object\n",
       "RainTomorrow       int64\n",
       "dtype: object"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "atom.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The column dtypes are successfully converted.\n"
     ]
    }
   ],
   "source": [
    "atom.shrink(str2cat=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Location         category\n",
       "MinTemp           Float32\n",
       "MaxTemp           Float32\n",
       "Rainfall          Float32\n",
       "Evaporation       Float32\n",
       "Sunshine          Float32\n",
       "WindGustDir      category\n",
       "WindGustSpeed       Int16\n",
       "WindDir9am       category\n",
       "WindDir3pm       category\n",
       "WindSpeed9am        Int16\n",
       "WindSpeed3pm         Int8\n",
       "Humidity9am          Int8\n",
       "Humidity3pm          Int8\n",
       "Pressure9am       Float32\n",
       "Pressure3pm       Float32\n",
       "Cloud9am             Int8\n",
       "Cloud3pm             Int8\n",
       "Temp9am           Float32\n",
       "Temp3pm           Float32\n",
       "RainToday        category\n",
       "RainTomorrow         Int8\n",
       "dtype: object"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "atom.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset stats ==================== >>\n",
      "Shape: (142193, 22)\n",
      "Train set size: 113755\n",
      "Test set size: 28438\n",
      "-------------------------------------\n",
      "Memory: 9.67 MB\n",
      "Scaled: False\n",
      "Missing values: 316559 (10.1%)\n",
      "Categorical features: 5 (23.8%)\n",
      "Duplicates: 45 (0.0%)\n"
     ]
    }
   ],
   "source": [
    "# Let's check the memory usage again...\n",
    "# Notice the huge drop!\n",
    "atom.stats()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting Imputer...\n",
      "Imputing missing values...\n",
      " --> Imputing 637 missing values with mean (12.19) in column MinTemp.\n",
      " --> Imputing 322 missing values with mean (23.23) in column MaxTemp.\n",
      " --> Imputing 1406 missing values with mean (2.37) in column Rainfall.\n",
      " --> Imputing 60843 missing values with mean (5.48) in column Evaporation.\n",
      " --> Imputing 67816 missing values with mean (7.63) in column Sunshine.\n",
      " --> Imputing 9330 missing values with most_frequent (W) in column WindGustDir.\n",
      " --> Imputing 9270 missing values with mean (40.0) in column WindGustSpeed.\n",
      " --> Imputing 10013 missing values with most_frequent (N) in column WindDir9am.\n",
      " --> Imputing 3778 missing values with most_frequent (SE) in column WindDir3pm.\n",
      " --> Imputing 1348 missing values with mean (14.02) in column WindSpeed9am.\n",
      " --> Imputing 2630 missing values with mean (18.64) in column WindSpeed3pm.\n",
      " --> Imputing 1774 missing values with mean (68.82) in column Humidity9am.\n",
      " --> Imputing 3610 missing values with mean (51.45) in column Humidity3pm.\n",
      " --> Imputing 14014 missing values with mean (1017.64) in column Pressure9am.\n",
      " --> Imputing 13981 missing values with mean (1015.25) in column Pressure3pm.\n",
      " --> Imputing 53657 missing values with mean (4.44) in column Cloud9am.\n",
      " --> Imputing 57094 missing values with mean (4.5) in column Cloud3pm.\n",
      " --> Imputing 904 missing values with mean (16.99) in column Temp9am.\n",
      " --> Imputing 2726 missing values with mean (21.69) in column Temp3pm.\n",
      " --> Imputing 1406 missing values with most_frequent (No) in column RainToday.\n",
      "Fitting Encoder...\n",
      "Encoding categorical columns...\n",
      " --> Target-encoding feature Location. Contains 49 classes.\n",
      " --> Target-encoding feature WindGustDir. Contains 16 classes.\n",
      " --> Target-encoding feature WindDir9am. Contains 16 classes.\n",
      " --> Target-encoding feature WindDir3pm. Contains 16 classes.\n",
      " --> Ordinal-encoding feature RainToday. Contains 2 classes.\n",
      "\n",
      "Training ========================= >>\n",
      "Models: LDA\n",
      "Metric: f1\n",
      "\n",
      "\n",
      "Results for LinearDiscriminantAnalysis:\n",
      "Fit ---------------------------------------------\n",
      "Train evaluation --> f1: 0.5906\n",
      "Test evaluation --> f1: 0.5904\n",
      "Time elapsed: 0.942s\n",
      "-------------------------------------------------\n",
      "Time: 0.942s\n",
      "\n",
      "\n",
      "Final results ==================== >>\n",
      "Total time: 1.005s\n",
      "-------------------------------------\n",
      "LinearDiscriminantAnalysis --> f1: 0.5904\n",
      "Successfully created new branch: b2.\n",
      "Fitting Scaler...\n",
      "Scaling features...\n",
      "\n",
      "Training ========================= >>\n",
      "Models: LDA_scaled\n",
      "Metric: f1\n",
      "\n",
      "\n",
      "Results for LinearDiscriminantAnalysis:\n",
      "Fit ---------------------------------------------\n",
      "Train evaluation --> f1: 0.5906\n",
      "Test evaluation --> f1: 0.5904\n",
      "Time elapsed: 0.956s\n",
      "-------------------------------------------------\n",
      "Time: 0.956s\n",
      "\n",
      "\n",
      "Final results ==================== >>\n",
      "Total time: 1.017s\n",
      "-------------------------------------\n",
      "LinearDiscriminantAnalysis --> f1: 0.5904\n",
      "Successfully created new branch: b3.\n",
      "Fitting Normalizer...\n",
      "Normalizing features...\n",
      "\n",
      "Training ========================= >>\n",
      "Models: LDA_norm\n",
      "Metric: f1\n",
      "\n",
      "\n",
      "Results for LinearDiscriminantAnalysis:\n",
      "Fit ---------------------------------------------\n",
      "Train evaluation --> f1: 0.5955\n",
      "Test evaluation --> f1: 0.594\n",
      "Time elapsed: 0.929s\n",
      "-------------------------------------------------\n",
      "Time: 0.929s\n",
      "\n",
      "\n",
      "Final results ==================== >>\n",
      "Total time: 0.991s\n",
      "-------------------------------------\n",
      "LinearDiscriminantAnalysis --> f1: 0.594\n"
     ]
    }
   ],
   "source": [
    "# Now, we create some new branches to train models with different trasnformers\n",
    "atom.impute()\n",
    "atom.encode()\n",
    "atom.run(\"LDA\")\n",
    "\n",
    "atom.branch = \"b2\"\n",
    "atom.scale()\n",
    "atom.run(\"LDA_scaled\")\n",
    "\n",
    "atom.branch = \"b3_from_main\"\n",
    "atom.normalize()\n",
    "atom.run(\"LDA_norm\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ATOMClassifier successfully saved.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'83.93MB'"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# If we save atom now, notice the size\n",
    "# This is because atom keeps a copy of every branch in memory\n",
    "filename = os.path.join(tempdir, \"atom1\")\n",
    "atom.save(filename)\n",
    "get_size(filename)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To avoid large memory usages, set the `memory` parameter."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<< ================== ATOM ================== >>\n",
      "\n",
      "Configuration ==================== >>\n",
      "Algorithm task: Binary classification.\n",
      "Cache storage: C:\\Users\\Mavs\\AppData\\Local\\Temp\\joblib\n",
      "\n",
      "Dataset stats ==================== >>\n",
      "Shape: (142193, 22)\n",
      "Train set size: 113755\n",
      "Test set size: 28438\n",
      "-------------------------------------\n",
      "Memory: 25.03 MB\n",
      "Scaled: False\n",
      "Missing values: 316559 (10.1%)\n",
      "Categorical features: 5 (23.8%)\n",
      "Duplicates: 45 (0.0%)\n",
      "\n",
      "The column dtypes are successfully converted.\n",
      "Loading cached results for Imputer...\n",
      "Loading cached results for Encoder...\n",
      "\n",
      "Training ========================= >>\n",
      "Models: LDA\n",
      "Metric: f1\n",
      "\n",
      "\n",
      "Results for LinearDiscriminantAnalysis:\n",
      "Fit ---------------------------------------------\n",
      "Train evaluation --> f1: 0.5914\n",
      "Test evaluation --> f1: 0.5892\n",
      "Time elapsed: 0.953s\n",
      "-------------------------------------------------\n",
      "Time: 0.953s\n",
      "\n",
      "\n",
      "Final results ==================== >>\n",
      "Total time: 1.015s\n",
      "-------------------------------------\n",
      "LinearDiscriminantAnalysis --> f1: 0.5892\n",
      "Successfully created new branch: b2.\n",
      "Loading cached results for Scaler...\n",
      "\n",
      "Training ========================= >>\n",
      "Models: LDA_scaled\n",
      "Metric: f1\n",
      "\n",
      "\n",
      "Results for LinearDiscriminantAnalysis:\n",
      "Fit ---------------------------------------------\n",
      "Train evaluation --> f1: 0.5914\n",
      "Test evaluation --> f1: 0.5892\n",
      "Time elapsed: 0.971s\n",
      "-------------------------------------------------\n",
      "Time: 0.971s\n",
      "\n",
      "\n",
      "Final results ==================== >>\n",
      "Total time: 1.028s\n",
      "-------------------------------------\n",
      "LinearDiscriminantAnalysis --> f1: 0.5892\n",
      "Successfully created new branch: b3.\n",
      "Loading cached results for Normalizer...\n",
      "\n",
      "Training ========================= >>\n",
      "Models: LDA_norm\n",
      "Metric: f1\n",
      "\n",
      "\n",
      "Results for LinearDiscriminantAnalysis:\n",
      "Fit ---------------------------------------------\n",
      "Train evaluation --> f1: 0.5957\n",
      "Test evaluation --> f1: 0.5935\n",
      "Time elapsed: 0.924s\n",
      "-------------------------------------------------\n",
      "Time: 0.924s\n",
      "\n",
      "\n",
      "Final results ==================== >>\n",
      "Total time: 0.985s\n",
      "-------------------------------------\n",
      "LinearDiscriminantAnalysis --> f1: 0.5935\n"
     ]
    }
   ],
   "source": [
    "atom = ATOMClassifier(X, y=\"RainTomorrow\", memory=tempdir, verbose=1, random_state=1)\n",
    "atom.shrink(str2cat=True)\n",
    "atom.impute()\n",
    "atom.encode()\n",
    "atom.run(\"LDA\")\n",
    "\n",
    "atom.branch = \"b2\"\n",
    "atom.scale()\n",
    "atom.run(\"LDA_scaled\")\n",
    "\n",
    "atom.branch = \"b3_from_main\"\n",
    "atom.normalize()\n",
    "atom.run(\"LDA_norm\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ATOMClassifier successfully saved.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'24.78MB'"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# And now, it only takes a fraction of the previous size\n",
    "# This is because the data of inactive branches is now stored locally\n",
    "filename = os.path.join(tempdir, \"atom2\")\n",
    "atom.save(filename)\n",
    "get_size(filename)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Additionnaly, repeated calls to the same transformers with the same data will use the cached results.  \n",
    "Don't forget to specify the `random_state` parameter to ensure the data remains the exact same."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<< ================== ATOM ================== >>\n",
      "\n",
      "Configuration ==================== >>\n",
      "Algorithm task: Binary classification.\n",
      "Cache storage: C:\\Users\\Mavs\\AppData\\Local\\Temp\\joblib\n",
      "\n",
      "Dataset stats ==================== >>\n",
      "Shape: (142193, 22)\n",
      "Train set size: 113755\n",
      "Test set size: 28438\n",
      "-------------------------------------\n",
      "Memory: 25.03 MB\n",
      "Scaled: False\n",
      "Missing values: 316559 (10.1%)\n",
      "Categorical features: 5 (23.8%)\n",
      "Duplicates: 45 (0.0%)\n",
      "\n",
      "The column dtypes are successfully converted.\n"
     ]
    }
   ],
   "source": [
    "atom = ATOMClassifier(X, y=\"RainTomorrow\", memory=tempdir, verbose=1, random_state=1)\n",
    "atom.shrink(str2cat=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading cached results for Imputer...\n",
      "Loading cached results for Encoder...\n"
     ]
    }
   ],
   "source": [
    "# Note the transformers are no longer fitted,\n",
    "# instead the results are immediately read from cache\n",
    "atom.impute()\n",
    "atom.encode()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Location</th>\n",
       "      <th>MinTemp</th>\n",
       "      <th>MaxTemp</th>\n",
       "      <th>Rainfall</th>\n",
       "      <th>Evaporation</th>\n",
       "      <th>Sunshine</th>\n",
       "      <th>WindGustDir</th>\n",
       "      <th>WindGustSpeed</th>\n",
       "      <th>WindDir9am</th>\n",
       "      <th>WindDir3pm</th>\n",
       "      <th>...</th>\n",
       "      <th>Humidity9am</th>\n",
       "      <th>Humidity3pm</th>\n",
       "      <th>Pressure9am</th>\n",
       "      <th>Pressure3pm</th>\n",
       "      <th>Cloud9am</th>\n",
       "      <th>Cloud3pm</th>\n",
       "      <th>Temp9am</th>\n",
       "      <th>Temp3pm</th>\n",
       "      <th>RainToday</th>\n",
       "      <th>RainTomorrow</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.070767</td>\n",
       "      <td>13.0</td>\n",
       "      <td>30.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>6.80000</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>0.272677</td>\n",
       "      <td>59.0</td>\n",
       "      <td>0.254995</td>\n",
       "      <td>0.282496</td>\n",
       "      <td>...</td>\n",
       "      <td>19.000000</td>\n",
       "      <td>8.00000</td>\n",
       "      <td>1013.599976</td>\n",
       "      <td>1008.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>2.00000</td>\n",
       "      <td>19.600000</td>\n",
       "      <td>29.900000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.130163</td>\n",
       "      <td>8.8</td>\n",
       "      <td>25.200001</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>5.00000</td>\n",
       "      <td>7.614201</td>\n",
       "      <td>0.285167</td>\n",
       "      <td>50.0</td>\n",
       "      <td>0.26967</td>\n",
       "      <td>0.278696</td>\n",
       "      <td>...</td>\n",
       "      <td>68.842218</td>\n",
       "      <td>51.50239</td>\n",
       "      <td>1011.200012</td>\n",
       "      <td>1006.500000</td>\n",
       "      <td>4.446657</td>\n",
       "      <td>3.00000</td>\n",
       "      <td>15.900000</td>\n",
       "      <td>23.700001</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.262043</td>\n",
       "      <td>19.9</td>\n",
       "      <td>26.600000</td>\n",
       "      <td>8.000000</td>\n",
       "      <td>5.46491</td>\n",
       "      <td>7.614201</td>\n",
       "      <td>0.26658</td>\n",
       "      <td>57.0</td>\n",
       "      <td>0.254995</td>\n",
       "      <td>0.250291</td>\n",
       "      <td>...</td>\n",
       "      <td>81.000000</td>\n",
       "      <td>81.00000</td>\n",
       "      <td>1013.099976</td>\n",
       "      <td>1008.599976</td>\n",
       "      <td>4.446657</td>\n",
       "      <td>4.50922</td>\n",
       "      <td>24.500000</td>\n",
       "      <td>24.700001</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.183912</td>\n",
       "      <td>19.6</td>\n",
       "      <td>31.900000</td>\n",
       "      <td>2.600000</td>\n",
       "      <td>5.46491</td>\n",
       "      <td>7.614201</td>\n",
       "      <td>0.26658</td>\n",
       "      <td>59.0</td>\n",
       "      <td>0.269775</td>\n",
       "      <td>0.220975</td>\n",
       "      <td>...</td>\n",
       "      <td>70.000000</td>\n",
       "      <td>42.00000</td>\n",
       "      <td>1001.200012</td>\n",
       "      <td>1002.400024</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>8.00000</td>\n",
       "      <td>25.799999</td>\n",
       "      <td>22.000000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.258569</td>\n",
       "      <td>15.3</td>\n",
       "      <td>22.400000</td>\n",
       "      <td>16.000000</td>\n",
       "      <td>4.20000</td>\n",
       "      <td>3.300000</td>\n",
       "      <td>0.194464</td>\n",
       "      <td>39.0</td>\n",
       "      <td>0.245824</td>\n",
       "      <td>0.189182</td>\n",
       "      <td>...</td>\n",
       "      <td>83.000000</td>\n",
       "      <td>63.00000</td>\n",
       "      <td>1025.500000</td>\n",
       "      <td>1023.599976</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>6.00000</td>\n",
       "      <td>16.900000</td>\n",
       "      <td>21.100000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>142188</th>\n",
       "      <td>0.278746</td>\n",
       "      <td>9.0</td>\n",
       "      <td>21.799999</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>5.46491</td>\n",
       "      <td>7.614201</td>\n",
       "      <td>0.158276</td>\n",
       "      <td>33.0</td>\n",
       "      <td>0.203597</td>\n",
       "      <td>0.277443</td>\n",
       "      <td>...</td>\n",
       "      <td>44.000000</td>\n",
       "      <td>38.00000</td>\n",
       "      <td>1017.660981</td>\n",
       "      <td>1015.270396</td>\n",
       "      <td>4.446657</td>\n",
       "      <td>4.50922</td>\n",
       "      <td>16.600000</td>\n",
       "      <td>21.100000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>142189</th>\n",
       "      <td>0.307562</td>\n",
       "      <td>11.5</td>\n",
       "      <td>19.200001</td>\n",
       "      <td>0.800000</td>\n",
       "      <td>2.00000</td>\n",
       "      <td>7.000000</td>\n",
       "      <td>0.158276</td>\n",
       "      <td>22.0</td>\n",
       "      <td>0.143946</td>\n",
       "      <td>0.187433</td>\n",
       "      <td>...</td>\n",
       "      <td>73.000000</td>\n",
       "      <td>52.00000</td>\n",
       "      <td>1021.299988</td>\n",
       "      <td>1018.799988</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>4.00000</td>\n",
       "      <td>17.100000</td>\n",
       "      <td>18.400000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>142190</th>\n",
       "      <td>0.197839</td>\n",
       "      <td>17.5</td>\n",
       "      <td>29.100000</td>\n",
       "      <td>35.599998</td>\n",
       "      <td>5.46491</td>\n",
       "      <td>7.614201</td>\n",
       "      <td>0.158276</td>\n",
       "      <td>33.0</td>\n",
       "      <td>0.203597</td>\n",
       "      <td>0.180537</td>\n",
       "      <td>...</td>\n",
       "      <td>77.000000</td>\n",
       "      <td>46.00000</td>\n",
       "      <td>1015.200012</td>\n",
       "      <td>1013.700012</td>\n",
       "      <td>4.446657</td>\n",
       "      <td>4.50922</td>\n",
       "      <td>21.000000</td>\n",
       "      <td>28.799999</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>142191</th>\n",
       "      <td>0.371853</td>\n",
       "      <td>5.9</td>\n",
       "      <td>18.000000</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.80000</td>\n",
       "      <td>6.700000</td>\n",
       "      <td>0.285167</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0.254995</td>\n",
       "      <td>0.278696</td>\n",
       "      <td>...</td>\n",
       "      <td>92.000000</td>\n",
       "      <td>65.00000</td>\n",
       "      <td>1028.000000</td>\n",
       "      <td>1025.300049</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>2.00000</td>\n",
       "      <td>9.400000</td>\n",
       "      <td>16.600000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>142192</th>\n",
       "      <td>0.297818</td>\n",
       "      <td>10.2</td>\n",
       "      <td>18.100000</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>5.46491</td>\n",
       "      <td>7.614201</td>\n",
       "      <td>0.205887</td>\n",
       "      <td>24.0</td>\n",
       "      <td>0.150067</td>\n",
       "      <td>0.221562</td>\n",
       "      <td>...</td>\n",
       "      <td>84.000000</td>\n",
       "      <td>94.00000</td>\n",
       "      <td>1018.099976</td>\n",
       "      <td>1016.000000</td>\n",
       "      <td>4.446657</td>\n",
       "      <td>4.50922</td>\n",
       "      <td>15.300000</td>\n",
       "      <td>16.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>142193 rows × 22 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Location  MinTemp    MaxTemp   Rainfall  Evaporation   Sunshine  \\\n",
       "0       0.070767     13.0  30.500000   0.000000      6.80000  10.000000   \n",
       "1       0.130163      8.8  25.200001   0.000000      5.00000   7.614201   \n",
       "2       0.262043     19.9  26.600000   8.000000      5.46491   7.614201   \n",
       "3       0.183912     19.6  31.900000   2.600000      5.46491   7.614201   \n",
       "4       0.258569     15.3  22.400000  16.000000      4.20000   3.300000   \n",
       "...          ...      ...        ...        ...          ...        ...   \n",
       "142188  0.278746      9.0  21.799999   0.000000      5.46491   7.614201   \n",
       "142189  0.307562     11.5  19.200001   0.800000      2.00000   7.000000   \n",
       "142190  0.197839     17.5  29.100000  35.599998      5.46491   7.614201   \n",
       "142191  0.371853      5.9  18.000000   0.400000      0.80000   6.700000   \n",
       "142192  0.297818     10.2  18.100000   0.200000      5.46491   7.614201   \n",
       "\n",
       "        WindGustDir  WindGustSpeed  WindDir9am  WindDir3pm  ...  Humidity9am  \\\n",
       "0          0.272677           59.0    0.254995    0.282496  ...    19.000000   \n",
       "1          0.285167           50.0     0.26967    0.278696  ...    68.842218   \n",
       "2           0.26658           57.0    0.254995    0.250291  ...    81.000000   \n",
       "3           0.26658           59.0    0.269775    0.220975  ...    70.000000   \n",
       "4          0.194464           39.0    0.245824    0.189182  ...    83.000000   \n",
       "...             ...            ...         ...         ...  ...          ...   \n",
       "142188     0.158276           33.0    0.203597    0.277443  ...    44.000000   \n",
       "142189     0.158276           22.0    0.143946    0.187433  ...    73.000000   \n",
       "142190     0.158276           33.0    0.203597    0.180537  ...    77.000000   \n",
       "142191     0.285167           26.0    0.254995    0.278696  ...    92.000000   \n",
       "142192     0.205887           24.0    0.150067    0.221562  ...    84.000000   \n",
       "\n",
       "        Humidity3pm  Pressure9am  Pressure3pm  Cloud9am  Cloud3pm    Temp9am  \\\n",
       "0           8.00000  1013.599976  1008.000000  0.000000   2.00000  19.600000   \n",
       "1          51.50239  1011.200012  1006.500000  4.446657   3.00000  15.900000   \n",
       "2          81.00000  1013.099976  1008.599976  4.446657   4.50922  24.500000   \n",
       "3          42.00000  1001.200012  1002.400024  2.000000   8.00000  25.799999   \n",
       "4          63.00000  1025.500000  1023.599976  6.000000   6.00000  16.900000   \n",
       "...             ...          ...          ...       ...       ...        ...   \n",
       "142188     38.00000  1017.660981  1015.270396  4.446657   4.50922  16.600000   \n",
       "142189     52.00000  1021.299988  1018.799988  3.000000   4.00000  17.100000   \n",
       "142190     46.00000  1015.200012  1013.700012  4.446657   4.50922  21.000000   \n",
       "142191     65.00000  1028.000000  1025.300049  3.000000   2.00000   9.400000   \n",
       "142192     94.00000  1018.099976  1016.000000  4.446657   4.50922  15.300000   \n",
       "\n",
       "          Temp3pm  RainToday  RainTomorrow  \n",
       "0       29.900000        0.0             0  \n",
       "1       23.700001        0.0             1  \n",
       "2       24.700001        1.0             1  \n",
       "3       22.000000        1.0             0  \n",
       "4       21.100000        1.0             1  \n",
       "...           ...        ...           ...  \n",
       "142188  21.100000        0.0             1  \n",
       "142189  18.400000        0.0             0  \n",
       "142190  28.799999        1.0             0  \n",
       "142191  16.600000        0.0             0  \n",
       "142192  16.000000        0.0             0  \n",
       "\n",
       "[142193 rows x 22 columns]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "atom.dataset"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.2"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}