# =========================================================
# Setup: install required packages if they are missing
# =========================================================

import sys
import subprocess
import importlib

required_packages = [
    "numpy",
    "pandas",
    "matplotlib",
    "scikit-learn",
    "jupyter"
]

for package in required_packages:
    try:
        importlib.import_module(package.replace("-", "_"))
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

print("All required packages are available.")

Installing scikit-learn...
All required packages are available.


import inspect
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

RANDOM_STATE = 123
np.random.seed(RANDOM_STATE)

pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 160)


def rmse(y_true, y_pred):
    """Root mean squared error."""
    return np.sqrt(mean_squared_error(y_true, y_pred))


def make_onehot_encoder():
    """Create a OneHotEncoder compatible with different scikit-learn versions."""
    params = inspect.signature(OneHotEncoder).parameters
    if "sparse_output" in params:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    else:
        return OneHotEncoder(handle_unknown="ignore", sparse=False)


def make_bagging_regressor(base_estimator, **kwargs):
    """Create a BaggingRegressor compatible with old and new scikit-learn versions."""
    params = inspect.signature(BaggingRegressor).parameters
    if "estimator" in params:
        return BaggingRegressor(estimator=base_estimator, **kwargs)
    else:
        return BaggingRegressor(base_estimator=base_estimator, **kwargs)

library(modeldata)
data(ames, package = "modeldata")
write.csv(ames, "data/AmesHousing1.csv", row.names = FALSE)


DATA_FILE = "data/AmesHousing1.csv"

ames = pd.read_csv(DATA_FILE)
ames.shape

(2930, 74)


ames.head()


ames.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 74 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   MS_SubClass         2930 non-null   object 
 1   MS_Zoning           2930 non-null   object 
 2   Lot_Frontage        2930 non-null   int64  
 3   Lot_Area            2930 non-null   int64  
 4   Street              2930 non-null   object 
 5   Alley               2930 non-null   object 
 6   Lot_Shape           2930 non-null   object 
 7   Land_Contour        2930 non-null   object 
 8   Utilities           2930 non-null   object 
 9   Lot_Config          2930 non-null   object 
 10  Land_Slope          2930 non-null   object 
 11  Neighborhood        2930 non-null   object 
 12  Condition_1         2930 non-null   object 
 13  Condition_2         2930 non-null   object 
 14  Bldg_Type           2930 non-null   object 
 15  House_Style         2930 non-null   object 
 16  Overall_Cond        2930 non-null   object 
 17  Year_Built          2930 non-null   int64  
 18  Year_Remod_Add      2930 non-null   int64  
 19  Roof_Style          2930 non-null   object 
 20  Roof_Matl           2930 non-null   object 
 21  Exterior_1st        2930 non-null   object 
 22  Exterior_2nd        2930 non-null   object 
 23  Mas_Vnr_Type        2930 non-null   object 
 24  Mas_Vnr_Area        2930 non-null   int64  
 25  Exter_Cond          2930 non-null   object 
 26  Foundation          2930 non-null   object 
 27  Bsmt_Cond           2930 non-null   object 
 28  Bsmt_Exposure       2930 non-null   object 
 29  BsmtFin_Type_1      2930 non-null   object 
 30  BsmtFin_SF_1        2930 non-null   int64  
 31  BsmtFin_Type_2      2930 non-null   object 
 32  BsmtFin_SF_2        2930 non-null   int64  
 33  Bsmt_Unf_SF         2930 non-null   int64  
 34  Total_Bsmt_SF       2930 non-null   int64  
 35  Heating             2930 non-null   object 
 36  Heating_QC          2930 non-null   object 
 37  Central_Air         2930 non-null   object 
 38  Electrical          2930 non-null   object 
 39  First_Flr_SF        2930 non-null   int64  
 40  Second_Flr_SF       2930 non-null   int64  
 41  Gr_Liv_Area         2930 non-null   int64  
 42  Bsmt_Full_Bath      2930 non-null   int64  
 43  Bsmt_Half_Bath      2930 non-null   int64  
 44  Full_Bath           2930 non-null   int64  
 45  Half_Bath           2930 non-null   int64  
 46  Bedroom_AbvGr       2930 non-null   int64  
 47  Kitchen_AbvGr       2930 non-null   int64  
 48  TotRms_AbvGrd       2930 non-null   int64  
 49  Functional          2930 non-null   object 
 50  Fireplaces          2930 non-null   int64  
 51  Garage_Type         2930 non-null   object 
 52  Garage_Finish       2930 non-null   object 
 53  Garage_Cars         2930 non-null   int64  
 54  Garage_Area         2930 non-null   int64  
 55  Garage_Cond         2930 non-null   object 
 56  Paved_Drive         2930 non-null   object 
 57  Wood_Deck_SF        2930 non-null   int64  
 58  Open_Porch_SF       2930 non-null   int64  
 59  Enclosed_Porch      2930 non-null   int64  
 60  Three_season_porch  2930 non-null   int64  
 61  Screen_Porch        2930 non-null   int64  
 62  Pool_Area           2930 non-null   int64  
 63  Pool_QC             2930 non-null   object 
 64  Fence               2930 non-null   object 
 65  Misc_Feature        2930 non-null   object 
 66  Misc_Val            2930 non-null   int64  
 67  Mo_Sold             2930 non-null   int64  
 68  Year_Sold           2930 non-null   int64  
 69  Sale_Type           2930 non-null   object 
 70  Sale_Condition      2930 non-null   object 
 71  Sale_Price          2930 non-null   int64  
 72  Longitude           2930 non-null   float64
 73  Latitude            2930 non-null   float64
dtypes: float64(2), int64(32), object(40)
memory usage: 1.7+ MB


ames["Sale_Price"].describe()

count      2930.000000
mean     180796.060068
std       79886.692357
min       12789.000000
25%      129500.000000
50%      160000.000000
75%      213500.000000
max      755000.000000
Name: Sale_Price, dtype: float64


plt.figure(figsize=(7, 4))
plt.hist(ames["Sale_Price"], bins=40)
plt.xlabel("Sale_Price")
plt.ylabel("Frequency")
plt.title("Distribution of sale prices")
plt.show()


ames = ames.copy()
ames["Sale_Price"] = ames["Sale_Price"] / 1000

ames["Sale_Price"].describe()

count    2930.000000
mean      180.796060
std        79.886692
min        12.789000
25%       129.500000
50%       160.000000
75%       213.500000
max       755.000000
Name: Sale_Price, dtype: float64


ames["Sale_Price"] = ames["Sale_Price"] / 1000


y = ames["Sale_Price"]
X = ames.drop(columns="Sale_Price")

# Create response bins for approximate stratification
price_bins = pd.qcut(y, q=5, duplicates="drop")

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.30,
    random_state=RANDOM_STATE,
    stratify=price_bins
)

X_train.shape, X_test.shape

((2051, 73), (879, 73))


pd.DataFrame({
    "train": y_train.describe(),
    "test": y_test.describe()
})


numeric_features = X_train.select_dtypes(include=["number", "bool"]).columns.tolist()
categorical_features = X_train.select_dtypes(exclude=["number", "bool"]).columns.tolist()

len(numeric_features), len(categorical_features)

(33, 40)


print("Numerical variables:")
print(numeric_features)
print("\nCategorical variables:")
print(categorical_features)

Numerical variables:
['Lot_Frontage', 'Lot_Area', 'Year_Built', 'Year_Remod_Add', 'Mas_Vnr_Area', 'BsmtFin_SF_1', 'BsmtFin_SF_2', 'Bsmt_Unf_SF', 'Total_Bsmt_SF', 'First_Flr_SF', 'Second_Flr_SF', 'Gr_Liv_Area', 'Bsmt_Full_Bath', 'Bsmt_Half_Bath', 'Full_Bath', 'Half_Bath', 'Bedroom_AbvGr', 'Kitchen_AbvGr', 'TotRms_AbvGrd', 'Fireplaces', 'Garage_Cars', 'Garage_Area', 'Wood_Deck_SF', 'Open_Porch_SF', 'Enclosed_Porch', 'Three_season_porch', 'Screen_Porch', 'Pool_Area', 'Misc_Val', 'Mo_Sold', 'Year_Sold', 'Longitude', 'Latitude']

Categorical variables:
['MS_SubClass', 'MS_Zoning', 'Street', 'Alley', 'Lot_Shape', 'Land_Contour', 'Utilities', 'Lot_Config', 'Land_Slope', 'Neighborhood', 'Condition_1', 'Condition_2', 'Bldg_Type', 'House_Style', 'Overall_Cond', 'Roof_Style', 'Roof_Matl', 'Exterior_1st', 'Exterior_2nd', 'Mas_Vnr_Type', 'Exter_Cond', 'Foundation', 'Bsmt_Cond', 'Bsmt_Exposure', 'BsmtFin_Type_1', 'BsmtFin_Type_2', 'Heating', 'Heating_QC', 'Central_Air', 'Electrical', 'Functional', 'Garage_Type', 'Garage_Finish', 'Garage_Cond', 'Paved_Drive', 'Pool_QC', 'Fence', 'Misc_Feature', 'Sale_Type', 'Sale_Condition']


# ColumnTransformer allows us to apply different preprocessing steps
# to different types of variables (e.g. one-hot encoding for categorical
# variables while leaving numerical variables unchanged).

preprocess = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", make_onehot_encoder(), categorical_features)
    ],
    remainder="drop"
)


small_tree = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", DecisionTreeRegressor(
        random_state=RANDOM_STATE,
        max_leaf_nodes=12,
        min_samples_split=10,
        min_samples_leaf=5
    ))
])

small_tree.fit(X_train, y_train)

yhat_train_small = small_tree.predict(X_train)
yhat_test_small = small_tree.predict(X_test)

rmse_train_small = rmse(y_train, yhat_train_small)
rmse_test_small = rmse(y_test, yhat_test_small)

rmse_train_small, rmse_test_small

(0.04006331231478953, 0.04053771485439569)


plt.figure(figsize=(6, 6))
plt.scatter(yhat_test_small, y_test, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()])
plt.xlabel("Predicted Sale_Price")
plt.ylabel("Observed Sale_Price")
plt.title(f"Small regression tree: test RMSE = {rmse_test_small:.2f}")
plt.show()


# Recover fitted preprocessing and model objects
fitted_preprocess = small_tree.named_steps["preprocess"]
fitted_tree = small_tree.named_steps["model"]

feature_names = fitted_preprocess.get_feature_names_out().tolist()

plt.figure(figsize=(22, 10))
plot_tree(
    fitted_tree,
    feature_names=feature_names,
    filled=True,
    rounded=True,
    max_depth=3,
    fontsize=8
)
plt.title("Small regression tree, first levels")
plt.show()


deep_tree = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", DecisionTreeRegressor(
        random_state=RANDOM_STATE,
        min_samples_split=2,
        min_samples_leaf=1,
        ccp_alpha=0.0
    ))
])

deep_tree.fit(X_train, y_train)

yhat_train_deep = deep_tree.predict(X_train)
yhat_test_deep = deep_tree.predict(X_test)

rmse_train_deep = rmse(y_train, yhat_train_deep)
rmse_test_deep = rmse(y_test, yhat_test_deep)

n_leaves_deep = deep_tree.named_steps["model"].get_n_leaves()
n_nodes_deep = deep_tree.named_steps["model"].tree_.node_count

rmse_train_deep, rmse_test_deep, n_leaves_deep, n_nodes_deep

(1.061519644547594e-18, 0.03936974461135565, 1961, 3921)


plt.figure(figsize=(6, 6))
plt.scatter(yhat_test_deep, y_test, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()])
plt.xlabel("Predicted Sale_Price")
plt.ylabel("Observed Sale_Price")
plt.title(f"Deep regression tree: test RMSE = {rmse_test_deep:.2f}")
plt.show()


# First fit the preprocessor on the training set and transform train/test data.
# This is useful for obtaining the cost-complexity pruning path.
preprocess_for_pruning = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", make_onehot_encoder(), categorical_features)
    ],
    remainder="drop"
)

X_train_enc = preprocess_for_pruning.fit_transform(X_train)
X_test_enc = preprocess_for_pruning.transform(X_test)

base_tree_for_path = DecisionTreeRegressor(random_state=RANDOM_STATE)
path = base_tree_for_path.cost_complexity_pruning_path(X_train_enc, y_train)
ccp_alphas = path.ccp_alphas

# Remove the largest alpha, which usually corresponds to the root-only tree
ccp_alphas = ccp_alphas[:-1]
len(ccp_alphas), ccp_alphas[:5], ccp_alphas[-5:]

(1923,
 array([0.00000000e+00, 6.84787910e-13, 9.07118413e-13, 2.43783519e-12,
        2.43783520e-12]),
 array([0.00019159, 0.00019389, 0.00030867, 0.00037756, 0.00077785]))


# Select a manageable grid of alpha values, including zero
if len(ccp_alphas) > 30:
    alpha_grid = np.unique(np.quantile(ccp_alphas, np.linspace(0, 1, 30)))
else:
    alpha_grid = np.unique(ccp_alphas)

alpha_grid = np.unique(np.r_[0, alpha_grid])
alpha_grid[:10], len(alpha_grid)

(array([0.00000000e+00, 3.90053632e-11, 8.12611734e-11, 2.43783520e-10,
        2.94978059e-10, 7.01592159e-10, 9.75134081e-10, 1.52364700e-09,
        2.19405168e-09, 3.43065760e-09]),
 30)


cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

pruning_results = []

for alpha in alpha_grid:
    fold_rmse = []
    fold_leaves = []

    for train_idx, valid_idx in cv.split(X_train_enc):
        X_tr, X_val = X_train_enc[train_idx], X_train_enc[valid_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]

        tree = DecisionTreeRegressor(random_state=RANDOM_STATE, ccp_alpha=alpha)
        tree.fit(X_tr, y_tr)
        pred_val = tree.predict(X_val)

        fold_rmse.append(rmse(y_val, pred_val))
        fold_leaves.append(tree.get_n_leaves())

    pruning_results.append({
        "ccp_alpha": alpha,
        "cv_rmse": np.mean(fold_rmse),
        "cv_rmse_sd": np.std(fold_rmse),
        "mean_leaves": np.mean(fold_leaves)
    })

pruning_results = pd.DataFrame(pruning_results)
pruning_results.sort_values("cv_rmse").head()


best_alpha = pruning_results.loc[pruning_results["cv_rmse"].idxmin(), "ccp_alpha"]
best_alpha

5.944817180287715e-06


plt.figure(figsize=(7, 4))
plt.plot(pruning_results["mean_leaves"], pruning_results["cv_rmse"], marker="o")
plt.xlabel("Mean number of terminal nodes")
plt.ylabel("Cross-validated RMSE")
plt.title("Cost-complexity pruning path")
plt.gca().invert_xaxis()
plt.show()


pruned_tree = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", DecisionTreeRegressor(
        random_state=RANDOM_STATE,
        ccp_alpha=best_alpha
    ))
])

pruned_tree.fit(X_train, y_train)

yhat_train_pruned = pruned_tree.predict(X_train)
yhat_test_pruned = pruned_tree.predict(X_test)

rmse_train_pruned = rmse(y_train, yhat_train_pruned)
rmse_test_pruned = rmse(y_test, yhat_test_pruned)

n_leaves_pruned = pruned_tree.named_steps["model"].get_n_leaves()

rmse_train_pruned, rmse_test_pruned, n_leaves_pruned

(0.023531267305817956, 0.03759359618015671, 70)


results = pd.DataFrame([
    {"Model": "Small regression tree", "Train RMSE": rmse_train_small, "Test RMSE": rmse_test_small},
    {"Model": "Deep regression tree", "Train RMSE": rmse_train_deep, "Test RMSE": rmse_test_deep},
    {"Model": "Pruned regression tree", "Train RMSE": rmse_train_pruned, "Test RMSE": rmse_test_pruned},
])

results.round(2)


base_tree = DecisionTreeRegressor(random_state=RANDOM_STATE)

bagging_model = make_bagging_regressor(
    base_tree,
    n_estimators=100,
    bootstrap=True,
    oob_score=True,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

bagging = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", bagging_model)
])

bagging.fit(X_train, y_train)

yhat_train_bag = bagging.predict(X_train)
yhat_test_bag = bagging.predict(X_test)

rmse_train_bag = rmse(y_train, yhat_train_bag)
rmse_test_bag = rmse(y_test, yhat_test_bag)

rmse_train_bag, rmse_test_bag

(0.011324611622751354, 0.024128478508996607)


plt.figure(figsize=(6, 6))
plt.scatter(yhat_test_bag, y_test, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()])
plt.xlabel("Predicted Sale_Price")
plt.ylabel("Observed Sale_Price")
plt.title(f"Bagged trees: test RMSE = {rmse_test_bag:.2f}")
plt.show()


bagging_estimator = bagging.named_steps["model"]
oob_pred_bag = bagging_estimator.oob_prediction_

# In rare cases, an observation may have no OOB prediction if the number of trees is very small.
# With 100 trees this should almost never happen, but we keep the check for safety.
valid_oob = ~np.isnan(oob_pred_bag)
oob_rmse_bag = rmse(y_train.iloc[valid_oob], oob_pred_bag[valid_oob])

oob_rmse_bag, bagging_estimator.oob_score_

(0.03007534767303115, 0.8637683053664331)


plt.figure(figsize=(6, 6))
plt.scatter(oob_pred_bag[valid_oob], y_train.iloc[valid_oob], alpha=0.6)
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()])
plt.xlabel("OOB predicted Sale_Price")
plt.ylabel("Observed Sale_Price")
plt.title(f"Bagged trees: OOB RMSE = {oob_rmse_bag:.2f}")
plt.show()


rf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", RandomForestRegressor(
        n_estimators=500,
        random_state=RANDOM_STATE,
        oob_score=True,
        n_jobs=-1
    ))
])

rf.fit(X_train, y_train)

yhat_train_rf = rf.predict(X_train)
yhat_test_rf = rf.predict(X_test)

rmse_train_rf = rmse(y_train, yhat_train_rf)
rmse_test_rf = rmse(y_test, yhat_test_rf)

rf_estimator = rf.named_steps["model"]
oob_pred_rf = rf_estimator.oob_prediction_
oob_rmse_rf = rmse(y_train, oob_pred_rf)

rmse_train_rf, rmse_test_rf, oob_rmse_rf, rf_estimator.oob_score_

(0.011069803313013132,
 0.023542720857479493,
 0.029792784810295785,
 0.8663161188925272)


plt.figure(figsize=(6, 6))
plt.scatter(yhat_test_rf, y_test, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()])
plt.xlabel("Predicted Sale_Price")
plt.ylabel("Observed Sale_Price")
plt.title(f"Random forest: test RMSE = {rmse_test_rf:.2f}")
plt.show()


rf_for_grid = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", RandomForestRegressor(
        random_state=RANDOM_STATE,
        oob_score=False,
        n_jobs=-1
    ))
])

param_grid = {
    "model__n_estimators": [100, 300],
    "model__max_features": [0.33, 0.50, 1.00]
}

cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

grid = GridSearchCV(
    estimator=rf_for_grid,
    param_grid=param_grid,
    scoring="neg_root_mean_squared_error",
    cv=cv,
    n_jobs=-1,
    return_train_score=True
)

grid.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=123, shuffle=True),
             estimator=Pipeline(steps=[('preprocess',
                                        ColumnTransformer(transformers=[('num',
                                                                         'passthrough',
                                                                         ['Lot_Frontage',
                                                                          'Lot_Area',
                                                                          'Year_Built',
                                                                          'Year_Remod_Add',
                                                                          'Mas_Vnr_Area',
                                                                          'BsmtFin_SF_1',
                                                                          'BsmtFin_SF_2',
                                                                          'Bsmt_Unf_SF',
                                                                          'Total_Bsmt_SF',
                                                                          'First_Flr_SF',
                                                                          'Second_Flr_SF',
                                                                          'Gr_Liv_Area',
                                                                          'Bsmt_Full_Bath'...
                                                                          'Foundation',
                                                                          'Bsmt_Cond',
                                                                          'Bsmt_Exposure',
                                                                          'BsmtFin_Type_1',
                                                                          'BsmtFin_Type_2',
                                                                          'Heating',
                                                                          'Heating_QC',
                                                                          'Central_Air',
                                                                          'Electrical', ...])])),
                                       ('model',
                                        RandomForestRegressor(n_jobs=-1,
                                                              random_state=123))]),
             n_jobs=-1,
             param_grid={'model__max_features': [0.33, 0.5, 1.0],
                         'model__n_estimators': [100, 300]},
             return_train_score=True, scoring='neg_root_mean_squared_error')

GridSearchCV(cv=KFold(n_splits=5, random_state=123, shuffle=True),
             estimator=Pipeline(steps=[('preprocess',
                                        ColumnTransformer(transformers=[('num',
                                                                         'passthrough',
                                                                         ['Lot_Frontage',
                                                                          'Lot_Area',
                                                                          'Year_Built',
                                                                          'Year_Remod_Add',
                                                                          'Mas_Vnr_Area',
                                                                          'BsmtFin_SF_1',
                                                                          'BsmtFin_SF_2',
                                                                          'Bsmt_Unf_SF',
                                                                          'Total_Bsmt_SF',
                                                                          'First_Flr_SF',
                                                                          'Second_Flr_SF',
                                                                          'Gr_Liv_Area',
                                                                          'Bsmt_Full_Bath'...
                                                                          'Foundation',
                                                                          'Bsmt_Cond',
                                                                          'Bsmt_Exposure',
                                                                          'BsmtFin_Type_1',
                                                                          'BsmtFin_Type_2',
                                                                          'Heating',
                                                                          'Heating_QC',
                                                                          'Central_Air',
                                                                          'Electrical', ...])])),
                                       ('model',
                                        RandomForestRegressor(n_jobs=-1,
                                                              random_state=123))]),
             n_jobs=-1,
             param_grid={'model__max_features': [0.33, 0.5, 1.0],
                         'model__n_estimators': [100, 300]},
             return_train_score=True, scoring='neg_root_mean_squared_error')

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('num', 'passthrough',
                                                  ['Lot_Frontage', 'Lot_Area',
                                                   'Year_Built',
                                                   'Year_Remod_Add',
                                                   'Mas_Vnr_Area',
                                                   'BsmtFin_SF_1',
                                                   'BsmtFin_SF_2',
                                                   'Bsmt_Unf_SF',
                                                   'Total_Bsmt_SF',
                                                   'First_Flr_SF',
                                                   'Second_Flr_SF',
                                                   'Gr_Liv_Area',
                                                   'Bsmt_Full_Bath',
                                                   'Bsmt_Half_Bath',
                                                   'Full_Bath', 'Half_Bath',
                                                   'Bedroom_AbvGr',
                                                   'Kitchen_AbvGr',...
                                                   'Land_Slope', 'Neighborhood',
                                                   'Condition_1', 'Condition_2',
                                                   'Bldg_Type', 'House_Style',
                                                   'Overall_Cond', 'Roof_Style',
                                                   'Roof_Matl', 'Exterior_1st',
                                                   'Exterior_2nd',
                                                   'Mas_Vnr_Type', 'Exter_Cond',
                                                   'Foundation', 'Bsmt_Cond',
                                                   'Bsmt_Exposure',
                                                   'BsmtFin_Type_1',
                                                   'BsmtFin_Type_2', 'Heating',
                                                   'Heating_QC', 'Central_Air',
                                                   'Electrical', ...])])),
                ('model', RandomForestRegressor(n_jobs=-1, random_state=123))])

ColumnTransformer(transformers=[('num', 'passthrough',
                                 ['Lot_Frontage', 'Lot_Area', 'Year_Built',
                                  'Year_Remod_Add', 'Mas_Vnr_Area',
                                  'BsmtFin_SF_1', 'BsmtFin_SF_2', 'Bsmt_Unf_SF',
                                  'Total_Bsmt_SF', 'First_Flr_SF',
                                  'Second_Flr_SF', 'Gr_Liv_Area',
                                  'Bsmt_Full_Bath', 'Bsmt_Half_Bath',
                                  'Full_Bath', 'Half_Bath', 'Bedroom_AbvGr',
                                  'Kitchen_AbvGr', 'TotRms_AbvGrd',
                                  'Fireplaces', '...
                                  'Lot_Shape', 'Land_Contour', 'Utilities',
                                  'Lot_Config', 'Land_Slope', 'Neighborhood',
                                  'Condition_1', 'Condition_2', 'Bldg_Type',
                                  'House_Style', 'Overall_Cond', 'Roof_Style',
                                  'Roof_Matl', 'Exterior_1st', 'Exterior_2nd',
                                  'Mas_Vnr_Type', 'Exter_Cond', 'Foundation',
                                  'Bsmt_Cond', 'Bsmt_Exposure',
                                  'BsmtFin_Type_1', 'BsmtFin_Type_2', 'Heating',
                                  'Heating_QC', 'Central_Air', 'Electrical', ...])])

['Lot_Frontage', 'Lot_Area', 'Year_Built', 'Year_Remod_Add', 'Mas_Vnr_Area', 'BsmtFin_SF_1', 'BsmtFin_SF_2', 'Bsmt_Unf_SF', 'Total_Bsmt_SF', 'First_Flr_SF', 'Second_Flr_SF', 'Gr_Liv_Area', 'Bsmt_Full_Bath', 'Bsmt_Half_Bath', 'Full_Bath', 'Half_Bath', 'Bedroom_AbvGr', 'Kitchen_AbvGr', 'TotRms_AbvGrd', 'Fireplaces', 'Garage_Cars', 'Garage_Area', 'Wood_Deck_SF', 'Open_Porch_SF', 'Enclosed_Porch', 'Three_season_porch', 'Screen_Porch', 'Pool_Area', 'Misc_Val', 'Mo_Sold', 'Year_Sold', 'Longitude', 'Latitude']

passthrough

['MS_SubClass', 'MS_Zoning', 'Street', 'Alley', 'Lot_Shape', 'Land_Contour', 'Utilities', 'Lot_Config', 'Land_Slope', 'Neighborhood', 'Condition_1', 'Condition_2', 'Bldg_Type', 'House_Style', 'Overall_Cond', 'Roof_Style', 'Roof_Matl', 'Exterior_1st', 'Exterior_2nd', 'Mas_Vnr_Type', 'Exter_Cond', 'Foundation', 'Bsmt_Cond', 'Bsmt_Exposure', 'BsmtFin_Type_1', 'BsmtFin_Type_2', 'Heating', 'Heating_QC', 'Central_Air', 'Electrical', 'Functional', 'Garage_Type', 'Garage_Finish', 'Garage_Cond', 'Paved_Drive', 'Pool_QC', 'Fence', 'Misc_Feature', 'Sale_Type', 'Sale_Condition']

OneHotEncoder(handle_unknown='ignore', sparse_output=False)

RandomForestRegressor(n_jobs=-1, random_state=123)


grid_results = pd.DataFrame(grid.cv_results_)
grid_summary = grid_results[[
    "param_model__n_estimators",
    "param_model__max_features",
    "mean_train_score",
    "mean_test_score",
    "std_test_score"
]].copy()

grid_summary["Train RMSE"] = -grid_summary["mean_train_score"]
grid_summary["CV RMSE"] = -grid_summary["mean_test_score"]
grid_summary["CV RMSE SD"] = grid_summary["std_test_score"]

grid_summary = grid_summary.drop(columns=["mean_train_score", "mean_test_score", "std_test_score"])
grid_summary.sort_values("CV RMSE").round(3)


best_rf = grid.best_estimator_
yhat_test_best_rf = best_rf.predict(X_test)
rmse_test_best_rf = rmse(y_test, yhat_test_best_rf)

grid.best_params_, rmse_test_best_rf

({'model__max_features': 0.33, 'model__n_estimators': 300},
 0.023187989371563064)


boosting = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", GradientBoostingRegressor(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=2,
        random_state=RANDOM_STATE
    ))
])

boosting.fit(X_train, y_train)

yhat_train_boost = boosting.predict(X_train)
yhat_test_boost = boosting.predict(X_test)

rmse_train_boost = rmse(y_train, yhat_train_boost)
rmse_test_boost = rmse(y_test, yhat_test_boost)

rmse_train_boost, rmse_test_boost

(0.014080600391711794, 0.021989690150901835)


plt.figure(figsize=(6, 6))
plt.scatter(yhat_test_boost, y_test, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()])
plt.xlabel("Predicted Sale_Price")
plt.ylabel("Observed Sale_Price")
plt.title(f"Gradient boosting: test RMSE = {rmse_test_boost:.2f}")
plt.show()


results = pd.DataFrame([
    {"Model": "Small regression tree", "Train RMSE": rmse_train_small, "Test RMSE": rmse_test_small, "OOB RMSE": np.nan},
    {"Model": "Deep regression tree", "Train RMSE": rmse_train_deep, "Test RMSE": rmse_test_deep, "OOB RMSE": np.nan},
    {"Model": "Pruned regression tree", "Train RMSE": rmse_train_pruned, "Test RMSE": rmse_test_pruned, "OOB RMSE": np.nan},
    {"Model": "Bagged trees", "Train RMSE": rmse_train_bag, "Test RMSE": rmse_test_bag, "OOB RMSE": oob_rmse_bag},
    {"Model": "Random forest", "Train RMSE": rmse_train_rf, "Test RMSE": rmse_test_rf, "OOB RMSE": oob_rmse_rf},
    {"Model": "Random forest, tuned", "Train RMSE": np.nan, "Test RMSE": rmse_test_best_rf, "OOB RMSE": np.nan},
    {"Model": "Gradient boosting", "Train RMSE": rmse_train_boost, "Test RMSE": rmse_test_boost, "OOB RMSE": np.nan},
])

results.round(5)


plt.figure(figsize=(9, 4))
plot_data = results.dropna(subset=["Test RMSE"])
plt.barh(plot_data["Model"], plot_data["Test RMSE"])
plt.xlabel("Test RMSE")
plt.title("Test error comparison")
plt.gca().invert_yaxis()
plt.show()


def get_original_feature_mapping(fitted_preprocessor, numeric_features, categorical_features):
    """Return transformed feature names and their corresponding original feature names."""
    feature_names = []
    original_names = []

    # Numerical variables
    for col in numeric_features:
        feature_names.append(f"num__{col}")
        original_names.append(col)

    # Categorical variables
    encoder = fitted_preprocessor.named_transformers_["cat"]
    for col, categories in zip(categorical_features, encoder.categories_):
        for cat in categories:
            feature_names.append(f"cat__{col}_{cat}")
            original_names.append(col)

    return np.array(feature_names), np.array(original_names)


def aggregate_importances_by_original_variable(fitted_pipeline, importances, numeric_features, categorical_features):
    fitted_preprocessor = fitted_pipeline.named_steps["preprocess"]
    transformed_names, original_names = get_original_feature_mapping(
        fitted_preprocessor, numeric_features, categorical_features
    )

    # Safety check: use actual transformed names if lengths differ
    actual_names = fitted_preprocessor.get_feature_names_out()
    if len(original_names) != len(importances):
        raise ValueError(
            f"Length mismatch: {len(original_names)} original names vs {len(importances)} importances."
        )

    out = pd.DataFrame({
        "transformed_feature": actual_names,
        "original_feature": original_names,
        "importance": importances
    })

    grouped = (
        out.groupby("original_feature", as_index=False)["importance"]
        .sum()
        .sort_values("importance", ascending=False)
    )

    return grouped, out


rf_importances = rf.named_steps["model"].feature_importances_
rf_grouped_importance, rf_dummy_importance = aggregate_importances_by_original_variable(
    rf, rf_importances, numeric_features, categorical_features
)

rf_grouped_importance.head(20)


top_rf = rf_grouped_importance.head(20).sort_values("importance")

plt.figure(figsize=(7, 7))
plt.barh(top_rf["original_feature"], top_rf["importance"])
plt.xlabel("Aggregated impurity-based importance")
plt.title("Random forest variable importance")
plt.show()


bagging_estimator = bagging.named_steps["model"]

bag_importances = np.mean(
    [est.feature_importances_ for est in bagging_estimator.estimators_],
    axis=0
)

bag_grouped_importance, bag_dummy_importance = aggregate_importances_by_original_variable(
    bagging, bag_importances, numeric_features, categorical_features
)

bag_grouped_importance.head(20)


top_bag = bag_grouped_importance.head(20).sort_values("importance")

plt.figure(figsize=(7, 7))
plt.barh(top_bag["original_feature"], top_bag["importance"])
plt.xlabel("Aggregated impurity-based importance")
plt.title("Bagging variable importance")
plt.show()


boost_importances = boosting.named_steps["model"].feature_importances_

boost_grouped_importance, boost_dummy_importance = aggregate_importances_by_original_variable(
    boosting, boost_importances, numeric_features, categorical_features
)

boost_grouped_importance.head(20)


top_boost = boost_grouped_importance.head(20).sort_values("importance")

plt.figure(figsize=(7, 7))
plt.barh(top_boost["original_feature"], top_boost["importance"])
plt.xlabel("Aggregated impurity-based importance")
plt.title("Gradient boosting variable importance")
plt.show()

	MS_SubClass	MS_Zoning	Lot_Frontage	Lot_Area	Street	Alley	Lot_Shape	Land_Contour	Utilities	Lot_Config	Land_Slope	Neighborhood	Condition_1	Condition_2	Bldg_Type	House_Style	Overall_Cond	Year_Built	Year_Remod_Add	Roof_Style	Roof_Matl	Exterior_1st	Exterior_2nd	Mas_Vnr_Type	Mas_Vnr_Area	Exter_Cond	Foundation	Bsmt_Cond	Bsmt_Exposure	BsmtFin_Type_1	BsmtFin_SF_1	BsmtFin_Type_2	BsmtFin_SF_2	Bsmt_Unf_SF	Total_Bsmt_SF	Heating	Heating_QC	Central_Air	Electrical	First_Flr_SF	Second_Flr_SF	Gr_Liv_Area	Bsmt_Full_Bath	Full_Bath	Half_Bath	Bedroom_AbvGr	Kitchen_AbvGr	TotRms_AbvGrd	Functional	Fireplaces	Garage_Type	Garage_Finish	Garage_Cars	Garage_Area	Garage_Cond	Paved_Drive	Wood_Deck_SF	Open_Porch_SF	Screen_Porch	Pool_QC	Fence	Misc_Feature	Misc_Val	Mo_Sold	Year_Sold	Sale_Type	Sale_Condition	Sale_Price	Longitude	Latitude
0	One_Story_1946_and_Newer_All_Styles	Residential_Low_Density	141	31770	Pave	No_Alley_Access	Slightly_Irregular	Lvl	AllPub	Corner	Gtl	North_Ames	Norm	Norm	OneFam	One_Story	Average	1960	1960	Hip	CompShg	BrkFace	Plywood	Stone	112	Typical	CBlock	Good	Gd	BLQ	2	Unf	0	441	1080	GasA	Fair	Y	SBrkr	1656	0	1656	1	1	0	3	1	7	Typ	2	Attchd	Fin	2	528	Typical	Partial_Pavement	210	62	0	No_Pool	No_Fence	None	0	5	2010	WD	Normal	215000	-93.619754	42.054035
1	One_Story_1946_and_Newer_All_Styles	Residential_High_Density	80	11622	Pave	No_Alley_Access	Regular	Lvl	AllPub	Inside	Gtl	North_Ames	Feedr	Norm	OneFam	One_Story	Above_Average	1961	1961	Gable	CompShg	VinylSd	VinylSd	None	0	Typical	CBlock	Typical	No	Rec	6	LwQ	144	270	882	GasA	Typical	Y	SBrkr	896	0	896	0	1	0	2	1	5	Typ	0	Attchd	Unf	1	730	Typical	Paved	140	0	120	No_Pool	Minimum_Privacy	None	0	6	2010	WD	Normal	105000	-93.619756	42.053014
2	One_Story_1946_and_Newer_All_Styles	Residential_Low_Density	81	14267	Pave	No_Alley_Access	Slightly_Irregular	Lvl	AllPub	Corner	Gtl	North_Ames	Norm	Norm	OneFam	One_Story	Above_Average	1958	1958	Hip	CompShg	Wd Sdng	Wd Sdng	BrkFace	108	Typical	CBlock	Typical	No	ALQ	1	Unf	0	406	1329	GasA	Typical	Y	SBrkr	1329	0	1329	0	1	1	3	1	6	Typ	0	Attchd	Unf	1	312	Typical	Paved	393	36	0	No_Pool	No_Fence	Gar2	12500	6	2010	WD	Normal	172000	-93.619387	42.052659
3	One_Story_1946_and_Newer_All_Styles	Residential_Low_Density	93	11160	Pave	No_Alley_Access	Regular	Lvl	AllPub	Corner	Gtl	North_Ames	Norm	Norm	OneFam	One_Story	Average	1968	1968	Hip	CompShg	BrkFace	BrkFace	None	0	Typical	CBlock	Typical	No	ALQ	1	Unf	0	1045	2110	GasA	Excellent	Y	SBrkr	2110	0	2110	1	2	1	3	1	8	Typ	2	Attchd	Fin	2	522	Typical	Paved	0	0	0	No_Pool	No_Fence	None	0	4	2010	WD	Normal	244000	-93.617320	42.051245
4	Two_Story_1946_and_Newer	Residential_Low_Density	74	13830	Pave	No_Alley_Access	Slightly_Irregular	Lvl	AllPub	Inside	Gtl	Gilbert	Norm	Norm	OneFam	Two_Story	Average	1997	1998	Gable	CompShg	VinylSd	VinylSd	None	0	Typical	PConc	Typical	No	GLQ	3	Unf	0	137	928	GasA	Good	Y	SBrkr	928	701	1629	0	2	1	3	1	6	Typ	1	Attchd	Fin	2	482	Typical	Paved	212	34	0	No_Pool	Minimum_Privacy	None	0	3	2010	WD	Normal	189900	-93.638933	42.060899

	train	test
count	2051.000000	879.000000
mean	0.181511	0.179128
std	0.081504	0.075999
min	0.012789	0.039300
25%	0.129875	0.128800
50%	0.161750	0.159000
75%	0.213370	0.214000
max	0.755000	0.625000

	ccp_alpha	cv_rmse	cv_rmse_sd	mean_leaves
28	5.944817e-06	0.040637	0.001843	70.0
27	2.126540e-06	0.041285	0.001799	141.6
26	1.148171e-06	0.041637	0.001558	202.0
25	6.857951e-07	0.041740	0.001797	270.8
24	4.408642e-07	0.041935	0.001722	331.2

Lab 2: Ensembles of Trees with Python¶

1. Required packages¶

2. The dataset¶

3. Exploratory data analysis and response transformation¶

4. Train/test split¶

5. Preprocessing: numerical and categorical variables¶

6. A first regression tree¶

Plotting the tree¶

7. A deeper tree¶

8. Cost-complexity pruning¶

9. Comparing single-tree models¶

10. Bagging regression trees¶

Out-of-bag error¶

11. Random forests¶

12. Small grid search for random forests¶

13. Fitting a boosted regression tree¶

14. Final comparison of models¶

15. Variable importance¶

Random forest variable importance¶

Bagging variable importance¶

Gradient boosting variable importance¶

16. Interpretation and final remarks¶

Questions¶

	Model	Train RMSE	Test RMSE
0	Small regression tree	0.04	0.04
1	Deep regression tree	0.00	0.04
2	Pruned regression tree	0.02	0.04

	param_model__n_estimators	param_model__max_features	Train RMSE	CV RMSE	CV RMSE SD
1	300	0.33	0.011	0.029	0.004
3	300	0.5	0.011	0.029	0.003
0	100	0.33	0.011	0.029	0.004
2	100	0.5	0.011	0.029	0.003
5	300	1.0	0.011	0.030	0.003
4	100	1.0	0.011	0.030	0.003

	Model	Train RMSE	Test RMSE	OOB RMSE
0	Small regression tree	0.04006	0.04054	NaN
1	Deep regression tree	0.00000	0.03937	NaN
2	Pruned regression tree	0.02353	0.03759	NaN
3	Bagged trees	0.01132	0.02413	0.03008
4	Random forest	0.01107	0.02354	0.02979
5	Random forest, tuned	NaN	0.02319	NaN
6	Gradient boosting	0.01408	0.02199	NaN

	original_feature	importance
27	Garage_Cars	0.371428
31	Gr_Liv_Area	0.143861
70	Year_Built	0.107074
22	First_Flr_SF	0.064728
67	Total_Bsmt_SF	0.063642
71	Year_Remod_Add	0.029867
21	Fireplaces	0.021677
41	Lot_Area	0.017580
39	Latitude	0.015600
24	Full_Bath	0.012617
63	Second_Flr_SF	0.012032
40	Longitude	0.011265
26	Garage_Area	0.009645
47	Mas_Vnr_Area	0.008869
52	Neighborhood	0.006379
11	Bsmt_Unf_SF	0.006280
5	BsmtFin_Type_1	0.006095
51	Mo_Sold	0.004872
53	Open_Porch_SF	0.004695
18	Exterior_1st	0.004500

	original_feature	importance
27	Garage_Cars	0.379990
31	Gr_Liv_Area	0.139771
70	Year_Built	0.109861
22	First_Flr_SF	0.065338
67	Total_Bsmt_SF	0.059246
71	Year_Remod_Add	0.027277
21	Fireplaces	0.023897
41	Lot_Area	0.018439
39	Latitude	0.014095
63	Second_Flr_SF	0.012497
24	Full_Bath	0.012018
40	Longitude	0.011135
26	Garage_Area	0.010156
47	Mas_Vnr_Area	0.008213
11	Bsmt_Unf_SF	0.005991
52	Neighborhood	0.005972
51	Mo_Sold	0.005142
5	BsmtFin_Type_1	0.005023
53	Open_Porch_SF	0.004488
8	Bsmt_Exposure	0.004294

	original_feature	importance
31	Gr_Liv_Area	0.200391
27	Garage_Cars	0.194399
67	Total_Bsmt_SF	0.158931
70	Year_Built	0.149455
22	First_Flr_SF	0.050951
21	Fireplaces	0.040193
63	Second_Flr_SF	0.030393
71	Year_Remod_Add	0.022049
39	Latitude	0.018524
52	Neighborhood	0.011237
26	Garage_Area	0.009242
41	Lot_Area	0.008823
8	Bsmt_Exposure	0.008225
59	Roof_Style	0.007736
5	BsmtFin_Type_1	0.007508
30	Garage_Type	0.007132
54	Overall_Cond	0.007104
36	Kitchen_AbvGr	0.005000
53	Open_Porch_SF	0.004487
11	Bsmt_Unf_SF	0.004259