Random forests#

In summary, both Bagging and Random Forests are ensemble methods that reduce overfitting and improve predictive performance by combining multiple decision trees. However, Random Forests go a step further by introducing feature selection randomness during tree construction, making them more robust and less prone to overfitting. As a result, Random Forests are often preferred when working with decision tree-based ensembles for a wide range of tasks.

Boston dataset#

import pandas as pd
from sklearn.model_selection import train_test_split

boston = pd.read_csv("../ISLP_datasets/Boston.csv").drop("Unnamed: 0", axis=1)
y = boston['medv']
X = boston.drop('medv', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[1], line 4
      1 import pandas as pd
      2 from sklearn.model_selection import train_test_split
----> 4 boston = pd.read_csv("../ISLP_datasets/Boston.csv").drop("Unnamed: 0", axis=1)
      5 y = boston['medv']
      6 X = boston.drop('medv', axis=1)

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/parsers/readers.py:912, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
    899 kwds_defaults = _refine_defaults_read(
    900     dialect,
    901     delimiter,
   (...)
    908     dtype_backend=dtype_backend,
    909 )
    910 kwds.update(kwds_defaults)
--> 912 return _read(filepath_or_buffer, kwds)

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/parsers/readers.py:577, in _read(filepath_or_buffer, kwds)
    574 _validate_names(kwds.get("names", None))
    576 # Create the parser.
--> 577 parser = TextFileReader(filepath_or_buffer, **kwds)
    579 if chunksize or iterator:
    580     return parser

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1407, in TextFileReader.__init__(self, f, engine, **kwds)
   1404     self.options["has_index_names"] = kwds["has_index_names"]
   1406 self.handles: IOHandles | None = None
-> 1407 self._engine = self._make_engine(f, self.engine)

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1661, in TextFileReader._make_engine(self, f, engine)
   1659     if "b" not in mode:
   1660         mode += "b"
-> 1661 self.handles = get_handle(
   1662     f,
   1663     mode,
   1664     encoding=self.options.get("encoding", None),
   1665     compression=self.options.get("compression", None),
   1666     memory_map=self.options.get("memory_map", False),
   1667     is_text=is_text,
   1668     errors=self.options.get("encoding_errors", "strict"),
   1669     storage_options=self.options.get("storage_options", None),
   1670 )
   1671 assert self.handles is not None
   1672 f = self.handles.handle

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/common.py:859, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    854 elif isinstance(handle, str):
    855     # Check whether the filename is to be opened in binary mode.
    856     # Binary mode does not support 'encoding' and 'newline'.
    857     if ioargs.encoding and "b" not in ioargs.mode:
    858         # Encoding
--> 859         handle = open(
    860             handle,
    861             ioargs.mode,
    862             encoding=ioargs.encoding,
    863             errors=errors,
    864             newline="",
    865         )
    866     else:
    867         # Binary mode
    868         handle = open(handle, ioargs.mode)

FileNotFoundError: [Errno 2] No such file or directory: '../ISLP_datasets/Boston.csv'

Train a random forest model:

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train, y_train)
print("Train score:", rf.score(X_train, y_train))
print("Train score:", rf.score(X_test, y_test))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[2], line 4
      1 from sklearn.ensemble import RandomForestRegressor
      3 rf = RandomForestRegressor()
----> 4 rf.fit(X_train, y_train)
      5 print("Train score:", rf.score(X_train, y_train))
      6 print("Train score:", rf.score(X_test, y_test))

NameError: name 'X_train' is not defined

Bagging#

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_error

# Load the California Housing dataset as an example
data = fetch_california_housing()
X = data.data
y = data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a base decision tree regressor
base_model = DecisionTreeRegressor(random_state=42)

# Create a Bagging Regressor with 100 base models (decision trees)
bagging_model = BaggingRegressor(base_model, n_estimators=100, random_state=42)

# Train the Bagging Regressor on the training data
bagging_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = bagging_model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
Mean Squared Error: 0.26

In this code:

  • We load the California Housing dataset from scikit-learn as an example regression dataset.

  • The dataset is split into training and testing sets using train_test_split.

  • We create a base model, which is a decision tree regressor.

  • We create a Bagging Regressor with 100 base models (decision trees) using BaggingRegressor.

  • The Bagging Regressor is trained on the training data using fit.

  • We make predictions on the test data using predict.

  • Finally, we evaluate the model’s performance using the mean squared error (MSE).

You can modify this code to work with your own dataset and adjust hyperparameters as needed. Bagging can also be applied to classification tasks using BaggingClassifier in scikit-learn.

Random forest#

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the California Housing dataset as an example
data = fetch_california_housing()
X = data.data
y = data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Regressor with 100 trees (estimators)
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the Random Forest Regressor on the training data
rf_regressor.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_regressor.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared (R2) Score: {r2:.2f}")
Mean Squared Error: 0.26
R-squared (R2) Score: 0.81

In this code:

  • We load the California Housing dataset from scikit-learn as an example regression dataset.

  • The dataset is split into training and testing sets using train_test_split.

  • We create a Random Forest Regressor with 100 decision trees (estimators) using RandomForestRegressor. You can adjust the n_estimators parameter to change the number of trees in the forest.

  • The Random Forest Regressor is trained on the training data using fit.

  • We make predictions on the test data using predict.

  • Finally, we evaluate the model’s performance using metrics such as mean squared error (MSE) and R-squared (R2) score.

You can modify this code to work with your own dataset and adjust hyperparameters as needed. Random Forest Regression is a powerful technique for solving regression tasks, as it combines the strengths of multiple decision trees while mitigating their weaknesses, such as overfitting.