# Daily Dose of Data Science


[Condense Random Forest into a Decision Tree](https://blog.dailydoseofds.com/p/condense-random-forest-into-a-decision)

Author: Avi Chawla

## Imports

In [None]:
import seaborn as sns

sns.set()
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [None]:
plt.rcParams['font.family'] = ['Comic Sans MS', 'sans-serif']

colors = ['#fe7c73', '#2471A3']

### Dummy classification dataset

In [None]:
X, y = make_classification(
    n_samples=1000, 
    n_features=2, 
    n_clusters_per_class=1,
    n_informative=2, class_sep=0.5,
    n_redundant=0,
    n_repeated=0
)

fig, ax = plt.subplots()
plt.scatter(X[:, 0], X[:, 1], c=[colors[i] for i in y])
ax.set_xticklabels([])
ax.set_yticklabels([])
plt.show()

In [None]:
X, X_test, y, y_test = train_test_split(X, y, test_size=0.2)

# Train Decision Tree

In [None]:
# # train
dtree = DecisionTreeClassifier()
dtree.fit(X, y)

# Decision region plot
xx, yy = np.meshgrid(np.linspace(np.floor(X[:, 0].min()), np.ceil(X[:, 0].max()), 100), np.linspace(np.floor(X[:, 1].min()), np.ceil(X[:, 1].max()), 100))
Z = dtree.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

fig, ax = plt.subplots()
plt.contourf(xx, yy, Z, alpha=0.5, cmap='coolwarm_r')
ax.set_title("Decision Tree", fontsize = 20, fontweight = "bold", pad=15)
ax.set_xticklabels([])
ax.set_yticklabels([])
plt.show()

## Score on test set

In [None]:
dtree.score(X_test, y_test)

# Train Random Forest model

In [None]:
rfmodel = RandomForestClassifier(n_estimators = 100, max_features=0.3, ccp_alpha=0.003)
rfmodel.fit(X, y)

# Decision region plot
xx, yy = np.meshgrid(np.linspace(np.floor(X[:, 0].min()), np.ceil(X[:, 0].max()), 100), np.linspace(np.floor(X[:, 1].min()), np.ceil(X[:, 1].max()), 100))
Z = rfmodel.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

fig, ax = plt.subplots()
plt.contourf(xx, yy, Z, alpha=0.5, cmap='coolwarm_r')
ax.set_title("Random Forest", fontsize = 20, fontweight = "bold")
ax.set_xticklabels([])
ax.set_yticklabels([])

plt.show()

## Measure training and test accuracy

In [None]:
rfmodel.score(X, y), rfmodel.score(X_test, y_test)

# New decision tree model

In [None]:
y1 = rfmodel.predict(X) # proxy labels

## Train Decision tree on proxy labels

In [None]:
# train
dtree = DecisionTreeClassifier()
dtree.fit(X, y1)

# Decision region plot
xx, yy = np.meshgrid(np.linspace(np.floor(X[:, 0].min()), np.ceil(X[:, 0].max()), 100), np.linspace(np.floor(X[:, 1].min()), np.ceil(X[:, 1].max()), 100))
Z = dtree.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

fig, ax = plt.subplots()
plt.contourf(xx, yy, Z, alpha=0.5, cmap='coolwarm_r')
ax.set_title("Decision Tree", fontsize = 20, fontweight = "bold", pad=15)
ax.set_xticklabels([])
ax.set_yticklabels([])

plt.show()

## Predicion accuracy of new decision tree model

In [None]:
y1 = rfmodel.predict(X) # this should overfit

## Run-time comparison

In [None]:
%%timeit

dtree.predict(X)

In [None]:
%%timeit

rfmodel.predict(X)

## Test accuracy comparison

In [None]:
dtree.score(X_test, y_test)

In [None]:
rfmodel.score(X_test, y_test)