# Daily Dose of Data Science

The notebook accompanies the code for improving the performance of Random Forest Model. 

[Reduce Trees in Random Forest Model](https://blog.dailydoseofds.com/p/reduce-trees-in-random-forest-model)

Author: Avi Chawla

In [None]:
import seaborn as sns
import copy
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")

sns.set()

In [None]:
colors = ['#fe7c73', '#2471A3', '#3498DB', '#27AE60', '#82E0AA', '#D35400', '#5D6D7E', '#E74C3C', '#21618C', '#B7950B', '#46C7C7', '#00B9FF', '#FF7051', "orange", "darkorange", "tomato", "coral", "limegreen", "lightsalmon"]

## Create dataset

In [None]:
# Generate some synthetic data to train the decision tree on
X, y = make_classification(
    n_samples=1500, 
    n_features=2, 
    n_clusters_per_class=1,
    n_informative=2, class_sep=0.5,
    n_redundant=0,
    n_repeated=0
)

fig, ax = plt.subplots()
plt.scatter(X[:, 0], X[:, 1], c=[colors[i] for i in y])
ax.set_xticklabels([])
ax.set_yticklabels([])
plt.show()

X_train, X_test, X_final = X[:1000], X[1000:1300], X[1300:]
y_train, y_test, y_final = y[:1000], y[1000:1300], y[1300:]

## Train Random Forest Model

In [None]:
model = RandomForestClassifier(max_features="sqrt", max_samples=0.8, n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
model_accs = [] # list to store accuracies

for idx, tree in enumerate(model.estimators_):

    score = tree.score(X_test, y_test) # find accuracy
    model_accs.append([idx, score]) # store accuracy
    
model_accs = np.array(model_accs)

In [None]:
sorted_indices = np.argsort(model_accs[:, 1])[::-1]

# Rearrange the rows of the array based on the sorted indices
model_ids = model_accs[sorted_indices][:,0].astype(int)

In [None]:
model.estimators_ = np.array(model.estimators_)[model_ids].tolist()

In [None]:
result = []
total_models = len(model.estimators_) 
for i in range(2, total_models):
    
    small_model = copy.deepcopy(model)
    
    small_model.estimators_ = model.estimators_[:i]
    result.append([i, small_model.score(X_test, y_test), small_model.score(X_final, y_final), small_model.score(X_train, y_train)])
    
result = np.array(result)

In [None]:
max_index = np.argmax(result[:, 1])
max_index

In [None]:
fig, ax = plt.subplots()
ax.plot(result[:, 0], result[:, 1], c = colors[0], label="Validation")
# ax.plot(result[:, 0], result[:, 2], c = colors[3], label="Test")
ax.plot(result[:, 0], result[:, 3], c = colors[11], label="Train")


ax.axvline(x=result[max_index][0], ymax=0.95, ls = "--")
plt.scatter([result[max_index][0],], [result[max_index][1],], zorder=0, s=60, c = colors[1])
ax.set_xlabel("Number of Trees", fontsize = 15, fontweight = "bold")
ax.set_ylabel("Random Forest accuracy", fontsize = 15, fontweight = "bold")
ax.set_title("Cumulative accuracy plot", fontsize = 20, fontweight = "bold")
plt.legend()
# plt.savefig("Cumulative_acc1.jpeg", dpi = 600, bbox_inches = "tight")
plt.show()

## Create final model

In [None]:
small_model = copy.deepcopy(model)

small_model.estimators_ = model.estimators_[:int(result[max_index][0])]

### RF Model with 100 trees

In [None]:
# Accuracy
model.score(X_test, y_test)

In [None]:
model.score(X_train, y_train)

In [None]:
model.score(X_final, y_final)

In [None]:
# Run-time
%timeit model.predict(X_test)

### RF Model with top trees 

In [None]:
# Accuracy
small_model.score(X_test, y_test)

In [None]:
small_model.score(X_train, y_train)

In [None]:
small_model.score(X_final, y_final)

In [None]:
# Run-time
%timeit small_model.predict(X_test)

In [None]:
4.69/0.358

In [None]:
scr