# Daily Dose of Data Science

[MissForest and kNN Imputation for Data Missing at Random](https://blog.dailydoseofds.com/p/missforest-and-knn-imputation-for)

Author: Avi Chawla

## kNN imputation

In [None]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer, KNNImputer

warnings.filterwarnings("ignore")
sns.set()

In [None]:
plt.rcParams['font.family'] = ['Comic Sans MS', 'sans-serif']
colors = ['#fe7c73', '#2471A3', '#3498DB', '#27AE60', '#82E0AA', '#D35400', '#5D6D7E', '#E74C3C', '#21618C', '#B7950B', '#46C7C7', '#00B9FF', '#FF7051', "orange", "darkorange", "tomato", "coral", "limegreen", "lightsalmon"]

In [None]:
# Step 2: Create the dummy dataset with missing values
size = 1000
data = np.random.normal(loc = 2, scale = 2, size=(size, 3))  # Generate random data
mask = np.random.rand(size, 3) < 0.2  # Create a mask to introduce missing values
data[mask] = np.nan  # Set missing values

In [None]:
# Step 4: Impute missing values with mean and zero, and plot their distributions
mean_imputer = SimpleImputer(strategy='mean')
zero_imputer = SimpleImputer(strategy='constant', fill_value=0)

data_imputed_mean = mean_imputer.fit_transform(data)
data_imputed_zero = zero_imputer.fit_transform(data)

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

# Original Distribution
ax.hist(data.flatten(), bins=45, edgecolor='black', color = colors[1])
ax.set_title("Original Distribution", size = 20, weight="bold", pad = 10)
ax.set_xlabel("Value", size = 15, weight="bold")
ax.set_ylabel("Frequency", size = 15, weight="bold")

plt.show();

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

ax.hist(data_imputed_mean.flatten(), bins=45, edgecolor='black', color = colors[16])
ax.set_title("Mean Imputation", size = 20, weight="bold", pad = 10)
ax.set_xlabel("Value", size = 15, weight="bold")
ax.set_ylabel("Frequency", size = 15, weight="bold")

plt.show();

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

ax.hist(data_imputed_zero.flatten(), bins=45, edgecolor='black', color = colors[16])
ax.set_title("Zero Imputation", size = 20, weight="bold", pad = 10)
ax.set_xlabel("Value", size = 15, weight="bold")
ax.set_ylabel("Frequency", size = 15, weight="bold")

plt.show();

In [None]:
# Step 5: Impute missing values using KNN imputer and plot the distribution
knn_imputer = KNNImputer(n_neighbors=5)
data_imputed_knn = knn_imputer.fit_transform(data)

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

ax.hist(data_imputed_knn.flatten(), bins=50, edgecolor='black',  color = colors[18])
ax.set_title("kNN Imputation", size = 20, weight="bold", pad = 10)
ax.set_xlabel("Value", size = 15, weight="bold")
ax.set_ylabel("Frequency", size = 15, weight="bold")

plt.show()


## MissForest imputation

In [None]:
!pip install missingpy

In [None]:
import sklearn.neighbors._base
import sys

sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

In [None]:
from missingpy import MissForest
imputer = MissForest()
data_imputed_missforest = imputer.fit_transform(data)

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))

ax.hist(data_imputed_missforest.flatten(), bins=50, edgecolor='black',  color = colors[18])
ax.set_title("MissForest Imputation", size = 20, weight="bold", pad = 10)
ax.set_xlabel("Value", size = 15, weight="bold")
ax.set_ylabel("Frequency", size = 15, weight="bold")

plt.show()
