# Daily Dose of Data Science

Post: [The Most Overlooked Source of Optimization in Data Pipelines](https://www.blog.dailydoseofds.com/p/the-most-overlooked-source-of-optimization)

Author: Avi Chawla

## Imports

In [None]:
import pandas as pd
import numpy as np
import random
import string


## Numeric data

In [None]:
# Define the number of rows and columns
num_rows = 1000000
num_columns = 20

# Create a dictionary to store column data
data = {}

# Generate random numerical values for each column
for i in range(num_columns):
    column_name = f"column_{i+1}"
    data[column_name] = np.random.randint(0, 100001, size=num_rows)

# Create the DataFrame
df = pd.DataFrame(data)

### CSV save run-time

In [None]:
%%timeit 

df.to_csv("data.csv", index=False)

### Pickle save run-time

In [None]:
%%timeit 

df.to_pickle("data.pickle")

### Parquet save run-time

In [None]:
%%timeit 

df.to_parquet("data.parquet")

### JSON save run-time

In [None]:
%%timeit 

df.to_json("data.json")

### Feather save run-time

In [None]:
%%timeit 

df.to_feather("data.feather")

### CSV load run-time

In [None]:
%%timeit 

df = pd.read_csv("data.csv")

### Pickle load run-time

In [None]:
%%timeit 

df = pd.read_pickle("data.pickle")

### Parquet load run-time

In [None]:
%%timeit 

df = pd.read_parquet("data.parquet")

### JSON load run-time

In [None]:
%%timeit 

df = pd.read_json("data.json")

### Feather load run-time

In [None]:
%%timeit 

df = pd.read_feather("data.feather")

## Mixed data

In [None]:
# Define the number of rows and columns
num_rows = 1000000
num_numeric_columns = 10
num_string_columns = 10

# Create a dictionary to store column data
data = {}

# Generate random numerical values for numeric columns
for i in range(num_numeric_columns):
    column_name = f"numeric_{i+1}"
    data[column_name] = np.random.randint(0, 100001, size=num_rows)

# Generate random string values for string columns
for i in range(num_string_columns):
    column_name = f"string_{i+1}"
    data[column_name] = [''.join(random.choices(string.ascii_letters, k=5)) for _ in range(num_rows)]

# Create the DataFrame
df = pd.DataFrame(data)

### CSV save run-time

In [None]:
%%timeit 

df.to_csv("data_string.csv", index=False)

### Pickle save run-time

In [None]:
%%timeit 

df.to_pickle("data_string.pickle")

### Parquet save run-time

In [None]:
%%timeit 

df.to_parquet("data_string.parquet")

### JSON save run-time

In [None]:
%%timeit 

df.to_json("data_string.json")

### Feather save run-time

In [None]:
%%timeit 

df.to_feather("data_string.feather")

### CSV load run-time

In [None]:
%%timeit 

df = pd.read_csv("data_string.csv")

### Pickle load run-time

In [None]:
%%timeit 

df = pd.read_pickle("data_string.pickle")

### Parquet load run-time

In [None]:
%%timeit 

df = pd.read_parquet("data_string.parquet")

### JSON load run-time

In [None]:
%%timeit 

df = pd.read_json("data_string.json")

### Feather load run-time

In [None]:
%%timeit 

df = pd.read_feather("data_string.feather")