Data 100 Review Lab – Bootstrapping

Suraj Rampure, Neil Shah

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
data = pd.read_csv("season_data.csv")
In [3]:
data.head()
Out[3]:
Rk Player Pos Age Tm G GS MP FG FGA ... FT% ORB DRB TRB AST STL BLK TOV PF PTS
0 1 Alex Abrines\abrinal01 SG 25 OKC 31 2 19.0 1.8 5.1 ... 0.923 0.2 1.4 1.5 0.6 0.5 0.2 0.5 1.7 5.3
1 2 Quincy Acy\acyqu01 PF 28 PHO 10 0 12.3 0.4 1.8 ... 0.700 0.3 2.2 2.5 0.8 0.1 0.4 0.4 2.4 1.7
2 3 Jaylen Adams\adamsja01 PG 22 ATL 34 1 12.6 1.1 3.2 ... 0.778 0.3 1.4 1.8 1.9 0.4 0.1 0.8 1.3 3.2
3 4 Steven Adams\adamsst01 C 25 OKC 80 80 33.4 6.0 10.1 ... 0.500 4.9 4.6 9.5 1.6 1.5 1.0 1.7 2.6 13.9
4 5 Bam Adebayo\adebaba01 C 21 MIA 82 28 23.3 3.4 5.9 ... 0.735 2.0 5.3 7.3 2.2 0.9 0.8 1.5 2.5 8.9

5 rows × 30 columns

In [4]:
pop = data['PTS']
pop.hist()
Out[4]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f17eb64d9b0>
In [5]:
def create_ci():
    sample = pop.sample(n = 200)
    stats = []

    for _ in range(1000):
        bootstrap_resample = sample.sample(n = 200, replace = True)
        stats.append(bootstrap_resample.mean())
        
    left = np.percentile(stats, 2.5)
    right = np.percentile(stats, 97.5)

    return [left, right]
In [6]:
create_ci()
Out[6]:
[7.5358625, 9.1600375]
In [7]:
all_cis = [create_ci() for _ in range(100)]
In [8]:
m = pop.mean()
count = 0
for ci in all_cis:
    if ci[0] <= m <= ci[1]:
        count += 1
In [9]:
count
Out[9]:
99