nycflights13 flights datalibrary(nycflights13)
library(dplyr)
library(ggplot2)
library(stringr)
library(infer)
set.seed(2017)
fli_small <- flights %>%
na.omit() %>%
sample_n(size = 500) %>%
mutate(season = case_when(
month %in% c(10:12, 1:3) ~ "winter",
month %in% c(4:9) ~ "summer"
)) %>%
mutate(day_hour = case_when(
between(hour, 1, 12) ~ "morning",
between(hour, 13, 24) ~ "not morning"
)) %>%
select(arr_delay, dep_delay, season,
day_hour, origin, carrier)arr_delay, dep_delayseason ("winter", "summer"),day_hour ("morning", "not morning")origin ("EWR", "JFK", "LGA")carrierObserved stat
| stat |
|---|
| 11.49 |
null_distn <- fli_small %>%
specify(response = dep_delay) %>%
hypothesize(null = "point", mu = 10) %>%
generate(reps = 1000) %>%
calculate(stat = "mean")## Setting `type = "bootstrap"` in `generate()`.
| p_value |
|---|
| 0.356 |
Observed stat
null_distn <- fli_small %>%
specify(response = dep_delay) %>%
hypothesize(null = "point", mu = 8) %>%
generate(reps = 1000) %>%
calculate(stat = "t")## Setting `type = "bootstrap"` in `generate()`.
| p_value |
|---|
| 0.018 |
Observed stat
| stat |
|---|
| -2 |
null_distn <- fli_small %>%
specify(response = dep_delay) %>%
hypothesize(null = "point", med = -1) %>%
generate(reps = 1000) %>%
calculate(stat = "median")## Setting `type = "bootstrap"` in `generate()`.
| p_value |
|---|
| 0.018 |
Observed stat
( p_hat <- fli_small %>%
specify(response = day_hour, success = "morning") %>%
calculate(stat = "prop") )| stat |
|---|
| 0.452 |
null_distn <- fli_small %>%
specify(response = day_hour, success = "morning") %>%
hypothesize(null = "point", p = .5) %>%
generate(reps = 1000) %>%
calculate(stat = "prop")## Setting `type = "simulate"` in `generate()`.
| p_value |
|---|
| 0.036 |
Logical variables will be coerced to factors:
null_distn <- fli_small %>%
mutate(day_hour_logical = (day_hour == "morning")) %>%
specify(response = day_hour_logical, success = "TRUE") %>%
hypothesize(null = "point", p = .5) %>%
generate(reps = 1000) %>%
calculate(stat = "prop")## Setting `type = "simulate"` in `generate()`.
Not yet implemented.
Observed stat
( d_hat <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
calculate(stat = "diff in props", order = c("winter", "summer")) )| stat |
|---|
| 0.0044 |
null_distn <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
hypothesize(null = "independence") %>%
generate(reps = 1000) %>%
calculate(stat = "diff in props", order = c("winter", "summer"))## Setting `type = "permute"` in `generate()`.
| p_value |
|---|
| 0.954 |
Standardized observed stat
( z_hat <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
calculate(stat = "z", order = c("winter", "summer")) )| stat |
|---|
| 0.0985 |
null_distn <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
hypothesize(null = "independence") %>%
generate(reps = 1000) %>%
calculate(stat = "z", order = c("winter", "summer"))## Setting `type = "permute"` in `generate()`.
| p_value |
|---|
| 0.95 |
Note the similarities in this plot and the previous one.
Observed stat
Note the need to add in the hypothesized values here to compute the observed statistic.
( Chisq_hat <- fli_small %>%
specify(response = origin) %>%
hypothesize(null = "point",
p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>%
calculate(stat = "Chisq") )| stat |
|---|
| 7.009 |
null_distn <- fli_small %>%
specify(response = origin) %>%
hypothesize(null = "point",
p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>%
generate(reps = 1000, type = "simulate") %>%
calculate(stat = "Chisq")
visualize(null_distn) +
shade_p_value(obs_stat = Chisq_hat, direction = "greater")| p_value |
|---|
| 0.037 |
Observed stat
| stat |
|---|
| 0.5284 |
null_distn <- fli_small %>%
specify(day_hour ~ origin) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "Chisq")
visualize(null_distn) +
shade_p_value(obs_stat = Chisq_hat, direction = "greater")| p_value |
|---|
| 0.77 |
Observed stat
( d_hat <- fli_small %>%
specify(dep_delay ~ season) %>%
calculate(stat = "diff in means", order = c("summer", "winter")) )| stat |
|---|
| 3 |
null_distn <- fli_small %>%
specify(dep_delay ~ season) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in means", order = c("summer", "winter"))
visualize(null_distn) +
shade_p_value(obs_stat = d_hat, direction = "two_sided")| p_value |
|---|
| 0.338 |
Standardized observed stat
( t_hat <- fli_small %>%
specify(dep_delay ~ season) %>%
calculate(stat = "t", order = c("summer", "winter")) )| stat |
|---|
| 0.8909 |
null_distn <- fli_small %>%
specify(dep_delay ~ season) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "t", order = c("summer", "winter"))
visualize(null_distn) +
shade_p_value(obs_stat = t_hat, direction = "two_sided")| p_value |
|---|
| 0.4 |
Note the similarities in this plot and the previous one.
Observed stat
( d_hat <- fli_small %>%
specify(dep_delay ~ season) %>%
calculate(stat = "diff in medians", order = c("summer", "winter")) )| stat |
|---|
| 1 |
null_distn <- fli_small %>%
specify(dep_delay ~ season) %>% # alt: response = dep_delay,
# explanatory = season
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in medians", order = c("summer", "winter"))
visualize(null_distn) +
shade_p_value(obs_stat = d_hat, direction = "two_sided")| p_value |
|---|
| 0.64 |
Observed stat
| stat |
|---|
| 0.6858 |
null_distn <- fli_small %>%
specify(arr_delay ~ origin) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "F")
visualize(null_distn) +
shade_p_value(obs_stat = F_hat, direction = "greater")| p_value |
|---|
| 0.529 |
Observed stat
| stat |
|---|
| 0.9916 |
null_distn <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "slope")
visualize(null_distn) +
shade_p_value(obs_stat = slope_hat, direction = "two_sided")| p_value |
|---|
| 0 |
Observed stat
( correlation_hat <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
calculate(stat = "correlation") )| stat |
|---|
| 0.8951 |
null_distn <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "correlation")
visualize(null_distn) +
shade_p_value(obs_stat = correlation_hat, direction = "two_sided")| p_value |
|---|
| 0 |
Not currently implemented since \(t\) could refer to standardized slope or standardized correlation.
Point estimate
| stat |
|---|
| 6.154 |
boot <- fli_small %>%
specify(response = arr_delay) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "mean")
( percentile_ci <- get_ci(boot) )| 2.5% | 97.5% |
|---|---|
| 2.606 | 9.602 |
| lower | upper |
|---|---|
| 2.609 | 9.699 |
Point estimate
| stat |
|---|
| 3.3 |
boot <- fli_small %>%
specify(response = arr_delay) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "t")
( percentile_ci <- get_ci(boot) )| 2.5% | 97.5% |
|---|---|
| 1.625 | 4.879 |
| lower | upper |
|---|---|
| 1.7 | 4.9 |
Point estimate
( p_hat <- fli_small %>%
specify(response = day_hour, success = "morning") %>%
calculate(stat = "prop") )| stat |
|---|
| 0.452 |
boot <- fli_small %>%
specify(response = day_hour, success = "morning") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "prop")
( percentile_ci <- get_ci(boot) )| 2.5% | 97.5% |
|---|---|
| 0.406 | 0.496 |
| lower | upper |
|---|---|
| 0.4079 | 0.4961 |
Not yet implemented.
Point estimate
( d_hat <- fli_small %>%
specify(arr_delay ~ season) %>%
calculate(stat = "diff in means", order = c("summer", "winter")) )| stat |
|---|
| 5.629 |
boot <- fli_small %>%
specify(arr_delay ~ season) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "diff in means", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )| 2.5% | 97.5% |
|---|---|
| -2.025 | 12.54 |
| lower | upper |
|---|---|
| -1.605 | 12.86 |
Standardized point estimate
( t_hat <- fli_small %>%
specify(arr_delay ~ season) %>%
calculate(stat = "t", order = c("summer", "winter")) )| stat |
|---|
| 1.511 |
boot <- fli_small %>%
specify(arr_delay ~ season) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "t", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )| 2.5% | 97.5% |
|---|---|
| -0.3589 | 3.736 |
| lower | upper |
|---|---|
| -0.5783 | 3.601 |
Point estimate
( d_hat <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
calculate(stat = "diff in props", order = c("summer", "winter")) )| stat |
|---|
| -0.0044 |
boot <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "diff in props", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )| 2.5% | 97.5% |
|---|---|
| -0.0957 | 0.0818 |
| lower | upper |
|---|---|
| -0.0914 | 0.0826 |
Standardized point estimate
( z_hat <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
calculate(stat = "z", order = c("summer", "winter")) )| stat |
|---|
| -0.0985 |
boot <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "z", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )| 2.5% | 97.5% |
|---|---|
| -1.962 | 1.788 |
| lower | upper |
|---|---|
| -2.042 | 1.845 |
Point estimate
| stat |
|---|
| 0.9916 |
boot <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "slope")
( percentile_ci <- get_ci(boot) )| 2.5% | 97.5% |
|---|---|
| 0.9463 | 1.032 |
| lower | upper |
|---|---|
| 0.9468 | 1.036 |
Point estimate
( correlation_hat <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
calculate(stat = "correlation") )| stat |
|---|
| 0.8951 |
boot <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "correlation")
( percentile_ci <- get_ci(boot) )| 2.5% | 97.5% |
|---|---|
| 0.827 | 0.9332 |
| lower | upper |
|---|---|
| 0.8418 | 0.9485 |
Not currently implemented since \(t\) could refer to standardized slope or standardized correlation.