nycflights13 flights dataNote: The type argument in generate() is automatically filled based on the entries for specify() and hypothesize(). It can be removed throughout the examples that follow. It is left in to reiterate the type of generation process being performed.
library(nycflights13)
library(dplyr)
library(stringr)
library(infer)
set.seed(2017)
fli_small <- flights %>%
sample_n(size = 500) %>%
mutate(half_year = case_when(
between(month, 1, 6) ~ "h1",
between(month, 7, 12) ~ "h2"
)) %>%
mutate(day_hour = case_when(
between(hour, 1, 12) ~ "morning",
between(hour, 13, 24) ~ "not morning"
)) %>%
select(arr_delay, dep_delay, half_year,
day_hour, origin, carrier)arr_delay, dep_delayhalf_year ("h1", "h2"),day_hour ("morning", "not morning")origin ("EWR", "JFK", "LGA")carrierThe recommended approach is to use specify() %>% calculate():
obs_t <- fli_small %>%
specify(arr_delay ~ half_year) %>%
calculate(stat = "t", order = c("h1", "h2"))## Warning: Removed 15 rows containing missing values.
The observed \(t\) statistic is
| stat |
|---|
| 0.8685 |
.
Or using t_test in infer
obs_t <- fli_small %>%
t_test(formula = arr_delay ~ half_year, alternative = "two_sided",
order = c("h1", "h2")) %>%
dplyr::select(statistic) %>%
dplyr::pull()The observed \(t\) statistic is 0.8685.
Or using another shortcut function in infer:
obs_t <- fli_small %>%
t_stat(formula = arr_delay ~ half_year, order = c("h1", "h2"))| statistic |
|---|
| 0.8685 |
.
t_null_distn <- fli_small %>%
# alt: response = arr_delay, explanatory = half_year
specify(arr_delay ~ half_year) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "t", order = c("h1", "h2"))## Warning: Removed 15 rows containing missing values.
t_null_distn %>% visualize(obs_stat = obs_t, direction = "two_sided")t_null_distn %>%
get_pvalue(obs_stat = obs_t, direction = "two_sided")| p_value |
|---|
| 0.43 |
fli_small %>%
# alt: response = arr_delay, explanatory = half_year
specify(arr_delay ~ half_year) %>%
hypothesize(null = "independence") %>%
# generate() ## Not used for theoretical
calculate(stat = "t", order = c("h1", "h2")) %>%
visualize(method = "theoretical", obs_stat = obs_t, direction = "two_sided")## Warning: Removed 15 rows containing missing values.
## Warning: Check to make sure the conditions have been met for the
## theoretical method. {infer} currently does not check these for you.
fli_small %>%
# alt: response = arr_delay, explanatory = half_year
specify(arr_delay ~ half_year) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "t", order = c("h1", "h2")) %>%
visualize(method = "both", obs_stat = obs_t, direction = "two_sided")## Warning: Check to make sure the conditions have been met for the
## theoretical method. `infer` currently does not check these for you.
fli_small %>%
t_test(formula = arr_delay ~ half_year,
alternative = "two_sided",
order = c("h1", "h2")) %>%
dplyr::select(p_value) %>%
dplyr::pull()## [1] 0.3855