nycflights13 flights datavignettes/observed_stat_examples.Rmd
observed_stat_examples.Rmdlibrary(nycflights13)
library(dplyr)
library(ggplot2)
library(stringr)
library(infer)
set.seed(2017)
fli_small <- flights %>%
na.omit() %>%
sample_n(size = 500) %>%
mutate(season = case_when(
month %in% c(10:12, 1:3) ~ "winter",
month %in% c(4:9) ~ "summer"
)) %>%
mutate(day_hour = case_when(
between(hour, 1, 12) ~ "morning",
between(hour, 13, 24) ~ "not morning"
)) %>%
select(arr_delay, dep_delay, season,
day_hour, origin, carrier)arr_delay, dep_delay
season ("winter", "summer"),day_hour ("morning", "not morning")origin ("EWR", "JFK", "LGA")carrier
Observed stat
## # A tibble: 1 x 1
## stat
## <dbl>
## 1 11.5
null_distn <- fli_small %>%
specify(response = dep_delay) %>%
hypothesize(null = "point", mu = 10) %>%
generate(reps = 1000) %>%
calculate(stat = "mean")## Setting `type = "bootstrap"` in `generate()`.

## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0.356
Observed stat
null_distn <- fli_small %>%
specify(response = dep_delay) %>%
hypothesize(null = "point", mu = 8) %>%
generate(reps = 1000) %>%
calculate(stat = "t")## Setting `type = "bootstrap"` in `generate()`.

## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0.018
Observed stat
## # A tibble: 1 x 1
## stat
## <dbl>
## 1 -2
null_distn <- fli_small %>%
specify(response = dep_delay) %>%
hypothesize(null = "point", med = -1) %>%
generate(reps = 1000) %>%
calculate(stat = "median")## Setting `type = "bootstrap"` in `generate()`.

## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0.018
Observed stat
( p_hat <- fli_small %>%
specify(response = day_hour, success = "morning") %>%
calculate(stat = "prop") )## # A tibble: 1 x 1
## stat
## <dbl>
## 1 0.452
null_distn <- fli_small %>%
specify(response = day_hour, success = "morning") %>%
hypothesize(null = "point", p = .5) %>%
generate(reps = 1000) %>%
calculate(stat = "prop")## Setting `type = "simulate"` in `generate()`.

## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0.036
Logical variables will be coerced to factors:
null_distn <- fli_small %>%
mutate(day_hour_logical = (day_hour == "morning")) %>%
specify(response = day_hour_logical, success = "TRUE") %>%
hypothesize(null = "point", p = .5) %>%
generate(reps = 1000) %>%
calculate(stat = "prop")## Setting `type = "simulate"` in `generate()`.
Observed stat
( d_hat <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
calculate(stat = "diff in props", order = c("winter", "summer")) )## # A tibble: 1 x 1
## stat
## <dbl>
## 1 0.00438
null_distn <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
hypothesize(null = "independence") %>%
generate(reps = 1000) %>%
calculate(stat = "diff in props", order = c("winter", "summer"))## Setting `type = "permute"` in `generate()`.

## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0.954
Standardized observed stat
( z_hat <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
calculate(stat = "z", order = c("winter", "summer")) )## # A tibble: 1 x 1
## stat
## <dbl>
## 1 0.0985
null_distn <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
hypothesize(null = "independence") %>%
generate(reps = 1000) %>%
calculate(stat = "z", order = c("winter", "summer"))## Setting `type = "permute"` in `generate()`.

## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0.95
Note the similarities in this plot and the previous one.
Observed stat
Note the need to add in the hypothesized values here to compute the observed statistic.
( Chisq_hat <- fli_small %>%
specify(response = origin) %>%
hypothesize(null = "point",
p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>%
calculate(stat = "Chisq") )## # A tibble: 1 x 1
## stat
## <dbl>
## 1 7.01
null_distn <- fli_small %>%
specify(response = origin) %>%
hypothesize(null = "point",
p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>%
generate(reps = 1000, type = "simulate") %>%
calculate(stat = "Chisq")
visualize(null_distn) +
shade_p_value(obs_stat = Chisq_hat, direction = "greater")
## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0.037
Observed stat
## # A tibble: 1 x 1
## stat
## <dbl>
## 1 0.528
null_distn <- fli_small %>%
specify(day_hour ~ origin) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "Chisq")
visualize(null_distn) +
shade_p_value(obs_stat = Chisq_hat, direction = "greater")
## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0.77
Observed stat
( d_hat <- fli_small %>%
specify(dep_delay ~ season) %>%
calculate(stat = "diff in means", order = c("summer", "winter")) )## # A tibble: 1 x 1
## stat
## <dbl>
## 1 3
null_distn <- fli_small %>%
specify(dep_delay ~ season) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in means", order = c("summer", "winter"))
visualize(null_distn) +
shade_p_value(obs_stat = d_hat, direction = "two_sided")
## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0.338
Standardized observed stat
( t_hat <- fli_small %>%
specify(dep_delay ~ season) %>%
calculate(stat = "t", order = c("summer", "winter")) )## # A tibble: 1 x 1
## stat
## <dbl>
## 1 0.891
null_distn <- fli_small %>%
specify(dep_delay ~ season) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "t", order = c("summer", "winter"))
visualize(null_distn) +
shade_p_value(obs_stat = t_hat, direction = "two_sided")
## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0.4
Note the similarities in this plot and the previous one.
Observed stat
( d_hat <- fli_small %>%
specify(dep_delay ~ season) %>%
calculate(stat = "diff in medians", order = c("summer", "winter")) )## # A tibble: 1 x 1
## stat
## <dbl>
## 1 1
null_distn <- fli_small %>%
specify(dep_delay ~ season) %>% # alt: response = dep_delay,
# explanatory = season
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in medians", order = c("summer", "winter"))
visualize(null_distn) +
shade_p_value(obs_stat = d_hat, direction = "two_sided")
## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0.64
Observed stat
## # A tibble: 1 x 1
## stat
## <dbl>
## 1 0.686
null_distn <- fli_small %>%
specify(arr_delay ~ origin) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "F")
visualize(null_distn) +
shade_p_value(obs_stat = F_hat, direction = "greater")
## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0.529
Observed stat
## # A tibble: 1 x 1
## stat
## <dbl>
## 1 0.992
null_distn <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "slope")
visualize(null_distn) +
shade_p_value(obs_stat = slope_hat, direction = "two_sided")
## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0
Observed stat
( correlation_hat <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
calculate(stat = "correlation") )## # A tibble: 1 x 1
## stat
## <dbl>
## 1 0.895
null_distn <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "correlation")
visualize(null_distn) +
shade_p_value(obs_stat = correlation_hat, direction = "two_sided")
## # A tibble: 1 x 1
## p_value
## <dbl>
## 1 0
Not currently implemented since \(t\) could refer to standardized slope or standardized correlation.
Point estimate
## # A tibble: 1 x 1
## stat
## <dbl>
## 1 6.15
boot <- fli_small %>%
specify(response = arr_delay) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "mean")
( percentile_ci <- get_ci(boot) )## # A tibble: 1 x 2
## `2.5%` `97.5%`
## <dbl> <dbl>
## 1 2.61 9.60

## # A tibble: 1 x 2
## lower upper
## <dbl> <dbl>
## 1 2.61 9.70

Point estimate
## # A tibble: 1 x 1
## stat
## <dbl>
## 1 3.30
boot <- fli_small %>%
specify(response = arr_delay) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "t")
( percentile_ci <- get_ci(boot) )## # A tibble: 1 x 2
## `2.5%` `97.5%`
## <dbl> <dbl>
## 1 1.62 4.88

## # A tibble: 1 x 2
## lower upper
## <dbl> <dbl>
## 1 1.70 4.90

Point estimate
( p_hat <- fli_small %>%
specify(response = day_hour, success = "morning") %>%
calculate(stat = "prop") )## # A tibble: 1 x 1
## stat
## <dbl>
## 1 0.452
boot <- fli_small %>%
specify(response = day_hour, success = "morning") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "prop")
( percentile_ci <- get_ci(boot) )## # A tibble: 1 x 2
## `2.5%` `97.5%`
## <dbl> <dbl>
## 1 0.406 0.496

## # A tibble: 1 x 2
## lower upper
## <dbl> <dbl>
## 1 0.408 0.496

Point estimate
( d_hat <- fli_small %>%
specify(arr_delay ~ season) %>%
calculate(stat = "diff in means", order = c("summer", "winter")) )## # A tibble: 1 x 1
## stat
## <dbl>
## 1 5.63
boot <- fli_small %>%
specify(arr_delay ~ season) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "diff in means", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )## # A tibble: 1 x 2
## `2.5%` `97.5%`
## <dbl> <dbl>
## 1 -2.03 12.5

## # A tibble: 1 x 2
## lower upper
## <dbl> <dbl>
## 1 -1.61 12.9

Standardized point estimate
( t_hat <- fli_small %>%
specify(arr_delay ~ season) %>%
calculate(stat = "t", order = c("summer", "winter")) )## # A tibble: 1 x 1
## stat
## <dbl>
## 1 1.51
boot <- fli_small %>%
specify(arr_delay ~ season) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "t", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )## # A tibble: 1 x 2
## `2.5%` `97.5%`
## <dbl> <dbl>
## 1 -0.359 3.74

## # A tibble: 1 x 2
## lower upper
## <dbl> <dbl>
## 1 -0.578 3.60

Point estimate
( d_hat <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
calculate(stat = "diff in props", order = c("summer", "winter")) )## # A tibble: 1 x 1
## stat
## <dbl>
## 1 -0.00438
boot <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "diff in props", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )## # A tibble: 1 x 2
## `2.5%` `97.5%`
## <dbl> <dbl>
## 1 -0.0957 0.0818

## # A tibble: 1 x 2
## lower upper
## <dbl> <dbl>
## 1 -0.0914 0.0826

Standardized point estimate
( z_hat <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
calculate(stat = "z", order = c("summer", "winter")) )## # A tibble: 1 x 1
## stat
## <dbl>
## 1 -0.0985
boot <- fli_small %>%
specify(day_hour ~ season, success = "morning") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "z", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )## # A tibble: 1 x 2
## `2.5%` `97.5%`
## <dbl> <dbl>
## 1 -1.96 1.79

## # A tibble: 1 x 2
## lower upper
## <dbl> <dbl>
## 1 -2.04 1.85

Point estimate
## # A tibble: 1 x 1
## stat
## <dbl>
## 1 0.992
boot <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "slope")
( percentile_ci <- get_ci(boot) )## # A tibble: 1 x 2
## `2.5%` `97.5%`
## <dbl> <dbl>
## 1 0.946 1.03

## # A tibble: 1 x 2
## lower upper
## <dbl> <dbl>
## 1 0.947 1.04

Point estimate
( correlation_hat <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
calculate(stat = "correlation") )## # A tibble: 1 x 1
## stat
## <dbl>
## 1 0.895
boot <- fli_small %>%
specify(arr_delay ~ dep_delay) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "correlation")
( percentile_ci <- get_ci(boot) )## # A tibble: 1 x 2
## `2.5%` `97.5%`
## <dbl> <dbl>
## 1 0.827 0.933

## # A tibble: 1 x 2
## lower upper
## <dbl> <dbl>
## 1 0.842 0.948
