Basic prediction from a linear model:
suppressPackageStartupMessages(library(dplyr))
snake_river_visits <- readRDS('data/snake_river_visits.rds')
str(snake_river_visits)
## 'data.frame': 410 obs. of 4 variables:
## $ n_visits: num 0 0 0 0 0 0 0 0 0 0 ...
## $ gender : Factor w/ 2 levels "male","female": 1 1 1 2 1 2 2 2 1 1 ...
## $ income : Factor w/ 4 levels "[$0,$25k]","($25k,$55k]",..: 4 2 4 2 4 2 4 4 4 4 ...
## $ travel : Factor w/ 3 levels "[0h,0.25h]","(0.25h,4h]",..: NA NA NA NA NA NA NA NA NA NA ...
snake_river_explanatory <- readRDS('data/snake_river_explanatory.rds')
str(snake_river_explanatory)
## tibble [24 × 3] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ gender: Factor w/ 2 levels "male","female": 1 2 1 2 1 2 1 2 1 2 ...
## $ income: Factor w/ 4 levels "[$0,$25k]","($25k,$55k]",..: 1 1 2 2 3 3 4 4 1 1 ...
## $ travel: Factor w/ 3 levels "[0h,0.25h]","(0.25h,4h]",..: 1 1 1 1 1 1 1 1 2 2 ...
## - attr(*, "spec")=List of 3
## ..$ cols :List of 3
## .. ..$ gender:List of 3
## .. .. ..$ levels : NULL
## .. .. ..$ ordered : logi FALSE
## .. .. ..$ include_na: logi FALSE
## .. .. ..- attr(*, "class")= chr [1:2] "collector_factor" "collector"
## .. ..$ income:List of 3
## .. .. ..$ levels : NULL
## .. .. ..$ ordered : logi FALSE
## .. .. ..$ include_na: logi FALSE
## .. .. ..- attr(*, "class")= chr [1:2] "collector_factor" "collector"
## .. ..$ travel:List of 3
## .. .. ..$ levels : NULL
## .. .. ..$ ordered : logi FALSE
## .. .. ..$ include_na: logi FALSE
## .. .. ..- attr(*, "class")= chr [1:2] "collector_factor" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr [1:2] "collector_guess" "collector"
## ..$ skip : num 1
## ..- attr(*, "class")= chr "col_spec"
model <- lm(n_visits ~ gender + income + travel, snake_river_visits)
snake_river_explanatory %>%
mutate(predicted_n_visits = predict(model, .))%>%
arrange(desc(predicted_n_visits))
## # A tibble: 24 x 4
## gender income travel predicted_n_visits
## <fct> <fct> <fct> <dbl>
## 1 female [$0,$25k] [0h,0.25h] 72.4
## 2 female ($25k,$55k] [0h,0.25h] 70.4
## 3 male [$0,$25k] [0h,0.25h] 61.5
## 4 male ($25k,$55k] [0h,0.25h] 59.6
## 5 female ($95k,$Inf) [0h,0.25h] 53.8
## 6 female ($55k,$95k] [0h,0.25h] 53.1
## 7 female [$0,$25k] (0.25h,4h] 45.8
## 8 female ($25k,$55k] (0.25h,4h] 43.8
## 9 male ($95k,$Inf) [0h,0.25h] 42.9
## 10 male ($55k,$95k] [0h,0.25h] 42.2
## # … with 14 more rows
When you draw a smooth trend line in plot using ggplot2::geom_smooth()
, you’re, in fact, using a generalized additive model (GAM). This sort of model is ideal for fitting nonlinear curves. You can use mgcv::gam()
to run the model. Thesyntax for running this GAM takes the following form:
gam(response ~ s(explanatory_var1) + explanatory_var2, data = dataset)
Here, s()
means “make the variable smooth”, where smooth very roughly means nonlinear.
library(ggplot2)
df <- readRDS('data/nass.corn.rds')
states_data <- readRDS('data/states.rds')
# get BEA region of each state
df <- df %>%
inner_join(states_data, by = c('state' = 'name'))
ggplot(df, aes(year, yield_bushels_per_acre)) +
geom_line(aes(group = state)) +
geom_smooth() +
facet_wrap(vars(bea_region))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
We can see in the plots that the trend lines fit well. So we can use GAM model and make predictions.
suppressPackageStartupMessages(library(mgcv))
bea_regions <- readRDS('data/BEA_regions.rds')$region_name
model <- gam(yield_bushels_per_acre ~ s(year) + bea_region, data = df)
predict_this <- data.frame(
year = 2050,
bea_region = bea_regions
)
# Predict the yield
pred_yield <- predict(model, predict_this, type = "response")
predict_this %>%
mutate(pred_yield = pred_yield)
## year bea_region pred_yield
## 1 2050 New England 222.6950
## 2 2050 Mideast 212.8918
## 3 2050 Great Lakes 220.8467
## 4 2050 Plains 211.3505
## 5 2050 Southeast 197.5393
## 6 2050 Southwest 207.8766
## 7 2050 Rocky Mountain 212.9105
## 8 2050 Far West 227.9843
A custom function cut_by_quantile()
that converts a numeric vector into a categorical variable where quantiles define the cut points:
cut_by_quantile <- function(x, n = 5, na.rm = FALSE, labels = NULL,
interval_type = c("(lo, hi]", "[lo, hi)")) {
interval_type <- match.arg(interval_type)
probs <- seq(0, 1, length.out = n + 1)
qtiles <- quantile(x, probs, na.rm = na.rm, names = FALSE)
right <- switch(interval_type, "(lo, hi]" = TRUE, "[lo, hi)" = FALSE)
cut(x, qtiles, labels = labels, right = right, include.lowest = TRUE)
}
# Remove the interval_type argument from the call
cut_by_quantile(snake_river_visits$n_visits)
## [1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [9] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [17] [0,1] [0,1] [0,1] (10,35] (35,350] (10,35] [0,1] (2,10]
## [25] (1,2] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] (35,350]
## [33] (35,350] (35,350] (35,350] (35,350] (10,35] (35,350] (35,350] (35,350]
## [41] (35,350] (35,350] (2,10] (35,350] (35,350] (35,350] (2,10] (2,10]
## [49] (1,2] (10,35] (10,35] (10,35] (35,350] (35,350] (10,35] (10,35]
## [57] (35,350] (2,10] (35,350] (10,35] (2,10] (10,35] (35,350] (35,350]
## [65] (35,350] (35,350] (10,35] (10,35] (35,350] (10,35] [0,1] (35,350]
## [73] (10,35] (10,35] (2,10] (10,35] (10,35] (10,35] (2,10] (35,350]
## [81] (35,350] (35,350] (10,35] (10,35] (10,35] [0,1] (35,350] (2,10]
## [89] (10,35] (35,350] (10,35] (10,35] (35,350] (10,35] [0,1] [0,1]
## [97] (2,10] [0,1] (2,10] (2,10] (35,350] (2,10] (2,10] (2,10]
## [105] (10,35] (1,2] (10,35] (35,350] (2,10] (35,350] (2,10] (35,350]
## [113] (1,2] (35,350] (1,2] (35,350] (10,35] [0,1] [0,1] [0,1]
## [121] [0,1] (1,2] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [129] (2,10] [0,1] [0,1] (1,2] (1,2] [0,1] [0,1] (1,2]
## [137] [0,1] [0,1] [0,1] (1,2] (1,2] (2,10] (2,10] (2,10]
## [145] (2,10] (2,10] [0,1] [0,1] (1,2] [0,1] (1,2] (2,10]
## [153] [0,1] [0,1] [0,1] [0,1] (35,350] (10,35] (35,350] (10,35]
## [161] (2,10] (35,350] (2,10] [0,1] (10,35] (2,10] (10,35] (10,35]
## [169] (35,350] (10,35] (10,35] (35,350] (35,350] (10,35] (35,350] (10,35]
## [177] (10,35] (2,10] (35,350] (10,35] (10,35] (10,35] (35,350] (35,350]
## [185] [0,1] (10,35] (35,350] (10,35] (10,35] (2,10] (35,350] (10,35]
## [193] (35,350] (10,35] (35,350] (10,35] (35,350] (10,35] (35,350] [0,1]
## [201] (10,35] (35,350] (35,350] [0,1] [0,1] (2,10] (35,350] (2,10]
## [209] (1,2] (2,10] (10,35] (2,10] [0,1] (2,10] (35,350] (10,35]
## [217] (35,350] (10,35] (35,350] (10,35] (1,2] (10,35] (2,10] (35,350]
## [225] (2,10] (10,35] (10,35] (10,35] [0,1] (10,35] (10,35] (10,35]
## [233] (35,350] (10,35] (2,10] (2,10] [0,1] [0,1] [0,1] [0,1]
## [241] (2,10] (1,2] [0,1] [0,1] [0,1] (2,10] (10,35] [0,1]
## [249] [0,1] (2,10] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [257] [0,1] [0,1] [0,1] [0,1] (35,350] (2,10] [0,1] (35,350]
## [265] (2,10] (35,350] (2,10] (1,2] (35,350] (2,10] (2,10] (2,10]
## [273] (1,2] (10,35] (2,10] (10,35] (35,350] (2,10] (35,350] (35,350]
## [281] (2,10] (2,10] (35,350] (10,35] (2,10] (35,350] (10,35] (2,10]
## [289] (1,2] [0,1] (2,10] [0,1] [0,1] [0,1] [0,1] [0,1]
## [297] (1,2] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [305] [0,1] (1,2] [0,1] [0,1] [0,1] [0,1] (1,2] [0,1]
## [313] [0,1] (1,2] [0,1] [0,1] [0,1] [0,1] (1,2] [0,1]
## [321] [0,1] [0,1] [0,1] [0,1] (1,2] [0,1] [0,1] [0,1]
## [329] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [337] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [345] [0,1] (2,10] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [353] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] (1,2] [0,1]
## [361] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1] [0,1]
## [369] [0,1] [0,1] (1,2] [0,1] [0,1] [0,1] (35,350] (10,35]
## [377] (35,350] (35,350] (35,350] (10,35] (35,350] (35,350] (35,350] (35,350]
## [385] (2,10] (2,10] (2,10] (10,35] (35,350] (10,35] (10,35] (10,35]
## [393] (10,35] (10,35] (10,35] (10,35] (35,350] (10,35] (35,350] (2,10]
## [401] (10,35] (1,2] (10,35] (35,350] (1,2] [0,1] (2,10] [0,1]
## [409] (2,10] (1,2]
## Levels: [0,1] (1,2] (2,10] (10,35] (35,350]