Statistics in R

Back to home

Models
– Linear models
– Non-linear models with GAM
Quantiles

Models

Linear models

Basic prediction from a linear model:

suppressPackageStartupMessages(library(dplyr))

snake_river_visits <- readRDS('data/snake_river_visits.rds')
str(snake_river_visits)

## 'data.frame':    410 obs. of  4 variables:
##  $ n_visits: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ gender  : Factor w/ 2 levels "male","female": 1 1 1 2 1 2 2 2 1 1 ...
##  $ income  : Factor w/ 4 levels "[$0,$25k]","($25k,$55k]",..: 4 2 4 2 4 2 4 4 4 4 ...
##  $ travel  : Factor w/ 3 levels "[0h,0.25h]","(0.25h,4h]",..: NA NA NA NA NA NA NA NA NA NA ...

snake_river_explanatory <- readRDS('data/snake_river_explanatory.rds')
str(snake_river_explanatory)

## tibble [24 × 3] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ gender: Factor w/ 2 levels "male","female": 1 2 1 2 1 2 1 2 1 2 ...
##  $ income: Factor w/ 4 levels "[$0,$25k]","($25k,$55k]",..: 1 1 2 2 3 3 4 4 1 1 ...
##  $ travel: Factor w/ 3 levels "[0h,0.25h]","(0.25h,4h]",..: 1 1 1 1 1 1 1 1 2 2 ...
##  - attr(*, "spec")=List of 3
##   ..$ cols   :List of 3
##   .. ..$ gender:List of 3
##   .. .. ..$ levels    : NULL
##   .. .. ..$ ordered   : logi FALSE
##   .. .. ..$ include_na: logi FALSE
##   .. .. ..- attr(*, "class")= chr [1:2] "collector_factor" "collector"
##   .. ..$ income:List of 3
##   .. .. ..$ levels    : NULL
##   .. .. ..$ ordered   : logi FALSE
##   .. .. ..$ include_na: logi FALSE
##   .. .. ..- attr(*, "class")= chr [1:2] "collector_factor" "collector"
##   .. ..$ travel:List of 3
##   .. .. ..$ levels    : NULL
##   .. .. ..$ ordered   : logi FALSE
##   .. .. ..$ include_na: logi FALSE
##   .. .. ..- attr(*, "class")= chr [1:2] "collector_factor" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr [1:2] "collector_guess" "collector"
##   ..$ skip   : num 1
##   ..- attr(*, "class")= chr "col_spec"

model <- lm(n_visits ~ gender + income + travel, snake_river_visits)

snake_river_explanatory %>%
  mutate(predicted_n_visits = predict(model, .))%>%
  arrange(desc(predicted_n_visits))

## # A tibble: 24 x 4
##    gender income      travel     predicted_n_visits
##    <fct>  <fct>       <fct>                   <dbl>
##  1 female [$0,$25k]   [0h,0.25h]               72.4
##  2 female ($25k,$55k] [0h,0.25h]               70.4
##  3 male   [$0,$25k]   [0h,0.25h]               61.5
##  4 male   ($25k,$55k] [0h,0.25h]               59.6
##  5 female ($95k,$Inf) [0h,0.25h]               53.8
##  6 female ($55k,$95k] [0h,0.25h]               53.1
##  7 female [$0,$25k]   (0.25h,4h]               45.8
##  8 female ($25k,$55k] (0.25h,4h]               43.8
##  9 male   ($95k,$Inf) [0h,0.25h]               42.9
## 10 male   ($55k,$95k] [0h,0.25h]               42.2
## # … with 14 more rows

(https://learn.datacamp.com/courses/introduction-to-writing-functions-in-r)

Non-linear models with GAM

When you draw a smooth trend line in plot using ggplot2::geom_smooth(), you’re, in fact, using a generalized additive model (GAM). This sort of model is ideal for fitting nonlinear curves. You can use mgcv::gam() to run the model. Thesyntax for running this GAM takes the following form:

gam(response ~ s(explanatory_var1) + explanatory_var2, data = dataset)

Here, s() means “make the variable smooth”, where smooth very roughly means nonlinear.

library(ggplot2)

df <- readRDS('data/nass.corn.rds')
states_data <- readRDS('data/states.rds')

# get BEA region of each state
df <- df %>%
  inner_join(states_data, by = c('state' = 'name'))

ggplot(df, aes(year, yield_bushels_per_acre)) +
  geom_line(aes(group = state)) +
  geom_smooth() +
  facet_wrap(vars(bea_region))

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

We can see in the plots that the trend lines fit well. So we can use GAM model and make predictions.

suppressPackageStartupMessages(library(mgcv))

bea_regions <- readRDS('data/BEA_regions.rds')$region_name

model <- gam(yield_bushels_per_acre ~ s(year) + bea_region, data = df)

predict_this <- data.frame(
  year = 2050,
  bea_region = bea_regions
)

# Predict the yield
pred_yield <- predict(model, predict_this, type = "response")

predict_this %>%
  mutate(pred_yield = pred_yield)

##   year     bea_region pred_yield
## 1 2050    New England   222.6950
## 2 2050        Mideast   212.8918
## 3 2050    Great Lakes   220.8467
## 4 2050         Plains   211.3505
## 5 2050      Southeast   197.5393
## 6 2050      Southwest   207.8766
## 7 2050 Rocky Mountain   212.9105
## 8 2050       Far West   227.9843

(https://learn.datacamp.com/courses/introduction-to-writing-functions-in-r)

Quantiles

A custom function cut_by_quantile() that converts a numeric vector into a categorical variable where quantiles define the cut points:

cut_by_quantile <- function(x, n = 5, na.rm = FALSE, labels = NULL, 
                            interval_type = c("(lo, hi]", "[lo, hi)")) {
  interval_type <- match.arg(interval_type)
  probs <- seq(0, 1, length.out = n + 1)
  qtiles <- quantile(x, probs, na.rm = na.rm, names = FALSE)
  right <- switch(interval_type, "(lo, hi]" = TRUE, "[lo, hi)" = FALSE)
  cut(x, qtiles, labels = labels, right = right, include.lowest = TRUE)
}

# Remove the interval_type argument from the call
cut_by_quantile(snake_river_visits$n_visits)

##   [1] [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
##   [9] [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
##  [17] [0,1]    [0,1]    [0,1]    (10,35]  (35,350] (10,35]  [0,1]    (2,10]  
##  [25] (1,2]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    (35,350]
##  [33] (35,350] (35,350] (35,350] (35,350] (10,35]  (35,350] (35,350] (35,350]
##  [41] (35,350] (35,350] (2,10]   (35,350] (35,350] (35,350] (2,10]   (2,10]  
##  [49] (1,2]    (10,35]  (10,35]  (10,35]  (35,350] (35,350] (10,35]  (10,35] 
##  [57] (35,350] (2,10]   (35,350] (10,35]  (2,10]   (10,35]  (35,350] (35,350]
##  [65] (35,350] (35,350] (10,35]  (10,35]  (35,350] (10,35]  [0,1]    (35,350]
##  [73] (10,35]  (10,35]  (2,10]   (10,35]  (10,35]  (10,35]  (2,10]   (35,350]
##  [81] (35,350] (35,350] (10,35]  (10,35]  (10,35]  [0,1]    (35,350] (2,10]  
##  [89] (10,35]  (35,350] (10,35]  (10,35]  (35,350] (10,35]  [0,1]    [0,1]   
##  [97] (2,10]   [0,1]    (2,10]   (2,10]   (35,350] (2,10]   (2,10]   (2,10]  
## [105] (10,35]  (1,2]    (10,35]  (35,350] (2,10]   (35,350] (2,10]   (35,350]
## [113] (1,2]    (35,350] (1,2]    (35,350] (10,35]  [0,1]    [0,1]    [0,1]   
## [121] [0,1]    (1,2]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
## [129] (2,10]   [0,1]    [0,1]    (1,2]    (1,2]    [0,1]    [0,1]    (1,2]   
## [137] [0,1]    [0,1]    [0,1]    (1,2]    (1,2]    (2,10]   (2,10]   (2,10]  
## [145] (2,10]   (2,10]   [0,1]    [0,1]    (1,2]    [0,1]    (1,2]    (2,10]  
## [153] [0,1]    [0,1]    [0,1]    [0,1]    (35,350] (10,35]  (35,350] (10,35] 
## [161] (2,10]   (35,350] (2,10]   [0,1]    (10,35]  (2,10]   (10,35]  (10,35] 
## [169] (35,350] (10,35]  (10,35]  (35,350] (35,350] (10,35]  (35,350] (10,35] 
## [177] (10,35]  (2,10]   (35,350] (10,35]  (10,35]  (10,35]  (35,350] (35,350]
## [185] [0,1]    (10,35]  (35,350] (10,35]  (10,35]  (2,10]   (35,350] (10,35] 
## [193] (35,350] (10,35]  (35,350] (10,35]  (35,350] (10,35]  (35,350] [0,1]   
## [201] (10,35]  (35,350] (35,350] [0,1]    [0,1]    (2,10]   (35,350] (2,10]  
## [209] (1,2]    (2,10]   (10,35]  (2,10]   [0,1]    (2,10]   (35,350] (10,35] 
## [217] (35,350] (10,35]  (35,350] (10,35]  (1,2]    (10,35]  (2,10]   (35,350]
## [225] (2,10]   (10,35]  (10,35]  (10,35]  [0,1]    (10,35]  (10,35]  (10,35] 
## [233] (35,350] (10,35]  (2,10]   (2,10]   [0,1]    [0,1]    [0,1]    [0,1]   
## [241] (2,10]   (1,2]    [0,1]    [0,1]    [0,1]    (2,10]   (10,35]  [0,1]   
## [249] [0,1]    (2,10]   [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
## [257] [0,1]    [0,1]    [0,1]    [0,1]    (35,350] (2,10]   [0,1]    (35,350]
## [265] (2,10]   (35,350] (2,10]   (1,2]    (35,350] (2,10]   (2,10]   (2,10]  
## [273] (1,2]    (10,35]  (2,10]   (10,35]  (35,350] (2,10]   (35,350] (35,350]
## [281] (2,10]   (2,10]   (35,350] (10,35]  (2,10]   (35,350] (10,35]  (2,10]  
## [289] (1,2]    [0,1]    (2,10]   [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
## [297] (1,2]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
## [305] [0,1]    (1,2]    [0,1]    [0,1]    [0,1]    [0,1]    (1,2]    [0,1]   
## [313] [0,1]    (1,2]    [0,1]    [0,1]    [0,1]    [0,1]    (1,2]    [0,1]   
## [321] [0,1]    [0,1]    [0,1]    [0,1]    (1,2]    [0,1]    [0,1]    [0,1]   
## [329] [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
## [337] [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
## [345] [0,1]    (2,10]   [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
## [353] [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    (1,2]    [0,1]   
## [361] [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
## [369] [0,1]    [0,1]    (1,2]    [0,1]    [0,1]    [0,1]    (35,350] (10,35] 
## [377] (35,350] (35,350] (35,350] (10,35]  (35,350] (35,350] (35,350] (35,350]
## [385] (2,10]   (2,10]   (2,10]   (10,35]  (35,350] (10,35]  (10,35]  (10,35] 
## [393] (10,35]  (10,35]  (10,35]  (10,35]  (35,350] (10,35]  (35,350] (2,10]  
## [401] (10,35]  (1,2]    (10,35]  (35,350] (1,2]    [0,1]    (2,10]   [0,1]   
## [409] (2,10]   (1,2]   
## Levels: [0,1] (1,2] (2,10] (10,35] (35,350]

Statistics in R

My personal knowledge base

Back to home

Contents

Models

Linear models

(https://learn.datacamp.com/courses/introduction-to-writing-functions-in-r)

Non-linear models with GAM

(https://learn.datacamp.com/courses/introduction-to-writing-functions-in-r)

Quantiles