My personal notes from DataCamp’s courses:
It’s unfair to compare R velocity with C velocity. Though C running code time is faster, R writing code is simpler and faster.
Simply keeping R up to date, is the first step to optimization.
# Print the R version details using version
version
## _
## platform x86_64-w64-mingw32
## arch x86_64
## os mingw32
## system x86_64, mingw32
## status
## major 3
## minor 6.3
## year 2020
## month 02
## day 29
## svn rev 77875
## language R
## version.string R version 3.6.3 (2020-02-29)
## nickname Holding the Windsock
Read datasets files is a common task that we want to optimize. R provides a native format .rds
that, if used without compression, is suitable for fast reading and writing.
# How long does it take to read movies from CSV?
system.time(read.csv('data/movies.csv'))
## user system elapsed
## 0.49 0.00 0.48
# How long does it take to read movies from RDS?
system.time(readRDS('data/movies.rds'))
## user system elapsed
## 0.05 0.00 0.04
Using microbenckmark package:
# Load the microbenchmark package
library(microbenchmark)
# Compare the two functions
compare <- microbenchmark(read.csv('data/movies.csv'),
readRDS('data/movies.rds'),
times = 10)
# Print compare
compare
## Unit: milliseconds
## expr min lq mean median uq
## read.csv("data/movies.csv") 454.5975 458.2171 468.16173 462.89610 467.3220
## readRDS("data/movies.rds") 55.8068 56.4693 67.94085 59.28785 68.7131
## max neval
## 508.0392 10
## 101.7521 10
Getting data and benchmarking my machine performance:
# Load the benchmarkme package
library(benchmarkme)
# Assign the variable ram to the amount of RAM on this machine
ram <- get_ram()
ram
## NA B
# Assign the variable cpu to the cpu specs
cpu <- get_cpu()
cpu
## $vendor_id
## [1] "GenuineIntel"
##
## $model_name
## [1] "Intel(R) Core(TM) i5-7200U CPU @ 2.50GHz"
##
## $no_of_cores
## [1] 4
# Run the io benchmark
res <- benchmark_io(runs = 1, size = 5)
## Preparing read/write io
## # IO benchmarks (2 tests) for size 5 MB:
## Writing a csv with 625000 values: 1.87 (sec).
## Reading a csv with 625000 values: 0.53 (sec).
# Plot the results
plot(res)
## You are ranked 102 out of 135 machines.
## Press return to get next plot
## You are ranked 125 out of 135 machines.
# Contribute to community
# upload_results(res)
Avoid growing a vector. Prefer to define it previously with correct size and type. Each time you increase the size of a vector, R requests the OS for more memory, which causes the code run slowly.
n <- 30000
# Slow code
growing <- function(n) {
x <- NULL
for(i in 1:n)
x <- c(x, rnorm(1))
x
}
# Fast code
pre_allocate <- function(n) {
x <- numeric(n) # Pre-allocate
for(i in 1:n)
x[i] <- rnorm(1)
x
}
# Use <- with system.time() to store the results of performance measures
system.time(res_grow <- growing(n))
## user system elapsed
## 1.61 0.04 1.64
system.time(res_allocate <- pre_allocate(n))
## user system elapsed
## 0.06 0.00 0.06
Calling an native R function eventually leads to C or FORTRAN code, which are heavily optimized.
Use vectorized solution whenever possible:
n <- 1e6
x <- vector("numeric", n)
microbenchmark(
x <- rnorm(n), # vectorized solution
{
for(i in seq_along(x)) # not vectorized
x[i] <- rnorm(1)
},
times = 10)
## Unit: milliseconds
## expr min lq
## x <- rnorm(n) 74.3023 75.5989
## { for (i in seq_along(x)) x[i] <- rnorm(1) } 1885.9603 1968.0757
## mean median uq max neval
## 84.91396 77.93325 83.0169 120.2617 10
## 2101.44342 1979.77340 2111.3313 2596.8162 10
Work with matrices whenever possible to optimize your code. Matrices provide better performance, mainly when extracting rows.
mat <- matrix(rnorm(100000), ncol = 1000)
df <- as.data.frame(mat)
# extracting columns
microbenchmark(mat[,1], df[,1])
## Unit: nanoseconds
## expr min lq mean median uq max neval
## mat[, 1] 700 1100 1706 1400 2000 8100 100
## df[, 1] 6700 7650 12693 13800 15500 52900 100
# extracting rows
microbenchmark(mat[1,], df[1,])
## Unit: microseconds
## expr min lq mean median uq max neval
## mat[1, ] 5.1 6.35 21.166 17.40 32.00 57.1 100
## df[1, ] 5092.8 5571.40 6336.108 6093.75 6706.55 10513.8 100
Use Profvis package to profile your code and find bottlenecks.
library(profvis)
profvis({
data(movies, package = "ggplot2movies") # Load data
braveheart <- movies[7288,]
movies <- movies[movies$Action == 1,]
plot(movies$year, movies$rating, xlab = "Year", ylab="Rating")
model <- loess(rating ~ year, data = movies) # loess regression line
j <- order(movies$year)
lines(movies$year[j], model$fitted[j], col="forestgreen", lwd=2)
points(braveheart$year, braveheart$rating, pch = 21, bg = "steelblue",
cex = 3)
})
Parallel package allow processing in multiple cores. Not every routine can benefit, or even be executed, in parallel. Only loops with no dependency among iterations. Besides that, if you have few iterations, multi thread communication may not compensate for using multiple cores.
library(parallel)
play <- function() {
total <- no_of_rolls <- 0
while(total < 10) {
total <- total + sample(1:6, 1)
# If even. Reset to 0
if(total %% 2 == 0) total <- 0
no_of_rolls <- no_of_rolls + 1
}
no_of_rolls
}
# Set the number of games to play
no_of_games <- 1e5
## Time serial version
system.time(serial <- sapply(1:no_of_games, function(i) play()))
## user system elapsed
## 8.53 0.00 8.56
## Set up cluster
cl <- makeCluster(2) # 2 cores
clusterExport(cl, "play")
## Time parallel version
system.time(par <- parSapply(cl, 1:no_of_games, function(i) play()))
## user system elapsed
## 0.04 0.02 5.41
## Stop cluster
stopCluster(cl)
An excelent example using:
foreach
loop structuredoParallel
packagesuppressPackageStartupMessages(library(dplyr))
library(stringr)
library(janeaustenr)
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
library(foreach)
library(microbenchmark)
janeausten_words <- function() {
# Names of the six books contained in janeaustenr
books <- austen_books()$book %>% unique %>% as.character
# Vector of words from all six books
words <- sapply(books, extract_words) %>% unlist
words
}
extract_words <- function(book_name) {
# extract the text of the book
text <- subset(austen_books(), book == book_name)$text
# extract words from the text and convert to lowercase
str_extract_all(text, boundary("word")) %>% unlist %>% tolower
}
max_frequency <- function(letter, words, min_length = 1) {
w <- select_words(letter, words = words, min_length = min_length)
frequency <- table(w)
frequency[which.max(frequency)]
}
select_words <- function(letter, words, min_length = 1) {
min_length_words <- words[nchar(words) >= min_length]
grep(paste0("^", letter), min_length_words, value = TRUE)
}
# Vector of words from all six books
words <- janeausten_words()
# function that runs in parallel
freq_doPar <- function(cores, min_length = 5) {
# Register a cluster of size cores
registerDoParallel(cores = cores)
# foreach loop
foreach(let = letters, .combine = c,
.export = c('max_frequency', "select_words", "words"),
.packages = c('janeaustenr', 'stringr')) %dopar%
max_frequency(let, words = words, min_length = min_length)
}
# function that runs sequentially
freq_seq <- function(min_length = 5) {
foreach(let = letters, .combine = c) %do%
max_frequency(let, words = words, min_length = min_length)
}
microbenchmark(
{ res_seq <- freq_seq() },
{ res_par <- freq_doPar(2) },
times = 1,
unit = 's')
## Unit: seconds
## expr min lq mean median uq
## { res_seq <- freq_seq() } 7.485440 7.485440 7.485440 7.485440 7.485440
## { res_par <- freq_doPar(2) } 7.559246 7.559246 7.559246 7.559246 7.559246
## max neval
## 7.485440 1
## 7.559246 1
all.equal(res_par, res_seq)
## [1] TRUE
You must use clusterSetRNGStream
in order to set seeds in parallel clusters to reproduce random numbers.
The code bellow shows how to activate RNG stream in parallel programming:
# Create a cluster
cl <- makeCluster(2)
# Check RNGkind on workers
clusterCall(cl, RNGkind)
## [[1]]
## [1] "Mersenne-Twister" "Inversion" "Rejection"
##
## [[2]]
## [1] "Mersenne-Twister" "Inversion" "Rejection"
# Set the RNG seed on workers
clusterSetRNGStream(cl, iseed = 100)
# Check RNGkind on workers
clusterCall(cl, RNGkind)
## [[1]]
## [1] "L'Ecuyer-CMRG" "Inversion" "Rejection"
##
## [[2]]
## [1] "L'Ecuyer-CMRG" "Inversion" "Rejection"
stopCluster(cl)
Using RNG stream in parallel:
mean_of_rnorm <- function(n) {
random_numbers <- rnorm(n)
mean(random_numbers)
}
n_vec <- rep(1000, 5)
cl <- makeCluster(2)
replicate(
n = 3,
expr = {
# Set the cluster's RNG stream seed to 1234
clusterSetRNGStream(cl, iseed = 1234)
clusterApply(cl, n_vec, mean_of_rnorm)
}
)
## [,1] [,2] [,3]
## [1,] -0.008597904 -0.008597904 -0.008597904
## [2,] -0.006089337 -0.006089337 -0.006089337
## [3,] -0.01398052 -0.01398052 -0.01398052
## [4,] -0.06629339 -0.06629339 -0.06629339
## [5,] 0.004297755 0.004297755 0.004297755
stopCluster(cl)