My personal notes from DataCamp’s course
Read functions like read.csv(), real.delim() can accept urls in place of local paths. You can use download.file() to save a local copy.
csv_url <- paste0("http://s3.amazonaws.com/assets.datacamp.com/production",
"/course_1561/datasets/chickwts.csv")
# Download the file with download.file()
download.file(url = csv_url, destfile = 'data/feed_data.csv')
# Read it in with read.csv()
csv_data <- read.csv('data/feed_data.csv')
R has several packages with APIs implementations. Google ‘CRAN
Example with pageviews: client for Wikipedia’s API
# Load pageviews
library(pageviews)
# Get the pageviews for "Hadley Wickham"
hadley_pageviews <- article_pageviews(project = "en.wikipedia",
"Hadley Wickham")
# Examine the resulting object
str(hadley_pageviews)
## 'data.frame': 1 obs. of 8 variables:
## $ project : chr "wikipedia"
## $ language : chr "en"
## $ article : chr "Hadley_Wickham"
## $ access : chr "all-access"
## $ agent : chr "all-agents"
## $ granularity: chr "daily"
## $ date : POSIXct, format: "2015-10-01"
## $ views : num 53
Package httr helps to interact directly with APIs.
library(httr)
url <- paste0("https://wikimedia.org/api/rest_v1/metrics/pageviews/",
"per-article/en.wikipedia.org/all-access/all-agents/",
"Hadley_Wickham/daily/20170101/20170102")
# Make a GET request to url and save the results
pageview_response <- GET(url)
# Call content() to retrieve the data the server sent back
pageview_data <- content(pageview_response)
# Examine the results with str()
str(pageview_data)
## List of 1
## $ items:List of 2
## ..$ :List of 7
## .. ..$ project : chr "en.wikipedia"
## .. ..$ article : chr "Hadley_Wickham"
## .. ..$ granularity: chr "daily"
## .. ..$ timestamp : chr "2017010100"
## .. ..$ access : chr "all-access"
## .. ..$ agent : chr "all-agents"
## .. ..$ views : int 45
## ..$ :List of 7
## .. ..$ project : chr "en.wikipedia"
## .. ..$ article : chr "Hadley_Wickham"
## .. ..$ granularity: chr "daily"
## .. ..$ timestamp : chr "2017010200"
## .. ..$ access : chr "all-access"
## .. ..$ agent : chr "all-agents"
## .. ..$ views : int 86
function httr::http_error() helps with response codes:
fake_url <- "http://google.com/fakepagethatdoesnotexist"
# Make the GET request
request_result <- GET(fake_url)
# Check request_result
if(http_error(request_result)){
warning("The request failed")
} else {
content(request_result)
}
## Warning: The request failed
Use ‘query’ argument for parameters based APIs:
# Create list with nationality and country elements
query_params <- list(nationality = 'americans',
country = 'antigua')
# Make parameter-based call to httpbin, with query_params
parameter_response <- GET('https://httpbin.org/get', query = query_params)
# Print parameter_response
parameter_response
## Response [https://httpbin.org/get?nationality=americans&country=antigua]
## Date: 2020-06-14 23:22
## Status: 200
## Content-Type: application/json
## Size: 470 B
## {
## "args": {
## "country": "antigua",
## "nationality": "americans"
## },
## "headers": {
## "Accept": "application/json, text/xml, application/xml, */*",
## "Accept-Encoding": "deflate, gzip, br",
## "Host": "httpbin.org",
## "User-Agent": "libcurl/7.68.0 r-curl/4.3 httr/1.4.1",
## ...
Typical consumption of an API with traffic limiter:
# Construct a vector of 2 URLs
urls <- c('http://httpbin.org/status/404', 'http://httpbin.org/status/301')
for(url in urls){
# Send a GET request to url
result <- GET(url)
# Delay for 5 seconds between requests
Sys.sleep(1)
}
A function tying all together:
get_pageviews <- function(article_title){
url <- paste(
paste0("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/",
"en.wikipedia/all-access/all-agents"),
article_title,
"daily/2015100100/2015103100",
sep = "/"
)
response <- GET(url, user_agent("my@email.com this is a test"))
# Is there an HTTP error?
if(http_error(response)){
# Throw an R error
stop("the request failed")
}
# Return the response's content
content(response)
}
toJSON, fromJSON {jsonlite}:
library(jsonlite)
# Stringify some data
jsoncars <- toJSON(mtcars[1:5,], pretty=TRUE)
jsoncars
## [
## {
## "mpg": 21,
## "cyl": 6,
## "disp": 160,
## "hp": 110,
## "drat": 3.9,
## "wt": 2.62,
## "qsec": 16.46,
## "vs": 0,
## "am": 1,
## "gear": 4,
## "carb": 4,
## "_row": "Mazda RX4"
## },
## {
## "mpg": 21,
## "cyl": 6,
## "disp": 160,
## "hp": 110,
## "drat": 3.9,
## "wt": 2.875,
## "qsec": 17.02,
## "vs": 0,
## "am": 1,
## "gear": 4,
## "carb": 4,
## "_row": "Mazda RX4 Wag"
## },
## {
## "mpg": 22.8,
## "cyl": 4,
## "disp": 108,
## "hp": 93,
## "drat": 3.85,
## "wt": 2.32,
## "qsec": 18.61,
## "vs": 1,
## "am": 1,
## "gear": 4,
## "carb": 1,
## "_row": "Datsun 710"
## },
## {
## "mpg": 21.4,
## "cyl": 6,
## "disp": 258,
## "hp": 110,
## "drat": 3.08,
## "wt": 3.215,
## "qsec": 19.44,
## "vs": 1,
## "am": 0,
## "gear": 3,
## "carb": 1,
## "_row": "Hornet 4 Drive"
## },
## {
## "mpg": 18.7,
## "cyl": 8,
## "disp": 360,
## "hp": 175,
## "drat": 3.15,
## "wt": 3.44,
## "qsec": 17.02,
## "vs": 0,
## "am": 0,
## "gear": 3,
## "carb": 2,
## "_row": "Hornet Sportabout"
## }
## ]
# Parse it back
fromJSON(jsoncars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Not run:
#retrieve data frame
data1 <- fromJSON("https://api.github.com/users/hadley/orgs")
names(data1)
## [1] "login" "id" "node_id"
## [4] "url" "repos_url" "events_url"
## [7] "hooks_url" "issues_url" "members_url"
## [10] "public_members_url" "avatar_url" "description"
data1$login
## [1] "ggobi" "rstudio" "rstats"
## [4] "ropensci" "rjournal" "r-dbi"
## [7] "RConsortium" "tidyverse" "r-lib"
## [10] "rstudio-education"
# Nested data frames:
data2 <- fromJSON("https://api.github.com/users/hadley/repos")
names(data2)
## [1] "id" "node_id" "name"
## [4] "full_name" "private" "owner"
## [7] "html_url" "description" "fork"
## [10] "url" "forks_url" "keys_url"
## [13] "collaborators_url" "teams_url" "hooks_url"
## [16] "issue_events_url" "events_url" "assignees_url"
## [19] "branches_url" "tags_url" "blobs_url"
## [22] "git_tags_url" "git_refs_url" "trees_url"
## [25] "statuses_url" "languages_url" "stargazers_url"
## [28] "contributors_url" "subscribers_url" "subscription_url"
## [31] "commits_url" "git_commits_url" "comments_url"
## [34] "issue_comment_url" "contents_url" "compare_url"
## [37] "merges_url" "archive_url" "downloads_url"
## [40] "issues_url" "pulls_url" "milestones_url"
## [43] "notifications_url" "labels_url" "releases_url"
## [46] "deployments_url" "created_at" "updated_at"
## [49] "pushed_at" "git_url" "ssh_url"
## [52] "clone_url" "svn_url" "homepage"
## [55] "size" "stargazers_count" "watchers_count"
## [58] "language" "has_issues" "has_projects"
## [61] "has_downloads" "has_wiki" "has_pages"
## [64] "forks_count" "mirror_url" "archived"
## [67] "disabled" "open_issues_count" "license"
## [70] "forks" "open_issues" "watchers"
## [73] "default_branch"
names(data2$owner)
## [1] "login" "id" "node_id"
## [4] "avatar_url" "gravatar_id" "url"
## [7] "html_url" "followers_url" "following_url"
## [10] "gists_url" "starred_url" "subscriptions_url"
## [13] "organizations_url" "repos_url" "events_url"
## [16] "received_events_url" "type" "site_admin"
data2$owner$login
## [1] "hadley" "hadley" "hadley" "hadley" "hadley" "hadley" "hadley" "hadley"
## [9] "hadley" "hadley" "hadley" "hadley" "hadley" "hadley" "hadley" "hadley"
## [17] "hadley" "hadley" "hadley" "hadley" "hadley" "hadley" "hadley" "hadley"
## [25] "hadley" "hadley" "hadley" "hadley" "hadley" "hadley"
# Flatten the data into a regular non-nested dataframe
names(flatten(data2))
## [1] "id" "node_id"
## [3] "name" "full_name"
## [5] "private" "html_url"
## [7] "description" "fork"
## [9] "url" "forks_url"
## [11] "keys_url" "collaborators_url"
## [13] "teams_url" "hooks_url"
## [15] "issue_events_url" "events_url"
## [17] "assignees_url" "branches_url"
## [19] "tags_url" "blobs_url"
## [21] "git_tags_url" "git_refs_url"
## [23] "trees_url" "statuses_url"
## [25] "languages_url" "stargazers_url"
## [27] "contributors_url" "subscribers_url"
## [29] "subscription_url" "commits_url"
## [31] "git_commits_url" "comments_url"
## [33] "issue_comment_url" "contents_url"
## [35] "compare_url" "merges_url"
## [37] "archive_url" "downloads_url"
## [39] "issues_url" "pulls_url"
## [41] "milestones_url" "notifications_url"
## [43] "labels_url" "releases_url"
## [45] "deployments_url" "created_at"
## [47] "updated_at" "pushed_at"
## [49] "git_url" "ssh_url"
## [51] "clone_url" "svn_url"
## [53] "homepage" "size"
## [55] "stargazers_count" "watchers_count"
## [57] "language" "has_issues"
## [59] "has_projects" "has_downloads"
## [61] "has_wiki" "has_pages"
## [63] "forks_count" "mirror_url"
## [65] "archived" "disabled"
## [67] "open_issues_count" "forks"
## [69] "open_issues" "watchers"
## [71] "default_branch" "owner.login"
## [73] "owner.id" "owner.node_id"
## [75] "owner.avatar_url" "owner.gravatar_id"
## [77] "owner.url" "owner.html_url"
## [79] "owner.followers_url" "owner.following_url"
## [81] "owner.gists_url" "owner.starred_url"
## [83] "owner.subscriptions_url" "owner.organizations_url"
## [85] "owner.repos_url" "owner.events_url"
## [87] "owner.received_events_url" "owner.type"
## [89] "owner.site_admin" "license.key"
## [91] "license.name" "license.spdx_id"
## [93] "license.url" "license.node_id"
# Flatten directly (more efficient):
data3 <- fromJSON("https://api.github.com/users/hadley/repos", flatten = TRUE)
identical(data3, flatten(data2))
## [1] TRUE
dplyr::bind_rows() is a good helper to deal with lists parsed from JSON:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
url <- paste0("https://wikimedia.org/api/rest_v1/metrics/pageviews/",
"per-article/en.wikipedia.org/all-access/all-agents/",
"Hadley_Wickham/daily/20170101/20170102")
# Make a GET request to url and save the results
pageview_response <- GET(url)
# Call content() to retrieve the data the server sent back
pageview_data <- content(pageview_response)
# Examine the results with str()
str(pageview_data)
## List of 1
## $ items:List of 2
## ..$ :List of 7
## .. ..$ project : chr "en.wikipedia"
## .. ..$ article : chr "Hadley_Wickham"
## .. ..$ granularity: chr "daily"
## .. ..$ timestamp : chr "2017010100"
## .. ..$ access : chr "all-access"
## .. ..$ agent : chr "all-agents"
## .. ..$ views : int 45
## ..$ :List of 7
## .. ..$ project : chr "en.wikipedia"
## .. ..$ article : chr "Hadley_Wickham"
## .. ..$ granularity: chr "daily"
## .. ..$ timestamp : chr "2017010200"
## .. ..$ access : chr "all-access"
## .. ..$ agent : chr "all-agents"
## .. ..$ views : int 86
pageview_data[["items"]] %>% bind_rows()
## # A tibble: 2 x 7
## project article granularity timestamp access agent views
## <chr> <chr> <chr> <chr> <chr> <chr> <int>
## 1 en.wikipedia Hadley_Wickham daily 2017010100 all-access all-agents 45
## 2 en.wikipedia Hadley_Wickham daily 2017010200 all-access all-agents 86
Like jsonlite, you have xlm2:
library(xml2)
cd <- read_xml(xml2_example("cd_catalog.xml"))
class(cd)
## [1] "xml_document" "xml_node"
xml_structure(xml_child(cd, 1))
## <CD>
## <TITLE>
## {text}
## <ARTIST>
## {text}
## <COUNTRY>
## {text}
## <COMPANY>
## {text}
## <PRICE>
## {text}
## <YEAR>
## {text}
# working with xpaths
xml_find_all(cd, xpath = '/CATALOG/CD/ARTIST')
## {xml_nodeset (26)}
## [1] <ARTIST>Bob Dylan</ARTIST>
## [2] <ARTIST>Bonnie Tylor</ARTIST>
## [3] <ARTIST>Dolly Parton</ARTIST>
## [4] <ARTIST>Gary More</ARTIST>
## [5] <ARTIST>Eros Ramazzotti</ARTIST>
## [6] <ARTIST>Bee Gees</ARTIST>
## [7] <ARTIST>Dr.Hook</ARTIST>
## [8] <ARTIST>Rod Stewart</ARTIST>
## [9] <ARTIST>Andrea Bocelli</ARTIST>
## [10] <ARTIST>Percy Sledge</ARTIST>
## [11] <ARTIST>Savage Rose</ARTIST>
## [12] <ARTIST>Many</ARTIST>
## [13] <ARTIST>Kenny Rogers</ARTIST>
## [14] <ARTIST>Will Smith</ARTIST>
## [15] <ARTIST>Van Morrison</ARTIST>
## [16] <ARTIST>Jorn Hoel</ARTIST>
## [17] <ARTIST>Cat Stevens</ARTIST>
## [18] <ARTIST>Sam Brown</ARTIST>
## [19] <ARTIST>T`Pau</ARTIST>
## [20] <ARTIST>Tina Turner</ARTIST>
## ...
# create data frame
cds <- xml_find_all(cd, xpath = '/CATALOG/CD')
df <- data.frame(title =
xml_text(xml_find_all(cd, xpath = '/CATALOG/CD/TITLE')),
artist =
xml_text(xml_find_all(cd, xpath = '/CATALOG/CD/ARTIST')),
country =
xml_text(xml_find_all(cd, xpath = '/CATALOG/CD/COUNTRY')),
company =
xml_text(xml_find_all(cd, xpath = '/CATALOG/CD/COMPANY')),
price =
xml_double(xml_find_all(cd, xpath = '/CATALOG/CD/PRICE')),
year =
xml_integer(xml_find_all(cd, xpath = '/CATALOG/CD/YEAR')))
df
## title artist country company price year
## 1 Empire Burlesque Bob Dylan USA Columbia 10.9 1985
## 2 Hide your heart Bonnie Tylor UK CBS Records 9.9 1988
## 3 Greatest Hits Dolly Parton USA RCA 9.9 1982
## 4 Still got the blues Gary More UK Virgin redords 10.2 1990
## 5 Eros Eros Ramazzotti EU BMG 9.9 1997
## 6 One night only Bee Gees UK Polydor 10.9 1998
## 7 Sylvias Mother Dr.Hook UK CBS 8.1 1973
## 8 Maggie May Rod Stewart UK Pickwick 8.5 1990
## 9 Romanza Andrea Bocelli EU Polydor 10.8 1996
## 10 When a man loves a woman Percy Sledge USA Atlantic 8.7 1987
## 11 Black angel Savage Rose EU Mega 10.9 1995
## 12 1999 Grammy Nominees Many USA Grammy 10.2 1999
## 13 For the good times Kenny Rogers UK Mucik Master 8.7 1995
## 14 Big Willie style Will Smith USA Columbia 9.9 1997
## 15 Tupelo Honey Van Morrison UK Polydor 8.2 1971
## 16 Soulsville Jorn Hoel Norway WEA 7.9 1996
## 17 The very best of Cat Stevens UK Island 8.9 1990
## 18 Stop Sam Brown UK A and M 8.9 1988
## 19 Bridge of Spies T`Pau UK Siren 7.9 1987
## 20 Private Dancer Tina Turner UK Capitol 8.9 1983
## 21 Midt om natten Kim Larsen EU Medley 7.8 1983
## 22 Pavarotti Gala Concert Luciano Pavarotti UK DECCA 9.9 1991
## 23 The dock of the bay Otis Redding USA Atlantic 7.9 1987
## 24 Picture book Simply Red EU Elektra 7.2 1985
## 25 Red The Communards UK London 7.8 1987
## 26 Unchain my heart Joe Cocker USA EMI 8.2 1987
Use package rvest to extract data from web html pages.
library(rvest)
# Hadley Wickham's Wikipedia page
test_url <- "https://en.wikipedia.org/wiki/Hadley_Wickham"
# Read the URL stored as "test_url" with read_html()
test_xml <- read_html(test_url)
test_xml
## {html_document}
## <html class="client-nojs" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject ...
# xpath to look for a class called vcard
xpath <- paste0("//*[contains(concat( \" \", @class, \" \" ),",
" concat( \" \", \"vcard\", \" \" ))]")
# Use html_node() to grab the node with the XPATH
node <- html_node(x = test_xml, xpath = xpath)
node
## {html_node}
## <table class="infobox biography vcard" style="width:22em">
## [1] <tbody>\n<tr><th colspan="2" style="text-align:center;font-size:125%;font ...
# look for directly for class fn with css argument
page_name <- html_node(x = node, css = '.fn')
page_name
## {html_node}
## <div class="fn" style="display:inline">
# Extract the text from page_name
page_title <- html_text(page_name)
page_title
## [1] "Hadley Wickham"
Use rvest::html_table() to convert web pages tables to data.frames:
wiki_table <- html_table(node)
colnames(wiki_table) <- c("key", "value")
cleaned_table <- subset(wiki_table, !key == '')
str(cleaned_table)
## 'data.frame': 8 obs. of 2 variables:
## $ key : chr "Born" "Alma mater" "Known for" "Awards" ...
## $ value: chr "(1979-10-14) 14 October 1979 (age 40)Hamilton, New Zealand" "Iowa State University, University of Auckland" "R programming language packages" "John Chambers Award (2006)\nFellow of the American Statistical Association (2015)" ...
More examples with css argument
# Select the table elements
html_nodes(test_xml, css = 'table')
## {xml_nodeset (2)}
## [1] <table class="infobox biography vcard" style="width:22em"><tbody>\n<tr><t ...
## [2] <table class="nowraplinks hlist navbox-inner" style="border-spacing:0;bac ...
# Select elements with class = "infobox"
html_nodes(test_xml, css = '.infobox')
## {xml_nodeset (1)}
## [1] <table class="infobox biography vcard" style="width:22em"><tbody>\n<tr><t ...
# Select elements with id = "firstHeading"
html_nodes(test_xml, css = '#firstHeading')
## {xml_nodeset (1)}
## [1] <h1 id="firstHeading" class="firstHeading" lang="en">Hadley Wickham</h1>
Wrapping everything up in usable function to extract infobox from wikipedia’s pages:
library(httr)
library(rvest)
library(xml2)
get_infobox <- function(title){
base_url <- "https://en.wikipedia.org/w/api.php"
# Change "Hadley Wickham" to title
query_params <- list(action = "parse",
page = title,
format = "xml")
resp <- GET(url = base_url, query = query_params)
resp_xml <- content(resp)
page_html <- read_html(xml_text(resp_xml))
infobox_element <- html_node(x = page_html, css =".infobox")
page_name <- html_node(x = infobox_element, css = ".fn")
page_title <- html_text(page_name)
wiki_table <- html_table(infobox_element)
colnames(wiki_table) <- c("key", "value")
cleaned_table <- subset(wiki_table, !wiki_table$key == "")
name_df <- data.frame(key = "Full name", value = page_title)
wiki_table <- rbind(name_df, cleaned_table)
wiki_table
}