rm(list=ls())
library(LalRUtils)
libreq(data.table, magrittr, tidyverse, microbenchmark,
nycflights13, tidyfast, dtplyr)
input <- if (file.exists("flights14.csv")) {
"flights14.csv"
} else {
"https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv"
}
flights <- fread(input)
flights %>% head
fwrite(flights, "flights14.csv")
DT = data.table(
ID = c("b","b","b","a","a","c"),
a = 1:6,
b = 7:12,
c = 13:18
)
DT
flights[origin == "JFK" & month == 6L] %>% head
Slice and return vector
flights[, arr_delay] %>% head
Slice and return dt
flights[, .(arr_delay)] %>% head
flights[order(origin, -dest)] %>% head
setorder(flights, origin, -dest)
flights %>% head
# example data
carsDT = data.table(mtcars, keep.rownames = TRUE)
# quick inspection
carsDT %>% head
carsDT[, .(mean_hp = mean(hp), mean_wt = mean(wt)), by=.(am, vs)]
carsDT[, lapply(.SD, mean), by=.(am, vs), .SDcols = c("hp", "wt", "disp")]
:=
syntax¶# example data
DT = data.table(
x = letters[c(1, 2, 3, 4, 5)],
y = c(1, 2, 3, 4, 5),
z = c(1, 2, 3, 4, 5) > 3
)
# create one
DT[, u := 5:1]
# creating multiple
DT[, `:=`(v = 2, w = 3L)]
# creating with dynamic names
nms = c("a", "b", "c")
DT[, (nms) := .(1, 2, 3)]
nms = c("u", "v", "w", "a", "b", "c")
DT[, (nms) := NULL]
DT
DT[, b := "Aardvark"][y > 1, b := "Zebra", verbose = TRUE]
cols = c("hp", "wt", "disp")
carsDT[, lapply(.SD, mean), by=.(am, vs), .SDcols = cols] %>%
setnames(cols, sprintf("mean_%s", cols)) %>% print
keep_cols = c("arr_delay", "dep_delay")
flights[, ..keep_cols] %>% head
drop_cols = c("arr_delay", "dep_delay")
flights[, -..drop_cols] %>% head
flights[, .(.N), by = .(origin)]
flights[carrier == "AA", .N, by = origin]
flights2 = copy(flights)
setkey(flights2, carrier, origin, dest)
key(flights2)
collapse_dplyr = function() {
flights %>% as_tibble() %>%
group_by(carrier, origin, dest) %>%
summarize(mean_arr_delay = mean(arr_delay),
pressure = mean(dep_delay))
}
collapse_dt = function() {
flights[,
.(mean_arr_delay = mean(arr_delay), pressure = mean(dep_delay)),
by = .(carrier, origin, dest)]
}
collapse_dt_key = function() {
flights2[,
.(mean_arr_delay = mean(arr_delay), pressure = mean(dep_delay)),
by = .(carrier, origin, dest)]
}
microbenchmark(collapse_dplyr(), collapse_dt(), collapse_dt_key(), times = 10) %>%
print
keybied = flights[carrier == "AA",
.(mean(arr_delay), mean(dep_delay)),
keyby = .(origin, dest, month)]
key(keybied)
flights[, .N, .(dep_delay>0, arr_delay>0)]
.SD
notation¶.SD
stands for Subset of Data
DT
flights[carrier == "AA", ## Only on trips with carrier "AA"
lapply(.SD, mean), ## compute the mean
by = .(origin, dest, month), ## for every 'origin,dest,month'
.SDcols = c("arr_delay", "dep_delay")] %>% ## for just those specified in .SDcols
head
emp = 'Employee EmployeeName Department Salary
1 Alice 11 800
2 Bob 11 600
3 Carla 12 900
4 Daniel 12 1000
5 Evelyn 13 800
6 Ferdinand 21 700'
dept = '
table Departments
Department DepartmentName Manager
11 Production 1
12 Sales 4
13 Marketing 5
14 Research NA'
X = fread(emp)
Y = fread(dept)
X
Y
merge(X, Y, all=F) # inner
merge(X, Y, all.x=T) # right
merge(X, Y, all.y=T) # left
merge(X, Y, all=T) # cartesian
# example data
a = data.table(id = c(1L, 1L, 2L, 3L, NA_integer_),
t = c(1L, 2L, 1L, 2L, NA_integer_), x = 11:15)
b = data.table(id = 1:2, y = c(11L, 15L))
a
b
a[b, on=.(id)]
Different keys
a[b, on=.(id, x = y)]
a[b, on=.(id), y := i.y ]
a
mDT = data.table(id = 1:3, x_dn = 10L, x_up = 13L)
mDT
a[mDT, on=.(id, x >= x_dn, x <= x_up), .(id, i.x_dn, i.x_up, x.x)]
s = 'id y a b
A X 1 3
A Z 2 4
B X 1 3
B Z 2 4'
DT = fread(s)
dcast(DT, id ~ y, value.var = c('a', 'b'))
s = 'id a_X a_Z b_X b_Z
A 1 2 3 4
B 1 2 3 4'
DT = fread(s)
DT
melt(DT,
id.vars = c("id"),
measure = patterns("^a", "^b"),
variable.name = "y",
value.name = c("a", "b"))
DT = data.table(a = 1:5)
DT[, `:=`(b = letters[a], c = 1 * (a > 3))][, d := .I , by= .(c)][]
DT[, l := shift(b, 1), by = c][]
carsDT[, .N, by=.(drat_bin = cut(drat, 4))]
dplyr
and data.table
with dtplyr
¶collapse_dtplyr = function() {
flights %>% lazy_dt() %>%
group_by(carrier, origin, dest) %>%
summarize(mean_arr_delay = mean(arr_delay),
pressure = mean(dep_delay))
}
## Just compare dtplyr with normal dplyr and data.table versions (i.e. no keys)
microbenchmark::microbenchmark(collapse_dplyr(), collapse_dt_key(), collapse_dt(), collapse_dtplyr(), times = 10) %>%
print