In [1]:
rm(list=ls())
library(LalRUtils)
libreq(data.table, magrittr, tidyverse, microbenchmark, 
       nycflights13, tidyfast, dtplyr)
     wants            loaded
[1,] "data.table"     TRUE  
[2,] "magrittr"       TRUE  
[3,] "tidyverse"      TRUE  
[4,] "microbenchmark" TRUE  
[5,] "nycflights13"   TRUE  
[6,] "tidyfast"       TRUE  
[7,] "dtplyr"         TRUE  
In [2]:
input <- if (file.exists("flights14.csv")) {
   "flights14.csv"
} else {
  "https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv"
}
In [3]:
flights <- fread(input)
flights %>% head
A data.table: 6 × 11
yearmonthdaydep_delayarr_delaycarrierorigindestair_timedistancehour
<int><int><int><int><int><chr><chr><chr><int><int><int>
20141114 13AAJFKLAX3592475 9
201411-3 13AAJFKLAX363247511
201411 2 9AAJFKLAX351247519
201411-8-26AALGAPBI1571035 7
201411 2 1AAJFKLAX350247513
201411 4 0AAEWRLAX339245418
In [4]:
fwrite(flights, "flights14.csv")

Creating DT

In [4]:
DT = data.table(
  ID = c("b","b","b","a","a","c"),
  a = 1:6,
  b = 7:12,
  c = 13:18
)
DT
A data.table: 6 × 4
IDabc
<chr><int><int><int>
b1 713
b2 814
b3 915
a41016
a51117
c61218

Slicing

In [5]:
flights[origin == "JFK" & month == 6L] %>% head
A data.table: 6 × 11
yearmonthdaydep_delayarr_delaycarrierorigindestair_timedistancehour
<int><int><int><int><int><chr><chr><chr><int><int><int>
201461 -9 -5AAJFKLAX3242475 8
201461-10-13AAJFKLAX329247512
201461 18 -1AAJFKLAX3262475 7
201461 -6-16AAJFKLAX320247510
201461 -4-45AAJFKLAX326247518
201461 -6-23AAJFKLAX329247514

Slice and return vector

In [6]:
flights[, arr_delay] %>% head
  1. 13
  2. 13
  3. 9
  4. -26
  5. 1
  6. 0

Slice and return dt

In [8]:
flights[, .(arr_delay)] %>% head
A data.table: 6 × 1
arr_delay
<int>
13
13
9
-26
1
0

Sorting

In [7]:
flights[order(origin, -dest)] %>% head
A data.table: 6 × 11
yearmonthdaydep_delayarr_delaycarrierorigindestair_timedistancehour
<int><int><int><int><int><chr><chr><chr><int><int><int>
20141 5 6 49EVEWRXNA19511318
20141 6 7 13EVEWRXNA19011318
20141 7-6-13EVEWRXNA17911318
20141 8-7-12EVEWRXNA18411318
20141 916 7EVEWRXNA18111318
201411366 66EVEWRXNA18811319

Sorting inplace

In [8]:
setorder(flights, origin, -dest)
flights %>% head
A data.table: 6 × 11
yearmonthdaydep_delayarr_delaycarrierorigindestair_timedistancehour
<int><int><int><int><int><chr><chr><chr><int><int><int>
20141 5 6 49EVEWRXNA19511318
20141 6 7 13EVEWRXNA19011318
20141 7-6-13EVEWRXNA17911318
20141 8-7-12EVEWRXNA18411318
20141 916 7EVEWRXNA18111318
201411366 66EVEWRXNA18811319

Subset and Calculate

In [9]:
# example data
carsDT = data.table(mtcars, keep.rownames = TRUE)
# quick inspection
carsDT %>% head
A data.table: 6 × 12
rnmpgcyldisphpdratwtqsecvsamgearcarb
<chr><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
Mazda RX4 21.061601103.902.62016.460144
Mazda RX4 Wag 21.061601103.902.87517.020144
Datsun 710 22.84108 933.852.32018.611141
Hornet 4 Drive 21.462581103.083.21519.441031
Hornet Sportabout18.783601753.153.44017.020032
Valiant 18.162251052.763.46020.221031

Aggregation

In [10]:
carsDT[, .(mean_hp = mean(hp), mean_wt = mean(wt)), by=.(am, vs)]
A data.table: 4 × 4
amvsmean_hpmean_wt
<dbl><dbl><dbl><dbl>
10180.832.857
11 80.572.028
01102.143.194
00194.174.104
In [11]:
carsDT[, lapply(.SD, mean), by=.(am, vs), .SDcols = c("hp", "wt", "disp")]
A data.table: 4 × 5
amvshpwtdisp
<dbl><dbl><dbl><dbl><dbl>
10180.832.857206.2
11 80.572.028 89.8
01102.143.194175.1
00194.174.104357.6

:= syntax

Creating Columns

In [12]:
# example data
DT = data.table(
  x = letters[c(1, 2, 3, 4, 5)], 
  y = c(1, 2, 3, 4, 5), 
  z = c(1, 2, 3, 4, 5) > 3
)
In [13]:
# create one
DT[, u := 5:1]
# creating multiple
DT[, `:=`(v = 2, w = 3L)]
# creating with dynamic names
nms = c("a", "b", "c")
DT[, (nms) := .(1, 2, 3)]

Dropping columns

In [14]:
nms = c("u", "v", "w", "a", "b", "c")
DT[, (nms) := NULL]
In [15]:
DT
A data.table: 5 × 3
xyz
<chr><dbl><lgl>
a1FALSE
b2FALSE
c3FALSE
d4 TRUE
e5 TRUE

Conditional Replace

In [16]:
DT[, b := "Aardvark"][y > 1, b := "Zebra", verbose = TRUE]
Detected that j uses these columns: b 
Assigning to 4 row subset of 5 rows
RHS_list_of_columns == false
In [17]:
cols = c("hp", "wt", "disp")
carsDT[, lapply(.SD, mean), by=.(am, vs), .SDcols = cols] %>% 
  setnames(cols, sprintf("mean_%s", cols)) %>% print
   am vs mean_hp mean_wt mean_disp
1:  1  0  180.83   2.857     206.2
2:  1  1   80.57   2.028      89.8
3:  0  1  102.14   3.194     175.1
4:  0  0  194.17   4.104     357.6

Column selection

In [18]:
keep_cols = c("arr_delay", "dep_delay")
flights[, ..keep_cols] %>% head
A data.table: 6 × 2
arr_delaydep_delay
<int><int>
49 6
13 7
-13-6
-12-7
716
6666
In [19]:
drop_cols = c("arr_delay", "dep_delay")
flights[, -..drop_cols] %>% head
A data.table: 6 × 9
yearmonthdaycarrierorigindestair_timedistancehour
<int><int><int><chr><chr><chr><int><int><int>
20141 5EVEWRXNA19511318
20141 6EVEWRXNA19011318
20141 7EVEWRXNA17911318
20141 8EVEWRXNA18411318
20141 9EVEWRXNA18111318
2014113EVEWRXNA18811319

Grouping summaries

In [20]:
flights[, .(.N), by = .(origin)]
A data.table: 3 × 2
originN
<chr><int>
EWR87400
JFK81483
LGA84433
In [21]:
flights[carrier == "AA", .N, by = origin]
A data.table: 3 × 2
originN
<chr><int>
EWR 2649
JFK11923
LGA11730

Keys

In [22]:
flights2 = copy(flights)
setkey(flights2, carrier, origin, dest)
key(flights2)
  1. 'carrier'
  2. 'origin'
  3. 'dest'
In [23]:
collapse_dplyr = function() {
  flights %>% as_tibble() %>% 
    group_by(carrier, origin, dest) %>% 
    summarize(mean_arr_delay = mean(arr_delay), 
              pressure = mean(dep_delay))
  }

collapse_dt = function() {
  flights[, 
    .(mean_arr_delay = mean(arr_delay), pressure = mean(dep_delay)),
            by = .(carrier, origin, dest)]
  }

collapse_dt_key = function() {
  flights2[, 
    .(mean_arr_delay = mean(arr_delay), pressure = mean(dep_delay)),
            by = .(carrier, origin, dest)]
  }
In [27]:
microbenchmark(collapse_dplyr(), collapse_dt(), collapse_dt_key(), times = 10) %>% 
    print
Unit: milliseconds
              expr    min     lq   mean median    uq   max neval cld
  collapse_dplyr() 38.479 39.400 47.325 43.363 44.39 95.52    10   b
     collapse_dt()  7.241  7.746  9.833  8.969 11.65 13.94    10  a 
 collapse_dt_key() 19.357 19.479 20.962 19.678 20.44 29.73    10  a 

Create key as part of agg step

In [28]:
keybied = flights[carrier == "AA",
        .(mean(arr_delay), mean(dep_delay)),
        keyby = .(origin, dest, month)] 
key(keybied)
  1. 'origin'
  2. 'dest'
  3. 'month'

Expressions in By

In [29]:
flights[, .N, .(dep_delay>0, arr_delay>0)]
A data.table: 4 × 3
dep_delayarr_delayN
<lgl><lgl><int>
TRUE TRUE 72836
FALSEFALSE119304
FALSE TRUE 34583
TRUEFALSE 26593

.SD notation

.SD stands for Subset of Data

In [30]:
DT
A data.table: 5 × 4
xyzb
<chr><dbl><lgl><chr>
a1FALSEAardvark
b2FALSEZebra
c3FALSEZebra
d4 TRUEZebra
e5 TRUEZebra
In [32]:
flights[carrier == "AA",                           ## Only on trips with carrier "AA"
        lapply(.SD, mean),                         ## compute the mean
        by = .(origin, dest, month),               ## for every 'origin,dest,month'
        .SDcols = c("arr_delay", "dep_delay")] %>% ## for just those specified in .SDcols 
        head
A data.table: 6 × 5
origindestmontharr_delaydep_delay
<chr><chr><int><dbl><dbl>
EWRPHX 7-5.103 0.2759
EWRPHX 8 3.548 6.2258
EWRPHX 9-4.233-1.6667
EWRPHX10-3.032-4.2903
EWRMIA 111.01112.1236
EWRMIA 2 1.564 4.7564

Merges

In [33]:
emp = 'Employee	EmployeeName	Department	Salary
1	Alice	11	800
2	Bob	11	600
3	Carla	12	900
4	Daniel	12	1000
5	Evelyn	13	800
6	Ferdinand	21	700'

dept = '
table Departments
Department	DepartmentName	Manager
11	Production	1
12	Sales	4
13	Marketing	5
14	Research	NA'

X = fread(emp)
Y = fread(dept)
In [34]:
X
Y
A data.table: 6 × 4
EmployeeEmployeeNameDepartmentSalary
<int><chr><int><int>
1Alice 11 800
2Bob 11 600
3Carla 12 900
4Daniel 121000
5Evelyn 13 800
6Ferdinand21 700
A data.table: 4 × 3
DepartmentDepartmentNameManager
<int><chr><int>
11Production 1
12Sales 4
13Marketing 5
14Research NA
In [37]:
merge(X, Y, all=F) # inner
merge(X, Y, all.x=T) # right
merge(X, Y, all.y=T) # left
merge(X, Y, all=T) # cartesian
A data.table: 5 × 6
DepartmentEmployeeEmployeeNameSalaryDepartmentNameManager
<int><int><chr><int><chr><int>
111Alice 800Production1
112Bob 600Production1
123Carla 900Sales 4
124Daniel1000Sales 4
135Evelyn 800Marketing 5
A data.table: 6 × 6
DepartmentEmployeeEmployeeNameSalaryDepartmentNameManager
<int><int><chr><int><chr><int>
111Alice 800Production 1
112Bob 600Production 1
123Carla 900Sales 4
124Daniel 1000Sales 4
135Evelyn 800Marketing 5
216Ferdinand 700NA NA
A data.table: 6 × 6
DepartmentEmployeeEmployeeNameSalaryDepartmentNameManager
<int><int><chr><int><chr><int>
11 1Alice 800Production 1
11 2Bob 600Production 1
12 3Carla 900Sales 4
12 4Daniel1000Sales 4
13 5Evelyn 800Marketing 5
14NANA NAResearch NA
A data.table: 7 × 6
DepartmentEmployeeEmployeeNameSalaryDepartmentNameManager
<int><int><chr><int><chr><int>
11 1Alice 800Production 1
11 2Bob 600Production 1
12 3Carla 900Sales 4
12 4Daniel 1000Sales 4
13 5Evelyn 800Marketing 5
14NANA NAResearch NA
21 6Ferdinand 700NA NA

DT idiomatic merges

In [38]:
# example data
a = data.table(id = c(1L, 1L, 2L, 3L, NA_integer_), 
               t = c(1L, 2L, 1L, 2L, NA_integer_), x = 11:15)
b = data.table(id = 1:2, y = c(11L, 15L))

a
b
A data.table: 5 × 3
idtx
<int><int><int>
1 111
1 212
2 113
3 214
NANA15
A data.table: 2 × 2
idy
<int><int>
111
215

Equi-joins

In [39]:
a[b, on=.(id)]
A data.table: 3 × 4
idtxy
<int><int><int><int>
111111
121211
211315

Different keys

In [40]:
a[b, on=.(id, x = y)]
A data.table: 2 × 3
idtx
<int><int><int>
1 111
2NA15
In [41]:
a[b, on=.(id), y := i.y ]
a
A data.table: 5 × 4
idtxy
<int><int><int><int>
1 11111
1 21211
2 11315
3 214NA
NANA15NA

Non-Equi Joins

In [42]:
mDT = data.table(id = 1:3, x_dn = 10L, x_up = 13L)
mDT
A data.table: 3 × 3
idx_dnx_up
<int><int><int>
11013
21013
31013
In [43]:
a[mDT, on=.(id, x >= x_dn, x <= x_up), .(id, i.x_dn, i.x_up, x.x)]
A data.table: 4 × 4
idi.x_dni.x_upx.x
<int><int><int><int>
1101311
1101312
2101313
31013NA

Long to wide

In [24]:
s = 'id y a b
A X 1 3
A Z 2 4
B X 1 3
B Z 2 4'
DT = fread(s)

dcast(DT, id ~ y, value.var = c('a', 'b'))
A data.table: 2 × 5
ida_Xa_Zb_Xb_Z
<chr><int><int><int><int>
A1234
B1234

Wide to long

In [45]:
s = 'id a_X a_Z b_X b_Z
A 1 2 3 4
B 1 2 3 4'
DT = fread(s)
DT

melt(DT,
    id.vars = c("id"),
    measure = patterns("^a", "^b"),
    variable.name = "y",
    value.name = c("a", "b"))
A data.table: 2 × 5
ida_Xa_Zb_Xb_Z
<chr><int><int><int><int>
A1234
B1234
A data.table: 4 × 4
idyab
<chr><fct><int><int>
A113
B113
A224
B224

Advanced operations

In [46]:
DT = data.table(a = 1:5) 
DT[, `:=`(b = letters[a], c = 1 * (a > 3))][, d := .I , by= .(c)][]
A data.table: 5 × 4
abcd
<int><chr><dbl><int>
1a01
2b02
3c03
4d14
5e15
In [47]:
DT[, l := shift(b, 1), by = c][]
A data.table: 5 × 5
abcdl
<int><chr><dbl><int><chr>
1a01NA
2b02a
3c03b
4d14NA
5e15d

Plot by group

Merges

Binned summary

In [48]:
carsDT[, .N, by=.(drat_bin = cut(drat, 4))]
A data.table: 4 × 2
drat_binN
<fct><int>
(3.84,4.39]11
(2.76,3.3] 13
(3.3,3.84] 6
(4.39,4.93] 2

Combining dplyr and data.table with dtplyr

https://atrebas.github.io/post/2019-03-03-datatable-dplyr

In [50]:
collapse_dtplyr = function() {
  flights %>% lazy_dt() %>% 
    group_by(carrier, origin, dest) %>% 
    summarize(mean_arr_delay = mean(arr_delay), 
              pressure = mean(dep_delay))
  }

## Just compare dtplyr with normal dplyr and data.table versions (i.e. no keys)
In [54]:
microbenchmark::microbenchmark(collapse_dplyr(), collapse_dt_key(), collapse_dt(), collapse_dtplyr(), times = 10) %>% 
    print
Unit: milliseconds
              expr    min     lq   mean median     uq     max neval cld
  collapse_dplyr() 38.607 39.551 41.795 40.784 42.971  48.918    10   b
 collapse_dt_key() 19.307 19.527 21.858 20.520 24.677  28.266    10  ab
     collapse_dt()  7.333  7.681 28.249 10.530 11.551 193.618    10  ab
 collapse_dtplyr()  3.296  3.512  3.809  3.561  3.646   6.179    10  a