# data.table introduction @ Why R? 2019 # data ---- # https://github.com/arunsrinivasan/user2017-data.table-tutorial DF1 = data.frame(id = c(rep(1L,4L), 2L, 2L, 1L, 2L, 1L), code = c("c","b","c","c","a","a","b","a","c"), valA = 1:9, valB = 10:18, stringsAsFactors = FALSE) DF2 = data.frame(id = c(3L, 1L, 1L, 2L, 3L), code = c("b","a","c","c","d"), mul = 5:1, stringsAsFactors = FALSE) library(data.table) packageVersion("data.table") DT1 = as.data.table(DF1) DT2 = as.data.table(DF2) # 1a. Subset all rows where id column equals 1 & code column is not equal to "c" ---- # df DF1[DF1\$id == 1 & DF1\$code != "c",] subset(DF1, id == 1 & code != "c") # dt DT1[id == 1 & code != "c"] # 2. Select valA and valB columns from DF1 ---- # df DF1[DF1\$id == 1 & DF1\$code != "c", c("valA","valB")] subset(DF1, id == 1 & code != "c", select = c("valA","valB")) # dt DT1[id == 1 & code != "c", c("valA","valB")] DT1[id == 1 & code != "c", .(valA, valB)] # 3. Get sum(valA) and sum(valB) for id > 1 as a 1-row, 2-col data.frame ---- # df as.data.frame(lapply( DF1[DF1\$id > 1, c("valA","valB")], sum )) with( DF1[DF1\$id > 1,], data.frame(valA=sum(valA), valB=sum(valB)) ) with( subset(DF1, id > 1), data.frame(valA=sum(valA), valB=sum(valB)) ) # dt DT1[id > 1, .(valA=sum(valA), valB=sum(valB))] # chaining queries ---- # df? DF1[DF1\$id > 1, ][DF1[DF1\$id > 1, "valA"] > 5,] subset(DF1, id > 1 )[subset(DF1, id > 1)\$valA > 5,] ## not possible to refer to self! tmp = DF1[DF1\$id > 1,] tmp[tmp\$valA > 5,] # magrittr pipes alias dot as self if (requireNamespace("magrittr", quietly=TRUE)) { `%>%` = magrittr::`%>%` DF1[DF1\$id > 1,] %>% .[.\$valA > 5,] DF1 %>% .[.\$id > 1,] %>% .[.\$valA > 5,] } # dt DT1[id > 1 ][valA > 5] DT1[ ][id > 1 ][valA > 5 ][ ][][] # 4. Replace valB with valB+1 for all rows where code == "c" ---- # df DF = DF1 DF tmp = DF\$code=="c" DF[tmp, "valB"] = DF[tmp, "valB"] + 1L DF # dt DT = copy(DT1) DT DT[code=="c", valB := valB + 1L] DT # reference semantics ---- # why we need copy? # data.table is memory conservative # imagine df/dt is of 1 GB size df = data.frame(a = 1:2) dt = as.data.table(df) # data.frame by default copy (copy-on-modify) df[1L, "a"] = 5L ## we need 2 GB memory # data.table by default updates in-place dt[1L, a := 5L] ## we need 1 GB memory copy(dt)[1L, a := 5L] ## we need 2 GB memory # 5. Add a new column valC column with values equal to valB^2 - valA^2 ---- # df (DF = DF1) DF\$valC = DF\$valB^2 - DF\$valA^2 DF # dt (DT = copy(DT1)) DT[, valC := valB^2 - valA^2] DT # 6. Get sum(valA) and sum(valB) grouped by id and code (i.e., for each unique combination of id,code) ---- # df aggregate(DF1[, c("valA","valB")], by = DF1[, c("id","code")], FUN = sum) # df faster and more flexible tmp = split(DF1, f = DF1[, c("id","code")], drop = TRUE) tmp2 = lapply(tmp, function(df) list( id = df\$id[1L], code = df\$code[1L], valA = sum(df\$valA), valB = sum(df\$valB) )) do.call("rbind.data.frame", tmp2) # dt DT1[, .(valA = sum(valA), valB = sum(valB)), by = .(id, code)] # apply different functions to different columns? sum(valA), mean(valB) DT1[, .(valA = sum(valA), valB = mean(valB)), by = .(id, code)] # 7. Get sum(valA) and sum(valB) grouped by id for id >= 2 & code %in% c("a", "c") ---- # df tmp = DF1\$id >= 2 & DF1\$code %in% c("a","c") aggregate(DF1[tmp, c("valA","valB")], by = DF1[tmp, "id", drop = FALSE], FUN = sum) # dt DT1[id >= 2 & code %in% c("a","c"), .(valA = sum(valA), valB = sum(valB)), by = id] # scale API for `j` expression using .SD or curly brackets ---- # what is j? # DT[i, j, by, ...] ## ?data.table # a list of columns or an expression that evaluates to such # list of columns? # data.frame is a list! d = list(a = 1:2, b = 11:12) d is.data.frame(d) class(d) = "data.frame" ## no coercion needed! attr(d, "row.names") = 1:2 ## only setting attributes is.data.frame(d) is.list(d) # data.table is a data.frame! is.data.frame(DT1) is.list(DT1) lapply(DT1, uniqueN) ## function(x) length(unique(x)) DT1[, .SD][, .SD][, .SD] ## .SD is a self data.table ## list in `j` is returned as data.table DT1[, lapply(.SD, uniqueN)] ## any expression, also multi-line, is valid! it has to return a list DT1[, { tmp1 = uniqueN(id) tmp2 = uniqueN(code) tmp3 = uniqueN(valA) tmp4 = uniqueN(valB) list(id = tmp1, code = tmp2, valA = tmp3, valB = tmp4) }] lapply(DT1[, c("valA","valB")], sum) DT1[, lapply(.SD, sum), .SDcols = c("valA","valB")] DT1[, lapply(.SD, sum), .SDcols = valA:valB] DT1[, lapply(.SD, sum), .SDcols = patterns("^val")] sapply(DT1, is.numeric) DT1[, lapply(.SD, sum), .SDcols = sapply(DT1, is.numeric)] # .SD while grouping - .SDcols is all columns except grouping columns DT1[code != "c", lapply(.SD, sum), by = .(id, code)] DT1[code != "c", lapply(.SD, sum), by = id, .SDcols = patterns("^val")] # combine lists l = c( list(1, 2), lapply(11:13, as.numeric) ) str(l) as.data.table(l) DT1[code != "c", c(list(firstA = first(valA), lastA = last(valA)), lapply(.SD, sum)), by = id, .SDcols = patterns("^val")] # 8. Replace valA with max(valA)-min(valA) grouped by code ---- # df tmp = split(DF1, f = DF1[, c("code")], drop = TRUE) tmp2 = lapply(tmp, function(df) { grp_tmp = max(df\$valA) - min(df\$valA) df\$valA = rep(grp_tmp, nrow(df)) df }) do.call("rbind.data.frame", tmp2) # dt (DT = copy(DT1)) DT[, valA := max(valA) - min(valA), by = code] DT # 9. Create a new col named valD with max(valB)-min(valA) grouped by code ---- # df tmp = split(DF1, f = DF1[, c("code")], drop = TRUE) tmp2 = lapply(tmp, function(df) { grp_tmp = max(df\$valB) - min(df\$valA) df\$valD = rep(grp_tmp, nrow(df)) df }) do.call("rbind.data.frame", tmp2) # dt (DT = copy(DT1)) DT[, valD := max(valB) - min(valA), by = code] DT # 10. Subset DF1 by DF2 on id,code column. That is, for each row of DF2\$id, DF2\$code, get valA and valB cols from DF1. Include rows that have no matches as well ---- # df ans = merge(DF1, DF2, by=c("id", "code"), all.y=TRUE) ans[, c("valA","valB")] # dt ans = merge(DT1, DT2, by=c("id", "code"), all.y=TRUE) ans[, c("valA","valB")] # dt better DT1[DT2, on=.(id, code), .(valA, valB)] # R's argument matching by names and position ---- args(`[.data.frame`) `[.foo` = function(x, i, j, by, on) match.call() foo = structure(list(NULL), class="foo") foo[bar, on=.(id, code), .(valA, valB)] # 11. Same as (10), but fetch just the first matching row of DF1 for each row of DF2\$id, DF2\$code. Exclude non-matching rows ---- DT1[DT2, on=.(id, code), .(valA, valB), mult="first", ## first matching row nomatch=NULL] ## inner join (exclude non-matching rows), default nomatch=NA # why is RIGHT OUTER JOIN (all.y=TRUE) the default one? ---- x = 1:5 x x[4:6] ## nomatch=NA names(x) = c("a","b","c","d","e") x[c("d","e","f")] ## nomatch=NA ## LHS [1:5] join RHS [4:6] ## names(LHS) [a,b,c,d,e] join RHS [d,e,f] ## RIGHT OUTER JOIN mx = matrix(1:9, 3, 3, dimnames=list(paste0("row",1:3), paste0("col",1:3))) mx try(mx[c(1L,3L,4L),]) ## nomatch=error try(mx[c("row1","row3","row4"),]) ## nomatch=error ## ERROR df = as.data.frame(mx) df df[c(1L,3L,4L),] ## nomatch=NA df[c("row1","row3","row4"),] ## nomatch=NA ## RIGHT OUTER JOIN dt = as.data.table(df, keep.rownames=TRUE) dt dt[c(1L,3L,4L),] ## nomatch=NA try(dt[c("row1","row3","row4")]) ## does not use rownames setkey(dt, rn) ## does use key! (clustered index) dt[c("row1","row3","row4")] setkey(dt, NULL) dt[c("row1","row3","row4"), on=.(rn)] setindex(dt, rn) ## also does use indexes (non-clustered index) dt[c("row1","row3","row4"), on=.(rn), verbose=TRUE] ## RIGHT OUTER JOIN dt[c("row1","row3","row4"), on=.(rn), nomatch=NULL] ## INNER JOIN ## nomatch=NULL DT1[DT2, on=.(id, code)] ## RIGHT OUTER JOIN DT2[DT1, on=.(id, code)] ## LEFT OUTER JOIN DT1[DT2, on=.(id, code), nomatch=NULL] ## INNER JOIN # NOTE: during "update on join" the LEFT OUTER JOIN is being used # 12. For every row of DF2\$id, DF2\$code that matches with DF1’s, update valA with valA*mul ---- (DT = copy(DT1)) DT2 DT[DT2, on=.(id, code), valA := valA * mul] DT # 13. Add a new column val to DF1 with values from DF2\$mul where DF2\$id, DF2\$code matches with DF1’s. Rows that don’t match should have NA ---- (DT = copy(DT1)) DT[DT2, on=.(id, code), mul := i.mul] DT # 14. Compute sum(valA)*mul for every row of DF2\$id, DF2\$code by matching it against DF1 ---- DT1[DT2, on=.(id, code), sum(valA) * mul, by = .EACHI] if (!interactive()) q("no")