## ----run----------------------------------------------------------------- library(data.table) library(data.cube) ## ----tiny---------------------------------------------------------------- set.seed(1) # array ar = array(rnorm(8,10,5), rep(2,3), dimnames = list(color = c("green","red"), year = c("2014","2015"), country = c("IN","UK"))) # sorted # cube normalized to star schema just on natural keys dc = as.data.cube(ar) # slice ar["green", "2015",] dc["green", "2015"] format(dc["green", "2015"]) # dice ar[c("green","red"), c("2014","2015"),] dc[c("green","red"), c("2014","2015")] format(dc[c("green","red"), c("2014","2015")]) # exact tabular representation of array is just a formatting on the cube ar["green", c("2014","2015"),] format(dc["green", c("2014","2015")], dcast = TRUE, formula = year ~ country) ar[,"2015", c("UK","IN")] format(dc[,"2015", c("UK","IN")], dcast = TRUE, formula = color ~ country) # sorted dimensions levels ## ----hierarchies--------------------------------------------------------- # populate fact, dimensions and hierarchies X = populate_star(N=1e5) lapply(X[1:2], sapply, ncol) lapply(X[1:2], sapply, nrow) str(X[3L], max.level=3L) # hierarchy defined as list of levels and attributes on each level dc = as.data.cube(X) # as.data.cube.list print(dc) # slice dc["Mazda RX4"] dc["Mazda RX4",,"BTC"] # dice dc[,, c("CNY","BTC"), c("GA","IA","AD")] # custom order of arguments matching by dimension name dc[geography = c("GA","IA","AD"), currency = c("CNY","BTC")] # check dimensions names(dc$dimensions) # foreign keys and measures names(dc) # use dimensions hierarchy attributes for slice and dice, mix filters from various levels in hierarchy dc["Mazda RX4",, .(curr_type = "crypto"),, .(time_year = 2014L, time_quarter_name = c("Q1","Q2"))] # same as above but more verbose dc[product = "Mazda RX4", customer = .(), currency = .(curr_type = "crypto"), geography = .(), time = .(time_year = 2014L, time_quarter_name = c("Q1","Q2"))] # data.cube `[` operator returns another cube so queries can be chained dc[,,, .(geog_region_name = "North Central") ][,,, .(geog_abb = c("IA","NV","MO")), .(time_year = 2014L) ] ## ----aggregate_collapse-------------------------------------------------- dc[product = `-`, customer = .(), currency = .(curr_type = "crypto"), geography = `-`, time = `-`(time_year = 2013:2014)] ## ----memory_efficiency--------------------------------------------------- # ~1e5 facts for 5 dims of cardinalities: 32, 32, 49, 50, 1826 dc = as.data.cube(populate_star(N=1e5)) ## estimated size of memory required to store an base R `array` for single numeric measure sprintf("array: %.2f GB", (prod(dim(dc)) * 8)/(1024^3)) ## fact table of *cube* object having multiple measures sprintf("data.cube: %.2f GB", as.numeric(object.size(dc$fact$data))/(1024^3)) # ~1e6 facts for 5 dims of cardinalities: 32, 32, 49, 50, 1826 dc = as.data.cube(populate_star(N=1e6)) ## estimated size of memory required to store an base R `array` for single numeric measure sprintf("array: %.2f GB", (prod(dim(dc)) * 8)/(1024^3)) ## fact table of *cube* object having multiple measures sprintf("data.cube: %.2f GB", as.numeric(object.size(dc$fact$data))/(1024^3)) # ~1e6 facts for 5 dims of cardinalities: 32, 32, 49, 50, 3652 # twice bigger time dimension dc = as.data.cube(populate_star(N=1e6, Y = c(2005L,2014L))) ## estimated size of memory required to store an base R `array` for single numeric measure sprintf("array: %.2f GB", (prod(dim(dc)) * 8)/(1024^3)) ## fact table of *cube* object having multiple measures sprintf("data.cube: %.2f GB", as.numeric(object.size(dc$fact$data))/(1024^3))