Subset multidimensional data with cube

Jan Gorecki, 2015-11-19

R cube class defined in data.cube package.

Please note there is a new updated oop cube class called data.cube. See Subset and aggregate multidimensional data with data.cube vignette instead.

characteristic of [.cube

what's the same in [.array

what's more than [.array

what's different to [.array

start session

if(!"data.cube" %in% rownames(installed.packages())) install.packages(
    "data.cube", repos = paste0("https://", c("jangorecki.github.io/data.cube","cran.rstudio.com"))
)
library(data.table)
library(data.cube)

tiny example

set.seed(1)
# array
ar = array(rnorm(8,10,5), 
           rep(2,3), 
           dimnames = list(color = c("green","red"), 
                           year = c("2014","2015"), 
                           country = c("UK","IN")))
# cube normalized to star schema
cb = as.cube(ar)

ar["green","2015",]
##        UK        IN 
##  5.821857 12.437145
cb["green","2015",]
## <cube>
## fact:
##   fact 2 rows x 2 cols (0.00 MB)
## dims:
##   country 2 rows x 1 cols (0.00 MB)
## total size: 0.00 MB
ar["green",c("2014","2015"),]
##       country
## year         UK       IN
##   2014 6.867731 11.64754
##   2015 5.821857 12.43715
cb["green",c("2014","2015"),]
## <cube>
## fact:
##   fact 4 rows x 3 cols (0.00 MB)
## dims:
##   year 2 rows x 1 cols (0.00 MB)
##   country 2 rows x 1 cols (0.00 MB)
## total size: 0.00 MB
# tabular representation of array is just a formatting on the cube
format(cb["green",c("2014","2015"),], 
       dcast = TRUE, 
       formula = year ~ country)
##    year       IN       UK
## 1: 2014 11.64754 6.867731
## 2: 2015 12.43715 5.821857
ar[,"2015",c("UK","IN")]
##        country
## color          UK       IN
##   green  5.821857 12.43715
##   red   17.976404 13.69162
cb[,"2015",c("UK","IN")]
## <cube>
## fact:
##   fact 4 rows x 3 cols (0.00 MB)
## dims:
##   color 2 rows x 1 cols (0.00 MB)
##   country 2 rows x 1 cols (0.00 MB)
## total size: 0.00 MB
format(cb[,"2015",c("UK","IN")], 
       dcast = TRUE, 
       formula = color ~ country)
##    color       IN        UK
## 1: green 12.43715  5.821857
## 2:   red 13.69162 17.976404

hierarchies example

# as.cube.list - investigate X to see structure
X = populate_star(N=1e5)
lapply(X, sapply, ncol)
## $fact
## sales 
##     7 
## 
## $dims
##   product  customer  currency geography      time 
##         5         4         2         4         8 
## 
## $hierarchies
## $hierarchies$product
## NULL
## 
## $hierarchies$customer
## NULL
## 
## $hierarchies$currency
## NULL
## 
## $hierarchies$geography
## NULL
## 
## $hierarchies$time
## NULL
lapply(X, sapply, nrow)
## $fact
## sales 
## 99999 
## 
## $dims
##   product  customer  currency geography      time 
##        32        32        49        50      1826 
## 
## $hierarchies
## $hierarchies$product
## NULL
## 
## $hierarchies$customer
## NULL
## 
## $hierarchies$currency
## NULL
## 
## $hierarchies$geography
## NULL
## 
## $hierarchies$time
## NULL
cb = as.cube(X)
print(cb)
## <cube>
## fact:
##   sales 99999 rows x 7 cols (5.35 MB)
## dims:
##   product 32 rows x 5 cols (0.00 MB)
##   customer 32 rows x 4 cols (0.00 MB)
##   currency 49 rows x 2 cols (0.00 MB)
##   geography 50 rows x 4 cols (0.01 MB)
##   time 1826 rows x 8 cols (0.09 MB)
## total size: 5.46 MB
# slice
cb["Mazda RX4"]
## <cube>
## fact:
##   sales 3135 rows x 6 cols (0.15 MB)
## dims:
##   customer 32 rows x 4 cols (0.00 MB)
##   currency 49 rows x 2 cols (0.00 MB)
##   geography 50 rows x 4 cols (0.01 MB)
##   time 1826 rows x 8 cols (0.09 MB)
## total size: 0.26 MB
cb["Mazda RX4",,"BTC"]
## <cube>
## fact:
##   sales 61 rows x 5 cols (0.01 MB)
## dims:
##   customer 32 rows x 4 cols (0.00 MB)
##   geography 50 rows x 4 cols (0.01 MB)
##   time 1826 rows x 8 cols (0.09 MB)
## total size: 0.11 MB
# dice
cb[,, c("CNY","BTC"), c("GA","IA","AD")]
## <cube>
## fact:
##   sales 164 rows x 7 cols (0.01 MB)
## dims:
##   product 32 rows x 5 cols (0.00 MB)
##   customer 32 rows x 4 cols (0.00 MB)
##   currency 2 rows x 2 cols (0.00 MB)
##   geography 2 rows x 4 cols (0.00 MB)
##   time 1826 rows x 8 cols (0.09 MB)
## total size: 0.12 MB
# check dimensions
cb$dims
## [1] "product"   "customer"  "currency"  "geography" "time"
# use dimensions hierarchy attributes for slice and dice, mix filters from various levels in hierarchy
cb["Mazda RX4",, .(curr_type = "crypto"),, .(time_year = 2014L, time_quarter_name = c("Q1","Q2"))]
## <cube>
## fact:
##   sales 64 rows x 6 cols (0.01 MB)
## dims:
##   customer 32 rows x 4 cols (0.00 MB)
##   currency 11 rows x 2 cols (0.00 MB)
##   geography 50 rows x 4 cols (0.01 MB)
##   time 181 rows x 8 cols (0.01 MB)
## total size: 0.04 MB
# same as above but more verbose
cb[product = "Mazda RX4",
   customer = .(),
   currency = .(curr_type = "crypto"),
   geography = .(),
   time = .(time_year = 2014L, time_quarter_name = c("Q1","Q2"))]
## <cube>
## fact:
##   sales 64 rows x 6 cols (0.01 MB)
## dims:
##   customer 32 rows x 4 cols (0.00 MB)
##   currency 11 rows x 2 cols (0.00 MB)
##   geography 50 rows x 4 cols (0.01 MB)
##   time 181 rows x 8 cols (0.01 MB)
## total size: 0.04 MB
# cube `[` operator returns another cube so queries can be chained
cb[,,, .(geog_region_name = "North Central")
   ][,,, .(geog_abb = c("IA","NV","MO")), .(time_year = 2014L)
     ]
## <cube>
## fact:
##   sales 737 rows x 7 cols (0.05 MB)
## dims:
##   product 32 rows x 5 cols (0.00 MB)
##   customer 32 rows x 4 cols (0.00 MB)
##   currency 49 rows x 2 cols (0.00 MB)
##   geography 2 rows x 4 cols (0.00 MB)
##   time 365 rows x 8 cols (0.02 MB)
## total size: 0.08 MB

scalability

# ~1e5 facts for 5 dims of cardinalities: 32, 32, 49, 50, 1826
cb = as.cube(populate_star(N=1e5))
## estimated size of memory required to store an base R `array` for single numeric measure
sprintf("array: %.2f GB", (prod(dim(cb)) * 8)/(1024^3))
## [1] "array: 34.13 GB"
## fact table of *cube* object having multiple measures
sprintf("cube: %.2f GB", as.numeric(object.size(cb$env$fact$sales))/(1024^3))
## [1] "cube: 0.01 GB"
# ~1e6 facts for 5 dims of cardinalities: 32, 32, 49, 50, 1826
cb = as.cube(populate_star(N=1e6))
## estimated size of memory required to store an base R `array` for single numeric measure
sprintf("array: %.2f GB", (prod(dim(cb)) * 8)/(1024^3))
## [1] "array: 34.13 GB"
## fact table of *cube* object having multiple measures
sprintf("cube: %.2f GB", as.numeric(object.size(cb$env$fact$sales))/(1024^3))
## [1] "cube: 0.05 GB"
# ~1e6 facts for 5 dims of cardinalities: 32, 32, 49, 50, 3652
cb = as.cube(populate_star(N=1e6, Y = c(2005L,2014L))) # bigger time dimension
## estimated size of memory required to store an base R `array` for single numeric measure
sprintf("array: %.2f GB", (prod(dim(cb)) * 8)/(1024^3))
## [1] "array: 68.26 GB"
## fact table of *cube* object having multiple measures
sprintf("cube: %.2f GB", as.numeric(object.size(cb$env$fact$sales))/(1024^3))
## [1] "cube: 0.05 GB"

examples

Lots of examples can be found in tests: tests/tests-sub-.cube.R.
Feel free to PR your use case for future regression testing.