cube
Jan Gorecki, 2015-11-19
R cube class defined in data.cube package.
Please note there is a new updated oop cube class called
data.cube
. See Subset and aggregate multidimensional data with data.cube vignette instead.
[.cube
[.array
[.array
[.array
if(!"data.cube" %in% rownames(installed.packages())) install.packages(
"data.cube", repos = paste0("https://", c("jangorecki.github.io/data.cube","cran.rstudio.com"))
)
library(data.table)
library(data.cube)
set.seed(1)
# array
ar = array(rnorm(8,10,5),
rep(2,3),
dimnames = list(color = c("green","red"),
year = c("2014","2015"),
country = c("UK","IN")))
# cube normalized to star schema
cb = as.cube(ar)
ar["green","2015",]
## UK IN
## 5.821857 12.437145
cb["green","2015",]
## <cube>
## fact:
## fact 2 rows x 2 cols (0.00 MB)
## dims:
## country 2 rows x 1 cols (0.00 MB)
## total size: 0.00 MB
ar["green",c("2014","2015"),]
## country
## year UK IN
## 2014 6.867731 11.64754
## 2015 5.821857 12.43715
cb["green",c("2014","2015"),]
## <cube>
## fact:
## fact 4 rows x 3 cols (0.00 MB)
## dims:
## year 2 rows x 1 cols (0.00 MB)
## country 2 rows x 1 cols (0.00 MB)
## total size: 0.00 MB
# tabular representation of array is just a formatting on the cube
format(cb["green",c("2014","2015"),],
dcast = TRUE,
formula = year ~ country)
## year IN UK
## 1: 2014 11.64754 6.867731
## 2: 2015 12.43715 5.821857
ar[,"2015",c("UK","IN")]
## country
## color UK IN
## green 5.821857 12.43715
## red 17.976404 13.69162
cb[,"2015",c("UK","IN")]
## <cube>
## fact:
## fact 4 rows x 3 cols (0.00 MB)
## dims:
## color 2 rows x 1 cols (0.00 MB)
## country 2 rows x 1 cols (0.00 MB)
## total size: 0.00 MB
format(cb[,"2015",c("UK","IN")],
dcast = TRUE,
formula = color ~ country)
## color IN UK
## 1: green 12.43715 5.821857
## 2: red 13.69162 17.976404
# as.cube.list - investigate X to see structure
X = populate_star(N=1e5)
lapply(X, sapply, ncol)
## $fact
## sales
## 7
##
## $dims
## product customer currency geography time
## 5 4 2 4 8
##
## $hierarchies
## $hierarchies$product
## NULL
##
## $hierarchies$customer
## NULL
##
## $hierarchies$currency
## NULL
##
## $hierarchies$geography
## NULL
##
## $hierarchies$time
## NULL
lapply(X, sapply, nrow)
## $fact
## sales
## 99999
##
## $dims
## product customer currency geography time
## 32 32 49 50 1826
##
## $hierarchies
## $hierarchies$product
## NULL
##
## $hierarchies$customer
## NULL
##
## $hierarchies$currency
## NULL
##
## $hierarchies$geography
## NULL
##
## $hierarchies$time
## NULL
cb = as.cube(X)
print(cb)
## <cube>
## fact:
## sales 99999 rows x 7 cols (5.35 MB)
## dims:
## product 32 rows x 5 cols (0.00 MB)
## customer 32 rows x 4 cols (0.00 MB)
## currency 49 rows x 2 cols (0.00 MB)
## geography 50 rows x 4 cols (0.01 MB)
## time 1826 rows x 8 cols (0.09 MB)
## total size: 5.46 MB
# slice
cb["Mazda RX4"]
## <cube>
## fact:
## sales 3135 rows x 6 cols (0.15 MB)
## dims:
## customer 32 rows x 4 cols (0.00 MB)
## currency 49 rows x 2 cols (0.00 MB)
## geography 50 rows x 4 cols (0.01 MB)
## time 1826 rows x 8 cols (0.09 MB)
## total size: 0.26 MB
cb["Mazda RX4",,"BTC"]
## <cube>
## fact:
## sales 61 rows x 5 cols (0.01 MB)
## dims:
## customer 32 rows x 4 cols (0.00 MB)
## geography 50 rows x 4 cols (0.01 MB)
## time 1826 rows x 8 cols (0.09 MB)
## total size: 0.11 MB
# dice
cb[,, c("CNY","BTC"), c("GA","IA","AD")]
## <cube>
## fact:
## sales 164 rows x 7 cols (0.01 MB)
## dims:
## product 32 rows x 5 cols (0.00 MB)
## customer 32 rows x 4 cols (0.00 MB)
## currency 2 rows x 2 cols (0.00 MB)
## geography 2 rows x 4 cols (0.00 MB)
## time 1826 rows x 8 cols (0.09 MB)
## total size: 0.12 MB
# check dimensions
cb$dims
## [1] "product" "customer" "currency" "geography" "time"
# use dimensions hierarchy attributes for slice and dice, mix filters from various levels in hierarchy
cb["Mazda RX4",, .(curr_type = "crypto"),, .(time_year = 2014L, time_quarter_name = c("Q1","Q2"))]
## <cube>
## fact:
## sales 64 rows x 6 cols (0.01 MB)
## dims:
## customer 32 rows x 4 cols (0.00 MB)
## currency 11 rows x 2 cols (0.00 MB)
## geography 50 rows x 4 cols (0.01 MB)
## time 181 rows x 8 cols (0.01 MB)
## total size: 0.04 MB
# same as above but more verbose
cb[product = "Mazda RX4",
customer = .(),
currency = .(curr_type = "crypto"),
geography = .(),
time = .(time_year = 2014L, time_quarter_name = c("Q1","Q2"))]
## <cube>
## fact:
## sales 64 rows x 6 cols (0.01 MB)
## dims:
## customer 32 rows x 4 cols (0.00 MB)
## currency 11 rows x 2 cols (0.00 MB)
## geography 50 rows x 4 cols (0.01 MB)
## time 181 rows x 8 cols (0.01 MB)
## total size: 0.04 MB
# cube `[` operator returns another cube so queries can be chained
cb[,,, .(geog_region_name = "North Central")
][,,, .(geog_abb = c("IA","NV","MO")), .(time_year = 2014L)
]
## <cube>
## fact:
## sales 737 rows x 7 cols (0.05 MB)
## dims:
## product 32 rows x 5 cols (0.00 MB)
## customer 32 rows x 4 cols (0.00 MB)
## currency 49 rows x 2 cols (0.00 MB)
## geography 2 rows x 4 cols (0.00 MB)
## time 365 rows x 8 cols (0.02 MB)
## total size: 0.08 MB
# ~1e5 facts for 5 dims of cardinalities: 32, 32, 49, 50, 1826
cb = as.cube(populate_star(N=1e5))
## estimated size of memory required to store an base R `array` for single numeric measure
sprintf("array: %.2f GB", (prod(dim(cb)) * 8)/(1024^3))
## [1] "array: 34.13 GB"
## fact table of *cube* object having multiple measures
sprintf("cube: %.2f GB", as.numeric(object.size(cb$env$fact$sales))/(1024^3))
## [1] "cube: 0.01 GB"
# ~1e6 facts for 5 dims of cardinalities: 32, 32, 49, 50, 1826
cb = as.cube(populate_star(N=1e6))
## estimated size of memory required to store an base R `array` for single numeric measure
sprintf("array: %.2f GB", (prod(dim(cb)) * 8)/(1024^3))
## [1] "array: 34.13 GB"
## fact table of *cube* object having multiple measures
sprintf("cube: %.2f GB", as.numeric(object.size(cb$env$fact$sales))/(1024^3))
## [1] "cube: 0.05 GB"
# ~1e6 facts for 5 dims of cardinalities: 32, 32, 49, 50, 3652
cb = as.cube(populate_star(N=1e6, Y = c(2005L,2014L))) # bigger time dimension
## estimated size of memory required to store an base R `array` for single numeric measure
sprintf("array: %.2f GB", (prod(dim(cb)) * 8)/(1024^3))
## [1] "array: 68.26 GB"
## fact table of *cube* object having multiple measures
sprintf("cube: %.2f GB", as.numeric(object.size(cb$env$fact$sales))/(1024^3))
## [1] "cube: 0.05 GB"
Lots of examples can be found in tests: tests/tests-sub-.cube.R.
Feel free to PR your use case for future regression testing.