data.cube
Jan Gorecki, 2016-05-04
R data.cube class defined in data.cube package.
[.data.cube
[.array
[.array
data.table
[.array
If not installed use install.packages("data.cube", repos = paste0("https://", c("jangorecki.gitlab.io/data.cube","Rdatatable.github.io/data.table","cran.rstudio.com")))
.
library(data.table)
library(data.cube)
2x2x2 array vs data.cube.
set.seed(1)
# array
ar = array(rnorm(8,10,5), rep(2,3),
dimnames = list(color = c("green","red"),
year = c("2014","2015"),
country = c("IN","UK"))) # sorted
# cube normalized to star schema just on natural keys
dc = as.data.cube(ar)
# slice
ar["green", "2015",]
## IN UK
## 5.821857 12.437145
dc["green", "2015"]
## <data.cube>
## fact:
## 2 rows x 1 dimensions x 1 measures (0.00 MB)
## dimensions:
## country : 2 entities x 1 levels (0.00 MB)
## total size: 0.00 MB
format(dc["green", "2015"])
## country value
## 1: IN 5.821857
## 2: UK 12.437145
# dice
ar[c("green","red"), c("2014","2015"),]
## , , country = IN
##
## year
## color 2014 2015
## green 6.867731 5.821857
## red 10.918217 17.976404
##
## , , country = UK
##
## year
## color 2014 2015
## green 11.647539 12.43715
## red 5.897658 13.69162
dc[c("green","red"), c("2014","2015")]
## <data.cube>
## fact:
## 8 rows x 3 dimensions x 1 measures (0.00 MB)
## dimensions:
## color : 2 entities x 1 levels (0.00 MB)
## year : 2 entities x 1 levels (0.00 MB)
## country : 2 entities x 1 levels (0.00 MB)
## total size: 0.01 MB
format(dc[c("green","red"), c("2014","2015")])
## color year country value
## 1: green 2014 IN 6.867731
## 2: green 2014 UK 11.647539
## 3: green 2015 IN 5.821857
## 4: green 2015 UK 12.437145
## 5: red 2014 IN 10.918217
## 6: red 2014 UK 5.897658
## 7: red 2015 IN 17.976404
## 8: red 2015 UK 13.691624
# exact tabular representation of array is just a formatting on the cube
ar["green", c("2014","2015"),]
## country
## year IN UK
## 2014 6.867731 11.64754
## 2015 5.821857 12.43715
format(dc["green", c("2014","2015")],
dcast = TRUE,
formula = year ~ country)
## year IN UK
## 1: 2014 6.867731 11.64754
## 2: 2015 5.821857 12.43715
ar[,"2015", c("UK","IN")]
## country
## color UK IN
## green 12.43715 5.821857
## red 13.69162 17.976404
format(dc[,"2015", c("UK","IN")],
dcast = TRUE,
formula = color ~ country) # sorted dimensions levels
## color IN UK
## 1: green 5.821857 12.43715
## 2: red 17.976404 13.69162
Filtering on attributes in hierarchies of dimension.
# populate fact, dimensions and hierarchies
X = populate_star(N=1e5)
lapply(X[1:2], sapply, ncol)
## $fact
## sales
## 7
##
## $dims
## product customer currency geography time
## 5 4 2 4 8
lapply(X[1:2], sapply, nrow)
## $fact
## sales
## 99999
##
## $dims
## product customer currency geography time
## 32 32 49 50 1826
str(X[3L], max.level=3L) # hierarchy defined as list of levels and attributes on each level
## List of 1
## $ hierarchies:List of 5
## ..$ product :List of 1
## .. ..$ :List of 5
## ..$ customer :List of 1
## .. ..$ :List of 4
## ..$ currency :List of 1
## .. ..$ :List of 2
## ..$ geography:List of 1
## .. ..$ :List of 3
## ..$ time :List of 2
## .. ..$ monthly:List of 4
## .. ..$ weekly :List of 4
dc = as.data.cube(X) # as.data.cube.list
print(dc)
## <data.cube>
## fact:
## 99999 rows x 5 dimensions x 2 measures (5.35 MB)
## dimensions:
## product : 32 entities x 5 levels (0.01 MB)
## customer : 32 entities x 4 levels (0.01 MB)
## currency : 49 entities x 2 levels (0.01 MB)
## geography : 50 entities x 3 levels (0.02 MB)
## time : 1826 entities x 6 levels (0.12 MB)
## total size: 5.53 MB
# slice
dc["Mazda RX4"]
## <data.cube>
## fact:
## 3135 rows x 4 dimensions x 2 measures (0.15 MB)
## dimensions:
## customer : 32 entities x 4 levels (0.01 MB)
## currency : 49 entities x 2 levels (0.01 MB)
## geography : 50 entities x 3 levels (0.02 MB)
## time : 1826 entities x 6 levels (0.12 MB)
## total size: 0.32 MB
dc["Mazda RX4",,"BTC"]
## <data.cube>
## fact:
## 61 rows x 3 dimensions x 2 measures (0.01 MB)
## dimensions:
## customer : 32 entities x 4 levels (0.01 MB)
## geography : 50 entities x 3 levels (0.02 MB)
## time : 1826 entities x 6 levels (0.12 MB)
## total size: 0.16 MB
# dice
dc[,, c("CNY","BTC"), c("GA","IA","AD")]
## <data.cube>
## fact:
## 164 rows x 5 dimensions x 2 measures (0.01 MB)
## dimensions:
## product : 32 entities x 5 levels (0.01 MB)
## customer : 32 entities x 4 levels (0.01 MB)
## currency : 2 entities x 2 levels (0.00 MB)
## geography : 2 entities x 3 levels (0.01 MB)
## time : 1826 entities x 6 levels (0.12 MB)
## total size: 0.18 MB
# custom order of arguments matching by dimension name
dc[geography = c("GA","IA","AD"), currency = c("CNY","BTC")]
## <data.cube>
## fact:
## 164 rows x 5 dimensions x 2 measures (0.01 MB)
## dimensions:
## product : 32 entities x 5 levels (0.01 MB)
## customer : 32 entities x 4 levels (0.01 MB)
## currency : 2 entities x 2 levels (0.00 MB)
## geography : 2 entities x 3 levels (0.01 MB)
## time : 1826 entities x 6 levels (0.12 MB)
## total size: 0.18 MB
# check dimensions
names(dc$dimensions)
## [1] "product" "customer" "currency" "geography" "time"
# foreign keys and measures
names(dc)
## [1] "prod_name" "cust_profile" "curr_name" "geog_abb"
## [5] "time_date" "amount" "value"
# use dimensions hierarchy attributes for slice and dice, mix filters from various levels in hierarchy
dc["Mazda RX4",, .(curr_type = "crypto"),, .(time_year = 2014L, time_quarter_name = c("Q1","Q2"))]
## <data.cube>
## fact:
## 64 rows x 4 dimensions x 2 measures (0.01 MB)
## dimensions:
## customer : 32 entities x 4 levels (0.01 MB)
## currency : 11 entities x 2 levels (0.01 MB)
## geography : 50 entities x 3 levels (0.02 MB)
## time : 181 entities x 6 levels (0.02 MB)
## total size: 0.07 MB
# same as above but more verbose
dc[product = "Mazda RX4",
customer = .(),
currency = .(curr_type = "crypto"),
geography = .(),
time = .(time_year = 2014L, time_quarter_name = c("Q1","Q2"))]
## <data.cube>
## fact:
## 64 rows x 4 dimensions x 2 measures (0.01 MB)
## dimensions:
## customer : 32 entities x 4 levels (0.01 MB)
## currency : 11 entities x 2 levels (0.01 MB)
## geography : 50 entities x 3 levels (0.02 MB)
## time : 181 entities x 6 levels (0.02 MB)
## total size: 0.07 MB
# data.cube `[` operator returns another cube so queries can be chained
dc[,,, .(geog_region_name = "North Central")
][,,, .(geog_abb = c("IA","NV","MO")), .(time_year = 2014L)
]
## <data.cube>
## fact:
## 737 rows x 5 dimensions x 2 measures (0.05 MB)
## dimensions:
## product : 32 entities x 5 levels (0.01 MB)
## customer : 32 entities x 4 levels (0.01 MB)
## currency : 49 entities x 2 levels (0.01 MB)
## geography : 2 entities x 3 levels (0.01 MB)
## time : 365 entities x 6 levels (0.04 MB)
## total size: 0.13 MB
Collapse dimension to sub-aggregate cube can be done with `-`
quoted symbol. It is possible to filter out on that dimension before collapsing it using filter similarly to .(...)
as `-`(...)
.
Filtering on attributes and group by customer and currency.
dc[product = `-`,
customer = .(),
currency = .(curr_type = "crypto"),
geography = `-`,
time = `-`(time_year = 2013:2014)]
## <data.cube>
## fact:
## 352 rows x 2 dimensions x 2 measures (0.01 MB)
## dimensions:
## customer : 32 entities x 4 levels (0.01 MB)
## currency : 11 entities x 2 levels (0.01 MB)
## total size: 0.03 MB
Memory data.cube vs array.
# ~1e5 facts for 5 dims of cardinalities: 32, 32, 49, 50, 1826
dc = as.data.cube(populate_star(N=1e5))
## estimated size of memory required to store an base R `array` for single numeric measure
sprintf("array: %.2f GB", (prod(dim(dc)) * 8)/(1024^3))
## [1] "array: 34.13 GB"
## fact table of *cube* object having multiple measures
sprintf("data.cube: %.2f GB", as.numeric(object.size(dc$fact$data))/(1024^3))
## [1] "data.cube: 0.01 GB"
# ~1e6 facts for 5 dims of cardinalities: 32, 32, 49, 50, 1826
dc = as.data.cube(populate_star(N=1e6))
## estimated size of memory required to store an base R `array` for single numeric measure
sprintf("array: %.2f GB", (prod(dim(dc)) * 8)/(1024^3))
## [1] "array: 34.13 GB"
## fact table of *cube* object having multiple measures
sprintf("data.cube: %.2f GB", as.numeric(object.size(dc$fact$data))/(1024^3))
## [1] "data.cube: 0.05 GB"
# ~1e6 facts for 5 dims of cardinalities: 32, 32, 49, 50, 3652
# twice bigger time dimension
dc = as.data.cube(populate_star(N=1e6, Y = c(2005L,2014L)))
## estimated size of memory required to store an base R `array` for single numeric measure
sprintf("array: %.2f GB", (prod(dim(dc)) * 8)/(1024^3))
## [1] "array: 68.26 GB"
## fact table of *cube* object having multiple measures
sprintf("data.cube: %.2f GB", as.numeric(object.size(dc$fact$data))/(1024^3))
## [1] "data.cube: 0.05 GB"
Memory cap is the most common issue on scalability. The is addressed by sharding fact table using big.data.table
class instead of data.table
in fact
class object. CI process includes basic testing of that feature on old cube
class, it wasn't yet implemented for data.cube
.
R single CPU limitation is now being addressed by parallelizing some data.table calls. This is now being developed in quite a low level and pretty well scalable way.
Sharding can also outsource CPU computation to nodes.