This function wraps dplyr's summarize()
function in a convenient way. The user only needs to define functions on the dataset with a named vector or list (with atomic entries of length 1) as return.
dplyr_wrapper(data, group_by, fun, check_fun = TRUE)
data | (`dataframe`). A dataframe with a grouping variable. |
---|---|
group_by | (`character()`). Name of column, which contains identifiers on which the dataset should be grouped by. E.g. different user IDs. |
fun | (`function`). Must be a function, which has a dataframe as input and a (named) vector of desired length as output. |
check_fun | (`logical(1)`). If |
(`dataframe`)
# Number of used chrome apps fun1 = function(data) { c(uses_chrome = nrow( dplyr::filter(data, RUNNING_TASKS_baseActivity_mPackage == "com.android.chrome")) ) } dplyr_wrapper(data = studentlife_small, group_by = "userId", fun = fun1)#> userId uses_chrome #> 1 00 3104 #> 2 01 1635 #> 3 02 2811# mean, max, sd of a column fun2 = function(data) { c(mean_sepal_length = mean(data$Sepal.Length), max_sepal_length = max(data$Sepal.Length), sd_sepal_length = sd(data$Sepal.Length) ) } dplyr_wrapper(data = iris, group_by = "Species", fun = fun2)#> Species mean_sepal_length max_sepal_length sd_sepal_length #> 1 setosa 5.006 5.8 0.3524897 #> 2 versicolor 5.936 7.0 0.5161711 #> 3 virginica 6.588 7.9 0.6358796# return list fun3 = function(data) { list(mean_sepal_length = mean(data$Sepal.Length), max_sepal_length = max(data$Sepal.Length), sd_sepal_length = sd(data$Sepal.Length) ) } dplyr_wrapper(data = iris, group_by = "Species", fun = fun3)#> Species mean_sepal_length max_sepal_length sd_sepal_length #> 1 setosa 5.006 5.8 0.3524897 #> 2 versicolor 5.936 7.0 0.5161711 #> 3 virginica 6.588 7.9 0.6358796# group by two columns df = data.frame(id = c(rep(1, 10), rep(2, 10))) df$task = rep(c(rep("task1", 5), rep("task2", 5)), 2) df$hour = rep(c(rep("hour1", 3), rep("hour2", 2), rep("hour1", 2), rep("hour2", 3)), 2) df$x = 1:20 fun4 = function(data) c(mean_x = mean(data$x)) dplyr_wrapper(data = df, group_by = c("id", "task"), fun = fun4)#> Warning: `group_by_()` is deprecated as of dplyr 0.7.0. #> Please use `group_by()` instead. #> See vignette('programming') for more help #> This warning is displayed once every 8 hours. #> Call `lifecycle::last_warnings()` to see where this warning was generated.#> id task1 task2 #> 1 1 3 8 #> 2 2 13 18