-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Closed as not planned
Labels
Milestone
Description
I am testing out the dev version of dplyr and have noticed some performance regressions when using summarize with a large number of groups. Calling n()
with a large number of groups produces a ~400x increased runtime, whereas using max()
has ~10x increased runtime.
Performance on 0.8.5
library(dplyr, warn.conflicts = FALSE)
packageVersion("dplyr")
#> [1] '0.8.5'
set.seed(42)
many_grps <- data.frame(grp = sample(1:1e5,
1e6,
replace = TRUE),
val = runif(1e6)) %>%
group_by(grp)
n_groups(many_grps)
#> [1] 99997
set.seed(42)
few_grps <- data.frame(grp = sample(1:100,
1e6,
replace = TRUE),
val = runif(1e6)) %>%
group_by(grp)
n_groups(few_grps)
#> [1] 100
microbenchmark::microbenchmark(summarize(many_grps, n = n()),
summarize(many_grps, m = max(val)),
summarize(few_grps, n = n()),
summarize(few_grps, m = max(val)),
times = 5,
unit = 'ms')
#> Unit: milliseconds
#> expr min lq mean median
#> summarize(many_grps, n = n()) 2.474665 2.531869 2.786606 2.743778
#> summarize(many_grps, m = max(val)) 17.693114 19.297248 22.774355 20.640482
#> summarize(few_grps, n = n()) 0.144234 0.154476 0.182271 0.175776
#> summarize(few_grps, m = max(val)) 8.792012 10.393226 11.908963 10.482098
#> uq max neval cld
#> 3.037315 3.145401 5 a
#> 27.791042 28.449888 5 c
#> 0.190168 0.246701 5 a
#> 14.835186 15.042294 5 b
Created on 2020-03-21 by the reprex package (v0.3.0)
Performance on current dev version
library(dplyr, warn.conflicts = FALSE)
packageVersion("dplyr")
#> [1] '0.8.99.9002'
set.seed(42)
many_grps <- data.frame(grp = sample(1:1e5,
1e6,
replace = TRUE),
val = runif(1e6)) %>%
group_by(grp)
n_groups(many_grps)
#> [1] 99997
set.seed(42)
few_grps <- data.frame(grp = sample(1:100,
1e6,
replace = TRUE),
val = runif(1e6)) %>%
group_by(grp)
n_groups(few_grps)
#> [1] 100
microbenchmark::microbenchmark(summarize(many_grps, n = n()),
summarize(many_grps, m = max(val)),
summarize(few_grps, n = n()),
summarize(few_grps, m = max(val)),
times = 5,
unit = 'ms')
#> Unit: milliseconds
#> expr min lq mean
#> summarize(many_grps, n = n()) 1129.489705 1170.905902 1177.632328
#> summarize(many_grps, m = max(val)) 164.942559 180.437278 212.037870
#> summarize(few_grps, n = n()) 2.286928 2.307006 2.408776
#> summarize(few_grps, m = max(val)) 13.396918 14.531448 15.026723
#> median uq max neval cld
#> 1177.479608 1188.72916 1221.55727 5 c
#> 205.139373 218.82458 290.84556 5 b
#> 2.324175 2.40646 2.71931 5 a
#> 14.850954 15.62192 16.73238 5 a
Created on 2020-03-21 by the reprex package (v0.3.0)
tungttnguyen, foundinblank, chrislim5, MajoroMask and seabbs