From 23116467d233682e6c9a900f91a81354db785633 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Mon, 5 Jan 2026 01:17:26 +0100 Subject: [PATCH 1/4] add fix, test and NEWS --- NEWS.md | 2 ++ R/data.table.R | 8 ++++++++ inst/tests/tests.Rraw | 7 +++++++ 3 files changed, 17 insertions(+) diff --git a/NEWS.md b/NEWS.md index 23e8d5c873..3f21d573e4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -32,6 +32,8 @@ 3. `fread("file://...")` works for file URIs with spaces, [#7550](https://github.com/Rdatatable/data.table/issues/7550). Thanks @aitap for the report and @MichaelChirico for the PR. +4. Grouping operations with constant `list()` expressions in `j` are now optimized to avoid per-group allocation overhead, [#712](https://github.com/Rdatatable/data.table/issues/712). Thanks @macrakis for the report and @ben-schwen for the fix. + ## data.table [v1.18.0](https://github.com/Rdatatable/data.table/milestone/37?closed=1) 23 December 2025 ### BREAKING CHANGE diff --git a/R/data.table.R b/R/data.table.R index 27c985e44c..a16e5c850f 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1653,6 +1653,14 @@ replace_dot_alias = function(e) { if ( getOption("datatable.optimize")>=1L && (is.call(jsub) || (is.name(jsub) && jsub %chin% c(".SD", ".N"))) ) { # Ability to turn off if problems or to benchmark the benefit # Optimization to reduce overhead of calling lapply over and over for each group oldjsub = jsub + + # Optimization: unwrap constant list() expressions to avoid per-group allocation + # e.g., list(1) -> 1, where the value is a simple atomic constant + if (jsub %iscall% "list" && length(jsub) == 2L && !is.null(jsub[[2L]]) && !is.call(jsub[[2L]]) && is_constantish(jsub[[2L]])) { + jsub = jsub[[2L]] + if (verbose) catf("Optimized j from list(constant) to bare constant\n") + } + funi = 1L # Fix for #985 # converted the lapply(.SD, ...) to a function and used below, easier to implement FR #2722 then. .massageSD = function(jsub) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index fcf78e9f36..3f24ef6726 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21978,3 +21978,10 @@ local({ test(2357.1, fread(f), DT) test(2357.2, fread(paste0("file://", f)), DT) }) + +# dt[, j=list(var), by] is slower than dt[, j=var, by], #712 +dt = data.table(x=rep(1:3, 2L), y=1L) +test(2358.1, dt[, .(1), by=x, verbose=TRUE], dt[, 1, by=x], output="lapply optimization changed j from") +dt = data.table(x=1:5, key="x") +test(2358.2, dt[dt, list(1), by=.EACHI, verbose=TRUE], dt[dt, 1, by=.EACHI], output="lapply optimization changed j from") +test(2358.3, dt[dt, list(x), by=.EACHI, verbose=TRUE], dt[dt, x, by=.EACHI], output="lapply optimization changed j from") From 7f8dbdd143693de4188425057afb3bf57002125c Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 17 Jan 2026 17:52:58 +0100 Subject: [PATCH 2/4] fix merge --- NEWS.md | 2 +- R/data.table.R | 18 ++++++++++++++++++ inst/tests/tests.Rraw | 6 +++--- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 9bc050e7dc..6f3c3e452d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -56,7 +56,7 @@ 5. The data.table test suite is a bit more robust to lacking UTF-8 support via a new `requires_utf8` argument to `test()` to skip tests when UTF-8 support is not available, [#7336](https://github.com/Rdatatable/data.table/issues/7336). Thanks @MichaelChirico for the suggestion and @ben-schwen for the implementation. -4. Grouping operations with constant `list()` expressions in `j` are now optimized to avoid per-group allocation overhead, [#712](https://github.com/Rdatatable/data.table/issues/712). Thanks @macrakis for the report and @ben-schwen for the fix. +6. Grouping operations with constant `list()` expressions in `j` are now optimized to avoid per-group allocation overhead, [#712](https://github.com/Rdatatable/data.table/issues/712). Thanks @macrakis for the report and @ben-schwen for the fix. ## data.table [v1.18.0](https://github.com/Rdatatable/data.table/milestone/37?closed=1) 23 December 2025 diff --git a/R/data.table.R b/R/data.table.R index fdb652bc67..407b485707 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -189,6 +189,18 @@ replace_dot_alias = function(e) { list(jsub=jsub, jvnames=jvnames, funi=funi+1L) } +# Optimize constant list() expressions to avoid per-group allocation overhead +# e.g., list(1) -> 1, where the value is a simple atomic constant, #712 +# return NULL for no optimization possible +.optimize_constant_list = function(jsub) { + if (!jsub %iscall% "list") return(NULL) + if (length(jsub) != 2L) return(NULL) + if (is.null(jsub[[2L]])) return(NULL) + if (is.call(jsub[[2L]])) return(NULL) + if (!is_constantish(jsub[[2L]])) return(NULL) + jsub[[2L]] +} + # Optimize .SD subsetting patterns like .SD[1], head(.SD), first(.SD) # return NULL for no optimization possible .optimize_sd_subset = function(jsub, sdvars, SDenv, envir) { @@ -505,6 +517,12 @@ replace_dot_alias = function(e) { return(list(GForce=FALSE, jsub=jsub, jvnames=jvnames)) } + # Step 0: Unwrap constant list() to avoid per-group allocation, #712 + if (!is.null(unwrapped_consts <- .optimize_constant_list(jsub))) { + if (verbose) catf("Optimized j from '%s' to bare constant '%s'\n", deparse(jsub), deparse(unwrapped_consts, width.cutoff=200L, nlines=1L)) + jsub = unwrapped_consts + } + # Step 1: Apply lapply(.SD) optimization lapply_result = .optimize_lapply(jsub, jvnames, sdvars, SDenv, verbose, envir) jsub = lapply_result$jsub diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5da639d76a..850c6e2d29 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21487,7 +21487,7 @@ test(2362.53, optimize=0:2, dt[, list(lapply(.SD, sum), list()), b, verbose=TRUE # dt[, j=list(var), by] is slower than dt[, j=var, by], #712 dt = data.table(x=rep(1:3, 2L), y=1L) -test(2363.1, dt[, .(1), by=x, verbose=TRUE], dt[, 1, by=x], output="lapply optimization changed j from") +test(2363.1, dt[, .(1), by=x, verbose=TRUE], dt[, 1, by=x], output="Optimized j from.*to bare constant") dt = data.table(x=1:5, key="x") -test(2363.2, dt[dt, list(1), by=.EACHI, verbose=TRUE], dt[dt, 1, by=.EACHI], output="lapply optimization changed j from") -test(2363.3, dt[dt, list(x), by=.EACHI, verbose=TRUE], dt[dt, x, by=.EACHI], output="lapply optimization changed j from") +test(2363.2, dt[dt, list(1), by=.EACHI, verbose=TRUE], dt[dt, 1, by=.EACHI], output="Optimized j from.*to bare constant") +test(2363.3, dt[dt, list(x), by=.EACHI, verbose=TRUE], dt[dt, x, by=.EACHI], output="Optimized j from.*to bare constant") From bb738718b8d21722d0a7accd9a4c340b9ac73fd4 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 17 Jan 2026 18:10:37 +0100 Subject: [PATCH 3/4] remove not needed cases --- R/data.table.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 407b485707..c971dba852 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -195,8 +195,6 @@ replace_dot_alias = function(e) { .optimize_constant_list = function(jsub) { if (!jsub %iscall% "list") return(NULL) if (length(jsub) != 2L) return(NULL) - if (is.null(jsub[[2L]])) return(NULL) - if (is.call(jsub[[2L]])) return(NULL) if (!is_constantish(jsub[[2L]])) return(NULL) jsub[[2L]] } From d58528349aa227c85de40ebd719181d8cfefa47f Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 17 Jan 2026 18:13:02 +0100 Subject: [PATCH 4/4] restore comment --- R/data.table.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/data.table.R b/R/data.table.R index c971dba852..c3896ab0de 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2039,6 +2039,7 @@ replace_dot_alias = function(e) { SDenv$.NGRP = length(f__) lockBinding(".NGRP", SDenv) + # Determine GForce-optimized query gforce_result = .attempt_optimize(jsub, jvnames, sdvars, SDenv, verbose, i, byjoin, f__, ansvars, use.I, lhs, names_x, parent.frame()) GForce = gforce_result$GForce jsub = gforce_result$jsub