From d94889a96a24d9f49be80a80de01e93edd4e0ac8 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 9 Jan 2026 16:32:08 -0500 Subject: [PATCH 01/26] Add udf_preimage logic --- datafusion/expr/src/udf.rs | 63 ++++++++++ .../simplify_expressions/expr_simplifier.rs | 64 +++++++++- .../optimizer/src/simplify_expressions/mod.rs | 1 + .../src/simplify_expressions/udf_preimage.rs | 112 ++++++++++++++++++ 4 files changed, 237 insertions(+), 3 deletions(-) create mode 100644 datafusion/optimizer/src/simplify_expressions/udf_preimage.rs diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 0654370ac7ebf..09c62278c2b00 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -232,6 +232,25 @@ impl ScalarUDF { self.inner.is_nullable(args, schema) } + /// Return a preimage + /// + /// See [`ScalarUDFImpl::preimage`] for more details. + pub fn preimage( + &self, + args: &[Expr], + lit_expr: &Expr, + info: &SimplifyContext, + ) -> Result> { + self.inner.preimage(args, lit_expr, info) + } + + /// Return inner column from function args + /// + /// See [`ScalarUDFImpl::column_expr`] + pub fn column_expr(&self, args: &[Expr]) -> Option { + self.inner.column_expr(args) + } + /// Invoke the function on `args`, returning the appropriate result. /// /// See [`ScalarUDFImpl::invoke_with_args`] for details. @@ -696,6 +715,37 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync { Ok(ExprSimplifyResult::Original(args)) } + /// Returns the [preimage] for this function and the specified scalar value, if any. + /// + /// A preimage is a single contiguous [`Interval`] of values where the function + /// will always return `lit_value` + /// + /// This rewrite is described in the [ClickHouse Paper] and is particularly + /// useful for simplifying expressions `date_part` or equivalent functions. The + /// idea is that if you have an expression like `date_part(YEAR, k) = 2024` and you + /// can find a [preimage] for `date_part(YEAR, k)`, which is the range of dates + /// covering the entire year of 2024. Thus, you can rewrite the expression to `k + /// >= '2024-01-01' AND k < '2025-01-01' which is often more optimizable. + /// + /// This should only return a preimage if the function takes a single argument + /// + /// [ClickHouse Paper]: https://www.vldb.org/pvldb/vol17/p3731-schulze.pdf + /// [preimage]: https://en.wikipedia.org/wiki/Image_(mathematics)#Inverse_image + fn preimage( + &self, + _args: &[Expr], + _lit_expr: &Expr, + _info: &SimplifyContext, + ) -> Result> { + Ok(None) + } + + // Return the inner column expression from this function + fn column_expr(&self, _args: &[Expr]) -> Option { + None + } + + /// Returns true if some of this `exprs` subexpressions may not be evaluated /// and thus any side effects (like divide by zero) may not be encountered. /// @@ -926,6 +976,19 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl { self.inner.simplify(args, info) } + fn preimage( + &self, + args: &[Expr], + lit_expr: &Expr, + info: &SimplifyContext, + ) -> Result> { + self.inner.preimage(args, lit_expr, info) + } + + fn column_expr(&self, args: &[Expr]) -> Option { + self.inner.column_expr(args) + } + fn conditional_arguments<'a>( &self, args: &'a [Expr], diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 55bff5849c5cb..15a58683a54b4 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -38,8 +38,7 @@ use datafusion_common::{ tree_node::{Transformed, TransformedResult, TreeNode, TreeNodeRewriter}, }; use datafusion_expr::{ - BinaryExpr, Case, ColumnarValue, Expr, Like, Operator, Volatility, and, - binary::BinaryTypeCoercer, lit, or, + BinaryExpr, Case, ColumnarValue, Expr, Like, Operator, Volatility, and, binary::BinaryTypeCoercer, interval_arithmetic::Interval, lit, or }; use datafusion_expr::{Cast, TryCast, simplify::ExprSimplifyResult}; use datafusion_expr::{expr::ScalarFunction, interval_arithmetic::NullableInterval}; @@ -51,7 +50,7 @@ use datafusion_physical_expr::{create_physical_expr, execution_props::ExecutionP use super::inlist_simplifier::ShortenInListSimplifier; use super::utils::*; -use crate::analyzer::type_coercion::TypeCoercionRewriter; +use crate::{analyzer::type_coercion::TypeCoercionRewriter, simplify_expressions::udf_preimage::rewrite_with_preimage}; use crate::simplify_expressions::SimplifyContext; use crate::simplify_expressions::regex::simplify_regex_expr; use crate::simplify_expressions::unwrap_cast::{ @@ -1952,12 +1951,71 @@ impl TreeNodeRewriter for Simplifier<'_> { })) } + // ======================================= + // preimage_in_comparison + // ======================================= + // + // For case: + // date_part(expr as 'YEAR') op literal + // + // Background: + // Datasources such as Parquet can prune partitions using simple predicates, + // but they cannot do so for complex expressions. + // For a complex predicate like `date_part('YEAR', c1) < 2000`, pruning is not possible. + // After rewriting it to `c1 < 2000-01-01`, pruning becomes feasible. + Expr::BinaryExpr(BinaryExpr { left, op, right }) + if get_preimage(&left, &right, info)?.0.is_some() + && get_preimage(&left, &right, info)?.1.is_some() => + { + // todo use let binding (if let Some(interval) = ...) once stabilized to avoid computing this thrice😢 + let (Some(interval), Some(col_expr)) = + get_preimage(left.as_ref(), &right, info)? + else { + unreachable!( + "The above if statement insures interval and col_expr are Some" + ) + }; + rewrite_with_preimage(info, interval, op, Box::new(col_expr))? + } + // literal op date_part(literal, expression) + // --> + // date_part(literal, expression) op_swap literal + Expr::BinaryExpr(BinaryExpr { left, op, right }) + if get_preimage(&right, &left, info)?.0.is_some() + && get_preimage(&right, &left, info)?.1.is_some() + && op.swap().is_some() => + { + let swapped = op.swap().unwrap(); + let (Some(interval), Some(col_expr)) = get_preimage(&right, &left, info)? + else { + unreachable!( + "The above if statement insures interval and col_expr are Some" + ) + }; + rewrite_with_preimage(info, interval, swapped, Box::new(col_expr))? + } + + // no additional rewrites possible expr => Transformed::no(expr), }) } } +fn get_preimage( + left_expr: &Expr, + right_expr: &Expr, + info: &SimplifyContext, +) -> Result<(Option, Option)> { + let Expr::ScalarFunction(ScalarFunction { func, args }) = left_expr else { + return Ok((None, None)); + }; + Ok(( + func.preimage(args, right_expr, info)?, + func.column_expr(args), + )) +} + fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option)> { match expr { Expr::Literal(ScalarValue::Utf8(s), _) => Some((DataType::Utf8, s)), diff --git a/datafusion/optimizer/src/simplify_expressions/mod.rs b/datafusion/optimizer/src/simplify_expressions/mod.rs index 3ab76119cca84..b85b000821ad8 100644 --- a/datafusion/optimizer/src/simplify_expressions/mod.rs +++ b/datafusion/optimizer/src/simplify_expressions/mod.rs @@ -24,6 +24,7 @@ mod regex; pub mod simplify_exprs; pub mod simplify_literal; mod simplify_predicates; +mod udf_preimage; mod unwrap_cast; mod utils; diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs new file mode 100644 index 0000000000000..61317f2a5dd76 --- /dev/null +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -0,0 +1,112 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion_common::{Result, internal_err, tree_node::Transformed}; +use datafusion_expr::{BinaryExpr, Expr, Operator, and, lit, or, simplify::SimplifyContext}; +use datafusion_expr_common::interval_arithmetic::Interval; + +/// Rewrites a binary expression using its "preimage" +/// +/// Specifically it rewrites expressions of the form ` OP x` (e.g. ` = +/// x`) where `` is known to have a pre-image (aka the entire single +/// range for which it is valid) +/// +/// This rewrite is described in the [ClickHouse Paper] and is particularly +/// useful for simplifying expressions `date_part` or equivalent functions. The +/// idea is that if you have an expression like `date_part(YEAR, k) = 2024` and you +/// can find a [preimage] for `date_part(YEAR, k)`, which is the range of dates +/// covering the entire year of 2024. Thus, you can rewrite the expression to `k +/// >= '2024-01-01' AND k < '2025-01-01' which is often more optimizable. +/// +/// [ClickHouse Paper]: https://www.vldb.org/pvldb/vol17/p3731-schulze.pdf +/// [preimage]: https://en.wikipedia.org/wiki/Image_(mathematics)#Inverse_image +/// +pub(super) fn rewrite_with_preimage( + _info: &SimplifyContext, + preimage_interval: Interval, + op: Operator, + expr: Box, +) -> Result> { + let (lower, upper) = preimage_interval.into_bounds(); + let (lower, upper) = (lit(lower), lit(upper)); + + let rewritten_expr = match op { + // < x ==> < upper + // >= x ==> >= lower + Operator::Lt | Operator::GtEq => Expr::BinaryExpr(BinaryExpr { + left: expr, + op, + right: Box::new(lower), + }), + // > x ==> >= upper + Operator::Gt => Expr::BinaryExpr(BinaryExpr { + left: expr, + op: Operator::GtEq, + right: Box::new(upper), + }), + // <= x ==> < upper + Operator::LtEq => Expr::BinaryExpr(BinaryExpr { + left: expr, + op: Operator::Lt, + right: Box::new(upper), + }), + // = x ==> ( >= lower) and ( < upper) + // + // is not distinct from x ==> ( is NULL and x is NULL) or (( >= lower) and ( < upper)) + // but since x is always not NULL => ( >= lower) and ( < upper) + Operator::Eq | Operator::IsNotDistinctFrom => and( + Expr::BinaryExpr(BinaryExpr { + left: expr.clone(), + op: Operator::GtEq, + right: Box::new(lower), + }), + Expr::BinaryExpr(BinaryExpr { + left: expr, + op: Operator::Lt, + right: Box::new(upper), + }), + ), + // != x ==> ( < lower) or ( >= upper) + Operator::NotEq => or( + Expr::BinaryExpr(BinaryExpr { + left: expr.clone(), + op: Operator::Lt, + right: Box::new(lower), + }), + Expr::BinaryExpr(BinaryExpr { + left: expr, + op: Operator::GtEq, + right: Box::new(upper), + }), + ), + // is distinct from x ==> ( < lower) or ( >= upper) or ( is NULL and x is not NULL) or ( is not NULL and x is NULL) + // but given that x is always not NULL => ( < lower) or ( >= upper) or ( is NULL) + Operator::IsDistinctFrom => Expr::BinaryExpr(BinaryExpr { + left: expr.clone(), + op: Operator::Lt, + right: Box::new(lower.clone()), + }) + .or(Expr::BinaryExpr(BinaryExpr { + left: expr.clone(), + op: Operator::GtEq, + right: Box::new(upper), + })) + .or(expr.is_null()), + _ => return internal_err!("Expect comparison operators"), + }; + Ok(Transformed::yes(rewritten_expr)) +} \ No newline at end of file From 4aa7f4e7d32688c01b00670ea8a54ed351937b9f Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 9 Jan 2026 20:37:19 -0500 Subject: [PATCH 02/26] Cargo fmt --- datafusion/expr/src/udf.rs | 3 +-- .../src/simplify_expressions/expr_simplifier.rs | 9 ++++++--- .../optimizer/src/simplify_expressions/udf_preimage.rs | 6 ++++-- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 09c62278c2b00..55c50f6dd8567 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -232,7 +232,7 @@ impl ScalarUDF { self.inner.is_nullable(args, schema) } - /// Return a preimage + /// Return a preimage /// /// See [`ScalarUDFImpl::preimage`] for more details. pub fn preimage( @@ -745,7 +745,6 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync { None } - /// Returns true if some of this `exprs` subexpressions may not be evaluated /// and thus any side effects (like divide by zero) may not be encountered. /// diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 15a58683a54b4..5dd19ef152c07 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -38,7 +38,8 @@ use datafusion_common::{ tree_node::{Transformed, TransformedResult, TreeNode, TreeNodeRewriter}, }; use datafusion_expr::{ - BinaryExpr, Case, ColumnarValue, Expr, Like, Operator, Volatility, and, binary::BinaryTypeCoercer, interval_arithmetic::Interval, lit, or + BinaryExpr, Case, ColumnarValue, Expr, Like, Operator, Volatility, and, + binary::BinaryTypeCoercer, interval_arithmetic::Interval, lit, or, }; use datafusion_expr::{Cast, TryCast, simplify::ExprSimplifyResult}; use datafusion_expr::{expr::ScalarFunction, interval_arithmetic::NullableInterval}; @@ -50,7 +51,6 @@ use datafusion_physical_expr::{create_physical_expr, execution_props::ExecutionP use super::inlist_simplifier::ShortenInListSimplifier; use super::utils::*; -use crate::{analyzer::type_coercion::TypeCoercionRewriter, simplify_expressions::udf_preimage::rewrite_with_preimage}; use crate::simplify_expressions::SimplifyContext; use crate::simplify_expressions::regex::simplify_regex_expr; use crate::simplify_expressions::unwrap_cast::{ @@ -58,6 +58,10 @@ use crate::simplify_expressions::unwrap_cast::{ is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist, unwrap_cast_in_comparison_for_binary, }; +use crate::{ + analyzer::type_coercion::TypeCoercionRewriter, + simplify_expressions::udf_preimage::rewrite_with_preimage, +}; use datafusion_expr::expr_rewriter::rewrite_with_guarantees_map; use datafusion_expr_common::casts::try_cast_literal_to_type; use indexmap::IndexSet; @@ -1995,7 +1999,6 @@ impl TreeNodeRewriter for Simplifier<'_> { rewrite_with_preimage(info, interval, swapped, Box::new(col_expr))? } - // no additional rewrites possible expr => Transformed::no(expr), }) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index 61317f2a5dd76..465a23318a372 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -16,7 +16,9 @@ // under the License. use datafusion_common::{Result, internal_err, tree_node::Transformed}; -use datafusion_expr::{BinaryExpr, Expr, Operator, and, lit, or, simplify::SimplifyContext}; +use datafusion_expr::{ + BinaryExpr, Expr, Operator, and, lit, or, simplify::SimplifyContext, +}; use datafusion_expr_common::interval_arithmetic::Interval; /// Rewrites a binary expression using its "preimage" @@ -109,4 +111,4 @@ pub(super) fn rewrite_with_preimage( _ => return internal_err!("Expect comparison operators"), }; Ok(Transformed::yes(rewritten_expr)) -} \ No newline at end of file +} From 2329c12966df305ffa779bef522990c0096b29d7 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 9 Jan 2026 21:27:00 -0500 Subject: [PATCH 03/26] Fix err in rewrite_with_preimage --- .../src/simplify_expressions/udf_preimage.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index 465a23318a372..f31a360d7c4d7 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -48,11 +48,10 @@ pub(super) fn rewrite_with_preimage( let rewritten_expr = match op { // < x ==> < upper - // >= x ==> >= lower - Operator::Lt | Operator::GtEq => Expr::BinaryExpr(BinaryExpr { + Operator::Lt => Expr::BinaryExpr(BinaryExpr { left: expr, op, - right: Box::new(lower), + right: Box::new(upper), }), // > x ==> >= upper Operator::Gt => Expr::BinaryExpr(BinaryExpr { @@ -66,6 +65,12 @@ pub(super) fn rewrite_with_preimage( op: Operator::Lt, right: Box::new(upper), }), + // >= x ==> >= lower + Operator::GtEq => Expr::BinaryExpr(BinaryExpr { + left: expr, + op: Operator::GtEq, + right: Box::new(lower), + }), // = x ==> ( >= lower) and ( < upper) // // is not distinct from x ==> ( is NULL and x is NULL) or (( >= lower) and ( < upper)) From 7ac83255cf90c4e17f64f6f17431084d240d8ab6 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 9 Jan 2026 21:28:25 -0500 Subject: [PATCH 04/26] Rewrite the preimage_in_comparison --- .../simplify_expressions/expr_simplifier.rs | 82 ++++++++++++------- 1 file changed, 52 insertions(+), 30 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 5dd19ef152c07..9ca6c2b2e42c6 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1967,36 +1967,43 @@ impl TreeNodeRewriter for Simplifier<'_> { // but they cannot do so for complex expressions. // For a complex predicate like `date_part('YEAR', c1) < 2000`, pruning is not possible. // After rewriting it to `c1 < 2000-01-01`, pruning becomes feasible. - Expr::BinaryExpr(BinaryExpr { left, op, right }) - if get_preimage(&left, &right, info)?.0.is_some() - && get_preimage(&left, &right, info)?.1.is_some() => - { - // todo use let binding (if let Some(interval) = ...) once stabilized to avoid computing this thrice😢 - let (Some(interval), Some(col_expr)) = - get_preimage(left.as_ref(), &right, info)? - else { - unreachable!( - "The above if statement insures interval and col_expr are Some" - ) - }; - rewrite_with_preimage(info, interval, op, Box::new(col_expr))? - } - // literal op date_part(literal, expression) - // --> - // date_part(literal, expression) op_swap literal - Expr::BinaryExpr(BinaryExpr { left, op, right }) - if get_preimage(&right, &left, info)?.0.is_some() - && get_preimage(&right, &left, info)?.1.is_some() - && op.swap().is_some() => - { - let swapped = op.swap().unwrap(); - let (Some(interval), Some(col_expr)) = get_preimage(&right, &left, info)? - else { - unreachable!( - "The above if statement insures interval and col_expr are Some" - ) - }; - rewrite_with_preimage(info, interval, swapped, Box::new(col_expr))? + // NOTE: we only consider immutable UDFs with literal RHS values + Expr::BinaryExpr(BinaryExpr { left, op, right }) => { + let is_preimage_op = matches!( + op, + Operator::Eq + | Operator::NotEq + | Operator::Lt + | Operator::LtEq + | Operator::Gt + | Operator::GtEq + | Operator::IsDistinctFrom + | Operator::IsNotDistinctFrom + ); + if !is_preimage_op { + return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right }))); + } + + if let (Some(interval), Some(col_expr)) = + get_preimage(left.as_ref(), right.as_ref(), info)? + { + rewrite_with_preimage(info, interval, op, Box::new(col_expr))? + } else if let Some(swapped) = op.swap() { + if let (Some(interval), Some(col_expr)) = + get_preimage(right.as_ref(), left.as_ref(), info)? + { + rewrite_with_preimage( + info, + interval, + swapped, + Box::new(col_expr), + )? + } else { + Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right })) + } + } else { + Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right })) + } } // no additional rewrites possible @@ -2013,12 +2020,27 @@ fn get_preimage( let Expr::ScalarFunction(ScalarFunction { func, args }) = left_expr else { return Ok((None, None)); }; + if !is_literal_or_literal_cast(right_expr) { + return Ok((None, None)); + } + if func.signature().volatility != Volatility::Immutable { + return Ok((None, None)); + } Ok(( func.preimage(args, right_expr, info)?, func.column_expr(args), )) } +fn is_literal_or_literal_cast(expr: &Expr) -> bool { + match expr { + Expr::Literal(_, _) => true, + Expr::Cast(Cast { expr, .. }) => matches!(expr.as_ref(), Expr::Literal(_, _)), + Expr::TryCast(TryCast { expr, .. }) => matches!(expr.as_ref(), Expr::Literal(_, _)), + _ => false, + } +} + fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option)> { match expr { Expr::Literal(ScalarValue::Utf8(s), _) => Some((DataType::Utf8, s)), From 7a3e8b3a7ed100662ec2228fa47cce3a778128fb Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 9 Jan 2026 21:30:45 -0500 Subject: [PATCH 05/26] cargo fmt --- .../src/simplify_expressions/expr_simplifier.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 9ca6c2b2e42c6..07e3edf026c53 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1981,7 +1981,11 @@ impl TreeNodeRewriter for Simplifier<'_> { | Operator::IsNotDistinctFrom ); if !is_preimage_op { - return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right }))); + return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { + left, + op, + right, + }))); } if let (Some(interval), Some(col_expr)) = @@ -2036,7 +2040,9 @@ fn is_literal_or_literal_cast(expr: &Expr) -> bool { match expr { Expr::Literal(_, _) => true, Expr::Cast(Cast { expr, .. }) => matches!(expr.as_ref(), Expr::Literal(_, _)), - Expr::TryCast(TryCast { expr, .. }) => matches!(expr.as_ref(), Expr::Literal(_, _)), + Expr::TryCast(TryCast { expr, .. }) => { + matches!(expr.as_ref(), Expr::Literal(_, _)) + } _ => false, } } From fbd5dcce014d1b09d54556f38c33b435a8329baf Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 9 Jan 2026 21:53:40 -0500 Subject: [PATCH 06/26] Fix ci --- .../src/simplify_expressions/expr_simplifier.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 07e3edf026c53..c99978a1cc4c6 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1969,16 +1969,16 @@ impl TreeNodeRewriter for Simplifier<'_> { // After rewriting it to `c1 < 2000-01-01`, pruning becomes feasible. // NOTE: we only consider immutable UDFs with literal RHS values Expr::BinaryExpr(BinaryExpr { left, op, right }) => { + use datafusion_expr::Operator::*; let is_preimage_op = matches!( op, - Operator::Eq - | Operator::NotEq - | Operator::Lt - | Operator::LtEq - | Operator::Gt - | Operator::GtEq - | Operator::IsDistinctFrom - | Operator::IsNotDistinctFrom + Eq | NotEq + | Lt + | LtEq + | Gt + | GtEq + | IsDistinctFrom + | IsNotDistinctFrom ); if !is_preimage_op { return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { From d9207352a327f08245a1d1716d6450d05bbfab84 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Sat, 10 Jan 2026 12:13:03 -0500 Subject: [PATCH 07/26] Fix GtEq, Lt logic --- .../src/simplify_expressions/udf_preimage.rs | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index f31a360d7c4d7..960c8df322d15 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -47,11 +47,12 @@ pub(super) fn rewrite_with_preimage( let (lower, upper) = (lit(lower), lit(upper)); let rewritten_expr = match op { - // < x ==> < upper - Operator::Lt => Expr::BinaryExpr(BinaryExpr { + // < x ==> < lower + // >= x ==> >= lower + Operator::Lt | Operator::GtEq => Expr::BinaryExpr(BinaryExpr { left: expr, op, - right: Box::new(upper), + right: Box::new(lower), }), // > x ==> >= upper Operator::Gt => Expr::BinaryExpr(BinaryExpr { @@ -65,12 +66,6 @@ pub(super) fn rewrite_with_preimage( op: Operator::Lt, right: Box::new(upper), }), - // >= x ==> >= lower - Operator::GtEq => Expr::BinaryExpr(BinaryExpr { - left: expr, - op: Operator::GtEq, - right: Box::new(lower), - }), // = x ==> ( >= lower) and ( < upper) // // is not distinct from x ==> ( is NULL and x is NULL) or (( >= lower) and ( < upper)) From c2b0cd39b5b1d75e853490b748f5e1faaa55ba06 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Sun, 18 Jan 2026 16:50:34 -0500 Subject: [PATCH 08/26] Replace BinaryExpression with binary_expr() fn --- .../src/simplify_expressions/udf_preimage.rs | 58 ++++--------------- 1 file changed, 11 insertions(+), 47 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index 960c8df322d15..4bfa98d679292 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -17,7 +17,7 @@ use datafusion_common::{Result, internal_err, tree_node::Transformed}; use datafusion_expr::{ - BinaryExpr, Expr, Operator, and, lit, or, simplify::SimplifyContext, + Expr, Operator, and, binary_expr, lit, or, simplify::SimplifyContext, }; use datafusion_expr_common::interval_arithmetic::Interval; @@ -49,65 +49,29 @@ pub(super) fn rewrite_with_preimage( let rewritten_expr = match op { // < x ==> < lower // >= x ==> >= lower - Operator::Lt | Operator::GtEq => Expr::BinaryExpr(BinaryExpr { - left: expr, - op, - right: Box::new(lower), - }), + Operator::Lt | Operator::GtEq => binary_expr(*expr, op, lower), // > x ==> >= upper - Operator::Gt => Expr::BinaryExpr(BinaryExpr { - left: expr, - op: Operator::GtEq, - right: Box::new(upper), - }), + Operator::Gt => binary_expr(*expr, Operator::GtEq, upper), // <= x ==> < upper - Operator::LtEq => Expr::BinaryExpr(BinaryExpr { - left: expr, - op: Operator::Lt, - right: Box::new(upper), - }), + Operator::LtEq => binary_expr(*expr, Operator::Lt, upper), // = x ==> ( >= lower) and ( < upper) // // is not distinct from x ==> ( is NULL and x is NULL) or (( >= lower) and ( < upper)) // but since x is always not NULL => ( >= lower) and ( < upper) Operator::Eq | Operator::IsNotDistinctFrom => and( - Expr::BinaryExpr(BinaryExpr { - left: expr.clone(), - op: Operator::GtEq, - right: Box::new(lower), - }), - Expr::BinaryExpr(BinaryExpr { - left: expr, - op: Operator::Lt, - right: Box::new(upper), - }), + binary_expr(*expr.clone(), Operator::GtEq, lower), + binary_expr(*expr, Operator::Lt, upper), ), // != x ==> ( < lower) or ( >= upper) Operator::NotEq => or( - Expr::BinaryExpr(BinaryExpr { - left: expr.clone(), - op: Operator::Lt, - right: Box::new(lower), - }), - Expr::BinaryExpr(BinaryExpr { - left: expr, - op: Operator::GtEq, - right: Box::new(upper), - }), + binary_expr(*expr.clone(), Operator::Lt, lower), + binary_expr(*expr, Operator::GtEq, upper), ), // is distinct from x ==> ( < lower) or ( >= upper) or ( is NULL and x is not NULL) or ( is not NULL and x is NULL) // but given that x is always not NULL => ( < lower) or ( >= upper) or ( is NULL) - Operator::IsDistinctFrom => Expr::BinaryExpr(BinaryExpr { - left: expr.clone(), - op: Operator::Lt, - right: Box::new(lower.clone()), - }) - .or(Expr::BinaryExpr(BinaryExpr { - left: expr.clone(), - op: Operator::GtEq, - right: Box::new(upper), - })) - .or(expr.is_null()), + Operator::IsDistinctFrom => binary_expr(*expr.clone(), Operator::Lt, lower) + .or(binary_expr(*expr.clone(), Operator::GtEq, upper)) + .or(expr.is_null()), _ => return internal_err!("Expect comparison operators"), }; Ok(Transformed::yes(rewritten_expr)) From a0b6564eb0403ec1780cc87baa7de2a4cfb670fc Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Sun, 18 Jan 2026 20:46:17 -0500 Subject: [PATCH 09/26] Add unit tests + add doc part about upper bound --- .../src/simplify_expressions/udf_preimage.rs | 196 +++++++++++++++++- 1 file changed, 194 insertions(+), 2 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index 4bfa98d679292..f03426ddf52c3 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -31,8 +31,9 @@ use datafusion_expr_common::interval_arithmetic::Interval; /// useful for simplifying expressions `date_part` or equivalent functions. The /// idea is that if you have an expression like `date_part(YEAR, k) = 2024` and you /// can find a [preimage] for `date_part(YEAR, k)`, which is the range of dates -/// covering the entire year of 2024. Thus, you can rewrite the expression to `k -/// >= '2024-01-01' AND k < '2025-01-01' which is often more optimizable. +/// covering the entire year of 2024. Thus, you can rewrite the expression to +/// `k >= '2024-01-01' AND k < '2025-01-01'`, which uses an inclusive lower bound +/// and exclusive upper bound and is often more optimizable. /// /// [ClickHouse Paper]: https://www.vldb.org/pvldb/vol17/p3731-schulze.pdf /// [preimage]: https://en.wikipedia.org/wiki/Image_(mathematics)#Inverse_image @@ -76,3 +77,194 @@ pub(super) fn rewrite_with_preimage( }; Ok(Transformed::yes(rewritten_expr)) } + +#[cfg(test)] +mod test { + use std::any::Any; + use std::sync::Arc; + + use arrow::datatypes::{DataType, Field}; + use datafusion_common::{DFSchema, DFSchemaRef, Result, ScalarValue}; + use datafusion_expr::{ + ColumnarValue, Expr, Operator, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, + Signature, Volatility, and, binary_expr, col, expr::ScalarFunction, lit, + simplify::SimplifyContext, + }; + + use super::Interval; + use crate::simplify_expressions::ExprSimplifier; + + #[derive(Debug, PartialEq, Eq, Hash)] + struct PreimageUdf { + signature: Signature, + } + + impl ScalarUDFImpl for PreimageUdf { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "preimage_func" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Int32) + } + + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { + Ok(ColumnarValue::Scalar(ScalarValue::Int32(Some(500)))) + } + + fn preimage( + &self, + args: &[Expr], + lit_expr: &Expr, + _info: &SimplifyContext, + ) -> Result> { + if args.len() != 1 { + return Ok(None); + } + match lit_expr { + Expr::Literal(ScalarValue::Int32(Some(500)), _) => { + Ok(Some(Interval::try_new( + ScalarValue::Int32(Some(100)), + ScalarValue::Int32(Some(200)), + )?)) + } + _ => Ok(None), + } + } + + fn column_expr(&self, args: &[Expr]) -> Option { + args.get(0).cloned() + } + } + + fn optimize_test(expr: Expr, schema: &DFSchemaRef) -> Expr { + let simplifier = ExprSimplifier::new( + SimplifyContext::default().with_schema(Arc::clone(schema)), + ); + + simplifier.simplify(expr).unwrap() + } + + fn preimage_udf_expr() -> Expr { + let udf = ScalarUDF::new_from_impl(PreimageUdf { + signature: Signature::exact(vec![DataType::Int32], Volatility::Immutable), + }); + + Expr::ScalarFunction(ScalarFunction::new_udf(Arc::new(udf), vec![col("x")])) + } + + fn test_schema() -> DFSchemaRef { + Arc::new( + DFSchema::from_unqualified_fields( + vec![Field::new("x", DataType::Int32, false)].into(), + Default::default(), + ) + .unwrap(), + ) + } + + #[test] + fn test_preimage_eq_rewrite() { + let schema = test_schema(); + let expr = binary_expr(preimage_udf_expr(), Operator::Eq, lit(500)); + let expected = and( + binary_expr(col("x"), Operator::GtEq, lit(100)), + binary_expr(col("x"), Operator::Lt, lit(200)), + ); + + assert_eq!(optimize_test(expr, &schema), expected); + } + + #[test] + fn test_preimage_noteq_rewrite() { + let schema = test_schema(); + let expr = binary_expr(preimage_udf_expr(), Operator::NotEq, lit(500)); + let expected = binary_expr(col("x"), Operator::Lt, lit(100)).or(binary_expr( + col("x"), + Operator::GtEq, + lit(200), + )); + + assert_eq!(optimize_test(expr, &schema), expected); + } + + #[test] + fn test_preimage_eq_rewrite_swapped() { + let schema = test_schema(); + let expr = binary_expr(lit(500), Operator::Eq, preimage_udf_expr()); + let expected = and( + binary_expr(col("x"), Operator::GtEq, lit(100)), + binary_expr(col("x"), Operator::Lt, lit(200)), + ); + + assert_eq!(optimize_test(expr, &schema), expected); + } + + #[test] + fn test_preimage_lt_rewrite() { + let schema = test_schema(); + let expr = binary_expr(preimage_udf_expr(), Operator::Lt, lit(500)); + let expected = binary_expr(col("x"), Operator::Lt, lit(100)); + + assert_eq!(optimize_test(expr, &schema), expected); + } + + #[test] + fn test_preimage_lteq_rewrite() { + let schema = test_schema(); + let expr = binary_expr(preimage_udf_expr(), Operator::LtEq, lit(500)); + let expected = binary_expr(col("x"), Operator::Lt, lit(200)); + + assert_eq!(optimize_test(expr, &schema), expected); + } + + #[test] + fn test_preimage_gt_rewrite() { + let schema = test_schema(); + let expr = binary_expr(preimage_udf_expr(), Operator::Gt, lit(500)); + let expected = binary_expr(col("x"), Operator::GtEq, lit(200)); + + assert_eq!(optimize_test(expr, &schema), expected); + } + + #[test] + fn test_preimage_gteq_rewrite() { + let schema = test_schema(); + let expr = binary_expr(preimage_udf_expr(), Operator::GtEq, lit(500)); + let expected = binary_expr(col("x"), Operator::GtEq, lit(100)); + + assert_eq!(optimize_test(expr, &schema), expected); + } + + #[test] + fn test_preimage_is_not_distinct_from_rewrite() { + let schema = test_schema(); + let expr = + binary_expr(preimage_udf_expr(), Operator::IsNotDistinctFrom, lit(500)); + let expected = and( + binary_expr(col("x"), Operator::GtEq, lit(100)), + binary_expr(col("x"), Operator::Lt, lit(200)), + ); + + assert_eq!(optimize_test(expr, &schema), expected); + } + + #[test] + fn test_preimage_is_distinct_from_rewrite() { + let schema = test_schema(); + let expr = binary_expr(preimage_udf_expr(), Operator::IsDistinctFrom, lit(500)); + let expected = binary_expr(col("x"), Operator::Lt, lit(100)) + .or(binary_expr(col("x"), Operator::GtEq, lit(200))) + .or(col("x").is_null()); + + assert_eq!(optimize_test(expr, &schema), expected); + } +} From 0a24d6096938b67c8930025beb459d2263d63882 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Sun, 18 Jan 2026 20:55:19 -0500 Subject: [PATCH 10/26] Fix docs --- datafusion/expr/src/udf.rs | 6 +++++- .../optimizer/src/simplify_expressions/expr_simplifier.rs | 7 +++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 55c50f6dd8567..affb439d81fd7 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -720,6 +720,9 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync { /// A preimage is a single contiguous [`Interval`] of values where the function /// will always return `lit_value` /// + /// Implementations should return intervals with an inclusive lower bound and + /// exclusive upper bound. + /// /// This rewrite is described in the [ClickHouse Paper] and is particularly /// useful for simplifying expressions `date_part` or equivalent functions. The /// idea is that if you have an expression like `date_part(YEAR, k) = 2024` and you @@ -727,7 +730,8 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync { /// covering the entire year of 2024. Thus, you can rewrite the expression to `k /// >= '2024-01-01' AND k < '2025-01-01' which is often more optimizable. /// - /// This should only return a preimage if the function takes a single argument + /// Implementations must also provide [`ScalarUDFImpl::column_expr`] so the + /// optimizer can identify which argument maps to the preimage interval. /// /// [ClickHouse Paper]: https://www.vldb.org/pvldb/vol17/p3731-schulze.pdf /// [preimage]: https://en.wikipedia.org/wiki/Image_(mathematics)#Inverse_image diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index d6aa3d9ccf341..38e78eda5c7f6 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1977,14 +1977,17 @@ impl TreeNodeRewriter for Simplifier<'_> { // ======================================= // // For case: - // date_part(expr as 'YEAR') op literal + // date_part('YEAR', expr) op literal // // Background: // Datasources such as Parquet can prune partitions using simple predicates, // but they cannot do so for complex expressions. // For a complex predicate like `date_part('YEAR', c1) < 2000`, pruning is not possible. // After rewriting it to `c1 < 2000-01-01`, pruning becomes feasible. - // NOTE: we only consider immutable UDFs with literal RHS values + // Rewrites use inclusive lower and exclusive upper bounds when + // translating an equality into a range. + // NOTE: we only consider immutable UDFs with literal RHS values and + // UDFs that provide both `preimage` and `column_expr`. Expr::BinaryExpr(BinaryExpr { left, op, right }) => { use datafusion_expr::Operator::*; let is_preimage_op = matches!( From 59235de4b398aa275165880e1515725a1299302d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 19 Jan 2026 13:34:32 -0500 Subject: [PATCH 11/26] clippy --- datafusion/optimizer/src/simplify_expressions/udf_preimage.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index f03426ddf52c3..a4ec0bf6aeafe 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -141,7 +141,7 @@ mod test { } fn column_expr(&self, args: &[Expr]) -> Option { - args.get(0).cloned() + args.first().cloned() } } From 9f845e74245c1903abd49d034d10316623b321a2 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Mon, 19 Jan 2026 14:40:17 -0500 Subject: [PATCH 12/26] Make test field nullable --- datafusion/optimizer/src/simplify_expressions/udf_preimage.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index a4ec0bf6aeafe..8e7278cb0fb45 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -164,7 +164,7 @@ mod test { fn test_schema() -> DFSchemaRef { Arc::new( DFSchema::from_unqualified_fields( - vec![Field::new("x", DataType::Int32, false)].into(), + vec![Field::new("x", DataType::Int32, true)].into(), Default::default(), ) .unwrap(), From 510b5bc11d49e91fbb2817df4d2c286a7125898d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 19 Jan 2026 19:55:43 -0500 Subject: [PATCH 13/26] Add tests for additional cases --- .../src/simplify_expressions/udf_preimage.rs | 217 +++++++++++++----- 1 file changed, 156 insertions(+), 61 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index a4ec0bf6aeafe..50959719da61c 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -16,9 +16,7 @@ // under the License. use datafusion_common::{Result, internal_err, tree_node::Transformed}; -use datafusion_expr::{ - Expr, Operator, and, binary_expr, lit, or, simplify::SimplifyContext, -}; +use datafusion_expr::{Expr, Operator, and, lit, or, simplify::SimplifyContext}; use datafusion_expr_common::interval_arithmetic::Interval; /// Rewrites a binary expression using its "preimage" @@ -46,32 +44,32 @@ pub(super) fn rewrite_with_preimage( ) -> Result> { let (lower, upper) = preimage_interval.into_bounds(); let (lower, upper) = (lit(lower), lit(upper)); + let expr = *expr; let rewritten_expr = match op { // < x ==> < lower // >= x ==> >= lower - Operator::Lt | Operator::GtEq => binary_expr(*expr, op, lower), + Operator::Lt => expr.lt(lower), + Operator::GtEq => expr.gt_eq(lower), // > x ==> >= upper - Operator::Gt => binary_expr(*expr, Operator::GtEq, upper), + Operator::Gt => expr.gt_eq(upper), // <= x ==> < upper - Operator::LtEq => binary_expr(*expr, Operator::Lt, upper), + Operator::LtEq => expr.lt(upper), // = x ==> ( >= lower) and ( < upper) // // is not distinct from x ==> ( is NULL and x is NULL) or (( >= lower) and ( < upper)) // but since x is always not NULL => ( >= lower) and ( < upper) - Operator::Eq | Operator::IsNotDistinctFrom => and( - binary_expr(*expr.clone(), Operator::GtEq, lower), - binary_expr(*expr, Operator::Lt, upper), - ), + Operator::Eq | Operator::IsNotDistinctFrom => { + and(expr.clone().gt_eq(lower), expr.lt(upper)) + } // != x ==> ( < lower) or ( >= upper) - Operator::NotEq => or( - binary_expr(*expr.clone(), Operator::Lt, lower), - binary_expr(*expr, Operator::GtEq, upper), - ), + Operator::NotEq => or(expr.clone().lt(lower), expr.gt_eq(upper)), // is distinct from x ==> ( < lower) or ( >= upper) or ( is NULL and x is not NULL) or ( is not NULL and x is NULL) // but given that x is always not NULL => ( < lower) or ( >= upper) or ( is NULL) - Operator::IsDistinctFrom => binary_expr(*expr.clone(), Operator::Lt, lower) - .or(binary_expr(*expr.clone(), Operator::GtEq, upper)) + Operator::IsDistinctFrom => expr + .clone() + .lt(lower) + .or(expr.clone().gt_eq(upper)) .or(expr.is_null()), _ => return internal_err!("Expect comparison operators"), }; @@ -86,17 +84,56 @@ mod test { use arrow::datatypes::{DataType, Field}; use datafusion_common::{DFSchema, DFSchemaRef, Result, ScalarValue}; use datafusion_expr::{ - ColumnarValue, Expr, Operator, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, - Signature, Volatility, and, binary_expr, col, expr::ScalarFunction, lit, - simplify::SimplifyContext, + BinaryExpr, ColumnarValue, Expr, Operator, ScalarFunctionArgs, ScalarUDF, + ScalarUDFImpl, Signature, Volatility, and, col, lit, simplify::SimplifyContext, }; use super::Interval; use crate::simplify_expressions::ExprSimplifier; + fn is_distinct_from(left: Expr, right: Expr) -> Expr { + Expr::BinaryExpr(BinaryExpr { + left: Box::new(left), + op: Operator::IsDistinctFrom, + right: Box::new(right), + }) + } + + fn is_not_distinct_from(left: Expr, right: Expr) -> Expr { + Expr::BinaryExpr(BinaryExpr { + left: Box::new(left), + op: Operator::IsNotDistinctFrom, + right: Box::new(right), + }) + } + #[derive(Debug, PartialEq, Eq, Hash)] struct PreimageUdf { + /// Defaults to an exact signature with one Int32 argument and Immutable volatility signature: Signature, + /// If true, returns a preimage; otherwise, returns None + enabled: bool, + } + + impl PreimageUdf { + fn new() -> Self { + Self { + signature: Signature::exact(vec![DataType::Int32], Volatility::Immutable), + enabled: true, + } + } + + /// Set the enabled flag + fn with_enabled(mut self, enabled: bool) -> Self { + self.enabled = enabled; + self + } + + /// Set the volatility + fn with_volatility(mut self, volatility: Volatility) -> Self { + self.signature.volatility = volatility; + self + } } impl ScalarUDFImpl for PreimageUdf { @@ -126,6 +163,9 @@ mod test { lit_expr: &Expr, _info: &SimplifyContext, ) -> Result> { + if !self.enabled { + return Ok(None); + } if args.len() != 1 { return Ok(None); } @@ -146,19 +186,24 @@ mod test { } fn optimize_test(expr: Expr, schema: &DFSchemaRef) -> Expr { - let simplifier = ExprSimplifier::new( - SimplifyContext::default().with_schema(Arc::clone(schema)), - ); - - simplifier.simplify(expr).unwrap() + let simplify_context = SimplifyContext::default().with_schema(Arc::clone(schema)); + ExprSimplifier::new(simplify_context) + .simplify(expr) + .unwrap() } fn preimage_udf_expr() -> Expr { - let udf = ScalarUDF::new_from_impl(PreimageUdf { - signature: Signature::exact(vec![DataType::Int32], Volatility::Immutable), - }); + ScalarUDF::new_from_impl(PreimageUdf::new()).call(vec![col("x")]) + } - Expr::ScalarFunction(ScalarFunction::new_udf(Arc::new(udf), vec![col("x")])) + fn non_immutable_udf_expr() -> Expr { + ScalarUDF::new_from_impl(PreimageUdf::new().with_volatility(Volatility::Volatile)) + .call(vec![col("x")]) + } + + fn no_preimage_udf_expr() -> Expr { + ScalarUDF::new_from_impl(PreimageUdf::new().with_enabled(false)) + .call(vec![col("x")]) } fn test_schema() -> DFSchemaRef { @@ -171,100 +216,150 @@ mod test { ) } + fn test_schema_xy() -> DFSchemaRef { + Arc::new( + DFSchema::from_unqualified_fields( + vec![ + Field::new("x", DataType::Int32, false), + Field::new("y", DataType::Int32, false), + ] + .into(), + Default::default(), + ) + .unwrap(), + ) + } + #[test] fn test_preimage_eq_rewrite() { + // Equality rewrite when preimage and column expression are available. let schema = test_schema(); - let expr = binary_expr(preimage_udf_expr(), Operator::Eq, lit(500)); - let expected = and( - binary_expr(col("x"), Operator::GtEq, lit(100)), - binary_expr(col("x"), Operator::Lt, lit(200)), - ); + let expr = preimage_udf_expr().eq(lit(500)); + let expected = and(col("x").gt_eq(lit(100)), col("x").lt(lit(200))); assert_eq!(optimize_test(expr, &schema), expected); } #[test] fn test_preimage_noteq_rewrite() { + // Inequality rewrite expands to disjoint ranges. let schema = test_schema(); - let expr = binary_expr(preimage_udf_expr(), Operator::NotEq, lit(500)); - let expected = binary_expr(col("x"), Operator::Lt, lit(100)).or(binary_expr( - col("x"), - Operator::GtEq, - lit(200), - )); + let expr = preimage_udf_expr().not_eq(lit(500)); + let expected = col("x").lt(lit(100)).or(col("x").gt_eq(lit(200))); assert_eq!(optimize_test(expr, &schema), expected); } #[test] fn test_preimage_eq_rewrite_swapped() { + // Equality rewrite works when the literal appears on the left. let schema = test_schema(); - let expr = binary_expr(lit(500), Operator::Eq, preimage_udf_expr()); - let expected = and( - binary_expr(col("x"), Operator::GtEq, lit(100)), - binary_expr(col("x"), Operator::Lt, lit(200)), - ); + let expr = lit(500).eq(preimage_udf_expr()); + let expected = and(col("x").gt_eq(lit(100)), col("x").lt(lit(200))); assert_eq!(optimize_test(expr, &schema), expected); } #[test] fn test_preimage_lt_rewrite() { + // Less-than comparison rewrites to the lower bound. let schema = test_schema(); - let expr = binary_expr(preimage_udf_expr(), Operator::Lt, lit(500)); - let expected = binary_expr(col("x"), Operator::Lt, lit(100)); + let expr = preimage_udf_expr().lt(lit(500)); + let expected = col("x").lt(lit(100)); assert_eq!(optimize_test(expr, &schema), expected); } #[test] fn test_preimage_lteq_rewrite() { + // Less-than-or-equal comparison rewrites to the upper bound. let schema = test_schema(); - let expr = binary_expr(preimage_udf_expr(), Operator::LtEq, lit(500)); - let expected = binary_expr(col("x"), Operator::Lt, lit(200)); + let expr = preimage_udf_expr().lt_eq(lit(500)); + let expected = col("x").lt(lit(200)); assert_eq!(optimize_test(expr, &schema), expected); } #[test] fn test_preimage_gt_rewrite() { + // Greater-than comparison rewrites to the upper bound (inclusive). let schema = test_schema(); - let expr = binary_expr(preimage_udf_expr(), Operator::Gt, lit(500)); - let expected = binary_expr(col("x"), Operator::GtEq, lit(200)); + let expr = preimage_udf_expr().gt(lit(500)); + let expected = col("x").gt_eq(lit(200)); assert_eq!(optimize_test(expr, &schema), expected); } #[test] fn test_preimage_gteq_rewrite() { + // Greater-than-or-equal comparison rewrites to the lower bound. let schema = test_schema(); - let expr = binary_expr(preimage_udf_expr(), Operator::GtEq, lit(500)); - let expected = binary_expr(col("x"), Operator::GtEq, lit(100)); + let expr = preimage_udf_expr().gt_eq(lit(500)); + let expected = col("x").gt_eq(lit(100)); assert_eq!(optimize_test(expr, &schema), expected); } #[test] fn test_preimage_is_not_distinct_from_rewrite() { + // IS NOT DISTINCT FROM is treated like equality for non-null literal RHS. let schema = test_schema(); - let expr = - binary_expr(preimage_udf_expr(), Operator::IsNotDistinctFrom, lit(500)); - let expected = and( - binary_expr(col("x"), Operator::GtEq, lit(100)), - binary_expr(col("x"), Operator::Lt, lit(200)), - ); + let expr = is_not_distinct_from(preimage_udf_expr(), lit(500)); + let expected = and(col("x").gt_eq(lit(100)), col("x").lt(lit(200))); assert_eq!(optimize_test(expr, &schema), expected); } #[test] fn test_preimage_is_distinct_from_rewrite() { + // IS DISTINCT FROM adds an explicit NULL branch for the column. let schema = test_schema(); - let expr = binary_expr(preimage_udf_expr(), Operator::IsDistinctFrom, lit(500)); - let expected = binary_expr(col("x"), Operator::Lt, lit(100)) - .or(binary_expr(col("x"), Operator::GtEq, lit(200))) + let expr = is_distinct_from(preimage_udf_expr(), lit(500)); + let expected = col("x") + .lt(lit(100)) + .or(col("x").gt_eq(lit(200))) .or(col("x").is_null()); assert_eq!(optimize_test(expr, &schema), expected); } + + #[test] + fn test_preimage_non_literal_rhs_no_rewrite() { + // Non-literal RHS should not be rewritten. + let schema = test_schema_xy(); + let expr = preimage_udf_expr().eq(col("y")); + let expected = expr.clone(); + + assert_eq!(optimize_test(expr, &schema), expected); + } + + #[test] + fn test_preimage_null_literal_no_rewrite() { + // NULL literal RHS should not be rewritten. + let schema = test_schema(); + let expr = preimage_udf_expr().eq(lit(ScalarValue::Int32(None))); + let expected = expr.clone(); + + assert_eq!(optimize_test(expr, &schema), expected); + } + + #[test] + fn test_preimage_non_immutable_no_rewrite() { + // Non-immutable UDFs should not participate in preimage rewrites. + let schema = test_schema(); + let expr = non_immutable_udf_expr().eq(lit(500)); + let expected = expr.clone(); + + assert_eq!(optimize_test(expr, &schema), expected); + } + + #[test] + fn test_preimage_no_preimage_no_rewrite() { + // If the UDF provides no preimage, the expression should remain unchanged. + let schema = test_schema(); + let expr = no_preimage_udf_expr().eq(lit(500)); + let expected = expr.clone(); + + assert_eq!(optimize_test(expr, &schema), expected); + } } From b9f5c2cac9ecaafea54586c21d3e2e2524b22850 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 19 Jan 2026 20:58:55 -0500 Subject: [PATCH 14/26] simplify --- .../src/simplify_expressions/udf_preimage.rs | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index 50959719da61c..0d64d75db4a16 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -85,26 +85,19 @@ mod test { use datafusion_common::{DFSchema, DFSchemaRef, Result, ScalarValue}; use datafusion_expr::{ BinaryExpr, ColumnarValue, Expr, Operator, ScalarFunctionArgs, ScalarUDF, - ScalarUDFImpl, Signature, Volatility, and, col, lit, simplify::SimplifyContext, + ScalarUDFImpl, Signature, Volatility, and, binary_expr, col, lit, + simplify::SimplifyContext, }; use super::Interval; use crate::simplify_expressions::ExprSimplifier; fn is_distinct_from(left: Expr, right: Expr) -> Expr { - Expr::BinaryExpr(BinaryExpr { - left: Box::new(left), - op: Operator::IsDistinctFrom, - right: Box::new(right), - }) + binary_expr(left, Operator::IsDistinctFrom, right) } fn is_not_distinct_from(left: Expr, right: Expr) -> Expr { - Expr::BinaryExpr(BinaryExpr { - left: Box::new(left), - op: Operator::IsNotDistinctFrom, - right: Box::new(right), - }) + binary_expr(left, Operator::IsNotDistinctFrom, right) } #[derive(Debug, PartialEq, Eq, Hash)] From ec8cc7e50f355cd3011341d3fb87c0d686a7fd17 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 19 Jan 2026 21:00:23 -0500 Subject: [PATCH 15/26] Simplfy --- .../src/simplify_expressions/expr_simplifier.rs | 9 ++------- .../optimizer/src/simplify_expressions/udf_preimage.rs | 8 +++----- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 38e78eda5c7f6..d16aadcc7a877 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -2011,17 +2011,12 @@ impl TreeNodeRewriter for Simplifier<'_> { if let (Some(interval), Some(col_expr)) = get_preimage(left.as_ref(), right.as_ref(), info)? { - rewrite_with_preimage(info, interval, op, Box::new(col_expr))? + rewrite_with_preimage(info, interval, op, col_expr)? } else if let Some(swapped) = op.swap() { if let (Some(interval), Some(col_expr)) = get_preimage(right.as_ref(), left.as_ref(), info)? { - rewrite_with_preimage( - info, - interval, - swapped, - Box::new(col_expr), - )? + rewrite_with_preimage(info, interval, swapped, col_expr)? } else { Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right })) } diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index 0d64d75db4a16..3637072a72f94 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -40,11 +40,10 @@ pub(super) fn rewrite_with_preimage( _info: &SimplifyContext, preimage_interval: Interval, op: Operator, - expr: Box, + expr: Expr, ) -> Result> { let (lower, upper) = preimage_interval.into_bounds(); let (lower, upper) = (lit(lower), lit(upper)); - let expr = *expr; let rewritten_expr = match op { // < x ==> < lower @@ -84,9 +83,8 @@ mod test { use arrow::datatypes::{DataType, Field}; use datafusion_common::{DFSchema, DFSchemaRef, Result, ScalarValue}; use datafusion_expr::{ - BinaryExpr, ColumnarValue, Expr, Operator, ScalarFunctionArgs, ScalarUDF, - ScalarUDFImpl, Signature, Volatility, and, binary_expr, col, lit, - simplify::SimplifyContext, + ColumnarValue, Expr, Operator, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, + Signature, Volatility, and, binary_expr, col, lit, simplify::SimplifyContext, }; use super::Interval; From 01b254b0969841b71ca424bfae664f08e356b34d Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Tue, 20 Jan 2026 11:40:20 -0500 Subject: [PATCH 16/26] Add rhs Null guard --- .../optimizer/src/simplify_expressions/expr_simplifier.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index d16aadcc7a877..5a891493fea40 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1986,7 +1986,7 @@ impl TreeNodeRewriter for Simplifier<'_> { // After rewriting it to `c1 < 2000-01-01`, pruning becomes feasible. // Rewrites use inclusive lower and exclusive upper bounds when // translating an equality into a range. - // NOTE: we only consider immutable UDFs with literal RHS values and + // NOTE: we only consider immutable UDFs with non Null literal RHS values and // UDFs that provide both `preimage` and `column_expr`. Expr::BinaryExpr(BinaryExpr { left, op, right }) => { use datafusion_expr::Operator::*; @@ -2000,7 +2000,7 @@ impl TreeNodeRewriter for Simplifier<'_> { | IsDistinctFrom | IsNotDistinctFrom ); - if !is_preimage_op { + if !is_preimage_op || is_null(&right) { return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, From d8b4f0fed5af8f149566e30f54396431982b1548 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Tue, 20 Jan 2026 11:46:02 -0500 Subject: [PATCH 17/26] Fix comment --- datafusion/optimizer/src/simplify_expressions/udf_preimage.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index a0ef621acfd5f..a235ed1fb6332 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -23,7 +23,7 @@ use datafusion_expr_common::interval_arithmetic::Interval; /// /// Specifically it rewrites expressions of the form ` OP x` (e.g. ` = /// x`) where `` is known to have a pre-image (aka the entire single -/// range for which it is valid) +/// range for which it is valid) and `x` is not `NULL` /// /// This rewrite is described in the [ClickHouse Paper] and is particularly /// useful for simplifying expressions `date_part` or equivalent functions. The From 116d6e28804231280d884cb1c1a5a93ae293c647 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Tue, 20 Jan 2026 12:50:08 -0500 Subject: [PATCH 18/26] Update API --- datafusion/expr/src/lib.rs | 1 + datafusion/expr/src/preimage.rs | 28 +++++++++++++++++++ datafusion/expr/src/udf.rs | 28 ++++--------------- .../simplify_expressions/expr_simplifier.rs | 23 +++++++-------- .../src/simplify_expressions/udf_preimage.rs | 28 ++++++++++--------- 5 files changed, 59 insertions(+), 49 deletions(-) create mode 100644 datafusion/expr/src/preimage.rs diff --git a/datafusion/expr/src/lib.rs b/datafusion/expr/src/lib.rs index 4fb78933d7a5c..978e9f627565c 100644 --- a/datafusion/expr/src/lib.rs +++ b/datafusion/expr/src/lib.rs @@ -77,6 +77,7 @@ pub mod statistics { pub use datafusion_expr_common::statistics::*; } mod predicate_bounds; +pub mod preimage; pub mod ptr_eq; pub mod test; pub mod tree_node; diff --git a/datafusion/expr/src/preimage.rs b/datafusion/expr/src/preimage.rs new file mode 100644 index 0000000000000..6662d7ae34776 --- /dev/null +++ b/datafusion/expr/src/preimage.rs @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion_expr_common::interval_arithmetic::Interval; + +use crate::Expr; + +pub enum PreimageResult { + /// No preimage exists for the specified value + None, + /// The expression always evaluates to the specified constant + /// given that `expr` is within the interval + Range { expr: Expr, interval: Interval }, +} diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index affb439d81fd7..870e318a62c3d 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -19,6 +19,7 @@ use crate::async_udf::AsyncScalarUDF; use crate::expr::schema_name_from_exprs_comma_separated_without_space; +use crate::preimage::PreimageResult; use crate::simplify::{ExprSimplifyResult, SimplifyContext}; use crate::sort_properties::{ExprProperties, SortProperties}; use crate::udf_eq::UdfEq; @@ -240,17 +241,10 @@ impl ScalarUDF { args: &[Expr], lit_expr: &Expr, info: &SimplifyContext, - ) -> Result> { + ) -> Result { self.inner.preimage(args, lit_expr, info) } - /// Return inner column from function args - /// - /// See [`ScalarUDFImpl::column_expr`] - pub fn column_expr(&self, args: &[Expr]) -> Option { - self.inner.column_expr(args) - } - /// Invoke the function on `args`, returning the appropriate result. /// /// See [`ScalarUDFImpl::invoke_with_args`] for details. @@ -730,9 +724,6 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync { /// covering the entire year of 2024. Thus, you can rewrite the expression to `k /// >= '2024-01-01' AND k < '2025-01-01' which is often more optimizable. /// - /// Implementations must also provide [`ScalarUDFImpl::column_expr`] so the - /// optimizer can identify which argument maps to the preimage interval. - /// /// [ClickHouse Paper]: https://www.vldb.org/pvldb/vol17/p3731-schulze.pdf /// [preimage]: https://en.wikipedia.org/wiki/Image_(mathematics)#Inverse_image fn preimage( @@ -740,13 +731,8 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync { _args: &[Expr], _lit_expr: &Expr, _info: &SimplifyContext, - ) -> Result> { - Ok(None) - } - - // Return the inner column expression from this function - fn column_expr(&self, _args: &[Expr]) -> Option { - None + ) -> Result { + Ok(PreimageResult::None) } /// Returns true if some of this `exprs` subexpressions may not be evaluated @@ -984,14 +970,10 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl { args: &[Expr], lit_expr: &Expr, info: &SimplifyContext, - ) -> Result> { + ) -> Result { self.inner.preimage(args, lit_expr, info) } - fn column_expr(&self, args: &[Expr]) -> Option { - self.inner.column_expr(args) - } - fn conditional_arguments<'a>( &self, args: &'a [Expr], diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 5a891493fea40..70de2afb9e306 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -39,7 +39,7 @@ use datafusion_common::{ }; use datafusion_expr::{ BinaryExpr, Case, ColumnarValue, Expr, Like, Operator, Volatility, and, - binary::BinaryTypeCoercer, interval_arithmetic::Interval, lit, or, + binary::BinaryTypeCoercer, lit, or, preimage::PreimageResult, }; use datafusion_expr::{Cast, TryCast, simplify::ExprSimplifyResult}; use datafusion_expr::{expr::ScalarFunction, interval_arithmetic::NullableInterval}; @@ -2008,15 +2008,15 @@ impl TreeNodeRewriter for Simplifier<'_> { }))); } - if let (Some(interval), Some(col_expr)) = + if let PreimageResult::Range { interval, expr } = get_preimage(left.as_ref(), right.as_ref(), info)? { - rewrite_with_preimage(info, interval, op, col_expr)? + rewrite_with_preimage(info, interval, op, expr)? } else if let Some(swapped) = op.swap() { - if let (Some(interval), Some(col_expr)) = + if let PreimageResult::Range { interval, expr } = get_preimage(right.as_ref(), left.as_ref(), info)? { - rewrite_with_preimage(info, interval, swapped, col_expr)? + rewrite_with_preimage(info, interval, swapped, expr)? } else { Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right })) } @@ -2035,20 +2035,17 @@ fn get_preimage( left_expr: &Expr, right_expr: &Expr, info: &SimplifyContext, -) -> Result<(Option, Option)> { +) -> Result { let Expr::ScalarFunction(ScalarFunction { func, args }) = left_expr else { - return Ok((None, None)); + return Ok(PreimageResult::None); }; if !is_literal_or_literal_cast(right_expr) { - return Ok((None, None)); + return Ok(PreimageResult::None); } if func.signature().volatility != Volatility::Immutable { - return Ok((None, None)); + return Ok(PreimageResult::None); } - Ok(( - func.preimage(args, right_expr, info)?, - func.column_expr(args), - )) + func.preimage(args, right_expr, info) } fn is_literal_or_literal_cast(expr: &Expr) -> bool { diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index a235ed1fb6332..fb167b4ff0781 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -84,7 +84,8 @@ mod test { use datafusion_common::{DFSchema, DFSchemaRef, Result, ScalarValue}; use datafusion_expr::{ ColumnarValue, Expr, Operator, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, - Signature, Volatility, and, binary_expr, col, lit, simplify::SimplifyContext, + Signature, Volatility, and, binary_expr, col, lit, preimage::PreimageResult, + simplify::SimplifyContext, }; use super::Interval; @@ -153,27 +154,28 @@ mod test { args: &[Expr], lit_expr: &Expr, _info: &SimplifyContext, - ) -> Result> { + ) -> Result { if !self.enabled { - return Ok(None); + return Ok(PreimageResult::None); } if args.len() != 1 { - return Ok(None); + return Ok(PreimageResult::None); } + + let expr = args.first().cloned().expect("Should be column expression"); match lit_expr { Expr::Literal(ScalarValue::Int32(Some(500)), _) => { - Ok(Some(Interval::try_new( - ScalarValue::Int32(Some(100)), - ScalarValue::Int32(Some(200)), - )?)) + Ok(PreimageResult::Range { + expr, + interval: Interval::try_new( + ScalarValue::Int32(Some(100)), + ScalarValue::Int32(Some(200)), + )?, + }) } - _ => Ok(None), + _ => Ok(PreimageResult::None), } } - - fn column_expr(&self, args: &[Expr]) -> Option { - args.first().cloned() - } } fn optimize_test(expr: Expr, schema: &DFSchemaRef) -> Expr { From c0ed63c571974ec212d96da3362c1735123fa813 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Tue, 20 Jan 2026 13:18:20 -0500 Subject: [PATCH 19/26] clippy --- datafusion/expr/src/preimage.rs | 2 +- .../optimizer/src/simplify_expressions/expr_simplifier.rs | 4 ++-- datafusion/optimizer/src/simplify_expressions/udf_preimage.rs | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/datafusion/expr/src/preimage.rs b/datafusion/expr/src/preimage.rs index 6662d7ae34776..4a2c918fa9989 100644 --- a/datafusion/expr/src/preimage.rs +++ b/datafusion/expr/src/preimage.rs @@ -24,5 +24,5 @@ pub enum PreimageResult { None, /// The expression always evaluates to the specified constant /// given that `expr` is within the interval - Range { expr: Expr, interval: Interval }, + Range { expr: Expr, interval: Box }, } diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 70de2afb9e306..a09dd423302ff 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -2011,12 +2011,12 @@ impl TreeNodeRewriter for Simplifier<'_> { if let PreimageResult::Range { interval, expr } = get_preimage(left.as_ref(), right.as_ref(), info)? { - rewrite_with_preimage(info, interval, op, expr)? + rewrite_with_preimage(info, *interval, op, expr)? } else if let Some(swapped) = op.swap() { if let PreimageResult::Range { interval, expr } = get_preimage(right.as_ref(), left.as_ref(), info)? { - rewrite_with_preimage(info, interval, swapped, expr)? + rewrite_with_preimage(info, *interval, swapped, expr)? } else { Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right })) } diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index fb167b4ff0781..f8d89b9309662 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -167,10 +167,10 @@ mod test { Expr::Literal(ScalarValue::Int32(Some(500)), _) => { Ok(PreimageResult::Range { expr, - interval: Interval::try_new( + interval: Box::new(Interval::try_new( ScalarValue::Int32(Some(100)), ScalarValue::Int32(Some(200)), - )?, + )?), }) } _ => Ok(PreimageResult::None), From 58561501bfceef8962f13c8c8a0814fc2e8456d7 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Tue, 20 Jan 2026 13:32:19 -0500 Subject: [PATCH 20/26] Fix null handling unit test --- datafusion/optimizer/src/simplify_expressions/udf_preimage.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index f8d89b9309662..7f9f33b2709a5 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -330,7 +330,7 @@ mod test { fn test_preimage_null_literal_no_rewrite() { // NULL literal RHS should not be rewritten. let schema = test_schema(); - let expr = preimage_udf_expr().eq(lit(ScalarValue::Int32(None))); + let expr = is_distinct_from(preimage_udf_expr(), lit(ScalarValue::Int32(None))); let expected = expr.clone(); assert_eq!(optimize_test(expr, &schema), expected); From c53a9fcd754f737710e89a16e8dacb7960aea086 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Tue, 20 Jan 2026 13:44:45 -0500 Subject: [PATCH 21/26] Fix null handling test --- .../src/simplify_expressions/udf_preimage.rs | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index 7f9f33b2709a5..061c30de6d21e 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -327,13 +327,23 @@ mod test { } #[test] - fn test_preimage_null_literal_no_rewrite() { - // NULL literal RHS should not be rewritten. + fn test_preimage_null_literal_no_rewrite_distinct_ops() { + // NULL literal RHS should not be rewritten for DISTINCTness operators: + // - `expr IS DISTINCT FROM NULL` <=> `NOT (expr IS NULL)` + // - `expr IS NOT DISTINCT FROM NULL` <=> `expr IS NULL` + // + // For normal comparisons (=, !=, <, <=, >, >=), `expr OP NULL` evaluates to NULL + // under SQL tri-state logic, and DataFusion's simplifier constant-folds it. + // https://docs.rs/datafusion/latest/datafusion/physical_optimizer/pruning/struct.PruningPredicate.html#boolean-tri-state-logic + let schema = test_schema(); + let expr = is_distinct_from(preimage_udf_expr(), lit(ScalarValue::Int32(None))); - let expected = expr.clone(); + assert_eq!(optimize_test(expr.clone(), &schema), expr); - assert_eq!(optimize_test(expr, &schema), expected); + let expr = + is_not_distinct_from(preimage_udf_expr(), lit(ScalarValue::Int32(None))); + assert_eq!(optimize_test(expr.clone(), &schema), expr); } #[test] From 9b32843b16d3d5e8017b5b569948f5c20ec146bd Mon Sep 17 00:00:00 2001 From: Kosta Tarasov <33369833+sdf-jkl@users.noreply.github.com> Date: Wed, 21 Jan 2026 09:27:52 -0500 Subject: [PATCH 22/26] Update datafusion/expr/src/preimage.rs Co-authored-by: Andrew Lamb --- datafusion/expr/src/preimage.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/expr/src/preimage.rs b/datafusion/expr/src/preimage.rs index 4a2c918fa9989..c2c866266ac30 100644 --- a/datafusion/expr/src/preimage.rs +++ b/datafusion/expr/src/preimage.rs @@ -19,6 +19,7 @@ use datafusion_expr_common::interval_arithmetic::Interval; use crate::Expr; +/// Return from [`ScalarUDFImpl::preimage`] pub enum PreimageResult { /// No preimage exists for the specified value None, From 53f72edd17a46565ae3ca08e600a37a364019d50 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Wed, 21 Jan 2026 11:18:10 -0500 Subject: [PATCH 23/26] Fix docs --- datafusion/expr/src/preimage.rs | 2 +- .../src/simplify_expressions/expr_simplifier.rs | 10 +--------- .../src/simplify_expressions/udf_preimage.rs | 11 +---------- 3 files changed, 3 insertions(+), 20 deletions(-) diff --git a/datafusion/expr/src/preimage.rs b/datafusion/expr/src/preimage.rs index c2c866266ac30..67ca7a91bbf38 100644 --- a/datafusion/expr/src/preimage.rs +++ b/datafusion/expr/src/preimage.rs @@ -19,7 +19,7 @@ use datafusion_expr_common::interval_arithmetic::Interval; use crate::Expr; -/// Return from [`ScalarUDFImpl::preimage`] +/// Return from [`crate::ScalarUDFImpl::preimage`] pub enum PreimageResult { /// No preimage exists for the specified value None, diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index a09dd423302ff..d9337d89c1605 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1979,15 +1979,7 @@ impl TreeNodeRewriter for Simplifier<'_> { // For case: // date_part('YEAR', expr) op literal // - // Background: - // Datasources such as Parquet can prune partitions using simple predicates, - // but they cannot do so for complex expressions. - // For a complex predicate like `date_part('YEAR', c1) < 2000`, pruning is not possible. - // After rewriting it to `c1 < 2000-01-01`, pruning becomes feasible. - // Rewrites use inclusive lower and exclusive upper bounds when - // translating an equality into a range. - // NOTE: we only consider immutable UDFs with non Null literal RHS values and - // UDFs that provide both `preimage` and `column_expr`. + // For details see datafusion_expr::ScalarUDFImpl::preimage Expr::BinaryExpr(BinaryExpr { left, op, right }) => { use datafusion_expr::Operator::*; let is_preimage_op = matches!( diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index 061c30de6d21e..65a98b5bb12bc 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -25,16 +25,7 @@ use datafusion_expr_common::interval_arithmetic::Interval; /// x`) where `` is known to have a pre-image (aka the entire single /// range for which it is valid) and `x` is not `NULL` /// -/// This rewrite is described in the [ClickHouse Paper] and is particularly -/// useful for simplifying expressions `date_part` or equivalent functions. The -/// idea is that if you have an expression like `date_part(YEAR, k) = 2024` and you -/// can find a [preimage] for `date_part(YEAR, k)`, which is the range of dates -/// covering the entire year of 2024. Thus, you can rewrite the expression to -/// `k >= '2024-01-01' AND k < '2025-01-01'`, which uses an inclusive lower bound -/// and exclusive upper bound and is often more optimizable. -/// -/// [ClickHouse Paper]: https://www.vldb.org/pvldb/vol17/p3731-schulze.pdf -/// [preimage]: https://en.wikipedia.org/wiki/Image_(mathematics)#Inverse_image +/// For details see [`datafusion_expr::ScalarUDFImpl::preimage`] /// pub(super) fn rewrite_with_preimage( _info: &SimplifyContext, From ba5be8a92076671e36d5f228cd74ac61e1c81d96 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Wed, 21 Jan 2026 11:20:41 -0500 Subject: [PATCH 24/26] Fix comment --- datafusion/optimizer/src/simplify_expressions/udf_preimage.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index 65a98b5bb12bc..4a44065b4ddab 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -38,8 +38,8 @@ pub(super) fn rewrite_with_preimage( let rewritten_expr = match op { // < x ==> < lower - // >= x ==> >= lower Operator::Lt => expr.lt(lower), + // >= x ==> >= lower Operator::GtEq => expr.gt_eq(lower), // > x ==> >= upper Operator::Gt => expr.gt_eq(upper), From 46a941f15fc5d789b40b363fe9023094630d48e0 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Wed, 21 Jan 2026 12:58:30 -0500 Subject: [PATCH 25/26] Fix is_not_distinct_from rewrite --- .../src/simplify_expressions/udf_preimage.rs | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index 4a44065b4ddab..5dea18ccfc5e1 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -46,14 +46,16 @@ pub(super) fn rewrite_with_preimage( // <= x ==> < upper Operator::LtEq => expr.lt(upper), // = x ==> ( >= lower) and ( < upper) - // - // is not distinct from x ==> ( is NULL and x is NULL) or (( >= lower) and ( < upper)) - // but since x is always not NULL => ( >= lower) and ( < upper) - Operator::Eq | Operator::IsNotDistinctFrom => { - and(expr.clone().gt_eq(lower), expr.lt(upper)) - } + Operator::Eq => and(expr.clone().gt_eq(lower), expr.lt(upper)), // != x ==> ( < lower) or ( >= upper) Operator::NotEq => or(expr.clone().lt(lower), expr.gt_eq(upper)), + // is not distinct from x ==> ( is NULL and x is NULL) or (( >= lower) and ( < upper)) + // but since x is always not NULL => ( is not NULL) and ( >= lower) and ( < upper) + Operator::IsNotDistinctFrom => expr + .clone() + .is_not_null() + .and(expr.clone().gt_eq(lower)) + .and(expr.lt(upper)), // is distinct from x ==> ( < lower) or ( >= upper) or ( is NULL and x is not NULL) or ( is not NULL and x is NULL) // but given that x is always not NULL => ( < lower) or ( >= upper) or ( is NULL) Operator::IsDistinctFrom => expr @@ -286,10 +288,14 @@ mod test { #[test] fn test_preimage_is_not_distinct_from_rewrite() { - // IS NOT DISTINCT FROM is treated like equality for non-null literal RHS. + // IS NOT DISTINCT FROM rewrites to equality plus expression not-null check + // for non-null literal RHS. let schema = test_schema(); let expr = is_not_distinct_from(preimage_udf_expr(), lit(500)); - let expected = and(col("x").gt_eq(lit(100)), col("x").lt(lit(200))); + let expected = col("x") + .is_not_null() + .and(col("x").gt_eq(lit(100))) + .and(col("x").lt(lit(200))); assert_eq!(optimize_test(expr, &schema), expected); } From fb155f6f5b339e5050e71b10489d2c8305f9be88 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 22 Jan 2026 16:45:11 -0500 Subject: [PATCH 26/26] Simplify the API --- .../optimizer/src/simplify_expressions/expr_simplifier.rs | 4 ++-- datafusion/optimizer/src/simplify_expressions/udf_preimage.rs | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index a09dd423302ff..b1aff800ca45a 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -2011,12 +2011,12 @@ impl TreeNodeRewriter for Simplifier<'_> { if let PreimageResult::Range { interval, expr } = get_preimage(left.as_ref(), right.as_ref(), info)? { - rewrite_with_preimage(info, *interval, op, expr)? + rewrite_with_preimage(*interval, op, expr)? } else if let Some(swapped) = op.swap() { if let PreimageResult::Range { interval, expr } = get_preimage(right.as_ref(), left.as_ref(), info)? { - rewrite_with_preimage(info, *interval, swapped, expr)? + rewrite_with_preimage(*interval, swapped, expr)? } else { Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right })) } diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index 061c30de6d21e..57a28468fc46b 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -16,7 +16,7 @@ // under the License. use datafusion_common::{Result, internal_err, tree_node::Transformed}; -use datafusion_expr::{Expr, Operator, and, lit, or, simplify::SimplifyContext}; +use datafusion_expr::{Expr, Operator, and, lit, or}; use datafusion_expr_common::interval_arithmetic::Interval; /// Rewrites a binary expression using its "preimage" @@ -37,7 +37,6 @@ use datafusion_expr_common::interval_arithmetic::Interval; /// [preimage]: https://en.wikipedia.org/wiki/Image_(mathematics)#Inverse_image /// pub(super) fn rewrite_with_preimage( - _info: &SimplifyContext, preimage_interval: Interval, op: Operator, expr: Expr,