From d94889a96a24d9f49be80a80de01e93edd4e0ac8 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 9 Jan 2026 16:32:08 -0500 Subject: [PATCH 01/18] Add udf_preimage logic --- datafusion/expr/src/udf.rs | 63 ++++++++++ .../simplify_expressions/expr_simplifier.rs | 64 +++++++++- .../optimizer/src/simplify_expressions/mod.rs | 1 + .../src/simplify_expressions/udf_preimage.rs | 112 ++++++++++++++++++ 4 files changed, 237 insertions(+), 3 deletions(-) create mode 100644 datafusion/optimizer/src/simplify_expressions/udf_preimage.rs diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 0654370ac7ebf..09c62278c2b00 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -232,6 +232,25 @@ impl ScalarUDF { self.inner.is_nullable(args, schema) } + /// Return a preimage + /// + /// See [`ScalarUDFImpl::preimage`] for more details. + pub fn preimage( + &self, + args: &[Expr], + lit_expr: &Expr, + info: &SimplifyContext, + ) -> Result> { + self.inner.preimage(args, lit_expr, info) + } + + /// Return inner column from function args + /// + /// See [`ScalarUDFImpl::column_expr`] + pub fn column_expr(&self, args: &[Expr]) -> Option { + self.inner.column_expr(args) + } + /// Invoke the function on `args`, returning the appropriate result. /// /// See [`ScalarUDFImpl::invoke_with_args`] for details. @@ -696,6 +715,37 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync { Ok(ExprSimplifyResult::Original(args)) } + /// Returns the [preimage] for this function and the specified scalar value, if any. + /// + /// A preimage is a single contiguous [`Interval`] of values where the function + /// will always return `lit_value` + /// + /// This rewrite is described in the [ClickHouse Paper] and is particularly + /// useful for simplifying expressions `date_part` or equivalent functions. The + /// idea is that if you have an expression like `date_part(YEAR, k) = 2024` and you + /// can find a [preimage] for `date_part(YEAR, k)`, which is the range of dates + /// covering the entire year of 2024. Thus, you can rewrite the expression to `k + /// >= '2024-01-01' AND k < '2025-01-01' which is often more optimizable. + /// + /// This should only return a preimage if the function takes a single argument + /// + /// [ClickHouse Paper]: https://www.vldb.org/pvldb/vol17/p3731-schulze.pdf + /// [preimage]: https://en.wikipedia.org/wiki/Image_(mathematics)#Inverse_image + fn preimage( + &self, + _args: &[Expr], + _lit_expr: &Expr, + _info: &SimplifyContext, + ) -> Result> { + Ok(None) + } + + // Return the inner column expression from this function + fn column_expr(&self, _args: &[Expr]) -> Option { + None + } + + /// Returns true if some of this `exprs` subexpressions may not be evaluated /// and thus any side effects (like divide by zero) may not be encountered. /// @@ -926,6 +976,19 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl { self.inner.simplify(args, info) } + fn preimage( + &self, + args: &[Expr], + lit_expr: &Expr, + info: &SimplifyContext, + ) -> Result> { + self.inner.preimage(args, lit_expr, info) + } + + fn column_expr(&self, args: &[Expr]) -> Option { + self.inner.column_expr(args) + } + fn conditional_arguments<'a>( &self, args: &'a [Expr], diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 55bff5849c5cb..15a58683a54b4 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -38,8 +38,7 @@ use datafusion_common::{ tree_node::{Transformed, TransformedResult, TreeNode, TreeNodeRewriter}, }; use datafusion_expr::{ - BinaryExpr, Case, ColumnarValue, Expr, Like, Operator, Volatility, and, - binary::BinaryTypeCoercer, lit, or, + BinaryExpr, Case, ColumnarValue, Expr, Like, Operator, Volatility, and, binary::BinaryTypeCoercer, interval_arithmetic::Interval, lit, or }; use datafusion_expr::{Cast, TryCast, simplify::ExprSimplifyResult}; use datafusion_expr::{expr::ScalarFunction, interval_arithmetic::NullableInterval}; @@ -51,7 +50,7 @@ use datafusion_physical_expr::{create_physical_expr, execution_props::ExecutionP use super::inlist_simplifier::ShortenInListSimplifier; use super::utils::*; -use crate::analyzer::type_coercion::TypeCoercionRewriter; +use crate::{analyzer::type_coercion::TypeCoercionRewriter, simplify_expressions::udf_preimage::rewrite_with_preimage}; use crate::simplify_expressions::SimplifyContext; use crate::simplify_expressions::regex::simplify_regex_expr; use crate::simplify_expressions::unwrap_cast::{ @@ -1952,12 +1951,71 @@ impl TreeNodeRewriter for Simplifier<'_> { })) } + // ======================================= + // preimage_in_comparison + // ======================================= + // + // For case: + // date_part(expr as 'YEAR') op literal + // + // Background: + // Datasources such as Parquet can prune partitions using simple predicates, + // but they cannot do so for complex expressions. + // For a complex predicate like `date_part('YEAR', c1) < 2000`, pruning is not possible. + // After rewriting it to `c1 < 2000-01-01`, pruning becomes feasible. + Expr::BinaryExpr(BinaryExpr { left, op, right }) + if get_preimage(&left, &right, info)?.0.is_some() + && get_preimage(&left, &right, info)?.1.is_some() => + { + // todo use let binding (if let Some(interval) = ...) once stabilized to avoid computing this thrice😢 + let (Some(interval), Some(col_expr)) = + get_preimage(left.as_ref(), &right, info)? + else { + unreachable!( + "The above if statement insures interval and col_expr are Some" + ) + }; + rewrite_with_preimage(info, interval, op, Box::new(col_expr))? + } + // literal op date_part(literal, expression) + // --> + // date_part(literal, expression) op_swap literal + Expr::BinaryExpr(BinaryExpr { left, op, right }) + if get_preimage(&right, &left, info)?.0.is_some() + && get_preimage(&right, &left, info)?.1.is_some() + && op.swap().is_some() => + { + let swapped = op.swap().unwrap(); + let (Some(interval), Some(col_expr)) = get_preimage(&right, &left, info)? + else { + unreachable!( + "The above if statement insures interval and col_expr are Some" + ) + }; + rewrite_with_preimage(info, interval, swapped, Box::new(col_expr))? + } + + // no additional rewrites possible expr => Transformed::no(expr), }) } } +fn get_preimage( + left_expr: &Expr, + right_expr: &Expr, + info: &SimplifyContext, +) -> Result<(Option, Option)> { + let Expr::ScalarFunction(ScalarFunction { func, args }) = left_expr else { + return Ok((None, None)); + }; + Ok(( + func.preimage(args, right_expr, info)?, + func.column_expr(args), + )) +} + fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option)> { match expr { Expr::Literal(ScalarValue::Utf8(s), _) => Some((DataType::Utf8, s)), diff --git a/datafusion/optimizer/src/simplify_expressions/mod.rs b/datafusion/optimizer/src/simplify_expressions/mod.rs index 3ab76119cca84..b85b000821ad8 100644 --- a/datafusion/optimizer/src/simplify_expressions/mod.rs +++ b/datafusion/optimizer/src/simplify_expressions/mod.rs @@ -24,6 +24,7 @@ mod regex; pub mod simplify_exprs; pub mod simplify_literal; mod simplify_predicates; +mod udf_preimage; mod unwrap_cast; mod utils; diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs new file mode 100644 index 0000000000000..61317f2a5dd76 --- /dev/null +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -0,0 +1,112 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion_common::{Result, internal_err, tree_node::Transformed}; +use datafusion_expr::{BinaryExpr, Expr, Operator, and, lit, or, simplify::SimplifyContext}; +use datafusion_expr_common::interval_arithmetic::Interval; + +/// Rewrites a binary expression using its "preimage" +/// +/// Specifically it rewrites expressions of the form ` OP x` (e.g. ` = +/// x`) where `` is known to have a pre-image (aka the entire single +/// range for which it is valid) +/// +/// This rewrite is described in the [ClickHouse Paper] and is particularly +/// useful for simplifying expressions `date_part` or equivalent functions. The +/// idea is that if you have an expression like `date_part(YEAR, k) = 2024` and you +/// can find a [preimage] for `date_part(YEAR, k)`, which is the range of dates +/// covering the entire year of 2024. Thus, you can rewrite the expression to `k +/// >= '2024-01-01' AND k < '2025-01-01' which is often more optimizable. +/// +/// [ClickHouse Paper]: https://www.vldb.org/pvldb/vol17/p3731-schulze.pdf +/// [preimage]: https://en.wikipedia.org/wiki/Image_(mathematics)#Inverse_image +/// +pub(super) fn rewrite_with_preimage( + _info: &SimplifyContext, + preimage_interval: Interval, + op: Operator, + expr: Box, +) -> Result> { + let (lower, upper) = preimage_interval.into_bounds(); + let (lower, upper) = (lit(lower), lit(upper)); + + let rewritten_expr = match op { + // < x ==> < upper + // >= x ==> >= lower + Operator::Lt | Operator::GtEq => Expr::BinaryExpr(BinaryExpr { + left: expr, + op, + right: Box::new(lower), + }), + // > x ==> >= upper + Operator::Gt => Expr::BinaryExpr(BinaryExpr { + left: expr, + op: Operator::GtEq, + right: Box::new(upper), + }), + // <= x ==> < upper + Operator::LtEq => Expr::BinaryExpr(BinaryExpr { + left: expr, + op: Operator::Lt, + right: Box::new(upper), + }), + // = x ==> ( >= lower) and ( < upper) + // + // is not distinct from x ==> ( is NULL and x is NULL) or (( >= lower) and ( < upper)) + // but since x is always not NULL => ( >= lower) and ( < upper) + Operator::Eq | Operator::IsNotDistinctFrom => and( + Expr::BinaryExpr(BinaryExpr { + left: expr.clone(), + op: Operator::GtEq, + right: Box::new(lower), + }), + Expr::BinaryExpr(BinaryExpr { + left: expr, + op: Operator::Lt, + right: Box::new(upper), + }), + ), + // != x ==> ( < lower) or ( >= upper) + Operator::NotEq => or( + Expr::BinaryExpr(BinaryExpr { + left: expr.clone(), + op: Operator::Lt, + right: Box::new(lower), + }), + Expr::BinaryExpr(BinaryExpr { + left: expr, + op: Operator::GtEq, + right: Box::new(upper), + }), + ), + // is distinct from x ==> ( < lower) or ( >= upper) or ( is NULL and x is not NULL) or ( is not NULL and x is NULL) + // but given that x is always not NULL => ( < lower) or ( >= upper) or ( is NULL) + Operator::IsDistinctFrom => Expr::BinaryExpr(BinaryExpr { + left: expr.clone(), + op: Operator::Lt, + right: Box::new(lower.clone()), + }) + .or(Expr::BinaryExpr(BinaryExpr { + left: expr.clone(), + op: Operator::GtEq, + right: Box::new(upper), + })) + .or(expr.is_null()), + _ => return internal_err!("Expect comparison operators"), + }; + Ok(Transformed::yes(rewritten_expr)) +} \ No newline at end of file From 4aa7f4e7d32688c01b00670ea8a54ed351937b9f Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 9 Jan 2026 20:37:19 -0500 Subject: [PATCH 02/18] Cargo fmt --- datafusion/expr/src/udf.rs | 3 +-- .../src/simplify_expressions/expr_simplifier.rs | 9 ++++++--- .../optimizer/src/simplify_expressions/udf_preimage.rs | 6 ++++-- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 09c62278c2b00..55c50f6dd8567 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -232,7 +232,7 @@ impl ScalarUDF { self.inner.is_nullable(args, schema) } - /// Return a preimage + /// Return a preimage /// /// See [`ScalarUDFImpl::preimage`] for more details. pub fn preimage( @@ -745,7 +745,6 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync { None } - /// Returns true if some of this `exprs` subexpressions may not be evaluated /// and thus any side effects (like divide by zero) may not be encountered. /// diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 15a58683a54b4..5dd19ef152c07 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -38,7 +38,8 @@ use datafusion_common::{ tree_node::{Transformed, TransformedResult, TreeNode, TreeNodeRewriter}, }; use datafusion_expr::{ - BinaryExpr, Case, ColumnarValue, Expr, Like, Operator, Volatility, and, binary::BinaryTypeCoercer, interval_arithmetic::Interval, lit, or + BinaryExpr, Case, ColumnarValue, Expr, Like, Operator, Volatility, and, + binary::BinaryTypeCoercer, interval_arithmetic::Interval, lit, or, }; use datafusion_expr::{Cast, TryCast, simplify::ExprSimplifyResult}; use datafusion_expr::{expr::ScalarFunction, interval_arithmetic::NullableInterval}; @@ -50,7 +51,6 @@ use datafusion_physical_expr::{create_physical_expr, execution_props::ExecutionP use super::inlist_simplifier::ShortenInListSimplifier; use super::utils::*; -use crate::{analyzer::type_coercion::TypeCoercionRewriter, simplify_expressions::udf_preimage::rewrite_with_preimage}; use crate::simplify_expressions::SimplifyContext; use crate::simplify_expressions::regex::simplify_regex_expr; use crate::simplify_expressions::unwrap_cast::{ @@ -58,6 +58,10 @@ use crate::simplify_expressions::unwrap_cast::{ is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist, unwrap_cast_in_comparison_for_binary, }; +use crate::{ + analyzer::type_coercion::TypeCoercionRewriter, + simplify_expressions::udf_preimage::rewrite_with_preimage, +}; use datafusion_expr::expr_rewriter::rewrite_with_guarantees_map; use datafusion_expr_common::casts::try_cast_literal_to_type; use indexmap::IndexSet; @@ -1995,7 +1999,6 @@ impl TreeNodeRewriter for Simplifier<'_> { rewrite_with_preimage(info, interval, swapped, Box::new(col_expr))? } - // no additional rewrites possible expr => Transformed::no(expr), }) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index 61317f2a5dd76..465a23318a372 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -16,7 +16,9 @@ // under the License. use datafusion_common::{Result, internal_err, tree_node::Transformed}; -use datafusion_expr::{BinaryExpr, Expr, Operator, and, lit, or, simplify::SimplifyContext}; +use datafusion_expr::{ + BinaryExpr, Expr, Operator, and, lit, or, simplify::SimplifyContext, +}; use datafusion_expr_common::interval_arithmetic::Interval; /// Rewrites a binary expression using its "preimage" @@ -109,4 +111,4 @@ pub(super) fn rewrite_with_preimage( _ => return internal_err!("Expect comparison operators"), }; Ok(Transformed::yes(rewritten_expr)) -} \ No newline at end of file +} From 2329c12966df305ffa779bef522990c0096b29d7 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 9 Jan 2026 21:27:00 -0500 Subject: [PATCH 03/18] Fix err in rewrite_with_preimage --- .../src/simplify_expressions/udf_preimage.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index 465a23318a372..f31a360d7c4d7 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -48,11 +48,10 @@ pub(super) fn rewrite_with_preimage( let rewritten_expr = match op { // < x ==> < upper - // >= x ==> >= lower - Operator::Lt | Operator::GtEq => Expr::BinaryExpr(BinaryExpr { + Operator::Lt => Expr::BinaryExpr(BinaryExpr { left: expr, op, - right: Box::new(lower), + right: Box::new(upper), }), // > x ==> >= upper Operator::Gt => Expr::BinaryExpr(BinaryExpr { @@ -66,6 +65,12 @@ pub(super) fn rewrite_with_preimage( op: Operator::Lt, right: Box::new(upper), }), + // >= x ==> >= lower + Operator::GtEq => Expr::BinaryExpr(BinaryExpr { + left: expr, + op: Operator::GtEq, + right: Box::new(lower), + }), // = x ==> ( >= lower) and ( < upper) // // is not distinct from x ==> ( is NULL and x is NULL) or (( >= lower) and ( < upper)) From 7ac83255cf90c4e17f64f6f17431084d240d8ab6 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 9 Jan 2026 21:28:25 -0500 Subject: [PATCH 04/18] Rewrite the preimage_in_comparison --- .../simplify_expressions/expr_simplifier.rs | 82 ++++++++++++------- 1 file changed, 52 insertions(+), 30 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 5dd19ef152c07..9ca6c2b2e42c6 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1967,36 +1967,43 @@ impl TreeNodeRewriter for Simplifier<'_> { // but they cannot do so for complex expressions. // For a complex predicate like `date_part('YEAR', c1) < 2000`, pruning is not possible. // After rewriting it to `c1 < 2000-01-01`, pruning becomes feasible. - Expr::BinaryExpr(BinaryExpr { left, op, right }) - if get_preimage(&left, &right, info)?.0.is_some() - && get_preimage(&left, &right, info)?.1.is_some() => - { - // todo use let binding (if let Some(interval) = ...) once stabilized to avoid computing this thrice😢 - let (Some(interval), Some(col_expr)) = - get_preimage(left.as_ref(), &right, info)? - else { - unreachable!( - "The above if statement insures interval and col_expr are Some" - ) - }; - rewrite_with_preimage(info, interval, op, Box::new(col_expr))? - } - // literal op date_part(literal, expression) - // --> - // date_part(literal, expression) op_swap literal - Expr::BinaryExpr(BinaryExpr { left, op, right }) - if get_preimage(&right, &left, info)?.0.is_some() - && get_preimage(&right, &left, info)?.1.is_some() - && op.swap().is_some() => - { - let swapped = op.swap().unwrap(); - let (Some(interval), Some(col_expr)) = get_preimage(&right, &left, info)? - else { - unreachable!( - "The above if statement insures interval and col_expr are Some" - ) - }; - rewrite_with_preimage(info, interval, swapped, Box::new(col_expr))? + // NOTE: we only consider immutable UDFs with literal RHS values + Expr::BinaryExpr(BinaryExpr { left, op, right }) => { + let is_preimage_op = matches!( + op, + Operator::Eq + | Operator::NotEq + | Operator::Lt + | Operator::LtEq + | Operator::Gt + | Operator::GtEq + | Operator::IsDistinctFrom + | Operator::IsNotDistinctFrom + ); + if !is_preimage_op { + return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right }))); + } + + if let (Some(interval), Some(col_expr)) = + get_preimage(left.as_ref(), right.as_ref(), info)? + { + rewrite_with_preimage(info, interval, op, Box::new(col_expr))? + } else if let Some(swapped) = op.swap() { + if let (Some(interval), Some(col_expr)) = + get_preimage(right.as_ref(), left.as_ref(), info)? + { + rewrite_with_preimage( + info, + interval, + swapped, + Box::new(col_expr), + )? + } else { + Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right })) + } + } else { + Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right })) + } } // no additional rewrites possible @@ -2013,12 +2020,27 @@ fn get_preimage( let Expr::ScalarFunction(ScalarFunction { func, args }) = left_expr else { return Ok((None, None)); }; + if !is_literal_or_literal_cast(right_expr) { + return Ok((None, None)); + } + if func.signature().volatility != Volatility::Immutable { + return Ok((None, None)); + } Ok(( func.preimage(args, right_expr, info)?, func.column_expr(args), )) } +fn is_literal_or_literal_cast(expr: &Expr) -> bool { + match expr { + Expr::Literal(_, _) => true, + Expr::Cast(Cast { expr, .. }) => matches!(expr.as_ref(), Expr::Literal(_, _)), + Expr::TryCast(TryCast { expr, .. }) => matches!(expr.as_ref(), Expr::Literal(_, _)), + _ => false, + } +} + fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option)> { match expr { Expr::Literal(ScalarValue::Utf8(s), _) => Some((DataType::Utf8, s)), From 7a3e8b3a7ed100662ec2228fa47cce3a778128fb Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 9 Jan 2026 21:30:45 -0500 Subject: [PATCH 05/18] cargo fmt --- .../src/simplify_expressions/expr_simplifier.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 9ca6c2b2e42c6..07e3edf026c53 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1981,7 +1981,11 @@ impl TreeNodeRewriter for Simplifier<'_> { | Operator::IsNotDistinctFrom ); if !is_preimage_op { - return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, right }))); + return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { + left, + op, + right, + }))); } if let (Some(interval), Some(col_expr)) = @@ -2036,7 +2040,9 @@ fn is_literal_or_literal_cast(expr: &Expr) -> bool { match expr { Expr::Literal(_, _) => true, Expr::Cast(Cast { expr, .. }) => matches!(expr.as_ref(), Expr::Literal(_, _)), - Expr::TryCast(TryCast { expr, .. }) => matches!(expr.as_ref(), Expr::Literal(_, _)), + Expr::TryCast(TryCast { expr, .. }) => { + matches!(expr.as_ref(), Expr::Literal(_, _)) + } _ => false, } } From fbd5dcce014d1b09d54556f38c33b435a8329baf Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 9 Jan 2026 21:53:40 -0500 Subject: [PATCH 06/18] Fix ci --- .../src/simplify_expressions/expr_simplifier.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 07e3edf026c53..c99978a1cc4c6 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1969,16 +1969,16 @@ impl TreeNodeRewriter for Simplifier<'_> { // After rewriting it to `c1 < 2000-01-01`, pruning becomes feasible. // NOTE: we only consider immutable UDFs with literal RHS values Expr::BinaryExpr(BinaryExpr { left, op, right }) => { + use datafusion_expr::Operator::*; let is_preimage_op = matches!( op, - Operator::Eq - | Operator::NotEq - | Operator::Lt - | Operator::LtEq - | Operator::Gt - | Operator::GtEq - | Operator::IsDistinctFrom - | Operator::IsNotDistinctFrom + Eq | NotEq + | Lt + | LtEq + | Gt + | GtEq + | IsDistinctFrom + | IsNotDistinctFrom ); if !is_preimage_op { return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr { From d9207352a327f08245a1d1716d6450d05bbfab84 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Sat, 10 Jan 2026 12:13:03 -0500 Subject: [PATCH 07/18] Fix GtEq, Lt logic --- .../src/simplify_expressions/udf_preimage.rs | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index f31a360d7c4d7..960c8df322d15 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -47,11 +47,12 @@ pub(super) fn rewrite_with_preimage( let (lower, upper) = (lit(lower), lit(upper)); let rewritten_expr = match op { - // < x ==> < upper - Operator::Lt => Expr::BinaryExpr(BinaryExpr { + // < x ==> < lower + // >= x ==> >= lower + Operator::Lt | Operator::GtEq => Expr::BinaryExpr(BinaryExpr { left: expr, op, - right: Box::new(upper), + right: Box::new(lower), }), // > x ==> >= upper Operator::Gt => Expr::BinaryExpr(BinaryExpr { @@ -65,12 +66,6 @@ pub(super) fn rewrite_with_preimage( op: Operator::Lt, right: Box::new(upper), }), - // >= x ==> >= lower - Operator::GtEq => Expr::BinaryExpr(BinaryExpr { - left: expr, - op: Operator::GtEq, - right: Box::new(lower), - }), // = x ==> ( >= lower) and ( < upper) // // is not distinct from x ==> ( is NULL and x is NULL) or (( >= lower) and ( < upper)) From d3318ff281b0175e20d2351120601684119fdc78 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Sat, 10 Jan 2026 12:10:27 -0500 Subject: [PATCH 08/18] Add datepart preimage + tests --- datafusion/core/tests/optimizer/mod.rs | 202 ++++++- .../functions/src/datetime/date_part.rs | 99 +++- .../sqllogictest/test_files/udf_preimage.slt | 506 ++++++++++++++++++ 3 files changed, 802 insertions(+), 5 deletions(-) create mode 100644 datafusion/sqllogictest/test_files/udf_preimage.slt diff --git a/datafusion/core/tests/optimizer/mod.rs b/datafusion/core/tests/optimizer/mod.rs index 6466e9ad96d17..bc8463a453dd2 100644 --- a/datafusion/core/tests/optimizer/mod.rs +++ b/datafusion/core/tests/optimizer/mod.rs @@ -18,6 +18,8 @@ //! Tests for the DataFusion SQL query planner that require functions from the //! datafusion-functions crate. +use datafusion_expr::simplify::SimplifyContext; +use datafusion_optimizer::simplify_expressions::ExprSimplifier; use insta::assert_snapshot; use std::any::Any; use std::collections::HashMap; @@ -28,11 +30,13 @@ use arrow::datatypes::{ }; use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::TransformedResult; -use datafusion_common::{DFSchema, Result, ScalarValue, TableReference, plan_err}; +use datafusion_common::{ + DFSchema, DFSchemaRef, Result, ScalarValue, TableReference, plan_err, +}; use datafusion_expr::interval_arithmetic::{Interval, NullableInterval}; use datafusion_expr::{ AggregateUDF, BinaryExpr, Expr, ExprSchemable, LogicalPlan, Operator, ScalarUDF, - TableSource, WindowUDF, col, lit, + TableSource, WindowUDF, and, col, lit, or, }; use datafusion_functions::core::expr_ext::FieldAccessor; use datafusion_optimizer::analyzer::Analyzer; @@ -45,7 +49,7 @@ use datafusion_sql::sqlparser::parser::Parser; use chrono::DateTime; use datafusion_expr::expr_rewriter::rewrite_with_guarantees; -use datafusion_functions::datetime; +use datafusion_functions::datetime::{self, expr_fn}; #[cfg(test)] #[ctor::ctor] @@ -378,3 +382,195 @@ fn validate_unchanged_cases(guarantees: &[(Expr, NullableInterval)], cases: &[Ex ); } } + +// DatePart preimage tests +#[test] +fn test_preimage_date_part_date32_eq() { + let schema = expr_test_schema(); + // date_part(c1, DatePart::Year) = 2024 -> c1 >= 2024-01-01 AND c1 < 2025-01-01 + let expr_lt = expr_fn::date_part(lit("year"), col("date32")).eq(lit(2024i32)); + let expected = and( + col("date32").gt_eq(lit(ScalarValue::Date32(Some(19723)))), + col("date32").lt(lit(ScalarValue::Date32(Some(20089)))), + ); + assert_eq!(optimize_test(expr_lt, &schema), expected) +} + +#[test] +fn test_preimage_date_part_date64_not_eq() { + let schema = expr_test_schema(); + // date_part(c1, DatePart::Year) <> 2024 -> c1 < 2024-01-01 AND c1 >= 2025-01-01 + let expr_lt = expr_fn::date_part(lit("year"), col("date64")).not_eq(lit(2024i32)); + let expected = or( + col("date64").lt(lit(ScalarValue::Date64(Some(19723 * 86_400_000)))), + col("date64").gt_eq(lit(ScalarValue::Date64(Some(20089 * 86_400_000)))), + ); + assert_eq!(optimize_test(expr_lt, &schema), expected) +} + +#[test] +fn test_preimage_date_part_timestamp_nano_lt() { + let schema = expr_test_schema(); + let expr_lt = expr_fn::date_part(lit("year"), col("ts_nano_none")).lt(lit(2024i32)); + let expected = col("ts_nano_none").lt(lit(ScalarValue::TimestampNanosecond( + Some(19723 * 86_400_000_000_000), + None, + ))); + assert_eq!(optimize_test(expr_lt, &schema), expected) +} + +#[test] +fn test_preimage_date_part_timestamp_nano_utc_gt() { + let schema = expr_test_schema(); + let expr_lt = expr_fn::date_part(lit("year"), col("ts_nano_utc")).gt(lit(2024i32)); + let expected = col("ts_nano_utc").gt_eq(lit(ScalarValue::TimestampNanosecond( + Some(20089 * 86_400_000_000_000), + None, + ))); + assert_eq!(optimize_test(expr_lt, &schema), expected) +} + +#[test] +fn test_preimage_date_part_timestamp_sec_est_gt_eq() { + let schema = expr_test_schema(); + let expr_lt = expr_fn::date_part(lit("year"), col("ts_sec_est")).gt_eq(lit(2024i32)); + let expected = col("ts_sec_est").gt_eq(lit(ScalarValue::TimestampSecond( + Some(19723 * 86_400), + None, + ))); + assert_eq!(optimize_test(expr_lt, &schema), expected) +} + +#[test] +fn test_preimage_date_part_timestamp_sec_est_lt_eq() { + let schema = expr_test_schema(); + let expr_lt = expr_fn::date_part(lit("year"), col("ts_mic_pt")).lt_eq(lit(2024i32)); + let expected = col("ts_mic_pt").lt(lit(ScalarValue::TimestampMicrosecond( + Some(20089 * 86_400_000_000), + None, + ))); + assert_eq!(optimize_test(expr_lt, &schema), expected) +} + +#[test] +fn test_preimage_date_part_timestamp_nano_lt_swap() { + let schema = expr_test_schema(); + let expr_lt = lit(2024i32).gt(expr_fn::date_part(lit("year"), col("ts_nano_none"))); + let expected = col("ts_nano_none").lt(lit(ScalarValue::TimestampNanosecond( + Some(19723 * 86_400_000_000_000), + None, + ))); + assert_eq!(optimize_test(expr_lt, &schema), expected) +} + +#[test] +fn test_preimage_date_part_date32_is_not_distinct_from() { + let schema = expr_test_schema(); + // date_part(c1, DatePart::Year) is not distinct from 2024 -> c1 >= 2024-01-01 AND c1 < 2025-01-01 (the null handling part is dropped since rhs is not null) + let expr_lt = Expr::BinaryExpr(BinaryExpr { + left: Box::new(expr_fn::date_part(lit("year"), col("date32"))), + op: Operator::IsNotDistinctFrom, + right: Box::new(lit(2024i32)), + }); + let expected = and( + col("date32").gt_eq(lit(ScalarValue::Date32(Some(19723)))), + col("date32").lt(lit(ScalarValue::Date32(Some(20089)))), + ); + assert_eq!(optimize_test(expr_lt, &schema), expected) +} + +#[test] +// Should not simplify - interval can't be calculated +fn test_preimage_date_part_date32_is_not_distinct_from_null() { + let schema = expr_test_schema(); + // date_part(c1, DatePart::Year) is not distinct from Null -> unchanged + let expr_lt = Expr::BinaryExpr(BinaryExpr { + left: Box::new(expr_fn::date_part(lit("year"), col("date32"))), + op: Operator::IsNotDistinctFrom, + right: Box::new(lit(ScalarValue::Null)), + }); + assert_eq!(optimize_test(expr_lt.clone(), &schema), expr_lt) +} + +#[test] +fn test_preimage_date_part_date64_is_distinct_from() { + let schema = expr_test_schema(); + // date_part(c1, DatePart::Year) is distinct from 2024 -> c1 < 2024-01-01 OR c1 >= 2025-01-01 or c1 is NULL + let expr_lt = Expr::BinaryExpr(BinaryExpr { + left: Box::new(expr_fn::date_part(lit("year"), col("date64"))), + op: Operator::IsDistinctFrom, + right: Box::new(lit(2024i32)), + }); + let expected = col("date64") + .lt(lit(ScalarValue::Date64(Some(19723 * 86_400_000)))) + .or(col("date64").gt_eq(lit(ScalarValue::Date64(Some(20089 * 86_400_000))))) + .or(col("date64").is_null()); + assert_eq!(optimize_test(expr_lt, &schema), expected) +} + +#[test] +// Should not simplify - interval can't be calculated +fn test_preimage_date_part_date64_is_distinct_from_null() { + let schema = expr_test_schema(); + // date_part(c1, DatePart::Year) is distinct from 2024 -> c1 < 2024-01-01 OR c1 >= unchanged + let expr_lt = Expr::BinaryExpr(BinaryExpr { + left: Box::new(expr_fn::date_part(lit("year"), col("date64"))), + op: Operator::IsDistinctFrom, + right: Box::new(lit(ScalarValue::Null)), + }); + assert_eq!(optimize_test(expr_lt.clone(), &schema), expr_lt) +} + +#[test] +// Should not simplify +fn test_preimage_date_part_not_year_date32_eq() { + let schema = expr_test_schema(); + // date_part(c1, DatePart::Year) = 2024 -> c1 >= 2024-01-01 AND c1 < 2025-01-01 + let expr_lt = expr_fn::date_part(lit("month"), col("date32")).eq(lit(1i32)); + assert_eq!(optimize_test(expr_lt.clone(), &schema), expr_lt) +} + +fn optimize_test(expr: Expr, schema: &DFSchemaRef) -> Expr { + let simplifier = + ExprSimplifier::new(SimplifyContext::default().with_schema(Arc::clone(schema))); + + simplifier.simplify(expr).unwrap() +} + +fn expr_test_schema() -> DFSchemaRef { + Arc::new( + DFSchema::from_unqualified_fields( + vec![ + Field::new("date32", DataType::Date32, true), + Field::new("date64", DataType::Date64, true), + Field::new("ts_nano_none", timestamp_nano_none_type(), true), + Field::new("ts_nano_utc", timestamp_nano_utc_type(), true), + Field::new("ts_sec_est", timestamp_sec_est_type(), true), + Field::new("ts_mic_pt", timestamp_mic_pt_type(), true), + ] + .into(), + HashMap::new(), + ) + .unwrap(), + ) +} + +fn timestamp_nano_none_type() -> DataType { + DataType::Timestamp(TimeUnit::Nanosecond, None) +} + +// this is the type that now() returns +fn timestamp_nano_utc_type() -> DataType { + let utc = Some("+0:00".into()); + DataType::Timestamp(TimeUnit::Nanosecond, utc) +} + +fn timestamp_sec_est_type() -> DataType { + let est = Some("-5:00".into()); + DataType::Timestamp(TimeUnit::Second, est) +} + +fn timestamp_mic_pt_type() -> DataType { + let pt = Some("-8::00".into()); + DataType::Timestamp(TimeUnit::Microsecond, pt) +} diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index 375200d07280b..e47ad2939db8e 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -27,6 +27,10 @@ use arrow::datatypes::DataType::{ }; use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; use arrow::datatypes::{DataType, Field, FieldRef, TimeUnit}; +use arrow::temporal_conversions::{ + MICROSECONDS_IN_DAY, MILLISECONDS_IN_DAY, NANOSECONDS_IN_DAY, SECONDS_IN_DAY, +}; +use chrono::{Datelike, NaiveDate}; use datafusion_common::types::{NativeType, logical_date}; use datafusion_common::{ @@ -41,9 +45,10 @@ use datafusion_common::{ types::logical_string, utils::take_function_args, }; +use datafusion_expr::simplify::SimplifyContext; use datafusion_expr::{ - ColumnarValue, Documentation, ReturnFieldArgs, ScalarUDFImpl, Signature, - TypeSignature, Volatility, + ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarUDFImpl, Signature, + TypeSignature, Volatility, interval_arithmetic, }; use datafusion_expr_common::signature::{Coercion, TypeSignatureClass}; use datafusion_macros::user_doc; @@ -231,6 +236,67 @@ impl ScalarUDFImpl for DatePartFunc { }) } + // Only casting the year is supported since pruning other IntervalUnit is not possible + // date_part(col, YEAR) = 2024 => col >= '2024-01-01' and col < '2025-01-01' + // But for anything less than YEAR simplifying is not possible without specifying the bigger interval + // date_part(col, MONTH) = 1 => col = '2023-01-01' or col = '2024-01-01' or ... or col = '3000-01-01' + fn preimage( + &self, + args: &[Expr], + lit_expr: &Expr, + info: &SimplifyContext, + ) -> Result> { + let [part, col_expr] = take_function_args(self.name(), args)?; + + // Get the interval unit from the part argument + let interval_unit = part + .as_literal() + .and_then(|sv| sv.try_as_str().flatten()) + .map(part_normalization) + .and_then(|s| IntervalUnit::from_str(s).ok()); + + // only support extracting year + match interval_unit { + Some(IntervalUnit::Year) => (), + _ => return Ok(None), + } + + // Check if the argument is a literal (e.g. date_part(YEAR, col) = 2024) + let Some(argument_literal) = lit_expr.as_literal() else { + return Ok(None); + }; + + // Extract i32 year from Scalar value + let year = match argument_literal { + ScalarValue::Int32(Some(y)) => *y, + _ => return Ok(None), + }; + + // Can only extract year from Date32/64 and Timestamp column + let target_type = match info.get_data_type(col_expr)? { + Date32 | Date64 | Timestamp(_, _) => &info.get_data_type(col_expr)?, + _ => return Ok(None), + }; + + // Compute the Interval bounds + let start_time = + NaiveDate::from_ymd_opt(year, 1, 1).expect("Expect computed start time"); + let end_time = start_time + .with_year(year + 1) + .expect("Expect computed end time"); + + // Convert to ScalarValues + let lower = date_to_scalar(start_time, target_type) + .expect("Expect preimage interval lower bound"); + let upper = date_to_scalar(end_time, target_type) + .expect("Expect preimage interval upper bound"); + Ok(Some(interval_arithmetic::Interval::try_new(lower, upper)?)) + } + + fn column_expr(&self, args: &[Expr]) -> Option { + Some(args[1].clone()) + } + fn aliases(&self) -> &[String] { &self.aliases } @@ -245,6 +311,35 @@ fn is_epoch(part: &str) -> bool { matches!(part.to_lowercase().as_str(), "epoch") } +fn date_to_scalar(date: NaiveDate, target_type: &DataType) -> Option { + let days = date + .signed_duration_since(NaiveDate::from_epoch_days(0)?) + .num_days(); + + Some(match target_type { + Date32 => ScalarValue::Date32(Some(days as i32)), + Date64 => ScalarValue::Date64(Some(days * MILLISECONDS_IN_DAY)), + Timestamp(unit, tz) => match unit { + Second => { + ScalarValue::TimestampSecond(Some(days * SECONDS_IN_DAY), tz.clone()) + } + Millisecond => ScalarValue::TimestampMillisecond( + Some(days * MILLISECONDS_IN_DAY), + tz.clone(), + ), + Microsecond => ScalarValue::TimestampMicrosecond( + Some(days * MICROSECONDS_IN_DAY), + tz.clone(), + ), + Nanosecond => ScalarValue::TimestampNanosecond( + Some(days * NANOSECONDS_IN_DAY), + tz.clone(), + ), + }, + _ => return None, + }) +} + // Try to remove quote if exist, if the quote is invalid, return original string and let the downstream function handle the error fn part_normalization(part: &str) -> &str { part.strip_prefix(|c| c == '\'' || c == '\"') diff --git a/datafusion/sqllogictest/test_files/udf_preimage.slt b/datafusion/sqllogictest/test_files/udf_preimage.slt new file mode 100644 index 0000000000000..544f082cc1d83 --- /dev/null +++ b/datafusion/sqllogictest/test_files/udf_preimage.slt @@ -0,0 +1,506 @@ +# Licensed to the Apache Software Foundation (asF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The asF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "as IS" BasIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +############################ +# date_part(year, col) tests +############################ + +statement ok +create table t1(c DATE) as VALUES (NULL), ('1990-01-01'), ('2024-01-01'), ('2030-01-01'); + +# +# Simple optimizations, col on LHS +# +query D +select c from t1 where extract(year from c) = 2024; +---- +2024-01-01 + +query D +select c from t1 where extract(year from c) <> 2024; +---- +1990-01-01 +2030-01-01 + +query D +select c from t1 where extract(year from c) > 2024; +---- +2030-01-01 + +query D +select c from t1 where extract(year from c) < 2024; +---- +1990-01-01 + +query D +select c from t1 where extract(year from c) >= 2024; +---- +2024-01-01 +2030-01-01 + +query D +select c from t1 where extract(year from c) <= 2024; +---- +1990-01-01 +2024-01-01 + +query D +select c from t1 where extract(year from c) is not distinct from 2024 +---- +2024-01-01 + +query D +select c from t1 where extract(year from c) is distinct from 2024 +---- +NULL +1990-01-01 +2030-01-01 + +# +# Check that date_part is not in the explain statements +# +query TT +explain select c from t1 where extract (year from c) = 2024 +---- +logical_plan +01)Filter: t1.c >= Date32("2024-01-01") AND t1.c < Date32("2025-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 >= 2024-01-01 AND c@0 < 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) <> 2024 +---- +logical_plan +01)Filter: t1.c < Date32("2024-01-01") OR t1.c >= Date32("2025-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 < 2024-01-01 OR c@0 >= 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) > 2024 +---- +logical_plan +01)Filter: t1.c >= Date32("2025-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 >= 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) < 2024 +---- +logical_plan +01)Filter: t1.c < Date32("2024-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 < 2024-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) >= 2024 +---- +logical_plan +01)Filter: t1.c >= Date32("2024-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 >= 2024-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) <= 2024 +---- +logical_plan +01)Filter: t1.c < Date32("2025-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 < 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) is not distinct from 2024 +---- +logical_plan +01)Filter: t1.c >= Date32("2024-01-01") AND t1.c < Date32("2025-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 >= 2024-01-01 AND c@0 < 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) is distinct from 2024 +---- +logical_plan +01)Filter: t1.c < Date32("2024-01-01") OR t1.c >= Date32("2025-01-01") OR t1.c IS NULL +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 < 2024-01-01 OR c@0 >= 2025-01-01 OR c@0 IS NULL +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +# +# Simple optimizations, column on RHS +# +query D +select c from t1 where 2024 = extract(year from c); +---- +2024-01-01 + +query D +select c from t1 where 2024 <> extract(year from c); +---- +1990-01-01 +2030-01-01 + +query D +select c from t1 where 2024 < extract(year from c); +---- +2030-01-01 + +query D +select c from t1 where 2024 > extract(year from c); +---- +1990-01-01 + +query D +select c from t1 where 2024 <= extract(year from c); +---- +2024-01-01 +2030-01-01 + +query D +select c from t1 where 2024 >= extract(year from c); +---- +1990-01-01 +2024-01-01 + +query D +select c from t1 where 2024 is not distinct from extract(year from c); +---- +2024-01-01 + +query D +select c from t1 where 2024 is distinct from extract(year from c); +---- +NULL +1990-01-01 +2030-01-01 + +# +# Check explain statements for optimizations for other interval types +# +query TT +explain select c from t1 where extract (quarter from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("QUARTER"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(QUARTER, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (month from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("MONTH"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(MONTH, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (week from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("WEEK"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(WEEK, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (day from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("DAY"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(DAY, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (hour from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("HOUR"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(HOUR, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (minute from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("MINUTE"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(MINUTE, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (second from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("SECOND"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(SECOND, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (millisecond from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("MILLISECOND"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(MILLISECOND, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (microsecond from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("MICROSECOND"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(MICROSECOND, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (nanosecond from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("NANOSECOND"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(NANOSECOND, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (dow from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("DOW"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(DOW, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (doy from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("DOY"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(DOY, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (epoch from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("EPOCH"), t1.c) = Float64(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(EPOCH, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (isodow from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("ISODOW"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(ISODOW, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +# +# Simple optimize different datatypes +# +statement ok +create table t2( + c1_date32 DATE, + c2_ts_sec timestamp, + c3_ts_mili timestamp, + c4_ts_micro timestamp, + c5_ts_nano timestamp +) as VALUES + (NULL, + NULL, + NULL, + NULL, + NULL), + ('1990-05-20', + '1990-05-20T00:00:10'::timestamp, + '1990-05-20T00:00:10.987'::timestamp, + '1990-05-20T00:00:10.987654'::timestamp, + '1990-05-20T00:00:10.987654321'::timestamp), + ('2024-01-01', + '2024-01-01T00:00:00'::timestamp, + '2024-01-01T00:00:00.123'::timestamp, + '2024-01-01T00:00:00.123456'::timestamp, + '2024-01-01T00:00:00.123456789'::timestamp), + ('2030-12-31', + '2030-12-31T23:59:59'::timestamp, + '2030-12-31T23:59:59.001'::timestamp, + '2030-12-31T23:59:59.001234'::timestamp, + '2030-12-31T23:59:59.001234567'::timestamp) +; + +query D +select c1_date32 from t2 where extract(year from c1_date32) = 2024; +---- +2024-01-01 + +query D +select c1_date32 from t2 where extract(year from c1_date32) <> 2024; +---- +1990-05-20 +2030-12-31 + +query P +select c2_ts_sec from t2 where extract(year from c2_ts_sec) > 2024; +---- +2030-12-31T23:59:59 + +query P +select c3_ts_mili from t2 where extract(year from c3_ts_mili) < 2024; +---- +1990-05-20T00:00:10.987 + +query P +select c4_ts_micro from t2 where extract(year from c4_ts_micro) >= 2024; +---- +2024-01-01T00:00:00.123456 +2030-12-31T23:59:59.001234 + +query P +select c5_ts_nano from t2 where extract(year from c5_ts_nano) <= 2024; +---- +1990-05-20T00:00:10.987654321 +2024-01-01T00:00:00.123456789 + +query D +select c1_date32 from t2 where extract(year from c1_date32) is not distinct from 2024 +---- +2024-01-01 + +query D +select c1_date32 from t2 where extract(year from c1_date32) is distinct from 2024 +---- +NULL +1990-05-20 +2030-12-31 + +# +# Check that date_part is not in the explain statements for other datatypes +# +query TT +explain select c1_date32 from t2 where extract (year from c1_date32) = 2024 +---- +logical_plan +01)Filter: t2.c1_date32 >= Date32("2024-01-01") AND t2.c1_date32 < Date32("2025-01-01") +02)--TableScan: t2 projection=[c1_date32] +physical_plan +01)FilterExec: c1_date32@0 >= 2024-01-01 AND c1_date32@0 < 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c1_date32 from t2 where extract (year from c1_date32) <> 2024 +---- +logical_plan +01)Filter: t2.c1_date32 < Date32("2024-01-01") OR t2.c1_date32 >= Date32("2025-01-01") +02)--TableScan: t2 projection=[c1_date32] +physical_plan +01)FilterExec: c1_date32@0 < 2024-01-01 OR c1_date32@0 >= 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c2_ts_sec from t2 where extract (year from c2_ts_sec) > 2024 +---- +logical_plan +01)Filter: t2.c2_ts_sec >= TimestampNanosecond(1735689600000000000, None) +02)--TableScan: t2 projection=[c2_ts_sec] +physical_plan +01)FilterExec: c2_ts_sec@0 >= 1735689600000000000 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c3_ts_mili from t2 where extract (year from c3_ts_mili) < 2024 +---- +logical_plan +01)Filter: t2.c3_ts_mili < TimestampNanosecond(1704067200000000000, None) +02)--TableScan: t2 projection=[c3_ts_mili] +physical_plan +01)FilterExec: c3_ts_mili@0 < 1704067200000000000 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c4_ts_micro from t2 where extract (year from c4_ts_micro) >= 2024 +---- +logical_plan +01)Filter: t2.c4_ts_micro >= TimestampNanosecond(1704067200000000000, None) +02)--TableScan: t2 projection=[c4_ts_micro] +physical_plan +01)FilterExec: c4_ts_micro@0 >= 1704067200000000000 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c5_ts_nano from t2 where extract (year from c5_ts_nano) <= 2024 +---- +logical_plan +01)Filter: t2.c5_ts_nano < TimestampNanosecond(1735689600000000000, None) +02)--TableScan: t2 projection=[c5_ts_nano] +physical_plan +01)FilterExec: c5_ts_nano@0 < 1735689600000000000 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c1_date32 from t2 where extract (year from c1_date32) is not distinct from 2024 +---- +logical_plan +01)Filter: t2.c1_date32 >= Date32("2024-01-01") AND t2.c1_date32 < Date32("2025-01-01") +02)--TableScan: t2 projection=[c1_date32] +physical_plan +01)FilterExec: c1_date32@0 >= 2024-01-01 AND c1_date32@0 < 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c1_date32 from t2 where extract (year from c1_date32) is distinct from 2024 +---- +logical_plan +01)Filter: t2.c1_date32 < Date32("2024-01-01") OR t2.c1_date32 >= Date32("2025-01-01") OR t2.c1_date32 IS NULL +02)--TableScan: t2 projection=[c1_date32] +physical_plan +01)FilterExec: c1_date32@0 < 2024-01-01 OR c1_date32@0 >= 2025-01-01 OR c1_date32@0 IS NULL +02)--DataSourceExec: partitions=1, partition_sizes=[1] \ No newline at end of file From 9fb245bd1368d2e4ee64b93ca3a12a9f78c4ae96 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Sat, 10 Jan 2026 12:41:26 -0500 Subject: [PATCH 09/18] Fix asf header --- datafusion/sqllogictest/test_files/udf_preimage.slt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/datafusion/sqllogictest/test_files/udf_preimage.slt b/datafusion/sqllogictest/test_files/udf_preimage.slt index 544f082cc1d83..543893ea85e95 100644 --- a/datafusion/sqllogictest/test_files/udf_preimage.slt +++ b/datafusion/sqllogictest/test_files/udf_preimage.slt @@ -1,16 +1,16 @@ -# Licensed to the Apache Software Foundation (asF) under one +# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information -# regarding copyright ownership. The asF licenses this file +# regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# + # http://www.apache.org/licenses/LICENSE-2.0 -# + # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an -# "as IS" BasIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. From c2b0cd39b5b1d75e853490b748f5e1faaa55ba06 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Sun, 18 Jan 2026 16:50:34 -0500 Subject: [PATCH 10/18] Replace BinaryExpression with binary_expr() fn --- .../src/simplify_expressions/udf_preimage.rs | 58 ++++--------------- 1 file changed, 11 insertions(+), 47 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index 960c8df322d15..4bfa98d679292 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -17,7 +17,7 @@ use datafusion_common::{Result, internal_err, tree_node::Transformed}; use datafusion_expr::{ - BinaryExpr, Expr, Operator, and, lit, or, simplify::SimplifyContext, + Expr, Operator, and, binary_expr, lit, or, simplify::SimplifyContext, }; use datafusion_expr_common::interval_arithmetic::Interval; @@ -49,65 +49,29 @@ pub(super) fn rewrite_with_preimage( let rewritten_expr = match op { // < x ==> < lower // >= x ==> >= lower - Operator::Lt | Operator::GtEq => Expr::BinaryExpr(BinaryExpr { - left: expr, - op, - right: Box::new(lower), - }), + Operator::Lt | Operator::GtEq => binary_expr(*expr, op, lower), // > x ==> >= upper - Operator::Gt => Expr::BinaryExpr(BinaryExpr { - left: expr, - op: Operator::GtEq, - right: Box::new(upper), - }), + Operator::Gt => binary_expr(*expr, Operator::GtEq, upper), // <= x ==> < upper - Operator::LtEq => Expr::BinaryExpr(BinaryExpr { - left: expr, - op: Operator::Lt, - right: Box::new(upper), - }), + Operator::LtEq => binary_expr(*expr, Operator::Lt, upper), // = x ==> ( >= lower) and ( < upper) // // is not distinct from x ==> ( is NULL and x is NULL) or (( >= lower) and ( < upper)) // but since x is always not NULL => ( >= lower) and ( < upper) Operator::Eq | Operator::IsNotDistinctFrom => and( - Expr::BinaryExpr(BinaryExpr { - left: expr.clone(), - op: Operator::GtEq, - right: Box::new(lower), - }), - Expr::BinaryExpr(BinaryExpr { - left: expr, - op: Operator::Lt, - right: Box::new(upper), - }), + binary_expr(*expr.clone(), Operator::GtEq, lower), + binary_expr(*expr, Operator::Lt, upper), ), // != x ==> ( < lower) or ( >= upper) Operator::NotEq => or( - Expr::BinaryExpr(BinaryExpr { - left: expr.clone(), - op: Operator::Lt, - right: Box::new(lower), - }), - Expr::BinaryExpr(BinaryExpr { - left: expr, - op: Operator::GtEq, - right: Box::new(upper), - }), + binary_expr(*expr.clone(), Operator::Lt, lower), + binary_expr(*expr, Operator::GtEq, upper), ), // is distinct from x ==> ( < lower) or ( >= upper) or ( is NULL and x is not NULL) or ( is not NULL and x is NULL) // but given that x is always not NULL => ( < lower) or ( >= upper) or ( is NULL) - Operator::IsDistinctFrom => Expr::BinaryExpr(BinaryExpr { - left: expr.clone(), - op: Operator::Lt, - right: Box::new(lower.clone()), - }) - .or(Expr::BinaryExpr(BinaryExpr { - left: expr.clone(), - op: Operator::GtEq, - right: Box::new(upper), - })) - .or(expr.is_null()), + Operator::IsDistinctFrom => binary_expr(*expr.clone(), Operator::Lt, lower) + .or(binary_expr(*expr.clone(), Operator::GtEq, upper)) + .or(expr.is_null()), _ => return internal_err!("Expect comparison operators"), }; Ok(Transformed::yes(rewritten_expr)) From a0b6564eb0403ec1780cc87baa7de2a4cfb670fc Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Sun, 18 Jan 2026 20:46:17 -0500 Subject: [PATCH 11/18] Add unit tests + add doc part about upper bound --- .../src/simplify_expressions/udf_preimage.rs | 196 +++++++++++++++++- 1 file changed, 194 insertions(+), 2 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index 4bfa98d679292..f03426ddf52c3 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -31,8 +31,9 @@ use datafusion_expr_common::interval_arithmetic::Interval; /// useful for simplifying expressions `date_part` or equivalent functions. The /// idea is that if you have an expression like `date_part(YEAR, k) = 2024` and you /// can find a [preimage] for `date_part(YEAR, k)`, which is the range of dates -/// covering the entire year of 2024. Thus, you can rewrite the expression to `k -/// >= '2024-01-01' AND k < '2025-01-01' which is often more optimizable. +/// covering the entire year of 2024. Thus, you can rewrite the expression to +/// `k >= '2024-01-01' AND k < '2025-01-01'`, which uses an inclusive lower bound +/// and exclusive upper bound and is often more optimizable. /// /// [ClickHouse Paper]: https://www.vldb.org/pvldb/vol17/p3731-schulze.pdf /// [preimage]: https://en.wikipedia.org/wiki/Image_(mathematics)#Inverse_image @@ -76,3 +77,194 @@ pub(super) fn rewrite_with_preimage( }; Ok(Transformed::yes(rewritten_expr)) } + +#[cfg(test)] +mod test { + use std::any::Any; + use std::sync::Arc; + + use arrow::datatypes::{DataType, Field}; + use datafusion_common::{DFSchema, DFSchemaRef, Result, ScalarValue}; + use datafusion_expr::{ + ColumnarValue, Expr, Operator, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, + Signature, Volatility, and, binary_expr, col, expr::ScalarFunction, lit, + simplify::SimplifyContext, + }; + + use super::Interval; + use crate::simplify_expressions::ExprSimplifier; + + #[derive(Debug, PartialEq, Eq, Hash)] + struct PreimageUdf { + signature: Signature, + } + + impl ScalarUDFImpl for PreimageUdf { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "preimage_func" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Int32) + } + + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { + Ok(ColumnarValue::Scalar(ScalarValue::Int32(Some(500)))) + } + + fn preimage( + &self, + args: &[Expr], + lit_expr: &Expr, + _info: &SimplifyContext, + ) -> Result> { + if args.len() != 1 { + return Ok(None); + } + match lit_expr { + Expr::Literal(ScalarValue::Int32(Some(500)), _) => { + Ok(Some(Interval::try_new( + ScalarValue::Int32(Some(100)), + ScalarValue::Int32(Some(200)), + )?)) + } + _ => Ok(None), + } + } + + fn column_expr(&self, args: &[Expr]) -> Option { + args.get(0).cloned() + } + } + + fn optimize_test(expr: Expr, schema: &DFSchemaRef) -> Expr { + let simplifier = ExprSimplifier::new( + SimplifyContext::default().with_schema(Arc::clone(schema)), + ); + + simplifier.simplify(expr).unwrap() + } + + fn preimage_udf_expr() -> Expr { + let udf = ScalarUDF::new_from_impl(PreimageUdf { + signature: Signature::exact(vec![DataType::Int32], Volatility::Immutable), + }); + + Expr::ScalarFunction(ScalarFunction::new_udf(Arc::new(udf), vec![col("x")])) + } + + fn test_schema() -> DFSchemaRef { + Arc::new( + DFSchema::from_unqualified_fields( + vec![Field::new("x", DataType::Int32, false)].into(), + Default::default(), + ) + .unwrap(), + ) + } + + #[test] + fn test_preimage_eq_rewrite() { + let schema = test_schema(); + let expr = binary_expr(preimage_udf_expr(), Operator::Eq, lit(500)); + let expected = and( + binary_expr(col("x"), Operator::GtEq, lit(100)), + binary_expr(col("x"), Operator::Lt, lit(200)), + ); + + assert_eq!(optimize_test(expr, &schema), expected); + } + + #[test] + fn test_preimage_noteq_rewrite() { + let schema = test_schema(); + let expr = binary_expr(preimage_udf_expr(), Operator::NotEq, lit(500)); + let expected = binary_expr(col("x"), Operator::Lt, lit(100)).or(binary_expr( + col("x"), + Operator::GtEq, + lit(200), + )); + + assert_eq!(optimize_test(expr, &schema), expected); + } + + #[test] + fn test_preimage_eq_rewrite_swapped() { + let schema = test_schema(); + let expr = binary_expr(lit(500), Operator::Eq, preimage_udf_expr()); + let expected = and( + binary_expr(col("x"), Operator::GtEq, lit(100)), + binary_expr(col("x"), Operator::Lt, lit(200)), + ); + + assert_eq!(optimize_test(expr, &schema), expected); + } + + #[test] + fn test_preimage_lt_rewrite() { + let schema = test_schema(); + let expr = binary_expr(preimage_udf_expr(), Operator::Lt, lit(500)); + let expected = binary_expr(col("x"), Operator::Lt, lit(100)); + + assert_eq!(optimize_test(expr, &schema), expected); + } + + #[test] + fn test_preimage_lteq_rewrite() { + let schema = test_schema(); + let expr = binary_expr(preimage_udf_expr(), Operator::LtEq, lit(500)); + let expected = binary_expr(col("x"), Operator::Lt, lit(200)); + + assert_eq!(optimize_test(expr, &schema), expected); + } + + #[test] + fn test_preimage_gt_rewrite() { + let schema = test_schema(); + let expr = binary_expr(preimage_udf_expr(), Operator::Gt, lit(500)); + let expected = binary_expr(col("x"), Operator::GtEq, lit(200)); + + assert_eq!(optimize_test(expr, &schema), expected); + } + + #[test] + fn test_preimage_gteq_rewrite() { + let schema = test_schema(); + let expr = binary_expr(preimage_udf_expr(), Operator::GtEq, lit(500)); + let expected = binary_expr(col("x"), Operator::GtEq, lit(100)); + + assert_eq!(optimize_test(expr, &schema), expected); + } + + #[test] + fn test_preimage_is_not_distinct_from_rewrite() { + let schema = test_schema(); + let expr = + binary_expr(preimage_udf_expr(), Operator::IsNotDistinctFrom, lit(500)); + let expected = and( + binary_expr(col("x"), Operator::GtEq, lit(100)), + binary_expr(col("x"), Operator::Lt, lit(200)), + ); + + assert_eq!(optimize_test(expr, &schema), expected); + } + + #[test] + fn test_preimage_is_distinct_from_rewrite() { + let schema = test_schema(); + let expr = binary_expr(preimage_udf_expr(), Operator::IsDistinctFrom, lit(500)); + let expected = binary_expr(col("x"), Operator::Lt, lit(100)) + .or(binary_expr(col("x"), Operator::GtEq, lit(200))) + .or(col("x").is_null()); + + assert_eq!(optimize_test(expr, &schema), expected); + } +} From 0a24d6096938b67c8930025beb459d2263d63882 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Sun, 18 Jan 2026 20:55:19 -0500 Subject: [PATCH 12/18] Fix docs --- datafusion/expr/src/udf.rs | 6 +++++- .../optimizer/src/simplify_expressions/expr_simplifier.rs | 7 +++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 55c50f6dd8567..affb439d81fd7 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -720,6 +720,9 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync { /// A preimage is a single contiguous [`Interval`] of values where the function /// will always return `lit_value` /// + /// Implementations should return intervals with an inclusive lower bound and + /// exclusive upper bound. + /// /// This rewrite is described in the [ClickHouse Paper] and is particularly /// useful for simplifying expressions `date_part` or equivalent functions. The /// idea is that if you have an expression like `date_part(YEAR, k) = 2024` and you @@ -727,7 +730,8 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync { /// covering the entire year of 2024. Thus, you can rewrite the expression to `k /// >= '2024-01-01' AND k < '2025-01-01' which is often more optimizable. /// - /// This should only return a preimage if the function takes a single argument + /// Implementations must also provide [`ScalarUDFImpl::column_expr`] so the + /// optimizer can identify which argument maps to the preimage interval. /// /// [ClickHouse Paper]: https://www.vldb.org/pvldb/vol17/p3731-schulze.pdf /// [preimage]: https://en.wikipedia.org/wiki/Image_(mathematics)#Inverse_image diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index d6aa3d9ccf341..38e78eda5c7f6 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1977,14 +1977,17 @@ impl TreeNodeRewriter for Simplifier<'_> { // ======================================= // // For case: - // date_part(expr as 'YEAR') op literal + // date_part('YEAR', expr) op literal // // Background: // Datasources such as Parquet can prune partitions using simple predicates, // but they cannot do so for complex expressions. // For a complex predicate like `date_part('YEAR', c1) < 2000`, pruning is not possible. // After rewriting it to `c1 < 2000-01-01`, pruning becomes feasible. - // NOTE: we only consider immutable UDFs with literal RHS values + // Rewrites use inclusive lower and exclusive upper bounds when + // translating an equality into a range. + // NOTE: we only consider immutable UDFs with literal RHS values and + // UDFs that provide both `preimage` and `column_expr`. Expr::BinaryExpr(BinaryExpr { left, op, right }) => { use datafusion_expr::Operator::*; let is_preimage_op = matches!( From 86b7627306f7f300b18dcb39ded303d9140f0c5e Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Sat, 10 Jan 2026 12:10:27 -0500 Subject: [PATCH 13/18] Add datepart preimage + tests --- .../functions/src/datetime/date_part.rs | 99 +++- .../sqllogictest/test_files/udf_preimage.slt | 506 ++++++++++++++++++ 2 files changed, 603 insertions(+), 2 deletions(-) create mode 100644 datafusion/sqllogictest/test_files/udf_preimage.slt diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index e9ae3188694bc..06fdc37ba7fde 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -29,6 +29,10 @@ use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; use arrow::datatypes::{ DataType, Field, FieldRef, IntervalUnit as ArrowIntervalUnit, TimeUnit, }; +use arrow::temporal_conversions::{ + MICROSECONDS_IN_DAY, MILLISECONDS_IN_DAY, NANOSECONDS_IN_DAY, SECONDS_IN_DAY, +}; +use chrono::{Datelike, NaiveDate}; use datafusion_common::types::{NativeType, logical_date}; use datafusion_common::{ @@ -44,9 +48,10 @@ use datafusion_common::{ types::logical_string, utils::take_function_args, }; +use datafusion_expr::simplify::SimplifyContext; use datafusion_expr::{ - ColumnarValue, Documentation, ReturnFieldArgs, ScalarUDFImpl, Signature, - TypeSignature, Volatility, + ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarUDFImpl, Signature, + TypeSignature, Volatility, interval_arithmetic, }; use datafusion_expr_common::signature::{Coercion, TypeSignatureClass}; use datafusion_macros::user_doc; @@ -237,6 +242,67 @@ impl ScalarUDFImpl for DatePartFunc { }) } + // Only casting the year is supported since pruning other IntervalUnit is not possible + // date_part(col, YEAR) = 2024 => col >= '2024-01-01' and col < '2025-01-01' + // But for anything less than YEAR simplifying is not possible without specifying the bigger interval + // date_part(col, MONTH) = 1 => col = '2023-01-01' or col = '2024-01-01' or ... or col = '3000-01-01' + fn preimage( + &self, + args: &[Expr], + lit_expr: &Expr, + info: &SimplifyContext, + ) -> Result> { + let [part, col_expr] = take_function_args(self.name(), args)?; + + // Get the interval unit from the part argument + let interval_unit = part + .as_literal() + .and_then(|sv| sv.try_as_str().flatten()) + .map(part_normalization) + .and_then(|s| IntervalUnit::from_str(s).ok()); + + // only support extracting year + match interval_unit { + Some(IntervalUnit::Year) => (), + _ => return Ok(None), + } + + // Check if the argument is a literal (e.g. date_part(YEAR, col) = 2024) + let Some(argument_literal) = lit_expr.as_literal() else { + return Ok(None); + }; + + // Extract i32 year from Scalar value + let year = match argument_literal { + ScalarValue::Int32(Some(y)) => *y, + _ => return Ok(None), + }; + + // Can only extract year from Date32/64 and Timestamp column + let target_type = match info.get_data_type(col_expr)? { + Date32 | Date64 | Timestamp(_, _) => &info.get_data_type(col_expr)?, + _ => return Ok(None), + }; + + // Compute the Interval bounds + let start_time = + NaiveDate::from_ymd_opt(year, 1, 1).expect("Expect computed start time"); + let end_time = start_time + .with_year(year + 1) + .expect("Expect computed end time"); + + // Convert to ScalarValues + let lower = date_to_scalar(start_time, target_type) + .expect("Expect preimage interval lower bound"); + let upper = date_to_scalar(end_time, target_type) + .expect("Expect preimage interval upper bound"); + Ok(Some(interval_arithmetic::Interval::try_new(lower, upper)?)) + } + + fn column_expr(&self, args: &[Expr]) -> Option { + Some(args[1].clone()) + } + fn aliases(&self) -> &[String] { &self.aliases } @@ -251,6 +317,35 @@ fn is_epoch(part: &str) -> bool { matches!(part.to_lowercase().as_str(), "epoch") } +fn date_to_scalar(date: NaiveDate, target_type: &DataType) -> Option { + let days = date + .signed_duration_since(NaiveDate::from_epoch_days(0)?) + .num_days(); + + Some(match target_type { + Date32 => ScalarValue::Date32(Some(days as i32)), + Date64 => ScalarValue::Date64(Some(days * MILLISECONDS_IN_DAY)), + Timestamp(unit, tz) => match unit { + Second => { + ScalarValue::TimestampSecond(Some(days * SECONDS_IN_DAY), tz.clone()) + } + Millisecond => ScalarValue::TimestampMillisecond( + Some(days * MILLISECONDS_IN_DAY), + tz.clone(), + ), + Microsecond => ScalarValue::TimestampMicrosecond( + Some(days * MICROSECONDS_IN_DAY), + tz.clone(), + ), + Nanosecond => ScalarValue::TimestampNanosecond( + Some(days * NANOSECONDS_IN_DAY), + tz.clone(), + ), + }, + _ => return None, + }) +} + // Try to remove quote if exist, if the quote is invalid, return original string and let the downstream function handle the error fn part_normalization(part: &str) -> &str { part.strip_prefix(|c| c == '\'' || c == '\"') diff --git a/datafusion/sqllogictest/test_files/udf_preimage.slt b/datafusion/sqllogictest/test_files/udf_preimage.slt new file mode 100644 index 0000000000000..544f082cc1d83 --- /dev/null +++ b/datafusion/sqllogictest/test_files/udf_preimage.slt @@ -0,0 +1,506 @@ +# Licensed to the Apache Software Foundation (asF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The asF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "as IS" BasIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +############################ +# date_part(year, col) tests +############################ + +statement ok +create table t1(c DATE) as VALUES (NULL), ('1990-01-01'), ('2024-01-01'), ('2030-01-01'); + +# +# Simple optimizations, col on LHS +# +query D +select c from t1 where extract(year from c) = 2024; +---- +2024-01-01 + +query D +select c from t1 where extract(year from c) <> 2024; +---- +1990-01-01 +2030-01-01 + +query D +select c from t1 where extract(year from c) > 2024; +---- +2030-01-01 + +query D +select c from t1 where extract(year from c) < 2024; +---- +1990-01-01 + +query D +select c from t1 where extract(year from c) >= 2024; +---- +2024-01-01 +2030-01-01 + +query D +select c from t1 where extract(year from c) <= 2024; +---- +1990-01-01 +2024-01-01 + +query D +select c from t1 where extract(year from c) is not distinct from 2024 +---- +2024-01-01 + +query D +select c from t1 where extract(year from c) is distinct from 2024 +---- +NULL +1990-01-01 +2030-01-01 + +# +# Check that date_part is not in the explain statements +# +query TT +explain select c from t1 where extract (year from c) = 2024 +---- +logical_plan +01)Filter: t1.c >= Date32("2024-01-01") AND t1.c < Date32("2025-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 >= 2024-01-01 AND c@0 < 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) <> 2024 +---- +logical_plan +01)Filter: t1.c < Date32("2024-01-01") OR t1.c >= Date32("2025-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 < 2024-01-01 OR c@0 >= 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) > 2024 +---- +logical_plan +01)Filter: t1.c >= Date32("2025-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 >= 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) < 2024 +---- +logical_plan +01)Filter: t1.c < Date32("2024-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 < 2024-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) >= 2024 +---- +logical_plan +01)Filter: t1.c >= Date32("2024-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 >= 2024-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) <= 2024 +---- +logical_plan +01)Filter: t1.c < Date32("2025-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 < 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) is not distinct from 2024 +---- +logical_plan +01)Filter: t1.c >= Date32("2024-01-01") AND t1.c < Date32("2025-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 >= 2024-01-01 AND c@0 < 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) is distinct from 2024 +---- +logical_plan +01)Filter: t1.c < Date32("2024-01-01") OR t1.c >= Date32("2025-01-01") OR t1.c IS NULL +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 < 2024-01-01 OR c@0 >= 2025-01-01 OR c@0 IS NULL +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +# +# Simple optimizations, column on RHS +# +query D +select c from t1 where 2024 = extract(year from c); +---- +2024-01-01 + +query D +select c from t1 where 2024 <> extract(year from c); +---- +1990-01-01 +2030-01-01 + +query D +select c from t1 where 2024 < extract(year from c); +---- +2030-01-01 + +query D +select c from t1 where 2024 > extract(year from c); +---- +1990-01-01 + +query D +select c from t1 where 2024 <= extract(year from c); +---- +2024-01-01 +2030-01-01 + +query D +select c from t1 where 2024 >= extract(year from c); +---- +1990-01-01 +2024-01-01 + +query D +select c from t1 where 2024 is not distinct from extract(year from c); +---- +2024-01-01 + +query D +select c from t1 where 2024 is distinct from extract(year from c); +---- +NULL +1990-01-01 +2030-01-01 + +# +# Check explain statements for optimizations for other interval types +# +query TT +explain select c from t1 where extract (quarter from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("QUARTER"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(QUARTER, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (month from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("MONTH"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(MONTH, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (week from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("WEEK"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(WEEK, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (day from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("DAY"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(DAY, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (hour from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("HOUR"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(HOUR, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (minute from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("MINUTE"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(MINUTE, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (second from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("SECOND"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(SECOND, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (millisecond from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("MILLISECOND"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(MILLISECOND, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (microsecond from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("MICROSECOND"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(MICROSECOND, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (nanosecond from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("NANOSECOND"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(NANOSECOND, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (dow from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("DOW"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(DOW, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (doy from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("DOY"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(DOY, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (epoch from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("EPOCH"), t1.c) = Float64(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(EPOCH, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (isodow from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("ISODOW"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(ISODOW, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +# +# Simple optimize different datatypes +# +statement ok +create table t2( + c1_date32 DATE, + c2_ts_sec timestamp, + c3_ts_mili timestamp, + c4_ts_micro timestamp, + c5_ts_nano timestamp +) as VALUES + (NULL, + NULL, + NULL, + NULL, + NULL), + ('1990-05-20', + '1990-05-20T00:00:10'::timestamp, + '1990-05-20T00:00:10.987'::timestamp, + '1990-05-20T00:00:10.987654'::timestamp, + '1990-05-20T00:00:10.987654321'::timestamp), + ('2024-01-01', + '2024-01-01T00:00:00'::timestamp, + '2024-01-01T00:00:00.123'::timestamp, + '2024-01-01T00:00:00.123456'::timestamp, + '2024-01-01T00:00:00.123456789'::timestamp), + ('2030-12-31', + '2030-12-31T23:59:59'::timestamp, + '2030-12-31T23:59:59.001'::timestamp, + '2030-12-31T23:59:59.001234'::timestamp, + '2030-12-31T23:59:59.001234567'::timestamp) +; + +query D +select c1_date32 from t2 where extract(year from c1_date32) = 2024; +---- +2024-01-01 + +query D +select c1_date32 from t2 where extract(year from c1_date32) <> 2024; +---- +1990-05-20 +2030-12-31 + +query P +select c2_ts_sec from t2 where extract(year from c2_ts_sec) > 2024; +---- +2030-12-31T23:59:59 + +query P +select c3_ts_mili from t2 where extract(year from c3_ts_mili) < 2024; +---- +1990-05-20T00:00:10.987 + +query P +select c4_ts_micro from t2 where extract(year from c4_ts_micro) >= 2024; +---- +2024-01-01T00:00:00.123456 +2030-12-31T23:59:59.001234 + +query P +select c5_ts_nano from t2 where extract(year from c5_ts_nano) <= 2024; +---- +1990-05-20T00:00:10.987654321 +2024-01-01T00:00:00.123456789 + +query D +select c1_date32 from t2 where extract(year from c1_date32) is not distinct from 2024 +---- +2024-01-01 + +query D +select c1_date32 from t2 where extract(year from c1_date32) is distinct from 2024 +---- +NULL +1990-05-20 +2030-12-31 + +# +# Check that date_part is not in the explain statements for other datatypes +# +query TT +explain select c1_date32 from t2 where extract (year from c1_date32) = 2024 +---- +logical_plan +01)Filter: t2.c1_date32 >= Date32("2024-01-01") AND t2.c1_date32 < Date32("2025-01-01") +02)--TableScan: t2 projection=[c1_date32] +physical_plan +01)FilterExec: c1_date32@0 >= 2024-01-01 AND c1_date32@0 < 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c1_date32 from t2 where extract (year from c1_date32) <> 2024 +---- +logical_plan +01)Filter: t2.c1_date32 < Date32("2024-01-01") OR t2.c1_date32 >= Date32("2025-01-01") +02)--TableScan: t2 projection=[c1_date32] +physical_plan +01)FilterExec: c1_date32@0 < 2024-01-01 OR c1_date32@0 >= 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c2_ts_sec from t2 where extract (year from c2_ts_sec) > 2024 +---- +logical_plan +01)Filter: t2.c2_ts_sec >= TimestampNanosecond(1735689600000000000, None) +02)--TableScan: t2 projection=[c2_ts_sec] +physical_plan +01)FilterExec: c2_ts_sec@0 >= 1735689600000000000 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c3_ts_mili from t2 where extract (year from c3_ts_mili) < 2024 +---- +logical_plan +01)Filter: t2.c3_ts_mili < TimestampNanosecond(1704067200000000000, None) +02)--TableScan: t2 projection=[c3_ts_mili] +physical_plan +01)FilterExec: c3_ts_mili@0 < 1704067200000000000 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c4_ts_micro from t2 where extract (year from c4_ts_micro) >= 2024 +---- +logical_plan +01)Filter: t2.c4_ts_micro >= TimestampNanosecond(1704067200000000000, None) +02)--TableScan: t2 projection=[c4_ts_micro] +physical_plan +01)FilterExec: c4_ts_micro@0 >= 1704067200000000000 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c5_ts_nano from t2 where extract (year from c5_ts_nano) <= 2024 +---- +logical_plan +01)Filter: t2.c5_ts_nano < TimestampNanosecond(1735689600000000000, None) +02)--TableScan: t2 projection=[c5_ts_nano] +physical_plan +01)FilterExec: c5_ts_nano@0 < 1735689600000000000 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c1_date32 from t2 where extract (year from c1_date32) is not distinct from 2024 +---- +logical_plan +01)Filter: t2.c1_date32 >= Date32("2024-01-01") AND t2.c1_date32 < Date32("2025-01-01") +02)--TableScan: t2 projection=[c1_date32] +physical_plan +01)FilterExec: c1_date32@0 >= 2024-01-01 AND c1_date32@0 < 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c1_date32 from t2 where extract (year from c1_date32) is distinct from 2024 +---- +logical_plan +01)Filter: t2.c1_date32 < Date32("2024-01-01") OR t2.c1_date32 >= Date32("2025-01-01") OR t2.c1_date32 IS NULL +02)--TableScan: t2 projection=[c1_date32] +physical_plan +01)FilterExec: c1_date32@0 < 2024-01-01 OR c1_date32@0 >= 2025-01-01 OR c1_date32@0 IS NULL +02)--DataSourceExec: partitions=1, partition_sizes=[1] \ No newline at end of file From 015866293cbaf433665081a19f126cb2e3f8016f Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Sat, 10 Jan 2026 12:41:26 -0500 Subject: [PATCH 14/18] Fix asf header --- datafusion/sqllogictest/test_files/udf_preimage.slt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/datafusion/sqllogictest/test_files/udf_preimage.slt b/datafusion/sqllogictest/test_files/udf_preimage.slt index 544f082cc1d83..543893ea85e95 100644 --- a/datafusion/sqllogictest/test_files/udf_preimage.slt +++ b/datafusion/sqllogictest/test_files/udf_preimage.slt @@ -1,16 +1,16 @@ -# Licensed to the Apache Software Foundation (asF) under one +# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information -# regarding copyright ownership. The asF licenses this file +# regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# + # http://www.apache.org/licenses/LICENSE-2.0 -# + # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an -# "as IS" BasIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. From 59235de4b398aa275165880e1515725a1299302d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 19 Jan 2026 13:34:32 -0500 Subject: [PATCH 15/18] clippy --- datafusion/optimizer/src/simplify_expressions/udf_preimage.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index f03426ddf52c3..a4ec0bf6aeafe 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -141,7 +141,7 @@ mod test { } fn column_expr(&self, args: &[Expr]) -> Option { - args.get(0).cloned() + args.first().cloned() } } From 9f845e74245c1903abd49d034d10316623b321a2 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Mon, 19 Jan 2026 14:40:17 -0500 Subject: [PATCH 16/18] Make test field nullable --- datafusion/optimizer/src/simplify_expressions/udf_preimage.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index a4ec0bf6aeafe..8e7278cb0fb45 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -164,7 +164,7 @@ mod test { fn test_schema() -> DFSchemaRef { Arc::new( DFSchema::from_unqualified_fields( - vec![Field::new("x", DataType::Int32, false)].into(), + vec![Field::new("x", DataType::Int32, true)].into(), Default::default(), ) .unwrap(), From 08ef1f13bc2ba64f66d911619897bb5880c1ce2b Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Sat, 10 Jan 2026 12:10:27 -0500 Subject: [PATCH 17/18] Add datepart preimage + tests --- .../functions/src/datetime/date_part.rs | 99 +++- .../sqllogictest/test_files/udf_preimage.slt | 506 ++++++++++++++++++ 2 files changed, 603 insertions(+), 2 deletions(-) create mode 100644 datafusion/sqllogictest/test_files/udf_preimage.slt diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index e9ae3188694bc..06fdc37ba7fde 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -29,6 +29,10 @@ use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; use arrow::datatypes::{ DataType, Field, FieldRef, IntervalUnit as ArrowIntervalUnit, TimeUnit, }; +use arrow::temporal_conversions::{ + MICROSECONDS_IN_DAY, MILLISECONDS_IN_DAY, NANOSECONDS_IN_DAY, SECONDS_IN_DAY, +}; +use chrono::{Datelike, NaiveDate}; use datafusion_common::types::{NativeType, logical_date}; use datafusion_common::{ @@ -44,9 +48,10 @@ use datafusion_common::{ types::logical_string, utils::take_function_args, }; +use datafusion_expr::simplify::SimplifyContext; use datafusion_expr::{ - ColumnarValue, Documentation, ReturnFieldArgs, ScalarUDFImpl, Signature, - TypeSignature, Volatility, + ColumnarValue, Documentation, Expr, ReturnFieldArgs, ScalarUDFImpl, Signature, + TypeSignature, Volatility, interval_arithmetic, }; use datafusion_expr_common::signature::{Coercion, TypeSignatureClass}; use datafusion_macros::user_doc; @@ -237,6 +242,67 @@ impl ScalarUDFImpl for DatePartFunc { }) } + // Only casting the year is supported since pruning other IntervalUnit is not possible + // date_part(col, YEAR) = 2024 => col >= '2024-01-01' and col < '2025-01-01' + // But for anything less than YEAR simplifying is not possible without specifying the bigger interval + // date_part(col, MONTH) = 1 => col = '2023-01-01' or col = '2024-01-01' or ... or col = '3000-01-01' + fn preimage( + &self, + args: &[Expr], + lit_expr: &Expr, + info: &SimplifyContext, + ) -> Result> { + let [part, col_expr] = take_function_args(self.name(), args)?; + + // Get the interval unit from the part argument + let interval_unit = part + .as_literal() + .and_then(|sv| sv.try_as_str().flatten()) + .map(part_normalization) + .and_then(|s| IntervalUnit::from_str(s).ok()); + + // only support extracting year + match interval_unit { + Some(IntervalUnit::Year) => (), + _ => return Ok(None), + } + + // Check if the argument is a literal (e.g. date_part(YEAR, col) = 2024) + let Some(argument_literal) = lit_expr.as_literal() else { + return Ok(None); + }; + + // Extract i32 year from Scalar value + let year = match argument_literal { + ScalarValue::Int32(Some(y)) => *y, + _ => return Ok(None), + }; + + // Can only extract year from Date32/64 and Timestamp column + let target_type = match info.get_data_type(col_expr)? { + Date32 | Date64 | Timestamp(_, _) => &info.get_data_type(col_expr)?, + _ => return Ok(None), + }; + + // Compute the Interval bounds + let start_time = + NaiveDate::from_ymd_opt(year, 1, 1).expect("Expect computed start time"); + let end_time = start_time + .with_year(year + 1) + .expect("Expect computed end time"); + + // Convert to ScalarValues + let lower = date_to_scalar(start_time, target_type) + .expect("Expect preimage interval lower bound"); + let upper = date_to_scalar(end_time, target_type) + .expect("Expect preimage interval upper bound"); + Ok(Some(interval_arithmetic::Interval::try_new(lower, upper)?)) + } + + fn column_expr(&self, args: &[Expr]) -> Option { + Some(args[1].clone()) + } + fn aliases(&self) -> &[String] { &self.aliases } @@ -251,6 +317,35 @@ fn is_epoch(part: &str) -> bool { matches!(part.to_lowercase().as_str(), "epoch") } +fn date_to_scalar(date: NaiveDate, target_type: &DataType) -> Option { + let days = date + .signed_duration_since(NaiveDate::from_epoch_days(0)?) + .num_days(); + + Some(match target_type { + Date32 => ScalarValue::Date32(Some(days as i32)), + Date64 => ScalarValue::Date64(Some(days * MILLISECONDS_IN_DAY)), + Timestamp(unit, tz) => match unit { + Second => { + ScalarValue::TimestampSecond(Some(days * SECONDS_IN_DAY), tz.clone()) + } + Millisecond => ScalarValue::TimestampMillisecond( + Some(days * MILLISECONDS_IN_DAY), + tz.clone(), + ), + Microsecond => ScalarValue::TimestampMicrosecond( + Some(days * MICROSECONDS_IN_DAY), + tz.clone(), + ), + Nanosecond => ScalarValue::TimestampNanosecond( + Some(days * NANOSECONDS_IN_DAY), + tz.clone(), + ), + }, + _ => return None, + }) +} + // Try to remove quote if exist, if the quote is invalid, return original string and let the downstream function handle the error fn part_normalization(part: &str) -> &str { part.strip_prefix(|c| c == '\'' || c == '\"') diff --git a/datafusion/sqllogictest/test_files/udf_preimage.slt b/datafusion/sqllogictest/test_files/udf_preimage.slt new file mode 100644 index 0000000000000..544f082cc1d83 --- /dev/null +++ b/datafusion/sqllogictest/test_files/udf_preimage.slt @@ -0,0 +1,506 @@ +# Licensed to the Apache Software Foundation (asF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The asF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "as IS" BasIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +############################ +# date_part(year, col) tests +############################ + +statement ok +create table t1(c DATE) as VALUES (NULL), ('1990-01-01'), ('2024-01-01'), ('2030-01-01'); + +# +# Simple optimizations, col on LHS +# +query D +select c from t1 where extract(year from c) = 2024; +---- +2024-01-01 + +query D +select c from t1 where extract(year from c) <> 2024; +---- +1990-01-01 +2030-01-01 + +query D +select c from t1 where extract(year from c) > 2024; +---- +2030-01-01 + +query D +select c from t1 where extract(year from c) < 2024; +---- +1990-01-01 + +query D +select c from t1 where extract(year from c) >= 2024; +---- +2024-01-01 +2030-01-01 + +query D +select c from t1 where extract(year from c) <= 2024; +---- +1990-01-01 +2024-01-01 + +query D +select c from t1 where extract(year from c) is not distinct from 2024 +---- +2024-01-01 + +query D +select c from t1 where extract(year from c) is distinct from 2024 +---- +NULL +1990-01-01 +2030-01-01 + +# +# Check that date_part is not in the explain statements +# +query TT +explain select c from t1 where extract (year from c) = 2024 +---- +logical_plan +01)Filter: t1.c >= Date32("2024-01-01") AND t1.c < Date32("2025-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 >= 2024-01-01 AND c@0 < 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) <> 2024 +---- +logical_plan +01)Filter: t1.c < Date32("2024-01-01") OR t1.c >= Date32("2025-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 < 2024-01-01 OR c@0 >= 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) > 2024 +---- +logical_plan +01)Filter: t1.c >= Date32("2025-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 >= 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) < 2024 +---- +logical_plan +01)Filter: t1.c < Date32("2024-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 < 2024-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) >= 2024 +---- +logical_plan +01)Filter: t1.c >= Date32("2024-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 >= 2024-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) <= 2024 +---- +logical_plan +01)Filter: t1.c < Date32("2025-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 < 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) is not distinct from 2024 +---- +logical_plan +01)Filter: t1.c >= Date32("2024-01-01") AND t1.c < Date32("2025-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 >= 2024-01-01 AND c@0 < 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) is distinct from 2024 +---- +logical_plan +01)Filter: t1.c < Date32("2024-01-01") OR t1.c >= Date32("2025-01-01") OR t1.c IS NULL +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 < 2024-01-01 OR c@0 >= 2025-01-01 OR c@0 IS NULL +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +# +# Simple optimizations, column on RHS +# +query D +select c from t1 where 2024 = extract(year from c); +---- +2024-01-01 + +query D +select c from t1 where 2024 <> extract(year from c); +---- +1990-01-01 +2030-01-01 + +query D +select c from t1 where 2024 < extract(year from c); +---- +2030-01-01 + +query D +select c from t1 where 2024 > extract(year from c); +---- +1990-01-01 + +query D +select c from t1 where 2024 <= extract(year from c); +---- +2024-01-01 +2030-01-01 + +query D +select c from t1 where 2024 >= extract(year from c); +---- +1990-01-01 +2024-01-01 + +query D +select c from t1 where 2024 is not distinct from extract(year from c); +---- +2024-01-01 + +query D +select c from t1 where 2024 is distinct from extract(year from c); +---- +NULL +1990-01-01 +2030-01-01 + +# +# Check explain statements for optimizations for other interval types +# +query TT +explain select c from t1 where extract (quarter from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("QUARTER"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(QUARTER, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (month from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("MONTH"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(MONTH, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (week from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("WEEK"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(WEEK, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (day from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("DAY"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(DAY, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (hour from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("HOUR"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(HOUR, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (minute from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("MINUTE"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(MINUTE, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (second from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("SECOND"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(SECOND, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (millisecond from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("MILLISECOND"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(MILLISECOND, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (microsecond from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("MICROSECOND"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(MICROSECOND, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (nanosecond from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("NANOSECOND"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(NANOSECOND, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (dow from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("DOW"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(DOW, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (doy from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("DOY"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(DOY, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (epoch from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("EPOCH"), t1.c) = Float64(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(EPOCH, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (isodow from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("ISODOW"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(ISODOW, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +# +# Simple optimize different datatypes +# +statement ok +create table t2( + c1_date32 DATE, + c2_ts_sec timestamp, + c3_ts_mili timestamp, + c4_ts_micro timestamp, + c5_ts_nano timestamp +) as VALUES + (NULL, + NULL, + NULL, + NULL, + NULL), + ('1990-05-20', + '1990-05-20T00:00:10'::timestamp, + '1990-05-20T00:00:10.987'::timestamp, + '1990-05-20T00:00:10.987654'::timestamp, + '1990-05-20T00:00:10.987654321'::timestamp), + ('2024-01-01', + '2024-01-01T00:00:00'::timestamp, + '2024-01-01T00:00:00.123'::timestamp, + '2024-01-01T00:00:00.123456'::timestamp, + '2024-01-01T00:00:00.123456789'::timestamp), + ('2030-12-31', + '2030-12-31T23:59:59'::timestamp, + '2030-12-31T23:59:59.001'::timestamp, + '2030-12-31T23:59:59.001234'::timestamp, + '2030-12-31T23:59:59.001234567'::timestamp) +; + +query D +select c1_date32 from t2 where extract(year from c1_date32) = 2024; +---- +2024-01-01 + +query D +select c1_date32 from t2 where extract(year from c1_date32) <> 2024; +---- +1990-05-20 +2030-12-31 + +query P +select c2_ts_sec from t2 where extract(year from c2_ts_sec) > 2024; +---- +2030-12-31T23:59:59 + +query P +select c3_ts_mili from t2 where extract(year from c3_ts_mili) < 2024; +---- +1990-05-20T00:00:10.987 + +query P +select c4_ts_micro from t2 where extract(year from c4_ts_micro) >= 2024; +---- +2024-01-01T00:00:00.123456 +2030-12-31T23:59:59.001234 + +query P +select c5_ts_nano from t2 where extract(year from c5_ts_nano) <= 2024; +---- +1990-05-20T00:00:10.987654321 +2024-01-01T00:00:00.123456789 + +query D +select c1_date32 from t2 where extract(year from c1_date32) is not distinct from 2024 +---- +2024-01-01 + +query D +select c1_date32 from t2 where extract(year from c1_date32) is distinct from 2024 +---- +NULL +1990-05-20 +2030-12-31 + +# +# Check that date_part is not in the explain statements for other datatypes +# +query TT +explain select c1_date32 from t2 where extract (year from c1_date32) = 2024 +---- +logical_plan +01)Filter: t2.c1_date32 >= Date32("2024-01-01") AND t2.c1_date32 < Date32("2025-01-01") +02)--TableScan: t2 projection=[c1_date32] +physical_plan +01)FilterExec: c1_date32@0 >= 2024-01-01 AND c1_date32@0 < 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c1_date32 from t2 where extract (year from c1_date32) <> 2024 +---- +logical_plan +01)Filter: t2.c1_date32 < Date32("2024-01-01") OR t2.c1_date32 >= Date32("2025-01-01") +02)--TableScan: t2 projection=[c1_date32] +physical_plan +01)FilterExec: c1_date32@0 < 2024-01-01 OR c1_date32@0 >= 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c2_ts_sec from t2 where extract (year from c2_ts_sec) > 2024 +---- +logical_plan +01)Filter: t2.c2_ts_sec >= TimestampNanosecond(1735689600000000000, None) +02)--TableScan: t2 projection=[c2_ts_sec] +physical_plan +01)FilterExec: c2_ts_sec@0 >= 1735689600000000000 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c3_ts_mili from t2 where extract (year from c3_ts_mili) < 2024 +---- +logical_plan +01)Filter: t2.c3_ts_mili < TimestampNanosecond(1704067200000000000, None) +02)--TableScan: t2 projection=[c3_ts_mili] +physical_plan +01)FilterExec: c3_ts_mili@0 < 1704067200000000000 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c4_ts_micro from t2 where extract (year from c4_ts_micro) >= 2024 +---- +logical_plan +01)Filter: t2.c4_ts_micro >= TimestampNanosecond(1704067200000000000, None) +02)--TableScan: t2 projection=[c4_ts_micro] +physical_plan +01)FilterExec: c4_ts_micro@0 >= 1704067200000000000 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c5_ts_nano from t2 where extract (year from c5_ts_nano) <= 2024 +---- +logical_plan +01)Filter: t2.c5_ts_nano < TimestampNanosecond(1735689600000000000, None) +02)--TableScan: t2 projection=[c5_ts_nano] +physical_plan +01)FilterExec: c5_ts_nano@0 < 1735689600000000000 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c1_date32 from t2 where extract (year from c1_date32) is not distinct from 2024 +---- +logical_plan +01)Filter: t2.c1_date32 >= Date32("2024-01-01") AND t2.c1_date32 < Date32("2025-01-01") +02)--TableScan: t2 projection=[c1_date32] +physical_plan +01)FilterExec: c1_date32@0 >= 2024-01-01 AND c1_date32@0 < 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c1_date32 from t2 where extract (year from c1_date32) is distinct from 2024 +---- +logical_plan +01)Filter: t2.c1_date32 < Date32("2024-01-01") OR t2.c1_date32 >= Date32("2025-01-01") OR t2.c1_date32 IS NULL +02)--TableScan: t2 projection=[c1_date32] +physical_plan +01)FilterExec: c1_date32@0 < 2024-01-01 OR c1_date32@0 >= 2025-01-01 OR c1_date32@0 IS NULL +02)--DataSourceExec: partitions=1, partition_sizes=[1] \ No newline at end of file From 57f6c4cec2da6b098504095c3f133e3a17fd6a80 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Sat, 10 Jan 2026 12:41:26 -0500 Subject: [PATCH 18/18] Fix asf header --- datafusion/sqllogictest/test_files/udf_preimage.slt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/datafusion/sqllogictest/test_files/udf_preimage.slt b/datafusion/sqllogictest/test_files/udf_preimage.slt index 544f082cc1d83..543893ea85e95 100644 --- a/datafusion/sqllogictest/test_files/udf_preimage.slt +++ b/datafusion/sqllogictest/test_files/udf_preimage.slt @@ -1,16 +1,16 @@ -# Licensed to the Apache Software Foundation (asF) under one +# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information -# regarding copyright ownership. The asF licenses this file +# regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# + # http://www.apache.org/licenses/LICENSE-2.0 -# + # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an -# "as IS" BasIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License.