From de30cb0d0dad6b3164431bbf33dc74d2c6f0a604 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Thu, 6 Nov 2025 16:19:27 -0500 Subject: [PATCH 01/27] create first test --- Cargo.lock | 1 + datafusion/optimizer/Cargo.toml | 1 + .../optimizer/src/simplify_expressions/mod.rs | 1 + .../simplify_expressions/unwrap_date_part.rs | 85 +++++++++++++++++++ 4 files changed, 88 insertions(+) create mode 100644 datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs diff --git a/Cargo.lock b/Cargo.lock index 1c516277c38a2..046109e0577ac 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2455,6 +2455,7 @@ dependencies = [ "datafusion-common", "datafusion-expr", "datafusion-expr-common", + "datafusion-functions", "datafusion-functions-aggregate", "datafusion-functions-window", "datafusion-functions-window-common", diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml index f10510e0973c3..bc982989f6ac7 100644 --- a/datafusion/optimizer/Cargo.toml +++ b/datafusion/optimizer/Cargo.toml @@ -46,6 +46,7 @@ chrono = { workspace = true } datafusion-common = { workspace = true, default-features = true } datafusion-expr = { workspace = true } datafusion-expr-common = { workspace = true } +datafusion-functions = { workspace = true} datafusion-physical-expr = { workspace = true } indexmap = { workspace = true } itertools = { workspace = true } diff --git a/datafusion/optimizer/src/simplify_expressions/mod.rs b/datafusion/optimizer/src/simplify_expressions/mod.rs index 7ae38eec9a3ad..3ea75bd2e9d86 100644 --- a/datafusion/optimizer/src/simplify_expressions/mod.rs +++ b/datafusion/optimizer/src/simplify_expressions/mod.rs @@ -25,6 +25,7 @@ mod regex; pub mod simplify_exprs; mod simplify_predicates; mod unwrap_cast; +mod unwrap_date_part; mod utils; // backwards compatibility diff --git a/datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs b/datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs new file mode 100644 index 0000000000000..a9e758e9a3c61 --- /dev/null +++ b/datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs @@ -0,0 +1,85 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[cfg(test)] +mod tests { + use crate::simplify_expressions::ExprSimplifier; + use arrow::datatypes::{DataType, Field, TimeUnit}; + use datafusion_common::{DFSchema, DFSchemaRef, ScalarValue}; + use datafusion_expr::expr_fn::col; + use datafusion_expr::{ + and, execution_props::ExecutionProps, lit, simplify::SimplifyContext, Expr, + }; + use datafusion_functions::datetime::expr_fn; + use std::{collections::HashMap, sync::Arc}; + + #[test] + fn test_unwrap_date_part_comparison() { + let schema = expr_test_schema(); + // date_part(c1, DatePart::Year) = 2024 -> c1 >= 2024-01-01 AND c1 < 2025-01-01 + let expr_lt = expr_fn::date_part(lit("year"), col("c1")).eq(lit(2024i32)); + let expected = and( + col("c1").gt_eq(lit(ScalarValue::Date32(Some(19723)))), + col("c1").lt(lit(ScalarValue::Date32(Some(20088)))), + ); + assert_eq!(optimize_test(expr_lt, &schema), expected) + } + + fn optimize_test(expr: Expr, schema: &DFSchemaRef) -> Expr { + let props = ExecutionProps::new(); + let simplifier = ExprSimplifier::new( + SimplifyContext::new(&props).with_schema(Arc::clone(schema)), + ); + + simplifier.simplify(expr).unwrap() + } + + fn expr_test_schema() -> DFSchemaRef { + Arc::new( + DFSchema::from_unqualified_fields( + vec![ + Field::new("c1", DataType::Date32, false), + Field::new("c2", DataType::Date64, false), + Field::new("ts_nano_none", timestamp_nano_none_type(), false), + Field::new("ts_nano_utf", timestamp_nano_utc_type(), false), + ] + .into(), + HashMap::new(), + ) + .unwrap(), + ) + } + + // fn lit_timestamp_nano_none(ts: i64) -> Expr { + // lit(ScalarValue::TimestampNanosecond(Some(ts), None)) + // } + + // fn lit_timestamp_nano_utc(ts: i64) -> Expr { + // let utc = Some("+0:00".into()); + // lit(ScalarValue::TimestampNanosecond(Some(ts), utc)) + // } + + fn timestamp_nano_none_type() -> DataType { + DataType::Timestamp(TimeUnit::Nanosecond, None) + } + + // this is the type that now() returns + fn timestamp_nano_utc_type() -> DataType { + let utc = Some("+0:00".into()); + DataType::Timestamp(TimeUnit::Nanosecond, utc) + } +} From 052ab95fe10d530b3cf8c8292ad1cf80b0407b46 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Mon, 10 Nov 2025 14:04:05 -0500 Subject: [PATCH 02/27] year_literal_to_type_with_op function --- .../simplify_expressions/unwrap_date_part.rs | 103 ++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs b/datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs index a9e758e9a3c61..3680e8f50a5ea 100644 --- a/datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs +++ b/datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs @@ -15,6 +15,109 @@ // specific language governing permissions and limitations // under the License. +use arrow::datatypes::{DataType, TimeUnit}; +use chrono::NaiveDate; +use datafusion_common::{Result, ScalarValue, internal_err, tree_node::Transformed}; +use datafusion_expr::{BinaryExpr, Expr, Operator, expr::ScalarFunction, lit, simplify::SimplifyInfo}; +use datafusion_expr_common::casts::try_cast_literal_to_type; +use datafusion_functions::datetime::{date_part, expr_fn}; + +pub(super) fn unwrap_date_part_in_comparison_for_binary( + info: &S, + cast_expr: Expr, + literal: Expr, + op: Operator, +) -> Result> { + match (cast_expr, literal) { + ( + Expr::ScalarFunction(ScalarFunction { + func, + args + }), + Expr::Literal(lit_value, _), + ) if func.name() == "date_part" => { + if let Some(value) = year_literal_to_type_with_op(&lit_value, &expr_type, op) + { + return Ok(Transformed::yes(Expr::BinaryExpr(BinaryExpr { + left: expr, + op, + right: Box::new(lit(value)), + }))); + }; + + // if the lit_value can be casted to the type of internal_left_expr + // we need to unwrap the cast for cast/try_cast expr, and add cast to the literal + let Some(value) = try_cast_literal_to_type(&lit_value, &expr_type) else { + return internal_err!( + "Can't cast the literal expr {:?} to type {}", + &lit_value, + &expr_type + ); + }; + Ok(Transformed::yes(Expr::BinaryExpr(BinaryExpr { + left: expr, + op, + right: Box::new(lit(value)), + }))) + } + _ => internal_err!("Expect date_part expr and literal"), + } +} + +/// This is just to extract cast the year to the right datatype +fn year_literal_to_type_with_op( + lit_value: &ScalarValue, + target_type: &DataType, + op: Operator, +) -> Option { + match (op, lit_value) { + ( + Operator::Eq | Operator::NotEq, + ScalarValue::Int32(Some(year)), + ) => { + // Can only extract year from Date32/64 and Timestamp + use DataType::*; + if matches!( + target_type, + Date32 | Date64 | Timestamp(_,_) + ) { + let naive_date = NaiveDate::from_ymd_opt(*year, 1, 1).expect("Invalid year"); + + let casted = match target_type { + Date32 => { + let days = naive_date.signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?).num_days() as i32; + ScalarValue::Date32(Some(days)) + }, + Date64 => { + let milis = naive_date.signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?).num_milliseconds(); + ScalarValue::Date64(Some(milis)) + }, + Timestamp(unit, tz) => { + let days = naive_date.signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?).num_days(); + match unit { + TimeUnit::Second => ScalarValue::TimestampSecond(Some(days * 86_400), tz.clone()), + TimeUnit::Millisecond => ScalarValue::TimestampMillisecond(Some(days * 86_400_000), tz.clone()), + TimeUnit::Microsecond => ScalarValue::TimestampMicrosecond(Some(days * 86_400_000_000), tz.clone()), + TimeUnit::Nanosecond => ScalarValue::TimestampNanosecond(Some(days * 86_400_000_000_000), tz.clone()), + } + }, + _ => return None + }; + + return Some(casted) + + } + else { + None + } + } + _ => None, + } +} + + + + #[cfg(test)] mod tests { use crate::simplify_expressions::ExprSimplifier; From d9a425331677753b7b3f75c9cd12387ac749b955 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Mon, 10 Nov 2025 17:09:20 -0500 Subject: [PATCH 03/27] kinda works --- .../simplify_expressions/expr_simplifier.rs | 38 ++++- .../simplify_expressions/unwrap_date_part.rs | 145 ++++++++++++------ 2 files changed, 137 insertions(+), 46 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 85e9d9b6a0ed8..c73c3e7ed9464 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -49,7 +49,6 @@ use datafusion_physical_expr::{create_physical_expr, execution_props::ExecutionP use super::inlist_simplifier::ShortenInListSimplifier; use super::utils::*; -use crate::analyzer::type_coercion::TypeCoercionRewriter; use crate::simplify_expressions::guarantees::GuaranteeRewriter; use crate::simplify_expressions::regex::simplify_regex_expr; use crate::simplify_expressions::unwrap_cast::{ @@ -58,6 +57,13 @@ use crate::simplify_expressions::unwrap_cast::{ unwrap_cast_in_comparison_for_binary, }; use crate::simplify_expressions::SimplifyInfo; +use crate::{ + analyzer::type_coercion::TypeCoercionRewriter, + simplify_expressions::unwrap_date_part::{ + is_date_part_expr_and_support_unwrap_date_part_in_comparison_for_binary, + unwrap_date_part_in_comparison_for_binary, + }, +}; use datafusion_expr_common::casts::try_cast_literal_to_type; use indexmap::IndexSet; use regex::Regex; @@ -1968,6 +1974,36 @@ impl TreeNodeRewriter for Simplifier<'_, S> { })) } + // ======================================= + // unwrap_date_part_in_comparison + // ======================================= + // + // For case: + // try_cast/cast(expr as data_type) op literal + Expr::BinaryExpr(BinaryExpr { left, op, right }) + if is_date_part_expr_and_support_unwrap_date_part_in_comparison_for_binary( + info, &left, op, &right, + ) && op.supports_propagation() => + { + unwrap_date_part_in_comparison_for_binary(info, *left, *right, op)? + } + // literal op try_cast/cast(expr as data_type) + // --> + // try_cast/cast(expr as data_type) op_swap literal + Expr::BinaryExpr(BinaryExpr { left, op, right }) + if is_date_part_expr_and_support_unwrap_date_part_in_comparison_for_binary( + info, &right, op, &left, + ) && op.supports_propagation() + && op.swap().is_some() => + { + unwrap_date_part_in_comparison_for_binary( + info, + *right, + *left, + op.swap().unwrap(), + )? + } + // no additional rewrites possible expr => Transformed::no(expr), }) diff --git a/datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs b/datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs index 3680e8f50a5ea..df2a6815a476f 100644 --- a/datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs +++ b/datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs @@ -17,10 +17,11 @@ use arrow::datatypes::{DataType, TimeUnit}; use chrono::NaiveDate; -use datafusion_common::{Result, ScalarValue, internal_err, tree_node::Transformed}; -use datafusion_expr::{BinaryExpr, Expr, Operator, expr::ScalarFunction, lit, simplify::SimplifyInfo}; -use datafusion_expr_common::casts::try_cast_literal_to_type; -use datafusion_functions::datetime::{date_part, expr_fn}; +use datafusion_common::{internal_err, tree_node::Transformed, Result, ScalarValue}; +use datafusion_expr::{ + expr::ScalarFunction, lit, simplify::SimplifyInfo, BinaryExpr, Expr, Operator, +}; +use datafusion_expr_common::casts::{is_supported_type, try_cast_literal_to_type}; pub(super) fn unwrap_date_part_in_comparison_for_binary( info: &S, @@ -28,14 +29,21 @@ pub(super) fn unwrap_date_part_in_comparison_for_binary( literal: Expr, op: Operator, ) -> Result> { + dbg!(&cast_expr, &literal, op); // <-- log inputs + match (cast_expr, literal) { ( - Expr::ScalarFunction(ScalarFunction { - func, - args - }), + Expr::ScalarFunction(ScalarFunction { func, args }), Expr::Literal(lit_value, _), ) if func.name() == "date_part" => { + let expr = Box::new(args[1].clone()); + + let Ok(expr_type) = info.get_data_type(&expr) else { + return internal_err!("Can't get the data type of the expr {:?}", &expr); + }; + + dbg!(&expr_type, &lit_value); // <-- log types and literal + if let Some(value) = year_literal_to_type_with_op(&lit_value, &expr_type, op) { return Ok(Transformed::yes(Expr::BinaryExpr(BinaryExpr { @@ -64,50 +72,100 @@ pub(super) fn unwrap_date_part_in_comparison_for_binary( } } +pub(super) fn is_date_part_expr_and_support_unwrap_date_part_in_comparison_for_binary< + S: SimplifyInfo, +>( + info: &S, + expr: &Expr, + op: Operator, + literal: &Expr, +) -> bool { + dbg!(expr, literal, op); // <-- log inputs + + match (expr, literal) { + ( + Expr::ScalarFunction(ScalarFunction { func, args }), + Expr::Literal(lit_val, _), + ) if func.name() == "date_part" => { + let left_expr = Box::new(args[1].clone()); + + let Ok(expr_type) = info.get_data_type(&left_expr) else { + return false; + }; + + let Ok(lit_type) = info.get_data_type(literal) else { + return false; + }; + + if year_literal_to_type_with_op(lit_val, &expr_type, op).is_some() { + return true; + } + + dbg!(&expr_type, &lit_type); // <-- log types and result + + try_cast_literal_to_type(lit_val, &expr_type).is_some() + && is_supported_type(&expr_type) + && is_supported_type(&lit_type) + } + _ => false, + } +} + /// This is just to extract cast the year to the right datatype fn year_literal_to_type_with_op( - lit_value: &ScalarValue, + lit_value: &ScalarValue, target_type: &DataType, op: Operator, ) -> Option { match (op, lit_value) { - ( - Operator::Eq | Operator::NotEq, - ScalarValue::Int32(Some(year)), - ) => { + (Operator::Eq | Operator::NotEq, ScalarValue::Int32(Some(year))) => { // Can only extract year from Date32/64 and Timestamp use DataType::*; - if matches!( - target_type, - Date32 | Date64 | Timestamp(_,_) - ) { - let naive_date = NaiveDate::from_ymd_opt(*year, 1, 1).expect("Invalid year"); - - let casted = match target_type { - Date32 => { - let days = naive_date.signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?).num_days() as i32; - ScalarValue::Date32(Some(days)) - }, - Date64 => { - let milis = naive_date.signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?).num_milliseconds(); - ScalarValue::Date64(Some(milis)) - }, - Timestamp(unit, tz) => { - let days = naive_date.signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?).num_days(); - match unit { - TimeUnit::Second => ScalarValue::TimestampSecond(Some(days * 86_400), tz.clone()), - TimeUnit::Millisecond => ScalarValue::TimestampMillisecond(Some(days * 86_400_000), tz.clone()), - TimeUnit::Microsecond => ScalarValue::TimestampMicrosecond(Some(days * 86_400_000_000), tz.clone()), - TimeUnit::Nanosecond => ScalarValue::TimestampNanosecond(Some(days * 86_400_000_000_000), tz.clone()), + if matches!(target_type, Date32 | Date64 | Timestamp(_, _)) { + let naive_date = + NaiveDate::from_ymd_opt(*year, 1, 1).expect("Invalid year"); + + let casted = match target_type { + Date32 => { + let days = naive_date + .signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?) + .num_days() as i32; + ScalarValue::Date32(Some(days)) } - }, - _ => return None - }; - - return Some(casted) + Date64 => { + let milis = naive_date + .signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?) + .num_milliseconds(); + ScalarValue::Date64(Some(milis)) + } + Timestamp(unit, tz) => { + let days = naive_date + .signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?) + .num_days(); + match unit { + TimeUnit::Second => ScalarValue::TimestampSecond( + Some(days * 86_400), + tz.clone(), + ), + TimeUnit::Millisecond => ScalarValue::TimestampMillisecond( + Some(days * 86_400_000), + tz.clone(), + ), + TimeUnit::Microsecond => ScalarValue::TimestampMicrosecond( + Some(days * 86_400_000_000), + tz.clone(), + ), + TimeUnit::Nanosecond => ScalarValue::TimestampNanosecond( + Some(days * 86_400_000_000_000), + tz.clone(), + ), + } + } + _ => return None, + }; - } - else { + Some(casted) + } else { None } } @@ -115,9 +173,6 @@ fn year_literal_to_type_with_op( } } - - - #[cfg(test)] mod tests { use crate::simplify_expressions::ExprSimplifier; From 83bae0206fb9993e4b36fba2c6900fde97b7773a Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Mon, 17 Nov 2025 16:36:43 -0500 Subject: [PATCH 04/27] getting_there --- .../simplify_expressions/unwrap_date_part.rs | 356 ++++++++++++------ 1 file changed, 236 insertions(+), 120 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs b/datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs index df2a6815a476f..07f1e11d3a90f 100644 --- a/datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs +++ b/datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs @@ -17,11 +17,13 @@ use arrow::datatypes::{DataType, TimeUnit}; use chrono::NaiveDate; -use datafusion_common::{internal_err, tree_node::Transformed, Result, ScalarValue}; +use datafusion_common::{ + internal_err, tree_node::Transformed, DataFusionError, Result, ScalarValue, +}; use datafusion_expr::{ - expr::ScalarFunction, lit, simplify::SimplifyInfo, BinaryExpr, Expr, Operator, + and, expr::ScalarFunction, lit, or, simplify::SimplifyInfo, BinaryExpr, Expr, + Operator, }; -use datafusion_expr_common::casts::{is_supported_type, try_cast_literal_to_type}; pub(super) fn unwrap_date_part_in_comparison_for_binary( info: &S, @@ -29,47 +31,108 @@ pub(super) fn unwrap_date_part_in_comparison_for_binary( literal: Expr, op: Operator, ) -> Result> { - dbg!(&cast_expr, &literal, op); // <-- log inputs - - match (cast_expr, literal) { + let (args, lit_value) = match (cast_expr, literal) { ( Expr::ScalarFunction(ScalarFunction { func, args }), Expr::Literal(lit_value, _), - ) if func.name() == "date_part" => { - let expr = Box::new(args[1].clone()); + ) if func.name() == "date_part" => (args, lit_value), + _ => return internal_err!("Expect date_part expr and literal"), + }; + let expr = Box::new(args[1].clone()); - let Ok(expr_type) = info.get_data_type(&expr) else { - return internal_err!("Can't get the data type of the expr {:?}", &expr); - }; + let Ok(expr_type) = info.get_data_type(&expr) else { + return internal_err!("Can't get the data type of the expr {:?}", &expr); + }; - dbg!(&expr_type, &lit_value); // <-- log types and literal + // Helper to cast literal + let cast_year = |updated_year: &ScalarValue| -> Result { + year_literal_to_type(updated_year, &expr_type).ok_or_else(|| { + DataFusionError::Internal(format!( + "Can't cast {lit_value} to type {expr_type}" + )) + }) + }; - if let Some(value) = year_literal_to_type_with_op(&lit_value, &expr_type, op) - { - return Ok(Transformed::yes(Expr::BinaryExpr(BinaryExpr { - left: expr, - op, - right: Box::new(lit(value)), - }))); + let rewritten_expr = match op { + Operator::Lt | Operator::GtEq => { + let v = cast_year(&lit_value)?; + Expr::BinaryExpr(BinaryExpr { + left: expr, + op, + right: Box::new(lit(v)), + }) + } + Operator::Gt => { + let year = match lit_value { + ScalarValue::Int32(Some(y)) => y + 1, + _ => return internal_err!("Expected Int32 Literal"), }; - - // if the lit_value can be casted to the type of internal_left_expr - // we need to unwrap the cast for cast/try_cast expr, and add cast to the literal - let Some(value) = try_cast_literal_to_type(&lit_value, &expr_type) else { - return internal_err!( - "Can't cast the literal expr {:?} to type {}", - &lit_value, - &expr_type - ); + let updated_year = ScalarValue::Int32(Some(year)); + let v = cast_year(&updated_year)?; + Expr::BinaryExpr(BinaryExpr { + left: expr, + op: Operator::GtEq, + right: Box::new(lit(v)), + }) + } + Operator::LtEq => { + let year = match lit_value { + ScalarValue::Int32(Some(y)) => y + 1, + _ => return internal_err!("Expected Int32 Literal"), }; - Ok(Transformed::yes(Expr::BinaryExpr(BinaryExpr { + let updated_year = ScalarValue::Int32(Some(year)); + let v = cast_year(&updated_year)?; + Expr::BinaryExpr(BinaryExpr { left: expr, - op, - right: Box::new(lit(value)), - }))) + op: Operator::Lt, + right: Box::new(lit(v)), + }) } - _ => internal_err!("Expect date_part expr and literal"), - } + Operator::Eq => { + let year = match lit_value { + ScalarValue::Int32(Some(y)) => y + 1, + _ => return internal_err!("Expected Int32 Literal"), + }; + let updated_year = ScalarValue::Int32(Some(year)); + let lower = cast_year(&lit_value)?; + let upper = cast_year(&updated_year)?; + and( + Expr::BinaryExpr(BinaryExpr { + left: expr.clone(), + op: Operator::GtEq, + right: Box::new(lit(lower)), + }), + Expr::BinaryExpr(BinaryExpr { + left: expr, + op: Operator::Lt, + right: Box::new(lit(upper)), + }), + ) + } + Operator::NotEq => { + let year = match lit_value { + ScalarValue::Int32(Some(y)) => y + 1, + _ => return internal_err!("Expected Int32 Literal"), + }; + let updated_year = ScalarValue::Int32(Some(year)); + let lower = cast_year(&lit_value)?; + let upper = cast_year(&updated_year)?; + or( + Expr::BinaryExpr(BinaryExpr { + left: expr.clone(), + op: Operator::Lt, + right: Box::new(lit(lower)), + }), + Expr::BinaryExpr(BinaryExpr { + left: expr, + op: Operator::GtEq, + right: Box::new(lit(upper)), + }), + ) + } + _ => return internal_err!("Expect comparison operators"), + }; + Ok(Transformed::yes(rewritten_expr)) } pub(super) fn is_date_part_expr_and_support_unwrap_date_part_in_comparison_for_binary< @@ -80,97 +143,86 @@ pub(super) fn is_date_part_expr_and_support_unwrap_date_part_in_comparison_for_b op: Operator, literal: &Expr, ) -> bool { - dbg!(expr, literal, op); // <-- log inputs - - match (expr, literal) { + match (expr, op, literal) { ( Expr::ScalarFunction(ScalarFunction { func, args }), + Operator::Eq + | Operator::NotEq + | Operator::Gt + | Operator::Lt + | Operator::GtEq + | Operator::LtEq, Expr::Literal(lit_val, _), ) if func.name() == "date_part" => { let left_expr = Box::new(args[1].clone()); - let Ok(expr_type) = info.get_data_type(&left_expr) else { return false; }; - - let Ok(lit_type) = info.get_data_type(literal) else { + let Ok(_lit_type) = info.get_data_type(literal) else { return false; }; - if year_literal_to_type_with_op(lit_val, &expr_type, op).is_some() { - return true; - } - - dbg!(&expr_type, &lit_type); // <-- log types and result - - try_cast_literal_to_type(lit_val, &expr_type).is_some() - && is_supported_type(&expr_type) - && is_supported_type(&lit_type) + year_literal_to_type(lit_val, &expr_type).is_some() } _ => false, } } -/// This is just to extract cast the year to the right datatype -fn year_literal_to_type_with_op( +/// Cast the year to the right datatype +fn year_literal_to_type( lit_value: &ScalarValue, target_type: &DataType, - op: Operator, ) -> Option { - match (op, lit_value) { - (Operator::Eq | Operator::NotEq, ScalarValue::Int32(Some(year))) => { - // Can only extract year from Date32/64 and Timestamp - use DataType::*; - if matches!(target_type, Date32 | Date64 | Timestamp(_, _)) { - let naive_date = - NaiveDate::from_ymd_opt(*year, 1, 1).expect("Invalid year"); - - let casted = match target_type { - Date32 => { - let days = naive_date - .signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?) - .num_days() as i32; - ScalarValue::Date32(Some(days)) - } - Date64 => { - let milis = naive_date - .signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?) - .num_milliseconds(); - ScalarValue::Date64(Some(milis)) - } - Timestamp(unit, tz) => { - let days = naive_date - .signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?) - .num_days(); - match unit { - TimeUnit::Second => ScalarValue::TimestampSecond( - Some(days * 86_400), - tz.clone(), - ), - TimeUnit::Millisecond => ScalarValue::TimestampMillisecond( - Some(days * 86_400_000), - tz.clone(), - ), - TimeUnit::Microsecond => ScalarValue::TimestampMicrosecond( - Some(days * 86_400_000_000), - tz.clone(), - ), - TimeUnit::Nanosecond => ScalarValue::TimestampNanosecond( - Some(days * 86_400_000_000_000), - tz.clone(), - ), - } - } - _ => return None, - }; - - Some(casted) - } else { - None + let year = match lit_value { + ScalarValue::Int32(Some(y)) => *y, + _ => return None, + }; + // Can only extract year from Date32/64 and Timestamp + match target_type { + DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, _) => {} + _ => return None, + } + + let naive_date = NaiveDate::from_ymd_opt(year, 1, 1).expect("Invalid year"); + + let casted = match target_type { + DataType::Date32 => { + let days = naive_date + .signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?) + .num_days() as i32; + ScalarValue::Date32(Some(days)) + } + DataType::Date64 => { + let milis = naive_date + .signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?) + .num_milliseconds(); + ScalarValue::Date64(Some(milis)) + } + DataType::Timestamp(unit, tz) => { + let days = naive_date + .signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?) + .num_days(); + match unit { + TimeUnit::Second => { + ScalarValue::TimestampSecond(Some(days * 86_400), tz.clone()) + } + TimeUnit::Millisecond => { + ScalarValue::TimestampMillisecond(Some(days * 86_400_000), tz.clone()) + } + TimeUnit::Microsecond => ScalarValue::TimestampMicrosecond( + Some(days * 86_400_000_000), + tz.clone(), + ), + TimeUnit::Nanosecond => ScalarValue::TimestampNanosecond( + Some(days * 86_400_000_000_000), + tz.clone(), + ), } } - _ => None, - } + _ => return None, + }; + + Some(casted) } #[cfg(test)] @@ -179,6 +231,7 @@ mod tests { use arrow::datatypes::{DataType, Field, TimeUnit}; use datafusion_common::{DFSchema, DFSchemaRef, ScalarValue}; use datafusion_expr::expr_fn::col; + use datafusion_expr::or; use datafusion_expr::{ and, execution_props::ExecutionProps, lit, simplify::SimplifyContext, Expr, }; @@ -186,17 +239,77 @@ mod tests { use std::{collections::HashMap, sync::Arc}; #[test] - fn test_unwrap_date_part_comparison() { + fn test_preimage_date_part_date32_eq() { let schema = expr_test_schema(); // date_part(c1, DatePart::Year) = 2024 -> c1 >= 2024-01-01 AND c1 < 2025-01-01 - let expr_lt = expr_fn::date_part(lit("year"), col("c1")).eq(lit(2024i32)); + let expr_lt = expr_fn::date_part(lit("year"), col("date32")).eq(lit(2024i32)); let expected = and( - col("c1").gt_eq(lit(ScalarValue::Date32(Some(19723)))), - col("c1").lt(lit(ScalarValue::Date32(Some(20088)))), + col("date32").gt_eq(lit(ScalarValue::Date32(Some(19723)))), + col("date32").lt(lit(ScalarValue::Date32(Some(20089)))), + ); + assert_eq!(optimize_test(expr_lt, &schema), expected) + } + + #[test] + fn test_preimage_date_part_date64_not_eq() { + let schema = expr_test_schema(); + // date_part(c1, DatePart::Year) <> 2024 -> c1 < 2024-01-01 AND c1 >= 2025-01-01 + let expr_lt = expr_fn::date_part(lit("year"), col("date64")).not_eq(lit(2024i32)); + let expected = or( + col("date64").lt(lit(ScalarValue::Date64(Some(19723 * 86_400_000)))), + col("date64").gt_eq(lit(ScalarValue::Date64(Some(20089 * 86_400_000)))), ); assert_eq!(optimize_test(expr_lt, &schema), expected) } + #[test] + fn test_preimage_date_part_timestamp_nano_lt() { + let schema = expr_test_schema(); + let expr_lt = + expr_fn::date_part(lit("year"), col("ts_nano_none")).lt(lit(2024i32)); + let expected = col("ts_nano_none").lt(lit(ScalarValue::TimestampNanosecond( + Some(19723 * 86_400_000_000_000), + None, + ))); + assert_eq!(optimize_test(expr_lt, &schema), expected) + } + + #[test] + fn test_preimage_date_part_timestamp_nano_utc_gt() { + let schema = expr_test_schema(); + let expr_lt = + expr_fn::date_part(lit("year"), col("ts_nano_utc")).gt(lit(2024i32)); + let expected = col("ts_nano_utc").gt_eq(lit(ScalarValue::TimestampNanosecond( + Some(20089 * 86_400_000_000_000), + None, + ))); + assert_eq!(optimize_test(expr_lt, &schema), expected) + } + + #[test] + fn test_preimage_date_part_timestamp_sec_est_gt_eq() { + let schema = expr_test_schema(); + let expr_lt = + expr_fn::date_part(lit("year"), col("ts_sec_est")).gt_eq(lit(2024i32)); + let expected = col("ts_sec_est").gt_eq(lit(ScalarValue::TimestampSecond( + Some(19723 * 86_400), + None, + ))); + assert_eq!(optimize_test(expr_lt, &schema), expected) + } + + #[test] + fn test_preimage_date_part_timestamp_sec_est_lt_eq() { + let schema = expr_test_schema(); + let expr_lt = + expr_fn::date_part(lit("year"), col("ts_mic_pt")).lt_eq(lit(2024i32)); + let expected = col("ts_mic_pt").lt(lit(ScalarValue::TimestampMicrosecond( + Some(20089 * 86_400_000_000), + None, + ))); + assert_eq!(optimize_test(expr_lt, &schema), expected) + } + fn optimize_test(expr: Expr, schema: &DFSchemaRef) -> Expr { let props = ExecutionProps::new(); let simplifier = ExprSimplifier::new( @@ -210,10 +323,12 @@ mod tests { Arc::new( DFSchema::from_unqualified_fields( vec![ - Field::new("c1", DataType::Date32, false), - Field::new("c2", DataType::Date64, false), + Field::new("date32", DataType::Date32, false), + Field::new("date64", DataType::Date64, false), Field::new("ts_nano_none", timestamp_nano_none_type(), false), - Field::new("ts_nano_utf", timestamp_nano_utc_type(), false), + Field::new("ts_nano_utc", timestamp_nano_utc_type(), false), + Field::new("ts_sec_est", timestamp_sec_est_type(), false), + Field::new("ts_mic_pt", timestamp_mic_pt_type(), false), ] .into(), HashMap::new(), @@ -222,15 +337,6 @@ mod tests { ) } - // fn lit_timestamp_nano_none(ts: i64) -> Expr { - // lit(ScalarValue::TimestampNanosecond(Some(ts), None)) - // } - - // fn lit_timestamp_nano_utc(ts: i64) -> Expr { - // let utc = Some("+0:00".into()); - // lit(ScalarValue::TimestampNanosecond(Some(ts), utc)) - // } - fn timestamp_nano_none_type() -> DataType { DataType::Timestamp(TimeUnit::Nanosecond, None) } @@ -240,4 +346,14 @@ mod tests { let utc = Some("+0:00".into()); DataType::Timestamp(TimeUnit::Nanosecond, utc) } + + fn timestamp_sec_est_type() -> DataType { + let est = Some("-5:00".into()); + DataType::Timestamp(TimeUnit::Second, est) + } + + fn timestamp_mic_pt_type() -> DataType { + let pt = Some("-8::00".into()); + DataType::Timestamp(TimeUnit::Microsecond, pt) + } } From 1e794c3344bc333de6ece9e46fd7bcb55fbdf8af Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Wed, 19 Nov 2025 11:08:32 -0500 Subject: [PATCH 05/27] taplo format --- datafusion/optimizer/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml index 9ce482d5094b1..d3ab799b0c9b8 100644 --- a/datafusion/optimizer/Cargo.toml +++ b/datafusion/optimizer/Cargo.toml @@ -49,7 +49,7 @@ chrono = { workspace = true } datafusion-common = { workspace = true, default-features = true } datafusion-expr = { workspace = true } datafusion-expr-common = { workspace = true } -datafusion-functions = { workspace = true} +datafusion-functions = { workspace = true } datafusion-physical-expr = { workspace = true } indexmap = { workspace = true } itertools = { workspace = true } From 2ec0b9a9bf9c5d1f5cd21b9c4395aeb1dbd880e3 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Wed, 19 Nov 2025 11:09:49 -0500 Subject: [PATCH 06/27] add op_swap test --- .../src/simplify_expressions/unwrap_date_part.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs b/datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs index 07f1e11d3a90f..ab4f11dd39881 100644 --- a/datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs +++ b/datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs @@ -310,6 +310,18 @@ mod tests { assert_eq!(optimize_test(expr_lt, &schema), expected) } + #[test] + fn test_preimage_date_part_timestamp_nano_lt_swap() { + let schema = expr_test_schema(); + let expr_lt = + lit(2024i32).gt(expr_fn::date_part(lit("year"), col("ts_nano_none"))); + let expected = col("ts_nano_none").lt(lit(ScalarValue::TimestampNanosecond( + Some(19723 * 86_400_000_000_000), + None, + ))); + assert_eq!(optimize_test(expr_lt, &schema), expected) + } + fn optimize_test(expr: Expr, schema: &DFSchemaRef) -> Expr { let props = ExecutionProps::new(); let simplifier = ExprSimplifier::new( From 630f5c5b20218efa2f9f11ebdfd91a2756d7c6c6 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Wed, 19 Nov 2025 11:10:30 -0500 Subject: [PATCH 07/27] Fix comment --- .../optimizer/src/simplify_expressions/expr_simplifier.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 8e2e93c029a22..8579abe69dfcc 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1967,7 +1967,7 @@ impl TreeNodeRewriter for Simplifier<'_, S> { // ======================================= // // For case: - // try_cast/cast(expr as data_type) op literal + // date_part(expr as data_type) op literal Expr::BinaryExpr(BinaryExpr { left, op, right }) if is_date_part_expr_and_support_unwrap_date_part_in_comparison_for_binary( info, &left, op, &right, @@ -1975,9 +1975,9 @@ impl TreeNodeRewriter for Simplifier<'_, S> { { unwrap_date_part_in_comparison_for_binary(info, *left, *right, op)? } - // literal op try_cast/cast(expr as data_type) + // literal op date_part(literal, expression) // --> - // try_cast/cast(expr as data_type) op_swap literal + // date_part(literal, expression) op_swap literal Expr::BinaryExpr(BinaryExpr { left, op, right }) if is_date_part_expr_and_support_unwrap_date_part_in_comparison_for_binary( info, &right, op, &left, From fbcfb9704a03bba43fd48aec28449ac291c828f6 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Wed, 19 Nov 2025 16:13:23 -0500 Subject: [PATCH 08/27] commented out attempt for inlist support --- .../simplify_expressions/expr_simplifier.rs | 50 ++++++++++++++++++- .../simplify_expressions/unwrap_date_part.rs | 42 ++++++++++++++++ 2 files changed, 91 insertions(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 8579abe69dfcc..9c5e2ab36f256 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -49,7 +49,7 @@ use datafusion_physical_expr::{create_physical_expr, execution_props::ExecutionP use super::inlist_simplifier::ShortenInListSimplifier; use super::utils::*; -use crate::simplify_expressions::guarantees::GuaranteeRewriter; +use crate::simplify_expressions::{guarantees::GuaranteeRewriter}; use crate::simplify_expressions::regex::simplify_regex_expr; use crate::simplify_expressions::unwrap_cast::{ is_cast_expr_and_support_unwrap_cast_in_comparison_for_binary, @@ -1992,6 +1992,54 @@ impl TreeNodeRewriter for Simplifier<'_, S> { )? } + // // For case: + // // try_cast/cast(expr as left_type) in (expr1,expr2,expr3) + // Expr::InList(InList { + // expr: mut left, + // list, + // negated, + // }) if is_date_part_expr_and_support_unwrap_date_part_in_comparison_for_inlist( + // info, &left, &list, + // ) => + // { + // let Expr::ScalarFunction(ScalarFunction { + // func, args + // }) = left.as_mut() + // else { + // return internal_err!("Expect scalar function expression, but got {:?}", left)?; + // }; + // let left_expr = Box::new(args[1].clone()); + // let expr_type = info.get_data_type(&left_expr)?; + // let right_exprs = list + // .into_iter() + // .map(|right| { + // match right { + // Expr::Literal(right_lit_value, _) => { + // // if the right_lit_value can be casted to the type of internal_left_expr + // // we need to unwrap the cast for cast/try_cast expr, and add cast to the literal + // let Some(value) = try_cast_literal_to_type(&right_lit_value, &expr_type) else { + // internal_err!( + // "Can't cast the list expr {:?} to type {}", + // right_lit_value, &expr_type + // )? + // }; + // Ok(lit(value)) + // } + // other_expr => internal_err!( + // "Only support literal expr to optimize, but the expr is {:?}", + // &other_expr + // ), + // } + // }) + // .collect::>>()?; + + // Transformed::yes(Expr::InList(InList { + // expr: std::mem::take(&mut: left_expr), + // list: right_exprs, + // negated, + // })) + // } + // no additional rewrites possible expr => Transformed::no(expr), }) diff --git a/datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs b/datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs index ab4f11dd39881..d9baac50c5a2f 100644 --- a/datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs +++ b/datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs @@ -168,6 +168,34 @@ pub(super) fn is_date_part_expr_and_support_unwrap_date_part_in_comparison_for_b } } +// pub(super) fn is_date_part_expr_and_support_unwrap_date_part_in_comparison_for_inlist< +// S: SimplifyInfo, +// >( +// info: &S, +// expr: &Expr, +// list: &[Expr], +// ) -> bool { +// match expr { +// Expr::ScalarFunction(ScalarFunction { func, args }) +// if func.name() == "date_part" => +// { +// let left_expr = Box::new(args[1].clone()); +// let Ok(expr_type) = info.get_data_type(&left_expr) else { +// return false; +// }; +// for right in list { +// match right { +// Expr::Literal(lit_val, _) +// if year_literal_to_type(lit_val, &expr_type).is_some() => {} +// _ => return false, +// } +// } +// true +// } +// _ => false, +// } +// } + /// Cast the year to the right datatype fn year_literal_to_type( lit_value: &ScalarValue, @@ -322,6 +350,20 @@ mod tests { assert_eq!(optimize_test(expr_lt, &schema), expected) } + // #[test] + // fn test_preimage_date_part_date32_in_list() { + // let schema = expr_test_schema(); + // let expr_lt = expr_fn::date_part(lit("year"), col("date32")) + // .in_list(vec![lit(2024i32), lit(1984i32)], false); + // let expected = (col("date32") + // .gt_eq(lit(ScalarValue::Date32(Some(19723)))) + // .or(col("date32").lt(lit(ScalarValue::Date32(Some(20089)))))) + // .or(col("date32") + // .gt_eq(lit(ScalarValue::Date32(Some(5113)))) + // .or(col("date32").lt(lit(ScalarValue::Date32(Some(5480)))))); + // assert_eq!(optimize_test(expr_lt, &schema), expected) + // } + fn optimize_test(expr: Expr, schema: &DFSchemaRef) -> Expr { let props = ExecutionProps::new(); let simplifier = ExprSimplifier::new( From 2b54409a9fab3ee63bac1e210836a64c571ce803 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Thu, 20 Nov 2025 11:41:16 -0500 Subject: [PATCH 09/27] restructure the changes --- datafusion/expr/src/udf.rs | 20 +++ .../functions/src/datetime/date_part.rs | 71 +++++++- .../simplify_expressions/expr_simplifier.rs | 25 ++- .../optimizer/src/simplify_expressions/mod.rs | 2 +- .../{unwrap_date_part.rs => udf_preimage.rs} | 163 ++++++------------ 5 files changed, 159 insertions(+), 122 deletions(-) rename datafusion/optimizer/src/simplify_expressions/{unwrap_date_part.rs => udf_preimage.rs} (71%) diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 449ddf59094a0..bf46c35f07980 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -31,6 +31,7 @@ use datafusion_common::{ }; use datafusion_expr_common::dyn_eq::{DynEq, DynHash}; use datafusion_expr_common::interval_arithmetic::Interval; +use datafusion_expr_common::operator::Operator; use std::any::Any; use std::cmp::Ordering; use std::fmt::Debug; @@ -696,6 +697,16 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync { Ok(ExprSimplifyResult::Original(args)) } + /// Applies simplification on a predicate expression to get a preimage expression + fn preimage_cast( + &self, + _lit_value: &ScalarValue, + _target_type: &DataType, + _op: Operator, + ) -> Option { + None + } + /// Returns true if some of this `exprs` subexpressions may not be evaluated /// and thus any side effects (like divide by zero) may not be encountered. /// @@ -926,6 +937,15 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl { self.inner.simplify(args, info) } + fn preimage_cast( + &self, + lit_value: &ScalarValue, + target_type: &DataType, + op: Operator, + ) -> Option { + self.inner.preimage_cast(lit_value, target_type, op) + } + fn conditional_arguments<'a>( &self, args: &'a [Expr], diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index aa23a5028dd81..ec6923cb63227 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -27,6 +27,7 @@ use arrow::datatypes::DataType::{ }; use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; use arrow::datatypes::{DataType, Field, FieldRef, TimeUnit}; +use chrono::NaiveDate; use datafusion_common::types::{logical_date, NativeType}; use datafusion_common::{ @@ -42,7 +43,7 @@ use datafusion_common::{ Result, ScalarValue, }; use datafusion_expr::{ - ColumnarValue, Documentation, ReturnFieldArgs, ScalarUDFImpl, Signature, + ColumnarValue, Documentation, Operator, ReturnFieldArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility, }; use datafusion_expr_common::signature::{Coercion, TypeSignatureClass}; @@ -231,6 +232,74 @@ impl ScalarUDFImpl for DatePartFunc { }) } + /// Cast the year to the right datatype + fn preimage_cast( + &self, + lit_value: &ScalarValue, + target_type: &DataType, + op: Operator, + ) -> Option { + let year = match lit_value { + ScalarValue::Int32(Some(y)) => *y, + _ => return None, + }; + // Can only extract year from Date32/64 and Timestamp + match target_type { + Date32 | Date64 | Timestamp(_, _) => {} + _ => return None, + } + + let updated_year = match op { + Operator::Gt | Operator::LtEq => year + 1, + Operator::Lt | Operator::GtEq => year, + Operator::Eq | Operator::NotEq => year, // This is to pass the is_scalar_udf_expr_and_support_preimage_in_comparison_for_binary + _ => return None, + }; + + let naive_date = + NaiveDate::from_ymd_opt(updated_year, 1, 1).expect("Invalid year"); + + let casted = match target_type { + Date32 => { + let days = naive_date + .signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?) + .num_days() as i32; + ScalarValue::Date32(Some(days)) + } + Date64 => { + let milis = naive_date + .signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?) + .num_milliseconds(); + ScalarValue::Date64(Some(milis)) + } + Timestamp(unit, tz) => { + let days = naive_date + .signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?) + .num_days(); + match unit { + Second => { + ScalarValue::TimestampSecond(Some(days * 86_400), tz.clone()) + } + Millisecond => ScalarValue::TimestampMillisecond( + Some(days * 86_400_000), + tz.clone(), + ), + Microsecond => ScalarValue::TimestampMicrosecond( + Some(days * 86_400_000_000), + tz.clone(), + ), + Nanosecond => ScalarValue::TimestampNanosecond( + Some(days * 86_400_000_000_000), + tz.clone(), + ), + } + } + _ => return None, + }; + + Some(casted) + } + fn aliases(&self) -> &[String] { &self.aliases } diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 9c5e2ab36f256..dff3e5ab9a26c 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -49,7 +49,7 @@ use datafusion_physical_expr::{create_physical_expr, execution_props::ExecutionP use super::inlist_simplifier::ShortenInListSimplifier; use super::utils::*; -use crate::simplify_expressions::{guarantees::GuaranteeRewriter}; +use crate::analyzer::type_coercion::TypeCoercionRewriter; use crate::simplify_expressions::regex::simplify_regex_expr; use crate::simplify_expressions::unwrap_cast::{ is_cast_expr_and_support_unwrap_cast_in_comparison_for_binary, @@ -57,11 +57,11 @@ use crate::simplify_expressions::unwrap_cast::{ unwrap_cast_in_comparison_for_binary, }; use crate::simplify_expressions::SimplifyInfo; -use crate::{ - analyzer::type_coercion::TypeCoercionRewriter, - simplify_expressions::unwrap_date_part::{ - is_date_part_expr_and_support_unwrap_date_part_in_comparison_for_binary, - unwrap_date_part_in_comparison_for_binary, +use crate::simplify_expressions::{ + guarantees::GuaranteeRewriter, + udf_preimage::{ + is_scalar_udf_expr_and_support_preimage_in_comparison_for_binary, + preimage_in_comparison_for_binary, }, }; use datafusion_expr_common::casts::try_cast_literal_to_type; @@ -1969,22 +1969,21 @@ impl TreeNodeRewriter for Simplifier<'_, S> { // For case: // date_part(expr as data_type) op literal Expr::BinaryExpr(BinaryExpr { left, op, right }) - if is_date_part_expr_and_support_unwrap_date_part_in_comparison_for_binary( + if is_scalar_udf_expr_and_support_preimage_in_comparison_for_binary( info, &left, op, &right, - ) && op.supports_propagation() => + ) => { - unwrap_date_part_in_comparison_for_binary(info, *left, *right, op)? + preimage_in_comparison_for_binary(info, *left, *right, op)? } // literal op date_part(literal, expression) // --> // date_part(literal, expression) op_swap literal Expr::BinaryExpr(BinaryExpr { left, op, right }) - if is_date_part_expr_and_support_unwrap_date_part_in_comparison_for_binary( + if is_scalar_udf_expr_and_support_preimage_in_comparison_for_binary( info, &right, op, &left, - ) && op.supports_propagation() - && op.swap().is_some() => + ) && op.swap().is_some() => { - unwrap_date_part_in_comparison_for_binary( + preimage_in_comparison_for_binary( info, *right, *left, diff --git a/datafusion/optimizer/src/simplify_expressions/mod.rs b/datafusion/optimizer/src/simplify_expressions/mod.rs index 3ea75bd2e9d86..c5d7ff14f379d 100644 --- a/datafusion/optimizer/src/simplify_expressions/mod.rs +++ b/datafusion/optimizer/src/simplify_expressions/mod.rs @@ -24,8 +24,8 @@ mod inlist_simplifier; mod regex; pub mod simplify_exprs; mod simplify_predicates; +mod udf_preimage; mod unwrap_cast; -mod unwrap_date_part; mod utils; // backwards compatibility diff --git a/datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs similarity index 71% rename from datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs rename to datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index d9baac50c5a2f..01d64a15c911f 100644 --- a/datafusion/optimizer/src/simplify_expressions/unwrap_date_part.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -15,27 +15,24 @@ // specific language governing permissions and limitations // under the License. -use arrow::datatypes::{DataType, TimeUnit}; -use chrono::NaiveDate; -use datafusion_common::{ - internal_err, tree_node::Transformed, DataFusionError, Result, ScalarValue, -}; +use datafusion_common::{internal_err, tree_node::Transformed, Result}; use datafusion_expr::{ and, expr::ScalarFunction, lit, or, simplify::SimplifyInfo, BinaryExpr, Expr, - Operator, + Operator, ScalarUDFImpl, }; +use datafusion_functions::datetime::date_part::DatePartFunc; -pub(super) fn unwrap_date_part_in_comparison_for_binary( - info: &S, - cast_expr: Expr, +pub(super) fn preimage_in_comparison_for_binary( + info: &dyn SimplifyInfo, + udf_expr: Expr, literal: Expr, op: Operator, ) -> Result> { - let (args, lit_value) = match (cast_expr, literal) { + let (func, args, lit_value) = match (udf_expr, literal) { ( Expr::ScalarFunction(ScalarFunction { func, args }), Expr::Literal(lit_value, _), - ) if func.name() == "date_part" => (args, lit_value), + ) => (func, args, lit_value), _ => return internal_err!("Expect date_part expr and literal"), }; let expr = Box::new(args[1].clone()); @@ -44,18 +41,17 @@ pub(super) fn unwrap_date_part_in_comparison_for_binary( return internal_err!("Can't get the data type of the expr {:?}", &expr); }; - // Helper to cast literal - let cast_year = |updated_year: &ScalarValue| -> Result { - year_literal_to_type(updated_year, &expr_type).ok_or_else(|| { - DataFusionError::Internal(format!( - "Can't cast {lit_value} to type {expr_type}" - )) - }) + let preimage_func = match func.name() { + "date_part" => DatePartFunc::new(), + _ => return internal_err!("Preimage is not supported for {:?}", func.name()), }; let rewritten_expr = match op { Operator::Lt | Operator::GtEq => { - let v = cast_year(&lit_value)?; + let v = match preimage_func.preimage_cast(&lit_value, &expr_type, op) { + Some(v) => v, + None => return internal_err!("Preimage cast did work"), + }; Expr::BinaryExpr(BinaryExpr { left: expr, op, @@ -63,12 +59,10 @@ pub(super) fn unwrap_date_part_in_comparison_for_binary( }) } Operator::Gt => { - let year = match lit_value { - ScalarValue::Int32(Some(y)) => y + 1, - _ => return internal_err!("Expected Int32 Literal"), + let v = match preimage_func.preimage_cast(&lit_value, &expr_type, op) { + Some(v) => v, + None => return internal_err!("Preimage cast did work"), }; - let updated_year = ScalarValue::Int32(Some(year)); - let v = cast_year(&updated_year)?; Expr::BinaryExpr(BinaryExpr { left: expr, op: Operator::GtEq, @@ -76,12 +70,10 @@ pub(super) fn unwrap_date_part_in_comparison_for_binary( }) } Operator::LtEq => { - let year = match lit_value { - ScalarValue::Int32(Some(y)) => y + 1, - _ => return internal_err!("Expected Int32 Literal"), + let v = match preimage_func.preimage_cast(&lit_value, &expr_type, op) { + Some(v) => v, + None => return internal_err!("Preimage cast did work"), }; - let updated_year = ScalarValue::Int32(Some(year)); - let v = cast_year(&updated_year)?; Expr::BinaryExpr(BinaryExpr { left: expr, op: Operator::Lt, @@ -89,13 +81,18 @@ pub(super) fn unwrap_date_part_in_comparison_for_binary( }) } Operator::Eq => { - let year = match lit_value { - ScalarValue::Int32(Some(y)) => y + 1, - _ => return internal_err!("Expected Int32 Literal"), - }; - let updated_year = ScalarValue::Int32(Some(year)); - let lower = cast_year(&lit_value)?; - let upper = cast_year(&updated_year)?; + let lower = + match preimage_func.preimage_cast(&lit_value, &expr_type, Operator::GtEq) + { + Some(v) => v, + None => return internal_err!("Preimage cast did work"), + }; + let upper = + match preimage_func.preimage_cast(&lit_value, &expr_type, Operator::LtEq) + { + Some(v) => v, + None => return internal_err!("Preimage cast did work"), + }; and( Expr::BinaryExpr(BinaryExpr { left: expr.clone(), @@ -110,13 +107,16 @@ pub(super) fn unwrap_date_part_in_comparison_for_binary( ) } Operator::NotEq => { - let year = match lit_value { - ScalarValue::Int32(Some(y)) => y + 1, - _ => return internal_err!("Expected Int32 Literal"), - }; - let updated_year = ScalarValue::Int32(Some(year)); - let lower = cast_year(&lit_value)?; - let upper = cast_year(&updated_year)?; + let lower = + match preimage_func.preimage_cast(&lit_value, &expr_type, Operator::Lt) { + Some(v) => v, + None => return internal_err!("Preimage cast did work"), + }; + let upper = + match preimage_func.preimage_cast(&lit_value, &expr_type, Operator::Gt) { + Some(v) => v, + None => return internal_err!("Preimage cast did work"), + }; or( Expr::BinaryExpr(BinaryExpr { left: expr.clone(), @@ -135,7 +135,7 @@ pub(super) fn unwrap_date_part_in_comparison_for_binary( Ok(Transformed::yes(rewritten_expr)) } -pub(super) fn is_date_part_expr_and_support_unwrap_date_part_in_comparison_for_binary< +pub(super) fn is_scalar_udf_expr_and_support_preimage_in_comparison_for_binary< S: SimplifyInfo, >( info: &S, @@ -143,7 +143,7 @@ pub(super) fn is_date_part_expr_and_support_unwrap_date_part_in_comparison_for_b op: Operator, literal: &Expr, ) -> bool { - match (expr, op, literal) { + let (func, args, lit_value) = match (expr, op, literal) { ( Expr::ScalarFunction(ScalarFunction { func, args }), Operator::Eq @@ -152,17 +152,23 @@ pub(super) fn is_date_part_expr_and_support_unwrap_date_part_in_comparison_for_b | Operator::Lt | Operator::GtEq | Operator::LtEq, - Expr::Literal(lit_val, _), - ) if func.name() == "date_part" => { - let left_expr = Box::new(args[1].clone()); + Expr::Literal(lit_value, _), + ) => (func, args, lit_value), + _ => return false, + }; + + match func.name() { + "date_part" => { + let left_expr = Box::new(args[1].clone()); // len args is variable and the position of args can vary too let Ok(expr_type) = info.get_data_type(&left_expr) else { return false; }; let Ok(_lit_type) = info.get_data_type(literal) else { return false; }; - - year_literal_to_type(lit_val, &expr_type).is_some() + DatePartFunc::new() + .preimage_cast(lit_value, &expr_type, op) + .is_some() } _ => false, } @@ -196,63 +202,6 @@ pub(super) fn is_date_part_expr_and_support_unwrap_date_part_in_comparison_for_b // } // } -/// Cast the year to the right datatype -fn year_literal_to_type( - lit_value: &ScalarValue, - target_type: &DataType, -) -> Option { - let year = match lit_value { - ScalarValue::Int32(Some(y)) => *y, - _ => return None, - }; - // Can only extract year from Date32/64 and Timestamp - match target_type { - DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, _) => {} - _ => return None, - } - - let naive_date = NaiveDate::from_ymd_opt(year, 1, 1).expect("Invalid year"); - - let casted = match target_type { - DataType::Date32 => { - let days = naive_date - .signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?) - .num_days() as i32; - ScalarValue::Date32(Some(days)) - } - DataType::Date64 => { - let milis = naive_date - .signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?) - .num_milliseconds(); - ScalarValue::Date64(Some(milis)) - } - DataType::Timestamp(unit, tz) => { - let days = naive_date - .signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?) - .num_days(); - match unit { - TimeUnit::Second => { - ScalarValue::TimestampSecond(Some(days * 86_400), tz.clone()) - } - TimeUnit::Millisecond => { - ScalarValue::TimestampMillisecond(Some(days * 86_400_000), tz.clone()) - } - TimeUnit::Microsecond => ScalarValue::TimestampMicrosecond( - Some(days * 86_400_000_000), - tz.clone(), - ), - TimeUnit::Nanosecond => ScalarValue::TimestampNanosecond( - Some(days * 86_400_000_000_000), - tz.clone(), - ), - } - } - _ => return None, - }; - - Some(casted) -} - #[cfg(test)] mod tests { use crate::simplify_expressions::ExprSimplifier; From 3260ee9313f317645306f9ae0e959ac7f18a17e2 Mon Sep 17 00:00:00 2001 From: Kosta Tarasov <33369833+sdf-jkl@users.noreply.github.com> Date: Thu, 20 Nov 2025 11:50:52 -0500 Subject: [PATCH 10/27] Update datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs Co-authored-by: Yongting You <2010youy01@gmail.com> --- .../optimizer/src/simplify_expressions/expr_simplifier.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index dff3e5ab9a26c..0a6e449ab37ec 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1968,6 +1968,12 @@ impl TreeNodeRewriter for Simplifier<'_, S> { // // For case: // date_part(expr as data_type) op literal + // + // Background: + // Datasources such as Parquet can prune partitions using simple predicates, + // but they cannot do so for complex expressions. + // For a complex predicate like `date_part('YEAR', c1) < 2000`, pruning is not possible. + // After rewriting it to `c1 < 2000-01-01`, pruning becomes feasible. Expr::BinaryExpr(BinaryExpr { left, op, right }) if is_scalar_udf_expr_and_support_preimage_in_comparison_for_binary( info, &left, op, &right, From d011b5115085b6b6bf0e146b263595c23af0f010 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Thu, 20 Nov 2025 13:28:26 -0500 Subject: [PATCH 11/27] Fix merge error --- .../optimizer/src/simplify_expressions/expr_simplifier.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index a441241b5609f..8a8e8c379595a 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -58,12 +58,9 @@ use crate::simplify_expressions::unwrap_cast::{ unwrap_cast_in_comparison_for_binary, }; use crate::simplify_expressions::SimplifyInfo; -use crate::simplify_expressions::{ - guarantees::GuaranteeRewriter, - udf_preimage::{ +use crate::simplify_expressions::udf_preimage::{ is_scalar_udf_expr_and_support_preimage_in_comparison_for_binary, preimage_in_comparison_for_binary, - }, }; use datafusion_expr::expr_rewriter::rewrite_with_guarantees_map; use datafusion_expr_common::casts::try_cast_literal_to_type; From cd390df379b89ba53e218fd429fd5b13d9337d0d Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 21 Nov 2025 12:04:57 -0500 Subject: [PATCH 12/27] cargo fmt plus some fixes --- .../simplify_expressions/expr_simplifier.rs | 8 +++--- .../src/simplify_expressions/udf_preimage.rs | 28 ++++++++++++++----- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 8a8e8c379595a..89338f6971b53 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -52,16 +52,16 @@ use super::inlist_simplifier::ShortenInListSimplifier; use super::utils::*; use crate::analyzer::type_coercion::TypeCoercionRewriter; use crate::simplify_expressions::regex::simplify_regex_expr; +use crate::simplify_expressions::udf_preimage::{ + is_scalar_udf_expr_and_support_preimage_in_comparison_for_binary, + preimage_in_comparison_for_binary, +}; use crate::simplify_expressions::unwrap_cast::{ is_cast_expr_and_support_unwrap_cast_in_comparison_for_binary, is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist, unwrap_cast_in_comparison_for_binary, }; use crate::simplify_expressions::SimplifyInfo; -use crate::simplify_expressions::udf_preimage::{ - is_scalar_udf_expr_and_support_preimage_in_comparison_for_binary, - preimage_in_comparison_for_binary, -}; use datafusion_expr::expr_rewriter::rewrite_with_guarantees_map; use datafusion_expr_common::casts::try_cast_literal_to_type; use indexmap::IndexSet; diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index 01d64a15c911f..efd52a19a8b7f 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -50,7 +50,9 @@ pub(super) fn preimage_in_comparison_for_binary( Operator::Lt | Operator::GtEq => { let v = match preimage_func.preimage_cast(&lit_value, &expr_type, op) { Some(v) => v, - None => return internal_err!("Preimage cast did work"), + None => { + return internal_err!("Could not cast literal to the column type") + } }; Expr::BinaryExpr(BinaryExpr { left: expr, @@ -61,7 +63,9 @@ pub(super) fn preimage_in_comparison_for_binary( Operator::Gt => { let v = match preimage_func.preimage_cast(&lit_value, &expr_type, op) { Some(v) => v, - None => return internal_err!("Preimage cast did work"), + None => { + return internal_err!("Could not cast literal to the column type") + } }; Expr::BinaryExpr(BinaryExpr { left: expr, @@ -72,7 +76,9 @@ pub(super) fn preimage_in_comparison_for_binary( Operator::LtEq => { let v = match preimage_func.preimage_cast(&lit_value, &expr_type, op) { Some(v) => v, - None => return internal_err!("Preimage cast did work"), + None => { + return internal_err!("Could not cast literal to the column type") + } }; Expr::BinaryExpr(BinaryExpr { left: expr, @@ -85,13 +91,17 @@ pub(super) fn preimage_in_comparison_for_binary( match preimage_func.preimage_cast(&lit_value, &expr_type, Operator::GtEq) { Some(v) => v, - None => return internal_err!("Preimage cast did work"), + None => { + return internal_err!("Could not cast literal to the column type") + } }; let upper = match preimage_func.preimage_cast(&lit_value, &expr_type, Operator::LtEq) { Some(v) => v, - None => return internal_err!("Preimage cast did work"), + None => { + return internal_err!("Could not cast literal to the column type") + } }; and( Expr::BinaryExpr(BinaryExpr { @@ -110,12 +120,16 @@ pub(super) fn preimage_in_comparison_for_binary( let lower = match preimage_func.preimage_cast(&lit_value, &expr_type, Operator::Lt) { Some(v) => v, - None => return internal_err!("Preimage cast did work"), + None => { + return internal_err!("Could not cast literal to the column type") + } }; let upper = match preimage_func.preimage_cast(&lit_value, &expr_type, Operator::Gt) { Some(v) => v, - None => return internal_err!("Preimage cast did work"), + None => { + return internal_err!("Could not cast literal to the column type") + } }; or( Expr::BinaryExpr(BinaryExpr { From 36bb529c21051d7134bb87ef64046be1d0c42492 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 21 Nov 2025 12:06:37 -0500 Subject: [PATCH 13/27] Add sqllogictest --- .../sqllogictest/test_files/udf_preimage.slt | 159 ++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100644 datafusion/sqllogictest/test_files/udf_preimage.slt diff --git a/datafusion/sqllogictest/test_files/udf_preimage.slt b/datafusion/sqllogictest/test_files/udf_preimage.slt new file mode 100644 index 0000000000000..db91913638b8a --- /dev/null +++ b/datafusion/sqllogictest/test_files/udf_preimage.slt @@ -0,0 +1,159 @@ +# Licensed to the Apache Software Foundation (asF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The asF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "as IS" BasIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +############################################### +# ScalarUDF predicate expression simplification +############################################### + +############################ +# date_part(year, col) tests +############################ +statement ok +create table t( + c1_date32 DATE, + c2_ts_sec timestamp, + c3_ts_mili timestamp, + c4_ts_micro timestamp, + c5_ts_nano timestamp +) as VALUES + ('2024-01-01', + '2024-01-01T00:00:00'::timestamp, + '2024-01-01T00:00:00.123'::timestamp, + '2024-01-01T00:00:00.123456'::timestamp, + '2024-01-01T00:00:00.123456789'::timestamp), + ('1990-05-20', + '1990-05-20T00:00:10'::timestamp, + '1990-05-20T00:00:10.987'::timestamp, + '1990-05-20T00:00:10.987654'::timestamp, + '1990-05-20T00:00:10.987654321'::timestamp), + ('2030-12-31', + '2030-12-31T23:59:59'::timestamp, + '2030-12-31T23:59:59.001'::timestamp, + '2030-12-31T23:59:59.001234'::timestamp, + '2030-12-31T23:59:59.001234567'::timestamp) +; + + +# Explain eq +query TT +explain select c1_date32 from t where extract (year from c1_date32) = 2024 +---- +logical_plan +01)Filter: t.c1_date32 >= Date32("2024-01-01") AND t.c1_date32 < Date32("2025-01-01") +02)--TableScan: t projection=[c1_date32] +physical_plan +01)FilterExec: c1_date32@0 >= 2024-01-01 AND c1_date32@0 < 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +## eq +query DPPPP +select + (select c1_date32 from t where extract(year from c1_date32) = 2024), + (select c2_ts_sec from t where extract(year from c2_ts_sec) = 2024), + (select c3_ts_mili from t where extract(year from c3_ts_mili) = 2024), + (select c4_ts_micro from t where extract(year from c4_ts_micro) = 2024), + (select c5_ts_nano from t where extract(year from c5_ts_nano) = 2024); +---- +2024-01-01 2024-01-01T00:00:00 2024-01-01T00:00:00.123 2024-01-01T00:00:00.123456 2024-01-01T00:00:00.123456789 + +# Explain not_eq +query TT +explain select c1_date32 from t where extract (year from c1_date32) <> 2024 +---- +logical_plan +01)Filter: t.c1_date32 < Date32("2024-01-01") OR t.c1_date32 >= Date32("2025-01-01") +02)--TableScan: t projection=[c1_date32] +physical_plan +01)FilterExec: c1_date32@0 < 2024-01-01 OR c1_date32@0 >= 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +## not_eq +query D +select c1_date32 from t where extract(year from c1_date32) <> 2024; +---- +1990-05-20 +2030-12-31 + +# Explain gt +query TT +explain select c2_ts_sec from t where extract (year from c2_ts_sec) > 2024 +---- +logical_plan +01)Filter: t.c2_ts_sec >= TimestampNanosecond(1735689600000000000, None) +02)--TableScan: t projection=[c2_ts_sec] +physical_plan +01)FilterExec: c2_ts_sec@0 >= 1735689600000000000 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +## gt +query P +select c2_ts_sec from t where extract(year from c2_ts_sec) > 2024; +---- +2030-12-31T23:59:59 + +# Explain lt +query TT +explain select c3_ts_mili from t where extract (year from c3_ts_mili) < 2024 +---- +logical_plan +01)Filter: t.c3_ts_mili < TimestampNanosecond(1704067200000000000, None) +02)--TableScan: t projection=[c3_ts_mili] +physical_plan +01)FilterExec: c3_ts_mili@0 < 1704067200000000000 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +## lt +query P +select c3_ts_mili from t where extract(year from c3_ts_mili) < 2024; +---- +1990-05-20T00:00:10.987 + +# Explain gt_eq +query TT +explain select c4_ts_micro from t where extract (year from c4_ts_micro) >= 2024 +---- +logical_plan +01)Filter: t.c4_ts_micro >= TimestampNanosecond(1704067200000000000, None) +02)--TableScan: t projection=[c4_ts_micro] +physical_plan +01)FilterExec: c4_ts_micro@0 >= 1704067200000000000 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +## gt_eq +query P +select c4_ts_micro from t where extract(year from c4_ts_micro) >= 2024; +---- +2024-01-01T00:00:00.123456 +2030-12-31T23:59:59.001234 + +# Explain lt_eq +query TT +explain select c5_ts_nano from t where extract (year from c5_ts_nano) <= 2024 +---- +logical_plan +01)Filter: t.c5_ts_nano < TimestampNanosecond(1735689600000000000, None) +02)--TableScan: t projection=[c5_ts_nano] +physical_plan +01)FilterExec: c5_ts_nano@0 < 1735689600000000000 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +## lt_eq +query P +select c5_ts_nano from t where extract(year from c5_ts_nano) <= 2024; +---- +2024-01-01T00:00:00.123456789 +1990-05-20T00:00:10.987654321 From 7c4dd9c66c43f16c8f7067794944cd4cd0624d55 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 21 Nov 2025 13:46:44 -0500 Subject: [PATCH 14/27] Add other than year date_part tests --- .../src/simplify_expressions/udf_preimage.rs | 24 +++++++++++++++++-- .../sqllogictest/test_files/udf_preimage.slt | 11 +++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index efd52a19a8b7f..36536030ac485 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -15,7 +15,10 @@ // specific language governing permissions and limitations // under the License. -use datafusion_common::{internal_err, tree_node::Transformed, Result}; +use std::str::FromStr; + +use arrow::compute::kernels::cast_utils::IntervalUnit; +use datafusion_common::{Result, ScalarValue, internal_err, tree_node::Transformed}; use datafusion_expr::{ and, expr::ScalarFunction, lit, or, simplify::SimplifyInfo, BinaryExpr, Expr, Operator, ScalarUDFImpl, @@ -173,7 +176,14 @@ pub(super) fn is_scalar_udf_expr_and_support_preimage_in_comparison_for_binary< match func.name() { "date_part" => { - let left_expr = Box::new(args[1].clone()); // len args is variable and the position of args can vary too + let left_expr = Box::new(args[1].clone()); + let Some(ScalarValue::Utf8(Some(part))) = args[0].as_literal() else { + return false; + }; + match IntervalUnit::from_str(part) { + Ok(IntervalUnit::Year) => {}, + _ => return false + }; let Ok(expr_type) = info.get_data_type(&left_expr) else { return false; }; @@ -313,6 +323,16 @@ mod tests { assert_eq!(optimize_test(expr_lt, &schema), expected) } + #[test] + // Should not try to simplify + fn test_preimage_date_part_not_year_date32_eq() { + let schema = expr_test_schema(); + // date_part(c1, DatePart::Year) = 2024 -> c1 >= 2024-01-01 AND c1 < 2025-01-01 + let expr_lt = expr_fn::date_part(lit("month"), col("date32")).eq(lit(1i32)); + let expected = expr_fn::date_part(lit("month"), col("date32")).eq(lit(1i32)); + assert_eq!(optimize_test(expr_lt, &schema), expected) + } + // #[test] // fn test_preimage_date_part_date32_in_list() { // let schema = expr_test_schema(); diff --git a/datafusion/sqllogictest/test_files/udf_preimage.slt b/datafusion/sqllogictest/test_files/udf_preimage.slt index db91913638b8a..f630167196d87 100644 --- a/datafusion/sqllogictest/test_files/udf_preimage.slt +++ b/datafusion/sqllogictest/test_files/udf_preimage.slt @@ -70,6 +70,17 @@ select ---- 2024-01-01 2024-01-01T00:00:00 2024-01-01T00:00:00.123 2024-01-01T00:00:00.123456 2024-01-01T00:00:00.123456789 +# Explain eq not year +query TT +explain select c1_date32 from t where extract (month from c1_date32) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("MONTH"), t.c1_date32) = Int32(2024) +02)--TableScan: t projection=[c1_date32] +physical_plan +01)FilterExec: date_part(MONTH, c1_date32@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + # Explain not_eq query TT explain select c1_date32 from t where extract (year from c1_date32) <> 2024 From cdb9cb617775161af6b561d60939f7e27fcda414 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 21 Nov 2025 13:48:03 -0500 Subject: [PATCH 15/27] Fix docs --- datafusion/expr/src/udf.rs | 21 ++++++++++++++++++- .../functions/src/datetime/date_part.rs | 7 +++++-- .../simplify_expressions/expr_simplifier.rs | 4 ++-- 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index bf46c35f07980..ee3ae0fc6f1bf 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -697,7 +697,26 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync { Ok(ExprSimplifyResult::Original(args)) } - /// Applies simplification on a predicate expression to get a preimage expression + /// Attempts to convert a literal value to the corresponding datatype + /// of a column expression so that a **preimage** can be computed for + /// pruning comparison predicates. + /// + /// This is used during predicate-pushdown optimization + /// (see `datafusion-optimizer-udf_preimage::preimage_in_comparison_for_binary`) + /// + /// Currently is only implemented by: + /// - `date_part(YEAR, expr)` + /// + /// # Arguments: + /// * `lit_value`: The literal `&ScalarValue` used in comparison + /// * `target_type`: The datatype of the column expression inside the function + /// * `op`: The comparison `Operator` (e.g. `=`, `<`, `>=`). + /// + /// # Returns + /// + /// Returns a `ScalarValue` converted to the appropriate target type if a + /// preimage cast is supported for the given function/operator combination; + /// otherwise returns `None`. fn preimage_cast( &self, _lit_value: &ScalarValue, diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index ec6923cb63227..88f7806e5e3a8 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -232,7 +232,10 @@ impl ScalarUDFImpl for DatePartFunc { }) } - /// Cast the year to the right datatype + // Only casting the year is supported since pruning other IntervalUnit is not possible + // date_part(col, YEAR) = 2024 => col >= '2024-01-01' and col < '2025-01-01' + // But for anything less than YEAR simplifying is not possible without specifying the bigger interval + // date_part(col, MONTH) = 1 => col = '2023-01-01' or col = '2024-01-01' or ... or col = '3000-01-01' fn preimage_cast( &self, lit_value: &ScalarValue, @@ -257,7 +260,7 @@ impl ScalarUDFImpl for DatePartFunc { }; let naive_date = - NaiveDate::from_ymd_opt(updated_year, 1, 1).expect("Invalid year"); + NaiveDate::from_ymd_opt(updated_year, 1, 1)?; let casted = match target_type { Date32 => { diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 89338f6971b53..8c00571f66b1a 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1965,11 +1965,11 @@ impl TreeNodeRewriter for Simplifier<'_, S> { } // ======================================= - // unwrap_date_part_in_comparison + // preimage_in_comparison // ======================================= // // For case: - // date_part(expr as data_type) op literal + // date_part(expr as 'YEAR') op literal // // Background: // Datasources such as Parquet can prune partitions using simple predicates, From 57ee667f665fed07b7b9c52aa47426bcbf7e2272 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 21 Nov 2025 13:54:21 -0500 Subject: [PATCH 16/27] cargo fmt --- datafusion/expr/src/udf.rs | 8 ++++---- datafusion/functions/src/datetime/date_part.rs | 3 +-- .../optimizer/src/simplify_expressions/udf_preimage.rs | 6 +++--- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index ee3ae0fc6f1bf..66240d90d6fb7 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -700,18 +700,18 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync { /// Attempts to convert a literal value to the corresponding datatype /// of a column expression so that a **preimage** can be computed for /// pruning comparison predicates. - /// + /// /// This is used during predicate-pushdown optimization /// (see `datafusion-optimizer-udf_preimage::preimage_in_comparison_for_binary`) - /// + /// /// Currently is only implemented by: /// - `date_part(YEAR, expr)` - /// + /// /// # Arguments: /// * `lit_value`: The literal `&ScalarValue` used in comparison /// * `target_type`: The datatype of the column expression inside the function /// * `op`: The comparison `Operator` (e.g. `=`, `<`, `>=`). - /// + /// /// # Returns /// /// Returns a `ScalarValue` converted to the appropriate target type if a diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index 88f7806e5e3a8..9d07248a2a982 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -259,8 +259,7 @@ impl ScalarUDFImpl for DatePartFunc { _ => return None, }; - let naive_date = - NaiveDate::from_ymd_opt(updated_year, 1, 1)?; + let naive_date = NaiveDate::from_ymd_opt(updated_year, 1, 1)?; let casted = match target_type { Date32 => { diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index 36536030ac485..fc1c54a01f764 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -18,7 +18,7 @@ use std::str::FromStr; use arrow::compute::kernels::cast_utils::IntervalUnit; -use datafusion_common::{Result, ScalarValue, internal_err, tree_node::Transformed}; +use datafusion_common::{internal_err, tree_node::Transformed, Result, ScalarValue}; use datafusion_expr::{ and, expr::ScalarFunction, lit, or, simplify::SimplifyInfo, BinaryExpr, Expr, Operator, ScalarUDFImpl, @@ -181,8 +181,8 @@ pub(super) fn is_scalar_udf_expr_and_support_preimage_in_comparison_for_binary< return false; }; match IntervalUnit::from_str(part) { - Ok(IntervalUnit::Year) => {}, - _ => return false + Ok(IntervalUnit::Year) => {} + _ => return false, }; let Ok(expr_type) = info.get_data_type(&left_expr) else { return false; From bb507924997b87171699f0603da9e226bc4953eb Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 21 Nov 2025 15:44:07 -0500 Subject: [PATCH 17/27] Cargo fmt + doc changes --- .../simplify_expressions/expr_simplifier.rs | 4 +-- .../src/simplify_expressions/udf_preimage.rs | 27 +++++++++++++------ 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 8c00571f66b1a..f43cb8ed2eafa 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -2000,12 +2000,12 @@ impl TreeNodeRewriter for Simplifier<'_, S> { } // // For case: - // // try_cast/cast(expr as left_type) in (expr1,expr2,expr3) + // // try_cast/cast(expr as left_type) in (lit1, lit2, lit3) // Expr::InList(InList { // expr: mut left, // list, // negated, - // }) if is_date_part_expr_and_support_unwrap_date_part_in_comparison_for_inlist( + // }) if is_scalar_udf_expr_and_support_preimage_in_comparison_for_inlist( // info, &left, &list, // ) => // { diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index fc1c54a01f764..1a915327720e7 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -198,25 +198,36 @@ pub(super) fn is_scalar_udf_expr_and_support_preimage_in_comparison_for_binary< } } -// pub(super) fn is_date_part_expr_and_support_unwrap_date_part_in_comparison_for_inlist< +// pub(super) fn is_scalar_udf_expr_and_support_preimage_in_comparison_for_inlist< // S: SimplifyInfo, // >( // info: &S, // expr: &Expr, // list: &[Expr], // ) -> bool { -// match expr { -// Expr::ScalarFunction(ScalarFunction { func, args }) -// if func.name() == "date_part" => -// { +// let (func, args) = match expr { +// Expr::ScalarFunction(ScalarFunction { func, args }) => (func, args), +// _ => return false, +// }; +// match func.name() { +// "date_part" => { // let left_expr = Box::new(args[1].clone()); +// let Some(ScalarValue::Utf8(Some(part))) = args[0].as_literal() else { +// return false; +// }; +// match IntervalUnit::from_str(part) { +// Ok(IntervalUnit::Year) => {} +// _ => return false, +// }; // let Ok(expr_type) = info.get_data_type(&left_expr) else { // return false; // }; // for right in list { // match right { -// Expr::Literal(lit_val, _) -// if year_literal_to_type(lit_val, &expr_type).is_some() => {} +// Expr::Literal(lit_value, _) +// if DatePartFunc::new() +// .preimage_cast(lit_value, &expr_type, Operator::Eq) +// .is_some() => {} // _ => return false, // } // } @@ -324,7 +335,7 @@ mod tests { } #[test] - // Should not try to simplify + // Should not simplify fn test_preimage_date_part_not_year_date32_eq() { let schema = expr_test_schema(); // date_part(c1, DatePart::Year) = 2024 -> c1 >= 2024-01-01 AND c1 < 2025-01-01 From 729951bff5e0b1f21cad69a0eef1c82bc3faa339 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Mon, 1 Dec 2025 13:02:24 -0500 Subject: [PATCH 18/27] Rewrite logic --- datafusion/expr/src/udf.rs | 18 +-- .../functions/src/datetime/date_part.rs | 100 ++++++------- .../src/simplify_expressions/udf_preimage.rs | 139 ++++++------------ 3 files changed, 98 insertions(+), 159 deletions(-) diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 66240d90d6fb7..220fe7b823404 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -31,7 +31,6 @@ use datafusion_common::{ }; use datafusion_expr_common::dyn_eq::{DynEq, DynHash}; use datafusion_expr_common::interval_arithmetic::Interval; -use datafusion_expr_common::operator::Operator; use std::any::Any; use std::cmp::Ordering; use std::fmt::Debug; @@ -697,7 +696,7 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync { Ok(ExprSimplifyResult::Original(args)) } - /// Attempts to convert a literal value to the corresponding datatype + /// Attempts to convert a literal value to in interval of the corresponding datatype /// of a column expression so that a **preimage** can be computed for /// pruning comparison predicates. /// @@ -710,19 +709,17 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync { /// # Arguments: /// * `lit_value`: The literal `&ScalarValue` used in comparison /// * `target_type`: The datatype of the column expression inside the function - /// * `op`: The comparison `Operator` (e.g. `=`, `<`, `>=`). /// /// # Returns /// - /// Returns a `ScalarValue` converted to the appropriate target type if a + /// Returns an `Interval` of the appropriate target type if a /// preimage cast is supported for the given function/operator combination; /// otherwise returns `None`. - fn preimage_cast( + fn preimage( &self, _lit_value: &ScalarValue, _target_type: &DataType, - _op: Operator, - ) -> Option { + ) -> Option { None } @@ -956,13 +953,12 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl { self.inner.simplify(args, info) } - fn preimage_cast( + fn preimage( &self, lit_value: &ScalarValue, target_type: &DataType, - op: Operator, - ) -> Option { - self.inner.preimage_cast(lit_value, target_type, op) + ) -> Option { + self.inner.preimage(lit_value, target_type) } fn conditional_arguments<'a>( diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index 9d07248a2a982..5a776ff8e771e 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -27,7 +27,7 @@ use arrow::datatypes::DataType::{ }; use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; use arrow::datatypes::{DataType, Field, FieldRef, TimeUnit}; -use chrono::NaiveDate; +use chrono::{Datelike, NaiveDate}; use datafusion_common::types::{logical_date, NativeType}; use datafusion_common::{ @@ -42,8 +42,9 @@ use datafusion_common::{ utils::take_function_args, Result, ScalarValue, }; +use datafusion_expr::interval_arithmetic; use datafusion_expr::{ - ColumnarValue, Documentation, Operator, ReturnFieldArgs, ScalarUDFImpl, Signature, + ColumnarValue, Documentation, ReturnFieldArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility, }; use datafusion_expr_common::signature::{Coercion, TypeSignatureClass}; @@ -236,12 +237,11 @@ impl ScalarUDFImpl for DatePartFunc { // date_part(col, YEAR) = 2024 => col >= '2024-01-01' and col < '2025-01-01' // But for anything less than YEAR simplifying is not possible without specifying the bigger interval // date_part(col, MONTH) = 1 => col = '2023-01-01' or col = '2024-01-01' or ... or col = '3000-01-01' - fn preimage_cast( + fn preimage( &self, lit_value: &ScalarValue, target_type: &DataType, - op: Operator, - ) -> Option { + ) -> Option { let year = match lit_value { ScalarValue::Int32(Some(y)) => *y, _ => return None, @@ -250,56 +250,14 @@ impl ScalarUDFImpl for DatePartFunc { match target_type { Date32 | Date64 | Timestamp(_, _) => {} _ => return None, - } - - let updated_year = match op { - Operator::Gt | Operator::LtEq => year + 1, - Operator::Lt | Operator::GtEq => year, - Operator::Eq | Operator::NotEq => year, // This is to pass the is_scalar_udf_expr_and_support_preimage_in_comparison_for_binary - _ => return None, }; - let naive_date = NaiveDate::from_ymd_opt(updated_year, 1, 1)?; - - let casted = match target_type { - Date32 => { - let days = naive_date - .signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?) - .num_days() as i32; - ScalarValue::Date32(Some(days)) - } - Date64 => { - let milis = naive_date - .signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?) - .num_milliseconds(); - ScalarValue::Date64(Some(milis)) - } - Timestamp(unit, tz) => { - let days = naive_date - .signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?) - .num_days(); - match unit { - Second => { - ScalarValue::TimestampSecond(Some(days * 86_400), tz.clone()) - } - Millisecond => ScalarValue::TimestampMillisecond( - Some(days * 86_400_000), - tz.clone(), - ), - Microsecond => ScalarValue::TimestampMicrosecond( - Some(days * 86_400_000_000), - tz.clone(), - ), - Nanosecond => ScalarValue::TimestampNanosecond( - Some(days * 86_400_000_000_000), - tz.clone(), - ), - } - } - _ => return None, - }; + let start_time = NaiveDate::from_ymd_opt(year, 1, 1)?; + let end_time = start_time.with_year(year + 1)?; + let lower = date_to_scalar(start_time, target_type)?; + let upper = date_to_scalar(end_time, target_type)?; - Some(casted) + interval_arithmetic::Interval::try_new(lower, upper).ok() } fn aliases(&self) -> &[String] { @@ -311,6 +269,44 @@ impl ScalarUDFImpl for DatePartFunc { } } +fn date_to_scalar(date: NaiveDate, target_type: &DataType) -> Option { + let scalar = match target_type { + Date32 => { + let days = date + .signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?) + .num_days() as i32; + ScalarValue::Date32(Some(days)) + } + Date64 => { + let milis = date + .signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?) + .num_milliseconds(); + ScalarValue::Date64(Some(milis)) + } + Timestamp(unit, tz) => { + let days = date + .signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?) + .num_days(); + match unit { + Second => ScalarValue::TimestampSecond(Some(days * 86_400), tz.clone()), + Millisecond => { + ScalarValue::TimestampMillisecond(Some(days * 86_400_000), tz.clone()) + } + Microsecond => ScalarValue::TimestampMicrosecond( + Some(days * 86_400_000_000), + tz.clone(), + ), + Nanosecond => ScalarValue::TimestampNanosecond( + Some(days * 86_400_000_000_000), + tz.clone(), + ), + } + } + _ => return None, + }; + Some(scalar) +} + fn is_epoch(part: &str) -> bool { let part = part_normalization(part); matches!(part.to_lowercase().as_str(), "epoch") diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index 1a915327720e7..bdc3fcaf8387b 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -36,7 +36,7 @@ pub(super) fn preimage_in_comparison_for_binary( Expr::ScalarFunction(ScalarFunction { func, args }), Expr::Literal(lit_value, _), ) => (func, args, lit_value), - _ => return internal_err!("Expect date_part expr and literal"), + _ => return internal_err!("Expect scalar function expr and literal"), }; let expr = Box::new(args[1].clone()); @@ -44,109 +44,56 @@ pub(super) fn preimage_in_comparison_for_binary( return internal_err!("Can't get the data type of the expr {:?}", &expr); }; - let preimage_func = match func.name() { - "date_part" => DatePartFunc::new(), + let preimage_interval = match func.name() { + "date_part" => DatePartFunc::new() + .preimage(&lit_value, &expr_type) + .expect("Preimage interval should be created"), _ => return internal_err!("Preimage is not supported for {:?}", func.name()), }; + let lower = lit(preimage_interval.lower().clone()); + let upper = lit(preimage_interval.upper().clone()); + let rewritten_expr = match op { - Operator::Lt | Operator::GtEq => { - let v = match preimage_func.preimage_cast(&lit_value, &expr_type, op) { - Some(v) => v, - None => { - return internal_err!("Could not cast literal to the column type") - } - }; - Expr::BinaryExpr(BinaryExpr { - left: expr, - op, - right: Box::new(lit(v)), - }) - } - Operator::Gt => { - let v = match preimage_func.preimage_cast(&lit_value, &expr_type, op) { - Some(v) => v, - None => { - return internal_err!("Could not cast literal to the column type") - } - }; + Operator::Lt | Operator::GtEq => Expr::BinaryExpr(BinaryExpr { + left: expr, + op, + right: Box::new(lower), + }), + Operator::Gt => Expr::BinaryExpr(BinaryExpr { + left: expr, + op: Operator::GtEq, + right: Box::new(upper), + }), + Operator::LtEq => Expr::BinaryExpr(BinaryExpr { + left: expr, + op: Operator::Lt, + right: Box::new(upper), + }), + Operator::Eq => and( Expr::BinaryExpr(BinaryExpr { - left: expr, + left: expr.clone(), op: Operator::GtEq, - right: Box::new(lit(v)), - }) - } - Operator::LtEq => { - let v = match preimage_func.preimage_cast(&lit_value, &expr_type, op) { - Some(v) => v, - None => { - return internal_err!("Could not cast literal to the column type") - } - }; + right: Box::new(lower), + }), Expr::BinaryExpr(BinaryExpr { left: expr, op: Operator::Lt, - right: Box::new(lit(v)), - }) - } - Operator::Eq => { - let lower = - match preimage_func.preimage_cast(&lit_value, &expr_type, Operator::GtEq) - { - Some(v) => v, - None => { - return internal_err!("Could not cast literal to the column type") - } - }; - let upper = - match preimage_func.preimage_cast(&lit_value, &expr_type, Operator::LtEq) - { - Some(v) => v, - None => { - return internal_err!("Could not cast literal to the column type") - } - }; - and( - Expr::BinaryExpr(BinaryExpr { - left: expr.clone(), - op: Operator::GtEq, - right: Box::new(lit(lower)), - }), - Expr::BinaryExpr(BinaryExpr { - left: expr, - op: Operator::Lt, - right: Box::new(lit(upper)), - }), - ) - } - Operator::NotEq => { - let lower = - match preimage_func.preimage_cast(&lit_value, &expr_type, Operator::Lt) { - Some(v) => v, - None => { - return internal_err!("Could not cast literal to the column type") - } - }; - let upper = - match preimage_func.preimage_cast(&lit_value, &expr_type, Operator::Gt) { - Some(v) => v, - None => { - return internal_err!("Could not cast literal to the column type") - } - }; - or( - Expr::BinaryExpr(BinaryExpr { - left: expr.clone(), - op: Operator::Lt, - right: Box::new(lit(lower)), - }), - Expr::BinaryExpr(BinaryExpr { - left: expr, - op: Operator::GtEq, - right: Box::new(lit(upper)), - }), - ) - } + right: Box::new(upper), + }), + ), + Operator::NotEq => or( + Expr::BinaryExpr(BinaryExpr { + left: expr.clone(), + op: Operator::Lt, + right: Box::new(lower), + }), + Expr::BinaryExpr(BinaryExpr { + left: expr, + op: Operator::GtEq, + right: Box::new(upper), + }), + ), _ => return internal_err!("Expect comparison operators"), }; Ok(Transformed::yes(rewritten_expr)) @@ -191,7 +138,7 @@ pub(super) fn is_scalar_udf_expr_and_support_preimage_in_comparison_for_binary< return false; }; DatePartFunc::new() - .preimage_cast(lit_value, &expr_type, op) + .preimage(lit_value, &expr_type) .is_some() } _ => false, From ab9b444edce669583654589be48841746abb6359 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Mon, 1 Dec 2025 19:16:27 -0500 Subject: [PATCH 19/27] support IsDistinctFrom and IsNotDistinctFrom --- .../src/simplify_expressions/udf_preimage.rs | 41 +++++- .../sqllogictest/test_files/udf_preimage.slt | 139 ++++++++++-------- 2 files changed, 121 insertions(+), 59 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index bdc3fcaf8387b..fab1e699ef4cb 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -94,6 +94,43 @@ pub(super) fn preimage_in_comparison_for_binary( right: Box::new(upper), }), ), + Operator::IsDistinctFrom => or( + or( + Expr::BinaryExpr(BinaryExpr { + left: expr.clone(), + op: Operator::Lt, + right: Box::new(lower.clone()), + }), + Expr::BinaryExpr(BinaryExpr { + left: expr.clone(), + op: Operator::GtEq, + right: Box::new(upper), + }), + ), + or( + and(expr.clone().is_null(), lower.clone().is_not_null()), + and(expr.is_not_null(), lower.is_null()), + ), + ), + Operator::IsNotDistinctFrom => or( + Expr::BinaryExpr(BinaryExpr { + left: Box::new(expr.clone().is_null()), + op: Operator::And, + right: Box::new(lower.clone().is_null()), + }), + and( + Expr::BinaryExpr(BinaryExpr { + left: expr.clone(), + op: Operator::GtEq, + right: Box::new(lower.clone()), + }), + Expr::BinaryExpr(BinaryExpr { + left: expr, + op: Operator::Lt, + right: Box::new(upper), + }), + ), + ), _ => return internal_err!("Expect comparison operators"), }; Ok(Transformed::yes(rewritten_expr)) @@ -115,7 +152,9 @@ pub(super) fn is_scalar_udf_expr_and_support_preimage_in_comparison_for_binary< | Operator::Gt | Operator::Lt | Operator::GtEq - | Operator::LtEq, + | Operator::LtEq + | Operator::IsDistinctFrom + | Operator::IsNotDistinctFrom, Expr::Literal(lit_value, _), ) => (func, args, lit_value), _ => return false, diff --git a/datafusion/sqllogictest/test_files/udf_preimage.slt b/datafusion/sqllogictest/test_files/udf_preimage.slt index f630167196d87..dff7683fab15f 100644 --- a/datafusion/sqllogictest/test_files/udf_preimage.slt +++ b/datafusion/sqllogictest/test_files/udf_preimage.slt @@ -22,6 +22,7 @@ ############################ # date_part(year, col) tests ############################ + statement ok create table t( c1_date32 DATE, @@ -47,41 +48,67 @@ create table t( '2030-12-31T23:59:59.001234567'::timestamp) ; +# +# Simple optimizations +# +query D +select c1_date32 from t where extract(year from c1_date32) = 2024; +---- +2024-01-01 -# Explain eq -query TT -explain select c1_date32 from t where extract (year from c1_date32) = 2024 +query D +select c1_date32 from t where extract(year from c1_date32) <> 2024; ---- -logical_plan -01)Filter: t.c1_date32 >= Date32("2024-01-01") AND t.c1_date32 < Date32("2025-01-01") -02)--TableScan: t projection=[c1_date32] -physical_plan -01)FilterExec: c1_date32@0 >= 2024-01-01 AND c1_date32@0 < 2025-01-01 -02)--DataSourceExec: partitions=1, partition_sizes=[1] +1990-05-20 +2030-12-31 -## eq -query DPPPP -select - (select c1_date32 from t where extract(year from c1_date32) = 2024), - (select c2_ts_sec from t where extract(year from c2_ts_sec) = 2024), - (select c3_ts_mili from t where extract(year from c3_ts_mili) = 2024), - (select c4_ts_micro from t where extract(year from c4_ts_micro) = 2024), - (select c5_ts_nano from t where extract(year from c5_ts_nano) = 2024); +query P +select c2_ts_sec from t where extract(year from c2_ts_sec) > 2024; ---- -2024-01-01 2024-01-01T00:00:00 2024-01-01T00:00:00.123 2024-01-01T00:00:00.123456 2024-01-01T00:00:00.123456789 +2030-12-31T23:59:59 + +query P +select c3_ts_mili from t where extract(year from c3_ts_mili) < 2024; +---- +1990-05-20T00:00:10.987 + +query P +select c4_ts_micro from t where extract(year from c4_ts_micro) >= 2024; +---- +2024-01-01T00:00:00.123456 +2030-12-31T23:59:59.001234 + +query P +select c5_ts_nano from t where extract(year from c5_ts_nano) <= 2024; +---- +2024-01-01T00:00:00.123456789 +1990-05-20T00:00:10.987654321 + +query D +select c1_date32 from t where extract(year from c1_date32) is not distinct from 2024 +---- +2024-01-01 + +query D +select c1_date32 from t where extract(year from c1_date32) is distinct from 2024 +---- +1990-05-20 +2030-12-31 + +# +# Explain statements +# -# Explain eq not year query TT -explain select c1_date32 from t where extract (month from c1_date32) = 2024 +explain select c1_date32 from t where extract (year from c1_date32) = 2024 ---- logical_plan -01)Filter: date_part(Utf8("MONTH"), t.c1_date32) = Int32(2024) +01)Filter: t.c1_date32 >= Date32("2024-01-01") AND t.c1_date32 < Date32("2025-01-01") 02)--TableScan: t projection=[c1_date32] physical_plan -01)FilterExec: date_part(MONTH, c1_date32@0) = 2024 +01)FilterExec: c1_date32@0 >= 2024-01-01 AND c1_date32@0 < 2025-01-01 02)--DataSourceExec: partitions=1, partition_sizes=[1] -# Explain not_eq query TT explain select c1_date32 from t where extract (year from c1_date32) <> 2024 ---- @@ -92,14 +119,6 @@ physical_plan 01)FilterExec: c1_date32@0 < 2024-01-01 OR c1_date32@0 >= 2025-01-01 02)--DataSourceExec: partitions=1, partition_sizes=[1] -## not_eq -query D -select c1_date32 from t where extract(year from c1_date32) <> 2024; ----- -1990-05-20 -2030-12-31 - -# Explain gt query TT explain select c2_ts_sec from t where extract (year from c2_ts_sec) > 2024 ---- @@ -110,13 +129,6 @@ physical_plan 01)FilterExec: c2_ts_sec@0 >= 1735689600000000000 02)--DataSourceExec: partitions=1, partition_sizes=[1] -## gt -query P -select c2_ts_sec from t where extract(year from c2_ts_sec) > 2024; ----- -2030-12-31T23:59:59 - -# Explain lt query TT explain select c3_ts_mili from t where extract (year from c3_ts_mili) < 2024 ---- @@ -127,13 +139,6 @@ physical_plan 01)FilterExec: c3_ts_mili@0 < 1704067200000000000 02)--DataSourceExec: partitions=1, partition_sizes=[1] -## lt -query P -select c3_ts_mili from t where extract(year from c3_ts_mili) < 2024; ----- -1990-05-20T00:00:10.987 - -# Explain gt_eq query TT explain select c4_ts_micro from t where extract (year from c4_ts_micro) >= 2024 ---- @@ -144,14 +149,6 @@ physical_plan 01)FilterExec: c4_ts_micro@0 >= 1704067200000000000 02)--DataSourceExec: partitions=1, partition_sizes=[1] -## gt_eq -query P -select c4_ts_micro from t where extract(year from c4_ts_micro) >= 2024; ----- -2024-01-01T00:00:00.123456 -2030-12-31T23:59:59.001234 - -# Explain lt_eq query TT explain select c5_ts_nano from t where extract (year from c5_ts_nano) <= 2024 ---- @@ -162,9 +159,35 @@ physical_plan 01)FilterExec: c5_ts_nano@0 < 1735689600000000000 02)--DataSourceExec: partitions=1, partition_sizes=[1] -## lt_eq -query P -select c5_ts_nano from t where extract(year from c5_ts_nano) <= 2024; +# This one doesn't pass due to a ParseError +#External error: task 20343 panicked with message "called `Result::unwrap()` on an `Err` value: ParseError { kind: InvalidLine(\"explain select c1_date32 from t where extract(year from c1_date32) is not distinct from 2024\"), loc: Location { file: \"test_files/udf_preimage.slt\", line: 162, upper: None } }" +# +#explain select c1_date32 from t where extract (year from c1_date32) is not distinct from 2024 +#---- +#logical_plan +#01)Filter: t.c1_date32 >= Date32("2024-01-01") AND t.c1_date32 < Date32("2025-01-01") +#02)--TableScan: t projection=[c1_date32] +#physical_plan +#01)FilterExec: c1_date32@0 >= 2024-01-01 AND c1_date32@0 < 2025-01-01 +#02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c1_date32 from t where extract (year from c1_date32) is distinct from 2024 ---- -2024-01-01T00:00:00.123456789 -1990-05-20T00:00:10.987654321 +logical_plan +01)Filter: t.c1_date32 < Date32("2024-01-01") OR t.c1_date32 >= Date32("2025-01-01") OR t.c1_date32 IS NULL +02)--TableScan: t projection=[c1_date32] +physical_plan +01)FilterExec: c1_date32@0 < 2024-01-01 OR c1_date32@0 >= 2025-01-01 OR c1_date32@0 IS NULL +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +# Explain eq not year +query TT +explain select c1_date32 from t where extract (month from c1_date32) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("MONTH"), t.c1_date32) = Int32(2024) +02)--TableScan: t projection=[c1_date32] +physical_plan +01)FilterExec: date_part(MONTH, c1_date32@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] \ No newline at end of file From 5286626eea7df86894195b2c337b7a6221d2cfa8 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Tue, 2 Dec 2025 14:48:56 -0500 Subject: [PATCH 20/27] Update sqllogictests --- .../sqllogictest/test_files/udf_preimage.slt | 437 +++++++++++++++--- 1 file changed, 377 insertions(+), 60 deletions(-) diff --git a/datafusion/sqllogictest/test_files/udf_preimage.slt b/datafusion/sqllogictest/test_files/udf_preimage.slt index dff7683fab15f..3779af0dc665f 100644 --- a/datafusion/sqllogictest/test_files/udf_preimage.slt +++ b/datafusion/sqllogictest/test_files/udf_preimage.slt @@ -15,32 +15,363 @@ # specific language governing permissions and limitations # under the License. -############################################### -# ScalarUDF predicate expression simplification -############################################### - ############################ # date_part(year, col) tests ############################ statement ok -create table t( +create table t1(c DATE) as VALUES (NULL), ('1990-01-01'), ('2024-01-01'), ('2030-01-01'); + +# +# Simple optimizations, col on LHS +# +query D +select c from t1 where extract(year from c) = 2024; +---- +2024-01-01 + +query D +select c from t1 where extract(year from c) <> 2024; +---- +1990-01-01 +2030-01-01 + +query D +select c from t1 where extract(year from c) > 2024; +---- +2030-01-01 + +query D +select c from t1 where extract(year from c) < 2024; +---- +1990-01-01 + +query D +select c from t1 where extract(year from c) >= 2024; +---- +2024-01-01 +2030-01-01 + +query D +select c from t1 where extract(year from c) <= 2024; +---- +1990-01-01 +2024-01-01 + +query D +select c from t1 where extract(year from c) is not distinct from 2024 +---- +2024-01-01 + +query D +select c from t1 where extract(year from c) is distinct from 2024 +---- +NULL +1990-01-01 +2030-01-01 + +# +# Check that date_part is not in the explain statements +# +query TT +explain select c from t1 where extract (year from c) = 2024 +---- +logical_plan +01)Filter: t1.c >= Date32("2024-01-01") AND t1.c < Date32("2025-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 >= 2024-01-01 AND c@0 < 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) <> 2024 +---- +logical_plan +01)Filter: t1.c < Date32("2024-01-01") OR t1.c >= Date32("2025-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 < 2024-01-01 OR c@0 >= 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) > 2024 +---- +logical_plan +01)Filter: t1.c >= Date32("2025-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 >= 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) < 2024 +---- +logical_plan +01)Filter: t1.c < Date32("2024-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 < 2024-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) >= 2024 +---- +logical_plan +01)Filter: t1.c >= Date32("2024-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 >= 2024-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) <= 2024 +---- +logical_plan +01)Filter: t1.c < Date32("2025-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 < 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +# This one doesn't pass due to a ParseError +#External error: task 20233 panicked with message "called `Result::unwrap()` on an `Err` value: ParseError { kind: InvalidLine(\"explain select c from t1 where extract (year from c) is not distinct from 2024\"), loc: Location { file: \"test_files/udf_preimage.slt\", line: 144, upper: None } }" +# +#explain select c from t1 where extract (year from c) is not distinct from 2024 +#---- +#logical_plan +#01)Filter: t1.c >= Date32("2024-01-01") AND t1.c < Date32("2025-01-01") +#02)--TableScan: t1 projection=[c1_date32] +#physical_plan +#01)FilterExec: c1_date32@0 >= 2024-01-01 AND c1_date32@0 < 2025-01-01 +#02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (year from c) is distinct from 2024 +---- +logical_plan +01)Filter: t1.c < Date32("2024-01-01") OR t1.c >= Date32("2025-01-01") OR t1.c IS NULL +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 < 2024-01-01 OR c@0 >= 2025-01-01 OR c@0 IS NULL +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +# +# Simple optimizations, column on RHS +# +query D +select c from t1 where 2024 = extract(year from c); +---- +2024-01-01 + +query D +select c from t1 where 2024 <> extract(year from c); +---- +1990-01-01 +2030-01-01 + +query D +select c from t1 where 2024 < extract(year from c); +---- +2030-01-01 + +query D +select c from t1 where 2024 > extract(year from c); +---- +1990-01-01 + +query D +select c from t1 where 2024 <= extract(year from c); +---- +2024-01-01 +2030-01-01 + +query D +select c from t1 where 2024 >= extract(year from c); +---- +1990-01-01 +2024-01-01 + +query D +select c from t1 where 2024 is not distinct from extract(year from c); +---- +2024-01-01 + +query D +select c from t1 where 2024 is distinct from extract(year from c); +---- +NULL +1990-01-01 +2030-01-01 + +# +# Check explain statements for optimizations for other interval types +# +query TT +explain select c from t1 where extract (quarter from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("QUARTER"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(QUARTER, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (month from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("MONTH"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(MONTH, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (week from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("WEEK"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(WEEK, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (day from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("DAY"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(DAY, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (hour from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("HOUR"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(HOUR, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (minute from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("MINUTE"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(MINUTE, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (second from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("SECOND"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(SECOND, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (millisecond from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("MILLISECOND"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(MILLISECOND, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (microsecond from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("MICROSECOND"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(MICROSECOND, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (nanosecond from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("NANOSECOND"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(NANOSECOND, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (dow from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("DOW"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(DOW, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (doy from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("DOY"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(DOY, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (epoch from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("EPOCH"), t1.c) = Float64(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(EPOCH, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +explain select c from t1 where extract (isodow from c) = 2024 +---- +logical_plan +01)Filter: date_part(Utf8("ISODOW"), t1.c) = Int32(2024) +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: date_part(ISODOW, c@0) = 2024 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +# +# Simple optimize different datatypes +# +statement ok +create table t2( c1_date32 DATE, c2_ts_sec timestamp, c3_ts_mili timestamp, c4_ts_micro timestamp, c5_ts_nano timestamp ) as VALUES - ('2024-01-01', - '2024-01-01T00:00:00'::timestamp, - '2024-01-01T00:00:00.123'::timestamp, - '2024-01-01T00:00:00.123456'::timestamp, - '2024-01-01T00:00:00.123456789'::timestamp), + (NULL, + NULL, + NULL, + NULL, + NULL), ('1990-05-20', '1990-05-20T00:00:10'::timestamp, '1990-05-20T00:00:10.987'::timestamp, '1990-05-20T00:00:10.987654'::timestamp, '1990-05-20T00:00:10.987654321'::timestamp), + ('2024-01-01', + '2024-01-01T00:00:00'::timestamp, + '2024-01-01T00:00:00.123'::timestamp, + '2024-01-01T00:00:00.123456'::timestamp, + '2024-01-01T00:00:00.123456789'::timestamp), ('2030-12-31', '2030-12-31T23:59:59'::timestamp, '2030-12-31T23:59:59.001'::timestamp, @@ -48,146 +379,132 @@ create table t( '2030-12-31T23:59:59.001234567'::timestamp) ; -# -# Simple optimizations -# query D -select c1_date32 from t where extract(year from c1_date32) = 2024; +select c1_date32 from t2 where extract(year from c1_date32) = 2024; ---- 2024-01-01 query D -select c1_date32 from t where extract(year from c1_date32) <> 2024; +select c1_date32 from t2 where extract(year from c1_date32) <> 2024; ---- 1990-05-20 2030-12-31 query P -select c2_ts_sec from t where extract(year from c2_ts_sec) > 2024; +select c2_ts_sec from t2 where extract(year from c2_ts_sec) > 2024; ---- 2030-12-31T23:59:59 query P -select c3_ts_mili from t where extract(year from c3_ts_mili) < 2024; +select c3_ts_mili from t2 where extract(year from c3_ts_mili) < 2024; ---- 1990-05-20T00:00:10.987 query P -select c4_ts_micro from t where extract(year from c4_ts_micro) >= 2024; +select c4_ts_micro from t2 where extract(year from c4_ts_micro) >= 2024; ---- 2024-01-01T00:00:00.123456 2030-12-31T23:59:59.001234 query P -select c5_ts_nano from t where extract(year from c5_ts_nano) <= 2024; +select c5_ts_nano from t2 where extract(year from c5_ts_nano) <= 2024; ---- -2024-01-01T00:00:00.123456789 1990-05-20T00:00:10.987654321 +2024-01-01T00:00:00.123456789 query D -select c1_date32 from t where extract(year from c1_date32) is not distinct from 2024 +select c1_date32 from t2 where extract(year from c1_date32) is not distinct from 2024 ---- 2024-01-01 query D -select c1_date32 from t where extract(year from c1_date32) is distinct from 2024 +select c1_date32 from t2 where extract(year from c1_date32) is distinct from 2024 ---- +NULL 1990-05-20 2030-12-31 # -# Explain statements +# Check that date_part is not in the explain statements for other datatypes # - query TT -explain select c1_date32 from t where extract (year from c1_date32) = 2024 +explain select c1_date32 from t2 where extract (year from c1_date32) = 2024 ---- logical_plan -01)Filter: t.c1_date32 >= Date32("2024-01-01") AND t.c1_date32 < Date32("2025-01-01") -02)--TableScan: t projection=[c1_date32] +01)Filter: t2.c1_date32 >= Date32("2024-01-01") AND t2.c1_date32 < Date32("2025-01-01") +02)--TableScan: t2 projection=[c1_date32] physical_plan 01)FilterExec: c1_date32@0 >= 2024-01-01 AND c1_date32@0 < 2025-01-01 02)--DataSourceExec: partitions=1, partition_sizes=[1] query TT -explain select c1_date32 from t where extract (year from c1_date32) <> 2024 +explain select c1_date32 from t2 where extract (year from c1_date32) <> 2024 ---- logical_plan -01)Filter: t.c1_date32 < Date32("2024-01-01") OR t.c1_date32 >= Date32("2025-01-01") -02)--TableScan: t projection=[c1_date32] +01)Filter: t2.c1_date32 < Date32("2024-01-01") OR t2.c1_date32 >= Date32("2025-01-01") +02)--TableScan: t2 projection=[c1_date32] physical_plan 01)FilterExec: c1_date32@0 < 2024-01-01 OR c1_date32@0 >= 2025-01-01 02)--DataSourceExec: partitions=1, partition_sizes=[1] query TT -explain select c2_ts_sec from t where extract (year from c2_ts_sec) > 2024 +explain select c2_ts_sec from t2 where extract (year from c2_ts_sec) > 2024 ---- logical_plan -01)Filter: t.c2_ts_sec >= TimestampNanosecond(1735689600000000000, None) -02)--TableScan: t projection=[c2_ts_sec] +01)Filter: t2.c2_ts_sec >= TimestampNanosecond(1735689600000000000, None) +02)--TableScan: t2 projection=[c2_ts_sec] physical_plan 01)FilterExec: c2_ts_sec@0 >= 1735689600000000000 02)--DataSourceExec: partitions=1, partition_sizes=[1] query TT -explain select c3_ts_mili from t where extract (year from c3_ts_mili) < 2024 +explain select c3_ts_mili from t2 where extract (year from c3_ts_mili) < 2024 ---- logical_plan -01)Filter: t.c3_ts_mili < TimestampNanosecond(1704067200000000000, None) -02)--TableScan: t projection=[c3_ts_mili] +01)Filter: t2.c3_ts_mili < TimestampNanosecond(1704067200000000000, None) +02)--TableScan: t2 projection=[c3_ts_mili] physical_plan 01)FilterExec: c3_ts_mili@0 < 1704067200000000000 02)--DataSourceExec: partitions=1, partition_sizes=[1] query TT -explain select c4_ts_micro from t where extract (year from c4_ts_micro) >= 2024 +explain select c4_ts_micro from t2 where extract (year from c4_ts_micro) >= 2024 ---- logical_plan -01)Filter: t.c4_ts_micro >= TimestampNanosecond(1704067200000000000, None) -02)--TableScan: t projection=[c4_ts_micro] +01)Filter: t2.c4_ts_micro >= TimestampNanosecond(1704067200000000000, None) +02)--TableScan: t2 projection=[c4_ts_micro] physical_plan 01)FilterExec: c4_ts_micro@0 >= 1704067200000000000 02)--DataSourceExec: partitions=1, partition_sizes=[1] query TT -explain select c5_ts_nano from t where extract (year from c5_ts_nano) <= 2024 +explain select c5_ts_nano from t2 where extract (year from c5_ts_nano) <= 2024 ---- logical_plan -01)Filter: t.c5_ts_nano < TimestampNanosecond(1735689600000000000, None) -02)--TableScan: t projection=[c5_ts_nano] +01)Filter: t2.c5_ts_nano < TimestampNanosecond(1735689600000000000, None) +02)--TableScan: t2 projection=[c5_ts_nano] physical_plan 01)FilterExec: c5_ts_nano@0 < 1735689600000000000 02)--DataSourceExec: partitions=1, partition_sizes=[1] # This one doesn't pass due to a ParseError -#External error: task 20343 panicked with message "called `Result::unwrap()` on an `Err` value: ParseError { kind: InvalidLine(\"explain select c1_date32 from t where extract(year from c1_date32) is not distinct from 2024\"), loc: Location { file: \"test_files/udf_preimage.slt\", line: 162, upper: None } }" +#External error: task 20343 panicked with message "called `Result::unwrap()` on an `Err` value: ParseError { kind: InvalidLine(\"explain select c1_date32 from t2 where extract(year from c1_date32) is not distinct from 2024\"), loc: Location { file: \"test_files/udf_preimage.slt\", line: 162, upper: None } }" # -#explain select c1_date32 from t where extract (year from c1_date32) is not distinct from 2024 +#explain select c1_date32 from t2 where extract (year from c1_date32) is not distinct from 2024 #---- #logical_plan -#01)Filter: t.c1_date32 >= Date32("2024-01-01") AND t.c1_date32 < Date32("2025-01-01") -#02)--TableScan: t projection=[c1_date32] +#01)Filter: t2.c1_date32 >= Date32("2024-01-01") AND t2.c1_date32 < Date32("2025-01-01") +#02)--TableScan: t2 projection=[c1_date32] #physical_plan #01)FilterExec: c1_date32@0 >= 2024-01-01 AND c1_date32@0 < 2025-01-01 #02)--DataSourceExec: partitions=1, partition_sizes=[1] query TT -explain select c1_date32 from t where extract (year from c1_date32) is distinct from 2024 +explain select c1_date32 from t2 where extract (year from c1_date32) is distinct from 2024 ---- logical_plan -01)Filter: t.c1_date32 < Date32("2024-01-01") OR t.c1_date32 >= Date32("2025-01-01") OR t.c1_date32 IS NULL -02)--TableScan: t projection=[c1_date32] +01)Filter: t2.c1_date32 < Date32("2024-01-01") OR t2.c1_date32 >= Date32("2025-01-01") OR t2.c1_date32 IS NULL +02)--TableScan: t2 projection=[c1_date32] physical_plan 01)FilterExec: c1_date32@0 < 2024-01-01 OR c1_date32@0 >= 2025-01-01 OR c1_date32@0 IS NULL -02)--DataSourceExec: partitions=1, partition_sizes=[1] - -# Explain eq not year -query TT -explain select c1_date32 from t where extract (month from c1_date32) = 2024 ----- -logical_plan -01)Filter: date_part(Utf8("MONTH"), t.c1_date32) = Int32(2024) -02)--TableScan: t projection=[c1_date32] -physical_plan -01)FilterExec: date_part(MONTH, c1_date32@0) = 2024 02)--DataSourceExec: partitions=1, partition_sizes=[1] \ No newline at end of file From 51060490fb9670747a77e19c2252e45dd182850b Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Wed, 3 Dec 2025 09:29:11 -0500 Subject: [PATCH 21/27] Remove commented out changes --- .../simplify_expressions/expr_simplifier.rs | 48 ----------------- .../src/simplify_expressions/udf_preimage.rs | 53 ------------------- 2 files changed, 101 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index f43cb8ed2eafa..75db248dd3928 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1999,54 +1999,6 @@ impl TreeNodeRewriter for Simplifier<'_, S> { )? } - // // For case: - // // try_cast/cast(expr as left_type) in (lit1, lit2, lit3) - // Expr::InList(InList { - // expr: mut left, - // list, - // negated, - // }) if is_scalar_udf_expr_and_support_preimage_in_comparison_for_inlist( - // info, &left, &list, - // ) => - // { - // let Expr::ScalarFunction(ScalarFunction { - // func, args - // }) = left.as_mut() - // else { - // return internal_err!("Expect scalar function expression, but got {:?}", left)?; - // }; - // let left_expr = Box::new(args[1].clone()); - // let expr_type = info.get_data_type(&left_expr)?; - // let right_exprs = list - // .into_iter() - // .map(|right| { - // match right { - // Expr::Literal(right_lit_value, _) => { - // // if the right_lit_value can be casted to the type of internal_left_expr - // // we need to unwrap the cast for cast/try_cast expr, and add cast to the literal - // let Some(value) = try_cast_literal_to_type(&right_lit_value, &expr_type) else { - // internal_err!( - // "Can't cast the list expr {:?} to type {}", - // right_lit_value, &expr_type - // )? - // }; - // Ok(lit(value)) - // } - // other_expr => internal_err!( - // "Only support literal expr to optimize, but the expr is {:?}", - // &other_expr - // ), - // } - // }) - // .collect::>>()?; - - // Transformed::yes(Expr::InList(InList { - // expr: std::mem::take(&mut: left_expr), - // list: right_exprs, - // negated, - // })) - // } - // no additional rewrites possible expr => Transformed::no(expr), }) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index fab1e699ef4cb..9ea9fad5fdfdb 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -184,45 +184,6 @@ pub(super) fn is_scalar_udf_expr_and_support_preimage_in_comparison_for_binary< } } -// pub(super) fn is_scalar_udf_expr_and_support_preimage_in_comparison_for_inlist< -// S: SimplifyInfo, -// >( -// info: &S, -// expr: &Expr, -// list: &[Expr], -// ) -> bool { -// let (func, args) = match expr { -// Expr::ScalarFunction(ScalarFunction { func, args }) => (func, args), -// _ => return false, -// }; -// match func.name() { -// "date_part" => { -// let left_expr = Box::new(args[1].clone()); -// let Some(ScalarValue::Utf8(Some(part))) = args[0].as_literal() else { -// return false; -// }; -// match IntervalUnit::from_str(part) { -// Ok(IntervalUnit::Year) => {} -// _ => return false, -// }; -// let Ok(expr_type) = info.get_data_type(&left_expr) else { -// return false; -// }; -// for right in list { -// match right { -// Expr::Literal(lit_value, _) -// if DatePartFunc::new() -// .preimage_cast(lit_value, &expr_type, Operator::Eq) -// .is_some() => {} -// _ => return false, -// } -// } -// true -// } -// _ => false, -// } -// } - #[cfg(test)] mod tests { use crate::simplify_expressions::ExprSimplifier; @@ -330,20 +291,6 @@ mod tests { assert_eq!(optimize_test(expr_lt, &schema), expected) } - // #[test] - // fn test_preimage_date_part_date32_in_list() { - // let schema = expr_test_schema(); - // let expr_lt = expr_fn::date_part(lit("year"), col("date32")) - // .in_list(vec![lit(2024i32), lit(1984i32)], false); - // let expected = (col("date32") - // .gt_eq(lit(ScalarValue::Date32(Some(19723)))) - // .or(col("date32").lt(lit(ScalarValue::Date32(Some(20089)))))) - // .or(col("date32") - // .gt_eq(lit(ScalarValue::Date32(Some(5113)))) - // .or(col("date32").lt(lit(ScalarValue::Date32(Some(5480)))))); - // assert_eq!(optimize_test(expr_lt, &schema), expected) - // } - fn optimize_test(expr: Expr, schema: &DFSchemaRef) -> Expr { let props = ExecutionProps::new(); let simplifier = ExprSimplifier::new( From e1c358f6fe793f540410e331a540381453b6e922 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 12 Dec 2025 15:30:30 -0500 Subject: [PATCH 22/27] Implementing suggestions --- Cargo.lock | 2 +- datafusion/expr/src/udf.rs | 71 +++-- datafusion/functions/Cargo.toml | 1 + .../functions/src/datetime/date_part.rs | 280 +++++++++++++++--- datafusion/optimizer/Cargo.toml | 1 - .../simplify_expressions/expr_simplifier.rs | 55 ++-- .../src/simplify_expressions/udf_preimage.rs | 273 +++-------------- 7 files changed, 355 insertions(+), 328 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8a2c1faaa149c..ac6e2f4a84909 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2326,6 +2326,7 @@ dependencies = [ "datafusion-expr", "datafusion-expr-common", "datafusion-macros", + "datafusion-optimizer", "env_logger", "hex", "itertools 0.14.0", @@ -2456,7 +2457,6 @@ dependencies = [ "datafusion-common", "datafusion-expr", "datafusion-expr-common", - "datafusion-functions", "datafusion-functions-aggregate", "datafusion-functions-window", "datafusion-functions-window-common", diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 220fe7b823404..2dfbef7844a9f 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -226,6 +226,25 @@ impl ScalarUDF { self.inner.simplify(args, info) } + /// Return a preimage + /// + /// See [`ScalarUDFImpl::preimage`] for more details. + pub fn preimage( + &self, + args: &[Expr], + lit_expr: &Expr, + info: &dyn SimplifyInfo, + ) -> Result> { + self.inner.preimage(args, lit_expr, info) + } + + /// Return inner column from function args + /// + /// See [`ScalarUDFImpl::column_expr`] + pub fn column_expr(&self, args: &[Expr]) -> Option { + self.inner.column_expr(args) + } + #[deprecated(since = "50.0.0", note = "Use `return_field_from_args` instead.")] pub fn is_nullable(&self, args: &[Expr], schema: &dyn ExprSchema) -> bool { #[allow(deprecated)] @@ -696,30 +715,33 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync { Ok(ExprSimplifyResult::Original(args)) } - /// Attempts to convert a literal value to in interval of the corresponding datatype - /// of a column expression so that a **preimage** can be computed for - /// pruning comparison predicates. + /// Returns the [preimage] for this function and the specified scalar value, if any. /// - /// This is used during predicate-pushdown optimization - /// (see `datafusion-optimizer-udf_preimage::preimage_in_comparison_for_binary`) + /// A preimage is a single contiguous [`Interval`] of values where the function + /// will always return `lit_value` /// - /// Currently is only implemented by: - /// - `date_part(YEAR, expr)` + /// This rewrite is described in the [ClickHouse Paper] and is particularly + /// useful for simplifying expressions `date_part` or equivalent functions. The + /// idea is that if you have an expression like `date_part(YEAR, k) = 2024` and you + /// can find a [preimage] for `date_part(YEAR, k)`, which is the range of dates + /// covering the entire year of 2024. Thus, you can rewrite the expression to `k + /// >= '2024-01-01' AND k < '2025-01-01' which is often more optimizable. /// - /// # Arguments: - /// * `lit_value`: The literal `&ScalarValue` used in comparison - /// * `target_type`: The datatype of the column expression inside the function + /// This should only return a preimage if the function takes a single argument /// - /// # Returns - /// - /// Returns an `Interval` of the appropriate target type if a - /// preimage cast is supported for the given function/operator combination; - /// otherwise returns `None`. + /// [ClickHouse Paper]: https://www.vldb.org/pvldb/vol17/p3731-schulze.pdf + /// [preimage]: https://en.wikipedia.org/wiki/Image_(mathematics)#Inverse_image fn preimage( &self, - _lit_value: &ScalarValue, - _target_type: &DataType, - ) -> Option { + _args: &[Expr], + _lit_expr: &Expr, + _info: &dyn SimplifyInfo, + ) -> Result> { + Ok(None) + } + + // Return the inner column expression from this function + fn column_expr(&self, _args: &[Expr]) -> Option { None } @@ -955,10 +977,15 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl { fn preimage( &self, - lit_value: &ScalarValue, - target_type: &DataType, - ) -> Option { - self.inner.preimage(lit_value, target_type) + args: &[Expr], + lit_expr: &Expr, + info: &dyn SimplifyInfo, + ) -> Result> { + self.inner.preimage(args, lit_expr, info) + } + + fn column_expr(&self, args: &[Expr]) -> Option { + self.inner.column_expr(args) } fn conditional_arguments<'a>( diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index ad52a551a7c17..b665037b9fc4e 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -76,6 +76,7 @@ datafusion-doc = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } datafusion-expr-common = { workspace = true } +datafusion-optimizer = { workspace = true } datafusion-macros = { workspace = true } hex = { version = "0.4", optional = true } itertools = { workspace = true } diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index 5a776ff8e771e..998d601a8fe38 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -27,6 +27,9 @@ use arrow::datatypes::DataType::{ }; use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; use arrow::datatypes::{DataType, Field, FieldRef, TimeUnit}; +use arrow::temporal_conversions::{ + MICROSECONDS_IN_DAY, MILLISECONDS_IN_DAY, NANOSECONDS_IN_DAY, SECONDS_IN_DAY, +}; use chrono::{Datelike, NaiveDate}; use datafusion_common::types::{logical_date, NativeType}; @@ -42,7 +45,8 @@ use datafusion_common::{ utils::take_function_args, Result, ScalarValue, }; -use datafusion_expr::interval_arithmetic; +use datafusion_expr::simplify::SimplifyInfo; +use datafusion_expr::{interval_arithmetic, Expr}; use datafusion_expr::{ ColumnarValue, Documentation, ReturnFieldArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility, @@ -239,25 +243,59 @@ impl ScalarUDFImpl for DatePartFunc { // date_part(col, MONTH) = 1 => col = '2023-01-01' or col = '2024-01-01' or ... or col = '3000-01-01' fn preimage( &self, - lit_value: &ScalarValue, - target_type: &DataType, - ) -> Option { - let year = match lit_value { + args: &[Expr], + lit_expr: &Expr, + info: &dyn SimplifyInfo, + ) -> Result> { + let [part, col_expr] = take_function_args(self.name(), args)?; + + // Get the interval unit from the part argument + let interval_unit = part + .as_literal() + .and_then(|sv| sv.try_as_str().flatten()) + .map(part_normalization) + .and_then(|s| IntervalUnit::from_str(s).ok()); + + // only support extracting year + match interval_unit { + Some(IntervalUnit::Year) => (), + _ => return Ok(None), + } + + // Check if the argument is a literal (e.g. date_part(YEAR, col) = 2024) + let Some(argument_literal) = lit_expr.as_literal() else { + return Ok(None); + }; + + // Extract i32 year from Scalar value + let year = match argument_literal { ScalarValue::Int32(Some(y)) => *y, - _ => return None, + _ => return Ok(None), }; - // Can only extract year from Date32/64 and Timestamp - match target_type { - Date32 | Date64 | Timestamp(_, _) => {} - _ => return None, + + // Can only extract year from Date32/64 and Timestamp column + let target_type = match info.get_data_type(col_expr)? { + Date32 | Date64 | Timestamp(_, _) => &info.get_data_type(col_expr)?, + _ => return Ok(None), }; - let start_time = NaiveDate::from_ymd_opt(year, 1, 1)?; - let end_time = start_time.with_year(year + 1)?; - let lower = date_to_scalar(start_time, target_type)?; - let upper = date_to_scalar(end_time, target_type)?; + // Compute the Interval bounds + let start_time = + NaiveDate::from_ymd_opt(year, 1, 1).expect("Expect computed start time"); + let end_time = start_time + .with_year(year + 1) + .expect("Expect computed end time"); + + // Convert to ScalarValues + let lower = date_to_scalar(start_time, target_type) + .expect("Expect preimage interval lower bound"); + let upper = date_to_scalar(end_time, target_type) + .expect("Expect preimage interval upper bound"); + Ok(Some(interval_arithmetic::Interval::try_new(lower, upper)?)) + } - interval_arithmetic::Interval::try_new(lower, upper).ok() + fn column_expr(&self, args: &[Expr]) -> Option { + Some(args[1].clone()) } fn aliases(&self) -> &[String] { @@ -270,41 +308,32 @@ impl ScalarUDFImpl for DatePartFunc { } fn date_to_scalar(date: NaiveDate, target_type: &DataType) -> Option { - let scalar = match target_type { - Date32 => { - let days = date - .signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?) - .num_days() as i32; - ScalarValue::Date32(Some(days)) - } - Date64 => { - let milis = date - .signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?) - .num_milliseconds(); - ScalarValue::Date64(Some(milis)) - } - Timestamp(unit, tz) => { - let days = date - .signed_duration_since(NaiveDate::from_ymd_opt(1970, 1, 1)?) - .num_days(); - match unit { - Second => ScalarValue::TimestampSecond(Some(days * 86_400), tz.clone()), - Millisecond => { - ScalarValue::TimestampMillisecond(Some(days * 86_400_000), tz.clone()) - } - Microsecond => ScalarValue::TimestampMicrosecond( - Some(days * 86_400_000_000), - tz.clone(), - ), - Nanosecond => ScalarValue::TimestampNanosecond( - Some(days * 86_400_000_000_000), - tz.clone(), - ), + let days = date + .signed_duration_since(NaiveDate::from_epoch_days(0)?) + .num_days(); + + Some(match target_type { + Date32 => ScalarValue::Date32(Some(days as i32)), + Date64 => ScalarValue::Date64(Some(days * MILLISECONDS_IN_DAY)), + Timestamp(unit, tz) => match unit { + Second => { + ScalarValue::TimestampSecond(Some(days * SECONDS_IN_DAY), tz.clone()) } - } + Millisecond => ScalarValue::TimestampMillisecond( + Some(days * MILLISECONDS_IN_DAY), + tz.clone(), + ), + Microsecond => ScalarValue::TimestampMicrosecond( + Some(days * MICROSECONDS_IN_DAY), + tz.clone(), + ), + Nanosecond => ScalarValue::TimestampNanosecond( + Some(days * NANOSECONDS_IN_DAY), + tz.clone(), + ), + }, _ => return None, - }; - Some(scalar) + }) } fn is_epoch(part: &str) -> bool { @@ -445,3 +474,158 @@ fn epoch(array: &dyn Array) -> Result { }; Ok(Arc::new(f)) } + +#[cfg(test)] +mod tests { + use crate::datetime::expr_fn; + use arrow::datatypes::{DataType, Field, TimeUnit}; + use datafusion_common::{DFSchema, DFSchemaRef, ScalarValue}; + use datafusion_expr::expr_fn::col; + use datafusion_expr::or; + use datafusion_expr::{ + and, execution_props::ExecutionProps, lit, simplify::SimplifyContext, Expr, + }; + use datafusion_optimizer::simplify_expressions::ExprSimplifier; + use std::{collections::HashMap, sync::Arc}; + + #[test] + fn test_preimage_date_part_date32_eq() { + let schema = expr_test_schema(); + // date_part(c1, DatePart::Year) = 2024 -> c1 >= 2024-01-01 AND c1 < 2025-01-01 + let expr_lt = expr_fn::date_part(lit("year"), col("date32")).eq(lit(2024i32)); + let expected = and( + col("date32").gt_eq(lit(ScalarValue::Date32(Some(19723)))), + col("date32").lt(lit(ScalarValue::Date32(Some(20089)))), + ); + assert_eq!(optimize_test(expr_lt, &schema), expected) + } + + #[test] + fn test_preimage_date_part_date64_not_eq() { + let schema = expr_test_schema(); + // date_part(c1, DatePart::Year) <> 2024 -> c1 < 2024-01-01 AND c1 >= 2025-01-01 + let expr_lt = expr_fn::date_part(lit("year"), col("date64")).not_eq(lit(2024i32)); + let expected = or( + col("date64").lt(lit(ScalarValue::Date64(Some(19723 * 86_400_000)))), + col("date64").gt_eq(lit(ScalarValue::Date64(Some(20089 * 86_400_000)))), + ); + assert_eq!(optimize_test(expr_lt, &schema), expected) + } + + #[test] + fn test_preimage_date_part_timestamp_nano_lt() { + let schema = expr_test_schema(); + let expr_lt = + expr_fn::date_part(lit("year"), col("ts_nano_none")).lt(lit(2024i32)); + let expected = col("ts_nano_none").lt(lit(ScalarValue::TimestampNanosecond( + Some(19723 * 86_400_000_000_000), + None, + ))); + assert_eq!(optimize_test(expr_lt, &schema), expected) + } + + #[test] + fn test_preimage_date_part_timestamp_nano_utc_gt() { + let schema = expr_test_schema(); + let expr_lt = + expr_fn::date_part(lit("year"), col("ts_nano_utc")).gt(lit(2024i32)); + let expected = col("ts_nano_utc").gt_eq(lit(ScalarValue::TimestampNanosecond( + Some(20089 * 86_400_000_000_000), + None, + ))); + assert_eq!(optimize_test(expr_lt, &schema), expected) + } + + #[test] + fn test_preimage_date_part_timestamp_sec_est_gt_eq() { + let schema = expr_test_schema(); + let expr_lt = + expr_fn::date_part(lit("year"), col("ts_sec_est")).gt_eq(lit(2024i32)); + let expected = col("ts_sec_est").gt_eq(lit(ScalarValue::TimestampSecond( + Some(19723 * 86_400), + None, + ))); + assert_eq!(optimize_test(expr_lt, &schema), expected) + } + + #[test] + fn test_preimage_date_part_timestamp_sec_est_lt_eq() { + let schema = expr_test_schema(); + let expr_lt = + expr_fn::date_part(lit("year"), col("ts_mic_pt")).lt_eq(lit(2024i32)); + let expected = col("ts_mic_pt").lt(lit(ScalarValue::TimestampMicrosecond( + Some(20089 * 86_400_000_000), + None, + ))); + assert_eq!(optimize_test(expr_lt, &schema), expected) + } + + #[test] + fn test_preimage_date_part_timestamp_nano_lt_swap() { + let schema = expr_test_schema(); + let expr_lt = + lit(2024i32).gt(expr_fn::date_part(lit("year"), col("ts_nano_none"))); + let expected = col("ts_nano_none").lt(lit(ScalarValue::TimestampNanosecond( + Some(19723 * 86_400_000_000_000), + None, + ))); + assert_eq!(optimize_test(expr_lt, &schema), expected) + } + + #[test] + // Should not simplify + fn test_preimage_date_part_not_year_date32_eq() { + let schema = expr_test_schema(); + // date_part(c1, DatePart::Year) = 2024 -> c1 >= 2024-01-01 AND c1 < 2025-01-01 + let expr_lt = expr_fn::date_part(lit("month"), col("date32")).eq(lit(1i32)); + let expected = expr_fn::date_part(lit("month"), col("date32")).eq(lit(1i32)); + assert_eq!(optimize_test(expr_lt, &schema), expected) + } + + fn optimize_test(expr: Expr, schema: &DFSchemaRef) -> Expr { + let props = ExecutionProps::new(); + let simplifier = ExprSimplifier::new( + SimplifyContext::new(&props).with_schema(Arc::clone(schema)), + ); + + simplifier.simplify(expr).unwrap() + } + + fn expr_test_schema() -> DFSchemaRef { + Arc::new( + DFSchema::from_unqualified_fields( + vec![ + Field::new("date32", DataType::Date32, false), + Field::new("date64", DataType::Date64, false), + Field::new("ts_nano_none", timestamp_nano_none_type(), false), + Field::new("ts_nano_utc", timestamp_nano_utc_type(), false), + Field::new("ts_sec_est", timestamp_sec_est_type(), false), + Field::new("ts_mic_pt", timestamp_mic_pt_type(), false), + ] + .into(), + HashMap::new(), + ) + .unwrap(), + ) + } + + fn timestamp_nano_none_type() -> DataType { + DataType::Timestamp(TimeUnit::Nanosecond, None) + } + + // this is the type that now() returns + fn timestamp_nano_utc_type() -> DataType { + let utc = Some("+0:00".into()); + DataType::Timestamp(TimeUnit::Nanosecond, utc) + } + + fn timestamp_sec_est_type() -> DataType { + let est = Some("-5:00".into()); + DataType::Timestamp(TimeUnit::Second, est) + } + + fn timestamp_mic_pt_type() -> DataType { + let pt = Some("-8::00".into()); + DataType::Timestamp(TimeUnit::Microsecond, pt) + } +} diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml index d3ab799b0c9b8..15d3261ca5132 100644 --- a/datafusion/optimizer/Cargo.toml +++ b/datafusion/optimizer/Cargo.toml @@ -49,7 +49,6 @@ chrono = { workspace = true } datafusion-common = { workspace = true, default-features = true } datafusion-expr = { workspace = true } datafusion-expr-common = { workspace = true } -datafusion-functions = { workspace = true } datafusion-physical-expr = { workspace = true } indexmap = { workspace = true } itertools = { workspace = true } diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 75db248dd3928..9c123ae40ece4 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -52,10 +52,7 @@ use super::inlist_simplifier::ShortenInListSimplifier; use super::utils::*; use crate::analyzer::type_coercion::TypeCoercionRewriter; use crate::simplify_expressions::regex::simplify_regex_expr; -use crate::simplify_expressions::udf_preimage::{ - is_scalar_udf_expr_and_support_preimage_in_comparison_for_binary, - preimage_in_comparison_for_binary, -}; +use crate::simplify_expressions::udf_preimage::rewrite_with_preimage; use crate::simplify_expressions::unwrap_cast::{ is_cast_expr_and_support_unwrap_cast_in_comparison_for_binary, is_cast_expr_and_support_unwrap_cast_in_comparison_for_inlist, @@ -64,6 +61,7 @@ use crate::simplify_expressions::unwrap_cast::{ use crate::simplify_expressions::SimplifyInfo; use datafusion_expr::expr_rewriter::rewrite_with_guarantees_map; use datafusion_expr_common::casts::try_cast_literal_to_type; +use datafusion_expr_common::interval_arithmetic::Interval; use indexmap::IndexSet; use regex::Regex; @@ -1977,26 +1975,35 @@ impl TreeNodeRewriter for Simplifier<'_, S> { // For a complex predicate like `date_part('YEAR', c1) < 2000`, pruning is not possible. // After rewriting it to `c1 < 2000-01-01`, pruning becomes feasible. Expr::BinaryExpr(BinaryExpr { left, op, right }) - if is_scalar_udf_expr_and_support_preimage_in_comparison_for_binary( - info, &left, op, &right, - ) => + if get_preimage(&left, &right, info)?.0.is_some() + && get_preimage(&left, &right, info)?.1.is_some() => { - preimage_in_comparison_for_binary(info, *left, *right, op)? + // todo use let binding (if let Some(interval) = ...) once stabilized to avoid computing this thrice😢 + let (Some(interval), Some(col_expr)) = + get_preimage(left.as_ref(), &right, info)? + else { + unreachable!( + "The above if statement insures interval and col_expr are Some" + ) + }; + rewrite_with_preimage(info, interval, op, Box::new(col_expr))? } // literal op date_part(literal, expression) // --> // date_part(literal, expression) op_swap literal Expr::BinaryExpr(BinaryExpr { left, op, right }) - if is_scalar_udf_expr_and_support_preimage_in_comparison_for_binary( - info, &right, op, &left, - ) && op.swap().is_some() => + if get_preimage(&right, &left, info)?.0.is_some() + && get_preimage(&right, &left, info)?.1.is_some() + && op.swap().is_some() => { - preimage_in_comparison_for_binary( - info, - *right, - *left, - op.swap().unwrap(), - )? + let swapped = op.swap().unwrap(); + let (Some(interval), Some(col_expr)) = get_preimage(&right, &left, info)? + else { + unreachable!( + "The above if statement insures interval and col_expr are Some" + ) + }; + rewrite_with_preimage(info, interval, swapped, Box::new(col_expr))? } // no additional rewrites possible @@ -2005,6 +2012,20 @@ impl TreeNodeRewriter for Simplifier<'_, S> { } } +fn get_preimage( + left_expr: &Expr, + right_expr: &Expr, + info: &dyn SimplifyInfo, +) -> Result<(Option, Option)> { + let Expr::ScalarFunction(ScalarFunction { func, args }) = left_expr else { + return Ok((None, None)); + }; + Ok(( + func.preimage(args, right_expr, info)?, + func.column_expr(args), + )) +} + fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option)> { match expr { Expr::Literal(ScalarValue::Utf8(s), _) => Some((DataType::Utf8, s)), diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index 9ea9fad5fdfdb..822830a659e9b 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -15,61 +15,56 @@ // specific language governing permissions and limitations // under the License. -use std::str::FromStr; - -use arrow::compute::kernels::cast_utils::IntervalUnit; -use datafusion_common::{internal_err, tree_node::Transformed, Result, ScalarValue}; -use datafusion_expr::{ - and, expr::ScalarFunction, lit, or, simplify::SimplifyInfo, BinaryExpr, Expr, - Operator, ScalarUDFImpl, -}; -use datafusion_functions::datetime::date_part::DatePartFunc; - -pub(super) fn preimage_in_comparison_for_binary( - info: &dyn SimplifyInfo, - udf_expr: Expr, - literal: Expr, +use datafusion_common::{internal_err, tree_node::Transformed, Result}; +use datafusion_expr::{and, lit, or, simplify::SimplifyInfo, BinaryExpr, Expr, Operator}; +use datafusion_expr_common::interval_arithmetic::Interval; + +/// Rewrites a binary expression using its "preimage" +/// +/// Specifically it rewrites expressions of the form ` OP x` (e.g. ` = +/// x`) where `` is known to have a pre-image (aka the entire single +/// range for which it is valid) +/// +/// This rewrite is described in the [ClickHouse Paper] and is particularly +/// useful for simplifying expressions `date_part` or equivalent functions. The +/// idea is that if you have an expression like `date_part(YEAR, k) = 2024` and you +/// can find a [preimage] for `date_part(YEAR, k)`, which is the range of dates +/// covering the entire year of 2024. Thus, you can rewrite the expression to `k +/// >= '2024-01-01' AND k < '2025-01-01' which is often more optimizable. +/// +/// [ClickHouse Paper]: https://www.vldb.org/pvldb/vol17/p3731-schulze.pdf +/// [preimage]: https://en.wikipedia.org/wiki/Image_(mathematics)#Inverse_image +/// +pub(super) fn rewrite_with_preimage( + _info: &dyn SimplifyInfo, + preimage_interval: Interval, op: Operator, + expr: Box, ) -> Result> { - let (func, args, lit_value) = match (udf_expr, literal) { - ( - Expr::ScalarFunction(ScalarFunction { func, args }), - Expr::Literal(lit_value, _), - ) => (func, args, lit_value), - _ => return internal_err!("Expect scalar function expr and literal"), - }; - let expr = Box::new(args[1].clone()); - - let Ok(expr_type) = info.get_data_type(&expr) else { - return internal_err!("Can't get the data type of the expr {:?}", &expr); - }; - - let preimage_interval = match func.name() { - "date_part" => DatePartFunc::new() - .preimage(&lit_value, &expr_type) - .expect("Preimage interval should be created"), - _ => return internal_err!("Preimage is not supported for {:?}", func.name()), - }; - - let lower = lit(preimage_interval.lower().clone()); - let upper = lit(preimage_interval.upper().clone()); + let (lower, upper) = preimage_interval.into_bounds(); + let (lower, upper) = (lit(lower), lit(upper)); let rewritten_expr = match op { + // < x ==> < upper + // >= x ==> >= lower Operator::Lt | Operator::GtEq => Expr::BinaryExpr(BinaryExpr { left: expr, op, right: Box::new(lower), }), + // > x ==> >= upper Operator::Gt => Expr::BinaryExpr(BinaryExpr { left: expr, op: Operator::GtEq, right: Box::new(upper), }), + // <= x ==> < upper Operator::LtEq => Expr::BinaryExpr(BinaryExpr { left: expr, op: Operator::Lt, right: Box::new(upper), }), + // = x ==> ( >= lower) and ( < upper) Operator::Eq => and( Expr::BinaryExpr(BinaryExpr { left: expr.clone(), @@ -82,6 +77,7 @@ pub(super) fn preimage_in_comparison_for_binary( right: Box::new(upper), }), ), + // != x ==> ( < lower) or ( >= upper) Operator::NotEq => or( Expr::BinaryExpr(BinaryExpr { left: expr.clone(), @@ -94,6 +90,7 @@ pub(super) fn preimage_in_comparison_for_binary( right: Box::new(upper), }), ), + // is distinct from x ==> ( < lower) or ( >= upper) or ( is NULL and x is not NULL) or ( is not NULL and x is NULL) Operator::IsDistinctFrom => or( or( Expr::BinaryExpr(BinaryExpr { @@ -112,6 +109,7 @@ pub(super) fn preimage_in_comparison_for_binary( and(expr.is_not_null(), lower.is_null()), ), ), + // is distinct from x ==> ( is NULL and x is NULL) or (( >= lower) and ( < upper)) Operator::IsNotDistinctFrom => or( Expr::BinaryExpr(BinaryExpr { left: Box::new(expr.clone().is_null()), @@ -135,206 +133,3 @@ pub(super) fn preimage_in_comparison_for_binary( }; Ok(Transformed::yes(rewritten_expr)) } - -pub(super) fn is_scalar_udf_expr_and_support_preimage_in_comparison_for_binary< - S: SimplifyInfo, ->( - info: &S, - expr: &Expr, - op: Operator, - literal: &Expr, -) -> bool { - let (func, args, lit_value) = match (expr, op, literal) { - ( - Expr::ScalarFunction(ScalarFunction { func, args }), - Operator::Eq - | Operator::NotEq - | Operator::Gt - | Operator::Lt - | Operator::GtEq - | Operator::LtEq - | Operator::IsDistinctFrom - | Operator::IsNotDistinctFrom, - Expr::Literal(lit_value, _), - ) => (func, args, lit_value), - _ => return false, - }; - - match func.name() { - "date_part" => { - let left_expr = Box::new(args[1].clone()); - let Some(ScalarValue::Utf8(Some(part))) = args[0].as_literal() else { - return false; - }; - match IntervalUnit::from_str(part) { - Ok(IntervalUnit::Year) => {} - _ => return false, - }; - let Ok(expr_type) = info.get_data_type(&left_expr) else { - return false; - }; - let Ok(_lit_type) = info.get_data_type(literal) else { - return false; - }; - DatePartFunc::new() - .preimage(lit_value, &expr_type) - .is_some() - } - _ => false, - } -} - -#[cfg(test)] -mod tests { - use crate::simplify_expressions::ExprSimplifier; - use arrow::datatypes::{DataType, Field, TimeUnit}; - use datafusion_common::{DFSchema, DFSchemaRef, ScalarValue}; - use datafusion_expr::expr_fn::col; - use datafusion_expr::or; - use datafusion_expr::{ - and, execution_props::ExecutionProps, lit, simplify::SimplifyContext, Expr, - }; - use datafusion_functions::datetime::expr_fn; - use std::{collections::HashMap, sync::Arc}; - - #[test] - fn test_preimage_date_part_date32_eq() { - let schema = expr_test_schema(); - // date_part(c1, DatePart::Year) = 2024 -> c1 >= 2024-01-01 AND c1 < 2025-01-01 - let expr_lt = expr_fn::date_part(lit("year"), col("date32")).eq(lit(2024i32)); - let expected = and( - col("date32").gt_eq(lit(ScalarValue::Date32(Some(19723)))), - col("date32").lt(lit(ScalarValue::Date32(Some(20089)))), - ); - assert_eq!(optimize_test(expr_lt, &schema), expected) - } - - #[test] - fn test_preimage_date_part_date64_not_eq() { - let schema = expr_test_schema(); - // date_part(c1, DatePart::Year) <> 2024 -> c1 < 2024-01-01 AND c1 >= 2025-01-01 - let expr_lt = expr_fn::date_part(lit("year"), col("date64")).not_eq(lit(2024i32)); - let expected = or( - col("date64").lt(lit(ScalarValue::Date64(Some(19723 * 86_400_000)))), - col("date64").gt_eq(lit(ScalarValue::Date64(Some(20089 * 86_400_000)))), - ); - assert_eq!(optimize_test(expr_lt, &schema), expected) - } - - #[test] - fn test_preimage_date_part_timestamp_nano_lt() { - let schema = expr_test_schema(); - let expr_lt = - expr_fn::date_part(lit("year"), col("ts_nano_none")).lt(lit(2024i32)); - let expected = col("ts_nano_none").lt(lit(ScalarValue::TimestampNanosecond( - Some(19723 * 86_400_000_000_000), - None, - ))); - assert_eq!(optimize_test(expr_lt, &schema), expected) - } - - #[test] - fn test_preimage_date_part_timestamp_nano_utc_gt() { - let schema = expr_test_schema(); - let expr_lt = - expr_fn::date_part(lit("year"), col("ts_nano_utc")).gt(lit(2024i32)); - let expected = col("ts_nano_utc").gt_eq(lit(ScalarValue::TimestampNanosecond( - Some(20089 * 86_400_000_000_000), - None, - ))); - assert_eq!(optimize_test(expr_lt, &schema), expected) - } - - #[test] - fn test_preimage_date_part_timestamp_sec_est_gt_eq() { - let schema = expr_test_schema(); - let expr_lt = - expr_fn::date_part(lit("year"), col("ts_sec_est")).gt_eq(lit(2024i32)); - let expected = col("ts_sec_est").gt_eq(lit(ScalarValue::TimestampSecond( - Some(19723 * 86_400), - None, - ))); - assert_eq!(optimize_test(expr_lt, &schema), expected) - } - - #[test] - fn test_preimage_date_part_timestamp_sec_est_lt_eq() { - let schema = expr_test_schema(); - let expr_lt = - expr_fn::date_part(lit("year"), col("ts_mic_pt")).lt_eq(lit(2024i32)); - let expected = col("ts_mic_pt").lt(lit(ScalarValue::TimestampMicrosecond( - Some(20089 * 86_400_000_000), - None, - ))); - assert_eq!(optimize_test(expr_lt, &schema), expected) - } - - #[test] - fn test_preimage_date_part_timestamp_nano_lt_swap() { - let schema = expr_test_schema(); - let expr_lt = - lit(2024i32).gt(expr_fn::date_part(lit("year"), col("ts_nano_none"))); - let expected = col("ts_nano_none").lt(lit(ScalarValue::TimestampNanosecond( - Some(19723 * 86_400_000_000_000), - None, - ))); - assert_eq!(optimize_test(expr_lt, &schema), expected) - } - - #[test] - // Should not simplify - fn test_preimage_date_part_not_year_date32_eq() { - let schema = expr_test_schema(); - // date_part(c1, DatePart::Year) = 2024 -> c1 >= 2024-01-01 AND c1 < 2025-01-01 - let expr_lt = expr_fn::date_part(lit("month"), col("date32")).eq(lit(1i32)); - let expected = expr_fn::date_part(lit("month"), col("date32")).eq(lit(1i32)); - assert_eq!(optimize_test(expr_lt, &schema), expected) - } - - fn optimize_test(expr: Expr, schema: &DFSchemaRef) -> Expr { - let props = ExecutionProps::new(); - let simplifier = ExprSimplifier::new( - SimplifyContext::new(&props).with_schema(Arc::clone(schema)), - ); - - simplifier.simplify(expr).unwrap() - } - - fn expr_test_schema() -> DFSchemaRef { - Arc::new( - DFSchema::from_unqualified_fields( - vec![ - Field::new("date32", DataType::Date32, false), - Field::new("date64", DataType::Date64, false), - Field::new("ts_nano_none", timestamp_nano_none_type(), false), - Field::new("ts_nano_utc", timestamp_nano_utc_type(), false), - Field::new("ts_sec_est", timestamp_sec_est_type(), false), - Field::new("ts_mic_pt", timestamp_mic_pt_type(), false), - ] - .into(), - HashMap::new(), - ) - .unwrap(), - ) - } - - fn timestamp_nano_none_type() -> DataType { - DataType::Timestamp(TimeUnit::Nanosecond, None) - } - - // this is the type that now() returns - fn timestamp_nano_utc_type() -> DataType { - let utc = Some("+0:00".into()); - DataType::Timestamp(TimeUnit::Nanosecond, utc) - } - - fn timestamp_sec_est_type() -> DataType { - let est = Some("-5:00".into()); - DataType::Timestamp(TimeUnit::Second, est) - } - - fn timestamp_mic_pt_type() -> DataType { - let pt = Some("-8::00".into()); - DataType::Timestamp(TimeUnit::Microsecond, pt) - } -} From c79e937a07f7f1ff0481f023e4f2786f2dd6e0f6 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 12 Dec 2025 16:36:20 -0500 Subject: [PATCH 23/27] fix ci --- datafusion/functions/Cargo.toml | 2 +- datafusion/functions/src/datetime/date_part.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 91aeb0f91857c..2c837570ed4aa 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -76,8 +76,8 @@ datafusion-doc = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } datafusion-expr-common = { workspace = true } -datafusion-optimizer = { workspace = true } datafusion-macros = { workspace = true } +datafusion-optimizer = { workspace = true } hex = { workspace = true, optional = true } itertools = { workspace = true } log = { workspace = true } diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index 44263dfd61822..0631f9e6a8ea1 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -46,11 +46,11 @@ use datafusion_common::{ utils::take_function_args, }; use datafusion_expr::simplify::SimplifyInfo; -use datafusion_expr::{interval_arithmetic, Expr}; use datafusion_expr::{ ColumnarValue, Documentation, ReturnFieldArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility, }; +use datafusion_expr::{Expr, interval_arithmetic}; use datafusion_expr_common::signature::{Coercion, TypeSignatureClass}; use datafusion_macros::user_doc; @@ -483,7 +483,7 @@ mod tests { use datafusion_expr::expr_fn::col; use datafusion_expr::or; use datafusion_expr::{ - and, execution_props::ExecutionProps, lit, simplify::SimplifyContext, Expr, + Expr, and, execution_props::ExecutionProps, lit, simplify::SimplifyContext, }; use datafusion_optimizer::simplify_expressions::ExprSimplifier; use std::{collections::HashMap, sync::Arc}; From 352b3ff6e0255ed29a189768418edddbe1fa833a Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Fri, 12 Dec 2025 16:37:01 -0500 Subject: [PATCH 24/27] cargo fmt --- datafusion/optimizer/src/simplify_expressions/udf_preimage.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index 822830a659e9b..6123910e07db5 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use datafusion_common::{internal_err, tree_node::Transformed, Result}; -use datafusion_expr::{and, lit, or, simplify::SimplifyInfo, BinaryExpr, Expr, Operator}; +use datafusion_common::{Result, internal_err, tree_node::Transformed}; +use datafusion_expr::{BinaryExpr, Expr, Operator, and, lit, or, simplify::SimplifyInfo}; use datafusion_expr_common::interval_arithmetic::Interval; /// Rewrites a binary expression using its "preimage" From 7dbdc00c1b1a661e5185b500aeff4099454809b6 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Mon, 15 Dec 2025 12:21:53 -0500 Subject: [PATCH 25/27] Move tests to core/optimizer/tests --- Cargo.lock | 1 - datafusion/core/tests/optimizer/mod.rs | 148 ++++++++++++++++- datafusion/functions/Cargo.toml | 1 - .../functions/src/datetime/date_part.rs | 155 ------------------ 4 files changed, 145 insertions(+), 160 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e069189570233..77df82639272c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2334,7 +2334,6 @@ dependencies = [ "datafusion-expr", "datafusion-expr-common", "datafusion-macros", - "datafusion-optimizer", "env_logger", "hex", "itertools 0.14.0", diff --git a/datafusion/core/tests/optimizer/mod.rs b/datafusion/core/tests/optimizer/mod.rs index b288706a54c9d..fb8a150859dbd 100644 --- a/datafusion/core/tests/optimizer/mod.rs +++ b/datafusion/core/tests/optimizer/mod.rs @@ -18,6 +18,9 @@ //! Tests for the DataFusion SQL query planner that require functions from the //! datafusion-functions crate. +use datafusion_expr::execution_props::ExecutionProps; +use datafusion_expr::simplify::SimplifyContext; +use datafusion_optimizer::simplify_expressions::ExprSimplifier; use insta::assert_snapshot; use std::any::Any; use std::collections::HashMap; @@ -26,13 +29,16 @@ use std::sync::Arc; use arrow::datatypes::{ DataType, Field, Fields, Schema, SchemaBuilder, SchemaRef, TimeUnit, }; +use datafusion::functions::datetime::expr_fn; use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::TransformedResult; -use datafusion_common::{plan_err, DFSchema, Result, ScalarValue, TableReference}; +use datafusion_common::{ + plan_err, DFSchema, DFSchemaRef, Result, ScalarValue, TableReference, +}; use datafusion_expr::interval_arithmetic::{Interval, NullableInterval}; use datafusion_expr::{ - col, lit, AggregateUDF, BinaryExpr, Expr, ExprSchemable, LogicalPlan, Operator, - ScalarUDF, TableSource, WindowUDF, + and, col, lit, or, AggregateUDF, BinaryExpr, Expr, ExprSchemable, LogicalPlan, + Operator, ScalarUDF, TableSource, WindowUDF, }; use datafusion_functions::core::expr_ext::FieldAccessor; use datafusion_optimizer::analyzer::Analyzer; @@ -378,3 +384,139 @@ fn validate_unchanged_cases(guarantees: &[(Expr, NullableInterval)], cases: &[Ex ); } } + +// DatePart preimage tests +#[test] +fn test_preimage_date_part_date32_eq() { + let schema = expr_test_schema(); + // date_part(c1, DatePart::Year) = 2024 -> c1 >= 2024-01-01 AND c1 < 2025-01-01 + let expr_lt = expr_fn::date_part(lit("year"), col("date32")).eq(lit(2024i32)); + let expected = and( + col("date32").gt_eq(lit(ScalarValue::Date32(Some(19723)))), + col("date32").lt(lit(ScalarValue::Date32(Some(20089)))), + ); + assert_eq!(optimize_test(expr_lt, &schema), expected) +} + +#[test] +fn test_preimage_date_part_date64_not_eq() { + let schema = expr_test_schema(); + // date_part(c1, DatePart::Year) <> 2024 -> c1 < 2024-01-01 AND c1 >= 2025-01-01 + let expr_lt = expr_fn::date_part(lit("year"), col("date64")).not_eq(lit(2024i32)); + let expected = or( + col("date64").lt(lit(ScalarValue::Date64(Some(19723 * 86_400_000)))), + col("date64").gt_eq(lit(ScalarValue::Date64(Some(20089 * 86_400_000)))), + ); + assert_eq!(optimize_test(expr_lt, &schema), expected) +} + +#[test] +fn test_preimage_date_part_timestamp_nano_lt() { + let schema = expr_test_schema(); + let expr_lt = expr_fn::date_part(lit("year"), col("ts_nano_none")).lt(lit(2024i32)); + let expected = col("ts_nano_none").lt(lit(ScalarValue::TimestampNanosecond( + Some(19723 * 86_400_000_000_000), + None, + ))); + assert_eq!(optimize_test(expr_lt, &schema), expected) +} + +#[test] +fn test_preimage_date_part_timestamp_nano_utc_gt() { + let schema = expr_test_schema(); + let expr_lt = expr_fn::date_part(lit("year"), col("ts_nano_utc")).gt(lit(2024i32)); + let expected = col("ts_nano_utc").gt_eq(lit(ScalarValue::TimestampNanosecond( + Some(20089 * 86_400_000_000_000), + None, + ))); + assert_eq!(optimize_test(expr_lt, &schema), expected) +} + +#[test] +fn test_preimage_date_part_timestamp_sec_est_gt_eq() { + let schema = expr_test_schema(); + let expr_lt = expr_fn::date_part(lit("year"), col("ts_sec_est")).gt_eq(lit(2024i32)); + let expected = col("ts_sec_est").gt_eq(lit(ScalarValue::TimestampSecond( + Some(19723 * 86_400), + None, + ))); + assert_eq!(optimize_test(expr_lt, &schema), expected) +} + +#[test] +fn test_preimage_date_part_timestamp_sec_est_lt_eq() { + let schema = expr_test_schema(); + let expr_lt = expr_fn::date_part(lit("year"), col("ts_mic_pt")).lt_eq(lit(2024i32)); + let expected = col("ts_mic_pt").lt(lit(ScalarValue::TimestampMicrosecond( + Some(20089 * 86_400_000_000), + None, + ))); + assert_eq!(optimize_test(expr_lt, &schema), expected) +} + +#[test] +fn test_preimage_date_part_timestamp_nano_lt_swap() { + let schema = expr_test_schema(); + let expr_lt = lit(2024i32).gt(expr_fn::date_part(lit("year"), col("ts_nano_none"))); + let expected = col("ts_nano_none").lt(lit(ScalarValue::TimestampNanosecond( + Some(19723 * 86_400_000_000_000), + None, + ))); + assert_eq!(optimize_test(expr_lt, &schema), expected) +} + +#[test] +// Should not simplify +fn test_preimage_date_part_not_year_date32_eq() { + let schema = expr_test_schema(); + // date_part(c1, DatePart::Year) = 2024 -> c1 >= 2024-01-01 AND c1 < 2025-01-01 + let expr_lt = expr_fn::date_part(lit("month"), col("date32")).eq(lit(1i32)); + let expected = expr_fn::date_part(lit("month"), col("date32")).eq(lit(1i32)); + assert_eq!(optimize_test(expr_lt, &schema), expected) +} + +fn optimize_test(expr: Expr, schema: &DFSchemaRef) -> Expr { + let props = ExecutionProps::new(); + let simplifier = + ExprSimplifier::new(SimplifyContext::new(&props).with_schema(Arc::clone(schema))); + + simplifier.simplify(expr).unwrap() +} + +fn expr_test_schema() -> DFSchemaRef { + Arc::new( + DFSchema::from_unqualified_fields( + vec![ + Field::new("date32", DataType::Date32, false), + Field::new("date64", DataType::Date64, false), + Field::new("ts_nano_none", timestamp_nano_none_type(), false), + Field::new("ts_nano_utc", timestamp_nano_utc_type(), false), + Field::new("ts_sec_est", timestamp_sec_est_type(), false), + Field::new("ts_mic_pt", timestamp_mic_pt_type(), false), + ] + .into(), + HashMap::new(), + ) + .unwrap(), + ) +} + +fn timestamp_nano_none_type() -> DataType { + DataType::Timestamp(TimeUnit::Nanosecond, None) +} + +// this is the type that now() returns +fn timestamp_nano_utc_type() -> DataType { + let utc = Some("+0:00".into()); + DataType::Timestamp(TimeUnit::Nanosecond, utc) +} + +fn timestamp_sec_est_type() -> DataType { + let est = Some("-5:00".into()); + DataType::Timestamp(TimeUnit::Second, est) +} + +fn timestamp_mic_pt_type() -> DataType { + let pt = Some("-8::00".into()); + DataType::Timestamp(TimeUnit::Microsecond, pt) +} diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 2489edc9eb920..443dca898beaf 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -77,7 +77,6 @@ datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } datafusion-expr-common = { workspace = true } datafusion-macros = { workspace = true } -datafusion-optimizer = { workspace = true } hex = { workspace = true, optional = true } itertools = { workspace = true } log = { workspace = true } diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index 0631f9e6a8ea1..a8af0fb61c2af 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -474,158 +474,3 @@ fn epoch(array: &dyn Array) -> Result { }; Ok(Arc::new(f)) } - -#[cfg(test)] -mod tests { - use crate::datetime::expr_fn; - use arrow::datatypes::{DataType, Field, TimeUnit}; - use datafusion_common::{DFSchema, DFSchemaRef, ScalarValue}; - use datafusion_expr::expr_fn::col; - use datafusion_expr::or; - use datafusion_expr::{ - Expr, and, execution_props::ExecutionProps, lit, simplify::SimplifyContext, - }; - use datafusion_optimizer::simplify_expressions::ExprSimplifier; - use std::{collections::HashMap, sync::Arc}; - - #[test] - fn test_preimage_date_part_date32_eq() { - let schema = expr_test_schema(); - // date_part(c1, DatePart::Year) = 2024 -> c1 >= 2024-01-01 AND c1 < 2025-01-01 - let expr_lt = expr_fn::date_part(lit("year"), col("date32")).eq(lit(2024i32)); - let expected = and( - col("date32").gt_eq(lit(ScalarValue::Date32(Some(19723)))), - col("date32").lt(lit(ScalarValue::Date32(Some(20089)))), - ); - assert_eq!(optimize_test(expr_lt, &schema), expected) - } - - #[test] - fn test_preimage_date_part_date64_not_eq() { - let schema = expr_test_schema(); - // date_part(c1, DatePart::Year) <> 2024 -> c1 < 2024-01-01 AND c1 >= 2025-01-01 - let expr_lt = expr_fn::date_part(lit("year"), col("date64")).not_eq(lit(2024i32)); - let expected = or( - col("date64").lt(lit(ScalarValue::Date64(Some(19723 * 86_400_000)))), - col("date64").gt_eq(lit(ScalarValue::Date64(Some(20089 * 86_400_000)))), - ); - assert_eq!(optimize_test(expr_lt, &schema), expected) - } - - #[test] - fn test_preimage_date_part_timestamp_nano_lt() { - let schema = expr_test_schema(); - let expr_lt = - expr_fn::date_part(lit("year"), col("ts_nano_none")).lt(lit(2024i32)); - let expected = col("ts_nano_none").lt(lit(ScalarValue::TimestampNanosecond( - Some(19723 * 86_400_000_000_000), - None, - ))); - assert_eq!(optimize_test(expr_lt, &schema), expected) - } - - #[test] - fn test_preimage_date_part_timestamp_nano_utc_gt() { - let schema = expr_test_schema(); - let expr_lt = - expr_fn::date_part(lit("year"), col("ts_nano_utc")).gt(lit(2024i32)); - let expected = col("ts_nano_utc").gt_eq(lit(ScalarValue::TimestampNanosecond( - Some(20089 * 86_400_000_000_000), - None, - ))); - assert_eq!(optimize_test(expr_lt, &schema), expected) - } - - #[test] - fn test_preimage_date_part_timestamp_sec_est_gt_eq() { - let schema = expr_test_schema(); - let expr_lt = - expr_fn::date_part(lit("year"), col("ts_sec_est")).gt_eq(lit(2024i32)); - let expected = col("ts_sec_est").gt_eq(lit(ScalarValue::TimestampSecond( - Some(19723 * 86_400), - None, - ))); - assert_eq!(optimize_test(expr_lt, &schema), expected) - } - - #[test] - fn test_preimage_date_part_timestamp_sec_est_lt_eq() { - let schema = expr_test_schema(); - let expr_lt = - expr_fn::date_part(lit("year"), col("ts_mic_pt")).lt_eq(lit(2024i32)); - let expected = col("ts_mic_pt").lt(lit(ScalarValue::TimestampMicrosecond( - Some(20089 * 86_400_000_000), - None, - ))); - assert_eq!(optimize_test(expr_lt, &schema), expected) - } - - #[test] - fn test_preimage_date_part_timestamp_nano_lt_swap() { - let schema = expr_test_schema(); - let expr_lt = - lit(2024i32).gt(expr_fn::date_part(lit("year"), col("ts_nano_none"))); - let expected = col("ts_nano_none").lt(lit(ScalarValue::TimestampNanosecond( - Some(19723 * 86_400_000_000_000), - None, - ))); - assert_eq!(optimize_test(expr_lt, &schema), expected) - } - - #[test] - // Should not simplify - fn test_preimage_date_part_not_year_date32_eq() { - let schema = expr_test_schema(); - // date_part(c1, DatePart::Year) = 2024 -> c1 >= 2024-01-01 AND c1 < 2025-01-01 - let expr_lt = expr_fn::date_part(lit("month"), col("date32")).eq(lit(1i32)); - let expected = expr_fn::date_part(lit("month"), col("date32")).eq(lit(1i32)); - assert_eq!(optimize_test(expr_lt, &schema), expected) - } - - fn optimize_test(expr: Expr, schema: &DFSchemaRef) -> Expr { - let props = ExecutionProps::new(); - let simplifier = ExprSimplifier::new( - SimplifyContext::new(&props).with_schema(Arc::clone(schema)), - ); - - simplifier.simplify(expr).unwrap() - } - - fn expr_test_schema() -> DFSchemaRef { - Arc::new( - DFSchema::from_unqualified_fields( - vec![ - Field::new("date32", DataType::Date32, false), - Field::new("date64", DataType::Date64, false), - Field::new("ts_nano_none", timestamp_nano_none_type(), false), - Field::new("ts_nano_utc", timestamp_nano_utc_type(), false), - Field::new("ts_sec_est", timestamp_sec_est_type(), false), - Field::new("ts_mic_pt", timestamp_mic_pt_type(), false), - ] - .into(), - HashMap::new(), - ) - .unwrap(), - ) - } - - fn timestamp_nano_none_type() -> DataType { - DataType::Timestamp(TimeUnit::Nanosecond, None) - } - - // this is the type that now() returns - fn timestamp_nano_utc_type() -> DataType { - let utc = Some("+0:00".into()); - DataType::Timestamp(TimeUnit::Nanosecond, utc) - } - - fn timestamp_sec_est_type() -> DataType { - let est = Some("-5:00".into()); - DataType::Timestamp(TimeUnit::Second, est) - } - - fn timestamp_mic_pt_type() -> DataType { - let pt = Some("-8::00".into()); - DataType::Timestamp(TimeUnit::Microsecond, pt) - } -} From 130413dc549da35ebd516768e41bd307e8528eb4 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Tue, 23 Dec 2025 11:32:44 -0500 Subject: [PATCH 26/27] Improve IsDistinctFrom, IsNotDistinctFrom logic and add unit tests for it --- .../src/simplify_expressions/udf_preimage.rs | 55 ++++++------------- 1 file changed, 16 insertions(+), 39 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs index 6123910e07db5..980a1ea42e0e3 100644 --- a/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs +++ b/datafusion/optimizer/src/simplify_expressions/udf_preimage.rs @@ -65,7 +65,10 @@ pub(super) fn rewrite_with_preimage( right: Box::new(upper), }), // = x ==> ( >= lower) and ( < upper) - Operator::Eq => and( + // + // is not distinct from x ==> ( is NULL and x is NULL) or (( >= lower) and ( < upper)) + // but since x is always not NULL => ( >= lower) and ( < upper) + Operator::Eq | Operator::IsNotDistinctFrom => and( Expr::BinaryExpr(BinaryExpr { left: expr.clone(), op: Operator::GtEq, @@ -91,44 +94,18 @@ pub(super) fn rewrite_with_preimage( }), ), // is distinct from x ==> ( < lower) or ( >= upper) or ( is NULL and x is not NULL) or ( is not NULL and x is NULL) - Operator::IsDistinctFrom => or( - or( - Expr::BinaryExpr(BinaryExpr { - left: expr.clone(), - op: Operator::Lt, - right: Box::new(lower.clone()), - }), - Expr::BinaryExpr(BinaryExpr { - left: expr.clone(), - op: Operator::GtEq, - right: Box::new(upper), - }), - ), - or( - and(expr.clone().is_null(), lower.clone().is_not_null()), - and(expr.is_not_null(), lower.is_null()), - ), - ), - // is distinct from x ==> ( is NULL and x is NULL) or (( >= lower) and ( < upper)) - Operator::IsNotDistinctFrom => or( - Expr::BinaryExpr(BinaryExpr { - left: Box::new(expr.clone().is_null()), - op: Operator::And, - right: Box::new(lower.clone().is_null()), - }), - and( - Expr::BinaryExpr(BinaryExpr { - left: expr.clone(), - op: Operator::GtEq, - right: Box::new(lower.clone()), - }), - Expr::BinaryExpr(BinaryExpr { - left: expr, - op: Operator::Lt, - right: Box::new(upper), - }), - ), - ), + // but given that x is always not NULL => ( < lower) or ( >= upper) or ( is NULL) + Operator::IsDistinctFrom => Expr::BinaryExpr(BinaryExpr { + left: expr.clone(), + op: Operator::Lt, + right: Box::new(lower.clone()), + }) + .or(Expr::BinaryExpr(BinaryExpr { + left: expr.clone(), + op: Operator::GtEq, + right: Box::new(upper), + })) + .or(expr.is_null()), _ => return internal_err!("Expect comparison operators"), }; Ok(Transformed::yes(rewritten_expr)) From ae052935a6fa009153ca563ca27609ec6cfd9487 Mon Sep 17 00:00:00 2001 From: sdf-jkl Date: Tue, 23 Dec 2025 11:33:49 -0500 Subject: [PATCH 27/27] Return sqllogictests + add unit tests --- datafusion/core/tests/optimizer/mod.rs | 73 +++++++++++++++++-- .../sqllogictest/test_files/udf_preimage.slt | 40 +++++----- 2 files changed, 83 insertions(+), 30 deletions(-) diff --git a/datafusion/core/tests/optimizer/mod.rs b/datafusion/core/tests/optimizer/mod.rs index 6c29c0f6032e0..25369bd2b742e 100644 --- a/datafusion/core/tests/optimizer/mod.rs +++ b/datafusion/core/tests/optimizer/mod.rs @@ -465,14 +465,71 @@ fn test_preimage_date_part_timestamp_nano_lt_swap() { assert_eq!(optimize_test(expr_lt, &schema), expected) } +#[test] +fn test_preimage_date_part_date32_is_not_distinct_from() { + let schema = expr_test_schema(); + // date_part(c1, DatePart::Year) is not distinct from 2024 -> c1 >= 2024-01-01 AND c1 < 2025-01-01 (the null handling part is dropped since rhs is not null) + let expr_lt = Expr::BinaryExpr(BinaryExpr { + left: Box::new(expr_fn::date_part(lit("year"), col("date32"))), + op: Operator::IsNotDistinctFrom, + right: Box::new(lit(2024i32)), + }); + let expected = and( + col("date32").gt_eq(lit(ScalarValue::Date32(Some(19723)))), + col("date32").lt(lit(ScalarValue::Date32(Some(20089)))), + ); + assert_eq!(optimize_test(expr_lt, &schema), expected) +} + +#[test] +// Should not simplify - interval can't be calculated +fn test_preimage_date_part_date32_is_not_distinct_from_null() { + let schema = expr_test_schema(); + // date_part(c1, DatePart::Year) is not distinct from Null -> unchanged + let expr_lt = Expr::BinaryExpr(BinaryExpr { + left: Box::new(expr_fn::date_part(lit("year"), col("date32"))), + op: Operator::IsNotDistinctFrom, + right: Box::new(lit(ScalarValue::Null)), + }); + assert_eq!(optimize_test(expr_lt.clone(), &schema), expr_lt) +} + +#[test] +fn test_preimage_date_part_date64_is_distinct_from() { + let schema = expr_test_schema(); + // date_part(c1, DatePart::Year) is distinct from 2024 -> c1 < 2024-01-01 OR c1 >= 2025-01-01 or c1 is NULL + let expr_lt = Expr::BinaryExpr(BinaryExpr { + left: Box::new(expr_fn::date_part(lit("year"), col("date64"))), + op: Operator::IsDistinctFrom, + right: Box::new(lit(2024i32)), + }); + let expected = col("date64") + .lt(lit(ScalarValue::Date64(Some(19723 * 86_400_000)))) + .or(col("date64").gt_eq(lit(ScalarValue::Date64(Some(20089 * 86_400_000))))) + .or(col("date64").is_null()); + assert_eq!(optimize_test(expr_lt, &schema), expected) +} + +#[test] +// Should not simplify - interval can't be calculated +fn test_preimage_date_part_date64_is_distinct_from_null() { + let schema = expr_test_schema(); + // date_part(c1, DatePart::Year) is distinct from 2024 -> c1 < 2024-01-01 OR c1 >= unchanged + let expr_lt = Expr::BinaryExpr(BinaryExpr { + left: Box::new(expr_fn::date_part(lit("year"), col("date64"))), + op: Operator::IsDistinctFrom, + right: Box::new(lit(ScalarValue::Null)), + }); + assert_eq!(optimize_test(expr_lt.clone(), &schema), expr_lt) +} + #[test] // Should not simplify fn test_preimage_date_part_not_year_date32_eq() { let schema = expr_test_schema(); // date_part(c1, DatePart::Year) = 2024 -> c1 >= 2024-01-01 AND c1 < 2025-01-01 let expr_lt = expr_fn::date_part(lit("month"), col("date32")).eq(lit(1i32)); - let expected = expr_fn::date_part(lit("month"), col("date32")).eq(lit(1i32)); - assert_eq!(optimize_test(expr_lt, &schema), expected) + assert_eq!(optimize_test(expr_lt.clone(), &schema), expr_lt) } fn optimize_test(expr: Expr, schema: &DFSchemaRef) -> Expr { @@ -487,12 +544,12 @@ fn expr_test_schema() -> DFSchemaRef { Arc::new( DFSchema::from_unqualified_fields( vec![ - Field::new("date32", DataType::Date32, false), - Field::new("date64", DataType::Date64, false), - Field::new("ts_nano_none", timestamp_nano_none_type(), false), - Field::new("ts_nano_utc", timestamp_nano_utc_type(), false), - Field::new("ts_sec_est", timestamp_sec_est_type(), false), - Field::new("ts_mic_pt", timestamp_mic_pt_type(), false), + Field::new("date32", DataType::Date32, true), + Field::new("date64", DataType::Date64, true), + Field::new("ts_nano_none", timestamp_nano_none_type(), true), + Field::new("ts_nano_utc", timestamp_nano_utc_type(), true), + Field::new("ts_sec_est", timestamp_sec_est_type(), true), + Field::new("ts_mic_pt", timestamp_mic_pt_type(), true), ] .into(), HashMap::new(), diff --git a/datafusion/sqllogictest/test_files/udf_preimage.slt b/datafusion/sqllogictest/test_files/udf_preimage.slt index 3779af0dc665f..544f082cc1d83 100644 --- a/datafusion/sqllogictest/test_files/udf_preimage.slt +++ b/datafusion/sqllogictest/test_files/udf_preimage.slt @@ -133,17 +133,15 @@ physical_plan 01)FilterExec: c@0 < 2025-01-01 02)--DataSourceExec: partitions=1, partition_sizes=[1] -# This one doesn't pass due to a ParseError -#External error: task 20233 panicked with message "called `Result::unwrap()` on an `Err` value: ParseError { kind: InvalidLine(\"explain select c from t1 where extract (year from c) is not distinct from 2024\"), loc: Location { file: \"test_files/udf_preimage.slt\", line: 144, upper: None } }" -# -#explain select c from t1 where extract (year from c) is not distinct from 2024 -#---- -#logical_plan -#01)Filter: t1.c >= Date32("2024-01-01") AND t1.c < Date32("2025-01-01") -#02)--TableScan: t1 projection=[c1_date32] -#physical_plan -#01)FilterExec: c1_date32@0 >= 2024-01-01 AND c1_date32@0 < 2025-01-01 -#02)--DataSourceExec: partitions=1, partition_sizes=[1] +query TT +explain select c from t1 where extract (year from c) is not distinct from 2024 +---- +logical_plan +01)Filter: t1.c >= Date32("2024-01-01") AND t1.c < Date32("2025-01-01") +02)--TableScan: t1 projection=[c] +physical_plan +01)FilterExec: c@0 >= 2024-01-01 AND c@0 < 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] query TT explain select c from t1 where extract (year from c) is distinct from 2024 @@ -487,17 +485,15 @@ physical_plan 01)FilterExec: c5_ts_nano@0 < 1735689600000000000 02)--DataSourceExec: partitions=1, partition_sizes=[1] -# This one doesn't pass due to a ParseError -#External error: task 20343 panicked with message "called `Result::unwrap()` on an `Err` value: ParseError { kind: InvalidLine(\"explain select c1_date32 from t2 where extract(year from c1_date32) is not distinct from 2024\"), loc: Location { file: \"test_files/udf_preimage.slt\", line: 162, upper: None } }" -# -#explain select c1_date32 from t2 where extract (year from c1_date32) is not distinct from 2024 -#---- -#logical_plan -#01)Filter: t2.c1_date32 >= Date32("2024-01-01") AND t2.c1_date32 < Date32("2025-01-01") -#02)--TableScan: t2 projection=[c1_date32] -#physical_plan -#01)FilterExec: c1_date32@0 >= 2024-01-01 AND c1_date32@0 < 2025-01-01 -#02)--DataSourceExec: partitions=1, partition_sizes=[1] +query TT +explain select c1_date32 from t2 where extract (year from c1_date32) is not distinct from 2024 +---- +logical_plan +01)Filter: t2.c1_date32 >= Date32("2024-01-01") AND t2.c1_date32 < Date32("2025-01-01") +02)--TableScan: t2 projection=[c1_date32] +physical_plan +01)FilterExec: c1_date32@0 >= 2024-01-01 AND c1_date32@0 < 2025-01-01 +02)--DataSourceExec: partitions=1, partition_sizes=[1] query TT explain select c1_date32 from t2 where extract (year from c1_date32) is distinct from 2024