From dc0d8bf4988465e3d07acabf2f8ecf564456808f Mon Sep 17 00:00:00 2001 From: Yoav Cohen Date: Mon, 11 Nov 2024 14:55:13 +0200 Subject: [PATCH 1/6] Add the option for the parser to try and parse an expression as identifier if the expression parsing fails --- src/ast/mod.rs | 3 + src/dialect/mod.rs | 6 + src/dialect/snowflake.rs | 12 ++ src/keywords.rs | 10 + src/parser/mod.rs | 375 ++++++++++++++++++++---------------- tests/sqlparser_bigquery.rs | 10 +- tests/sqlparser_common.rs | 6 +- tests/sqlparser_postgres.rs | 15 +- 8 files changed, 248 insertions(+), 189 deletions(-) diff --git a/src/ast/mod.rs b/src/ast/mod.rs index 9185c9df4..8a5123971 100644 --- a/src/ast/mod.rs +++ b/src/ast/mod.rs @@ -695,6 +695,8 @@ pub enum Expr { // https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#formatting_syntax format: Option, }, + /// `DEFAULT` value of a column e.g. INSERT INTO tbl (a, b) VALUES ('foo', DEFAULT) + Default, /// AT a timestamp to a different timezone e.g. `FROM_UNIXTIME(0) AT TIME ZONE 'UTC-06:00'` AtTimeZone { timestamp: Box, @@ -1449,6 +1451,7 @@ impl fmt::Display for Expr { write!(f, "{expr}::{data_type}") } }, + Expr::Default => write!(f, "DEFAULT"), Expr::Extract { field, syntax, diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 159e14717..7d857c697 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -681,6 +681,12 @@ pub trait Dialect: Debug + Any { fn supports_partiql(&self) -> bool { false } + + /// Returns true if the specified keyword is reserved and cannot be + /// used as an identifier without special handling like quoting. + fn is_reserved_for_identifier(&self, kw: Keyword) -> bool { + keywords::RESERVED_FOR_IDENTIFIER.contains(&kw) + } } /// This represents the operators for which precedence must be defined diff --git a/src/dialect/snowflake.rs b/src/dialect/snowflake.rs index b584ed9b4..56919fb31 100644 --- a/src/dialect/snowflake.rs +++ b/src/dialect/snowflake.rs @@ -38,6 +38,8 @@ use alloc::vec::Vec; #[cfg(not(feature = "std"))] use alloc::{format, vec}; +use super::keywords::RESERVED_FOR_IDENTIFIER; + /// A [`Dialect`] for [Snowflake](https://www.snowflake.com/) #[derive(Debug, Default)] pub struct SnowflakeDialect; @@ -214,6 +216,16 @@ impl Dialect for SnowflakeDialect { fn supports_show_like_before_in(&self) -> bool { true } + + fn is_reserved_for_identifier(&self, kw: Keyword) -> bool { + // Unreserve some keywords that Snowflake accepts as identifiers + // See: https://docs.snowflake.com/en/sql-reference/reserved-keywords + if matches!(kw, Keyword::INTERVAL) { + false + } else { + RESERVED_FOR_IDENTIFIER.contains(&kw) + } + } } /// Parse snowflake create table statement. diff --git a/src/keywords.rs b/src/keywords.rs index fc2a2927c..8c0ed588f 100644 --- a/src/keywords.rs +++ b/src/keywords.rs @@ -948,3 +948,13 @@ pub const RESERVED_FOR_COLUMN_ALIAS: &[Keyword] = &[ Keyword::INTO, Keyword::END, ]; + +/// Global list of reserved keywords that cannot be parsed as identifiers +/// without special handling like quoting. Parser should call `Dialect::is_reserved_for_identifier` +/// to allow for each dialect to customize the list. +pub const RESERVED_FOR_IDENTIFIER: &[Keyword] = &[ + Keyword::EXISTS, + Keyword::INTERVAL, + Keyword::STRUCT, + Keyword::TRIM, +]; diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 1bf173169..2b2efc2be 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -45,6 +45,9 @@ pub enum ParserError { TokenizerError(String), ParserError(String), RecursionLimitExceeded, + /// Error indicating that the parsing branch taken + /// did not yield a meaningful result + BranchAbandoned, } // avoid clippy type_complexity warnings @@ -174,6 +177,7 @@ impl fmt::Display for ParserError { ParserError::TokenizerError(s) => s, ParserError::ParserError(s) => s, ParserError::RecursionLimitExceeded => "recursion limit exceeded", + ParserError::BranchAbandoned => "branch abandoned", } ) } @@ -1025,6 +1029,178 @@ impl<'a> Parser<'a> { Ok(Statement::NOTIFY { channel, payload }) } + fn parse_expr_by_keyword(&mut self, w: &Word) -> Result { + match w.keyword { + Keyword::TRUE | Keyword::FALSE if self.dialect.supports_boolean_literals() => { + self.prev_token(); + Ok(Expr::Value(self.parse_value()?)) + } + Keyword::NULL => { + self.prev_token(); + Ok(Expr::Value(self.parse_value()?)) + } + Keyword::CURRENT_CATALOG + | Keyword::CURRENT_USER + | Keyword::SESSION_USER + | Keyword::USER + if dialect_of!(self is PostgreSqlDialect | GenericDialect) => + { + Ok(Expr::Function(Function { + name: ObjectName(vec![w.to_ident()]), + parameters: FunctionArguments::None, + args: FunctionArguments::None, + null_treatment: None, + filter: None, + over: None, + within_group: vec![], + })) + } + Keyword::CURRENT_TIMESTAMP + | Keyword::CURRENT_TIME + | Keyword::CURRENT_DATE + | Keyword::LOCALTIME + | Keyword::LOCALTIMESTAMP => { + self.parse_time_functions(ObjectName(vec![w.to_ident()])) + } + Keyword::CASE => self.parse_case_expr(), + Keyword::CONVERT => self.parse_convert_expr(false), + Keyword::TRY_CONVERT if self.dialect.supports_try_convert() => self.parse_convert_expr(true), + Keyword::CAST => self.parse_cast_expr(CastKind::Cast), + Keyword::TRY_CAST => self.parse_cast_expr(CastKind::TryCast), + Keyword::SAFE_CAST => self.parse_cast_expr(CastKind::SafeCast), + Keyword::EXISTS + // Support parsing Databricks has a function named `exists`. + if !dialect_of!(self is DatabricksDialect) + || matches!( + self.peek_nth_token(1).token, + Token::Word(Word { + keyword: Keyword::SELECT | Keyword::WITH, + .. + }) + ) => + { + self.parse_exists_expr(false) + } + Keyword::EXTRACT => self.parse_extract_expr(), + Keyword::CEIL => self.parse_ceil_floor_expr(true), + Keyword::FLOOR => self.parse_ceil_floor_expr(false), + Keyword::POSITION if self.peek_token().token == Token::LParen => { + self.parse_position_expr(w.to_ident()) + } + Keyword::SUBSTRING => self.parse_substring_expr(), + Keyword::OVERLAY => self.parse_overlay_expr(), + Keyword::TRIM => self.parse_trim_expr(), + Keyword::INTERVAL => self.parse_interval(), + // Treat ARRAY[1,2,3] as an array [1,2,3], otherwise try as subquery or a function call + Keyword::ARRAY if self.peek_token() == Token::LBracket => { + self.expect_token(&Token::LBracket)?; + self.parse_array_expr(true) + } + Keyword::ARRAY + if self.peek_token() == Token::LParen + && !dialect_of!(self is ClickHouseDialect | DatabricksDialect) => + { + self.expect_token(&Token::LParen)?; + let query = self.parse_query()?; + self.expect_token(&Token::RParen)?; + Ok(Expr::Function(Function { + name: ObjectName(vec![w.to_ident()]), + parameters: FunctionArguments::None, + args: FunctionArguments::Subquery(query), + filter: None, + null_treatment: None, + over: None, + within_group: vec![], + })) + } + Keyword::NOT => self.parse_not(), + Keyword::MATCH if dialect_of!(self is MySqlDialect | GenericDialect) => { + self.parse_match_against() + } + Keyword::STRUCT if dialect_of!(self is BigQueryDialect | GenericDialect) => { + self.prev_token(); + self.parse_bigquery_struct_literal() + } + Keyword::PRIOR if matches!(self.state, ParserState::ConnectBy) => { + let expr = self.parse_subexpr(self.dialect.prec_value(Precedence::PlusMinus))?; + Ok(Expr::Prior(Box::new(expr))) + } + Keyword::MAP if self.peek_token() == Token::LBrace && self.dialect.support_map_literal_syntax() => { + self.parse_duckdb_map_literal() + } + Keyword::DEFAULT => Ok(Expr::Default), + _ => Err(ParserError::BranchAbandoned) + } + } + + fn parse_ident_expr(&mut self, w: &Word) -> Result { + match self.peek_token().token { + Token::LParen | Token::Period => { + let mut id_parts: Vec = vec![w.to_ident()]; + let mut ends_with_wildcard = false; + while self.consume_token(&Token::Period) { + let next_token = self.next_token(); + match next_token.token { + Token::Word(w) => id_parts.push(w.to_ident()), + Token::Mul => { + // Postgres explicitly allows funcnm(tablenm.*) and the + // function array_agg traverses this control flow + if dialect_of!(self is PostgreSqlDialect) { + ends_with_wildcard = true; + break; + } else { + return self.expected("an identifier after '.'", next_token); + } + } + Token::SingleQuotedString(s) => id_parts.push(Ident::with_quote('\'', s)), + _ => { + return self.expected("an identifier or a '*' after '.'", next_token); + } + } + } + + if ends_with_wildcard { + Ok(Expr::QualifiedWildcard(ObjectName(id_parts))) + } else if self.consume_token(&Token::LParen) { + if dialect_of!(self is SnowflakeDialect | MsSqlDialect) + && self.consume_tokens(&[Token::Plus, Token::RParen]) + { + Ok(Expr::OuterJoin(Box::new( + match <[Ident; 1]>::try_from(id_parts) { + Ok([ident]) => Expr::Identifier(ident), + Err(parts) => Expr::CompoundIdentifier(parts), + }, + ))) + } else { + self.prev_token(); + self.parse_function(ObjectName(id_parts)) + } + } else { + Ok(Expr::CompoundIdentifier(id_parts)) + } + } + // string introducer https://dev.mysql.com/doc/refman/8.0/en/charset-introducer.html + Token::SingleQuotedString(_) + | Token::DoubleQuotedString(_) + | Token::HexStringLiteral(_) + if w.value.starts_with('_') => + { + Ok(Expr::IntroducedString { + introducer: w.value.clone(), + value: self.parse_introduced_string_value()?, + }) + } + Token::Arrow if self.dialect.supports_lambda_functions() => { + self.expect_token(&Token::Arrow)?; + Ok(Expr::Lambda(LambdaFunction { + params: OneOrManyWithParens::One(w.to_ident()), + body: Box::new(self.parse_expr()?), + })) + } + _ => Ok(Expr::Identifier(w.to_ident())), + } + } + /// Parse an expression prefix. pub fn parse_prefix(&mut self) -> Result { // allow the dialect to override prefix parsing @@ -1073,175 +1249,22 @@ impl<'a> Parser<'a> { let next_token = self.next_token(); let expr = match next_token.token { - Token::Word(w) => match w.keyword { - Keyword::TRUE | Keyword::FALSE if self.dialect.supports_boolean_literals() => { - self.prev_token(); - Ok(Expr::Value(self.parse_value()?)) - } - Keyword::NULL => { - self.prev_token(); - Ok(Expr::Value(self.parse_value()?)) - } - Keyword::CURRENT_CATALOG - | Keyword::CURRENT_USER - | Keyword::SESSION_USER - | Keyword::USER - if dialect_of!(self is PostgreSqlDialect | GenericDialect) => - { - Ok(Expr::Function(Function { - name: ObjectName(vec![w.to_ident()]), - parameters: FunctionArguments::None, - args: FunctionArguments::None, - null_treatment: None, - filter: None, - over: None, - within_group: vec![], - })) - } - Keyword::CURRENT_TIMESTAMP - | Keyword::CURRENT_TIME - | Keyword::CURRENT_DATE - | Keyword::LOCALTIME - | Keyword::LOCALTIMESTAMP => { - self.parse_time_functions(ObjectName(vec![w.to_ident()])) - } - Keyword::CASE => self.parse_case_expr(), - Keyword::CONVERT => self.parse_convert_expr(false), - Keyword::TRY_CONVERT if self.dialect.supports_try_convert() => self.parse_convert_expr(true), - Keyword::CAST => self.parse_cast_expr(CastKind::Cast), - Keyword::TRY_CAST => self.parse_cast_expr(CastKind::TryCast), - Keyword::SAFE_CAST => self.parse_cast_expr(CastKind::SafeCast), - Keyword::EXISTS - // Support parsing Databricks has a function named `exists`. - if !dialect_of!(self is DatabricksDialect) - || matches!( - self.peek_nth_token(1).token, - Token::Word(Word { - keyword: Keyword::SELECT | Keyword::WITH, - .. - }) - ) => - { - self.parse_exists_expr(false) - } - Keyword::EXTRACT => self.parse_extract_expr(), - Keyword::CEIL => self.parse_ceil_floor_expr(true), - Keyword::FLOOR => self.parse_ceil_floor_expr(false), - Keyword::POSITION if self.peek_token().token == Token::LParen => { - self.parse_position_expr(w.to_ident()) - } - Keyword::SUBSTRING => self.parse_substring_expr(), - Keyword::OVERLAY => self.parse_overlay_expr(), - Keyword::TRIM => self.parse_trim_expr(), - Keyword::INTERVAL => self.parse_interval(), - // Treat ARRAY[1,2,3] as an array [1,2,3], otherwise try as subquery or a function call - Keyword::ARRAY if self.peek_token() == Token::LBracket => { - self.expect_token(&Token::LBracket)?; - self.parse_array_expr(true) - } - Keyword::ARRAY - if self.peek_token() == Token::LParen - && !dialect_of!(self is ClickHouseDialect | DatabricksDialect) => - { - self.expect_token(&Token::LParen)?; - let query = self.parse_query()?; - self.expect_token(&Token::RParen)?; - Ok(Expr::Function(Function { - name: ObjectName(vec![w.to_ident()]), - parameters: FunctionArguments::None, - args: FunctionArguments::Subquery(query), - filter: None, - null_treatment: None, - over: None, - within_group: vec![], - })) - } - Keyword::NOT => self.parse_not(), - Keyword::MATCH if dialect_of!(self is MySqlDialect | GenericDialect) => { - self.parse_match_against() - } - Keyword::STRUCT if dialect_of!(self is BigQueryDialect | GenericDialect) => { - self.prev_token(); - self.parse_bigquery_struct_literal() - } - Keyword::PRIOR if matches!(self.state, ParserState::ConnectBy) => { - let expr = self.parse_subexpr(self.dialect.prec_value(Precedence::PlusMinus))?; - Ok(Expr::Prior(Box::new(expr))) - } - Keyword::MAP if self.peek_token() == Token::LBrace && self.dialect.support_map_literal_syntax() => { - self.parse_duckdb_map_literal() - } - // Here `w` is a word, check if it's a part of a multipart - // identifier, a function call, or a simple identifier: - _ => match self.peek_token().token { - Token::LParen | Token::Period => { - let mut id_parts: Vec = vec![w.to_ident()]; - let mut ends_with_wildcard = false; - while self.consume_token(&Token::Period) { - let next_token = self.next_token(); - match next_token.token { - Token::Word(w) => id_parts.push(w.to_ident()), - Token::Mul => { - // Postgres explicitly allows funcnm(tablenm.*) and the - // function array_agg traverses this control flow - if dialect_of!(self is PostgreSqlDialect) { - ends_with_wildcard = true; - break; - } else { - return self - .expected("an identifier after '.'", next_token); - } - } - Token::SingleQuotedString(s) => { - id_parts.push(Ident::with_quote('\'', s)) - } - _ => { - return self - .expected("an identifier or a '*' after '.'", next_token); - } - } + // We first try to parse the word as the prefix of an expression. + // For example, the word INTERVAL in: SELECT INTERVAL '7' DAY + Token::Word(w) => match self.try_parse(|parser| parser.parse_expr_by_keyword(&w)) { + Ok(expr) => Ok(expr), + // Word does not indicate the start of a complex expression, try to parse as identifier + Err(ParserError::BranchAbandoned) => Ok(self.parse_ident_expr(&w)?), + // Word indicates the start of a complex expression, try to parse as identifier if the + // dialect does not reserve it, otherwise return the original error + Err(e) => { + if !self.dialect.is_reserved_for_identifier(w.keyword) { + if let Ok(expr) = self.try_parse(|parser| parser.parse_ident_expr(&w)) { + return Ok(expr); } - - if ends_with_wildcard { - Ok(Expr::QualifiedWildcard(ObjectName(id_parts))) - } else if self.consume_token(&Token::LParen) { - if dialect_of!(self is SnowflakeDialect | MsSqlDialect) - && self.consume_tokens(&[Token::Plus, Token::RParen]) - { - Ok(Expr::OuterJoin(Box::new( - match <[Ident; 1]>::try_from(id_parts) { - Ok([ident]) => Expr::Identifier(ident), - Err(parts) => Expr::CompoundIdentifier(parts), - }, - ))) - } else { - self.prev_token(); - self.parse_function(ObjectName(id_parts)) - } - } else { - Ok(Expr::CompoundIdentifier(id_parts)) - } - } - // string introducer https://dev.mysql.com/doc/refman/8.0/en/charset-introducer.html - Token::SingleQuotedString(_) - | Token::DoubleQuotedString(_) - | Token::HexStringLiteral(_) - if w.value.starts_with('_') => - { - Ok(Expr::IntroducedString { - introducer: w.value, - value: self.parse_introduced_string_value()?, - }) } - Token::Arrow if self.dialect.supports_lambda_functions() => { - self.expect_token(&Token::Arrow)?; - return Ok(Expr::Lambda(LambdaFunction { - params: OneOrManyWithParens::One(w.to_ident()), - body: Box::new(self.parse_expr()?), - })); - } - _ => Ok(Expr::Identifier(w.to_ident())), - }, + return Err(e); + } }, // End of Token::Word // array `[1, 2, 3]` Token::LBracket => self.parse_array_expr(false), @@ -3693,6 +3716,24 @@ impl<'a> Parser<'a> { } } + /// Run a parser method `f`, reverting back to the current position if unsuccessful + /// but retaining the error message if such was raised by `f` + pub fn try_parse(&mut self, mut f: F) -> Result + where + F: FnMut(&mut Parser) -> Result, + { + let index = self.index; + match f(self) { + Ok(t) => Ok(t), + // Unwind stack if limit exceeded + Err(ParserError::RecursionLimitExceeded) => Err(ParserError::RecursionLimitExceeded), + Err(e) => { + self.index = index; + Err(e) + } + } + } + /// Parse either `ALL`, `DISTINCT` or `DISTINCT ON (...)`. Returns [`None`] if `ALL` is parsed /// and results in a [`ParserError`] if both `ALL` and `DISTINCT` are found. pub fn parse_all_or_distinct(&mut self) -> Result, ParserError> { diff --git a/tests/sqlparser_bigquery.rs b/tests/sqlparser_bigquery.rs index d4c178bbf..550133c33 100644 --- a/tests/sqlparser_bigquery.rs +++ b/tests/sqlparser_bigquery.rs @@ -1749,10 +1749,7 @@ fn parse_merge() { columns: vec![Ident::new("a"), Ident::new("b"),], kind: MergeInsertKind::Values(Values { explicit_row: false, - rows: vec![vec![ - Expr::Value(number("1")), - Expr::Identifier(Ident::new("DEFAULT")), - ]] + rows: vec![vec![Expr::Value(number("1")), Expr::Default,]] }) }) }, @@ -1763,10 +1760,7 @@ fn parse_merge() { columns: vec![], kind: MergeInsertKind::Values(Values { explicit_row: false, - rows: vec![vec![ - Expr::Value(number("1")), - Expr::Identifier(Ident::new("DEFAULT")), - ]] + rows: vec![vec![Expr::Value(number("1")), Expr::Default,]] }) }) }, diff --git a/tests/sqlparser_common.rs b/tests/sqlparser_common.rs index b41063859..26e90405f 100644 --- a/tests/sqlparser_common.rs +++ b/tests/sqlparser_common.rs @@ -34,7 +34,7 @@ use sqlparser::dialect::{ GenericDialect, HiveDialect, MsSqlDialect, MySqlDialect, PostgreSqlDialect, RedshiftSqlDialect, SQLiteDialect, SnowflakeDialect, }; -use sqlparser::keywords::ALL_KEYWORDS; +use sqlparser::keywords::{Keyword, ALL_KEYWORDS}; use sqlparser::parser::{Parser, ParserError, ParserOptions}; use sqlparser::tokenizer::Tokenizer; use test_utils::{ @@ -5112,7 +5112,9 @@ fn parse_interval_dont_require_unit() { #[test] fn parse_interval_require_unit() { - let dialects = all_dialects_where(|d| d.require_interval_qualifier()); + let dialects = all_dialects_where(|d| { + d.require_interval_qualifier() && d.is_reserved_for_identifier(Keyword::INTERVAL) + }); let sql = "SELECT INTERVAL '1 DAY'"; let err = dialects.parse_sql_statements(sql).unwrap_err(); diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs index 098a3464c..0f93937d6 100644 --- a/tests/sqlparser_postgres.rs +++ b/tests/sqlparser_postgres.rs @@ -1352,10 +1352,7 @@ fn parse_set() { local: false, hivevar: false, variables: OneOrManyWithParens::One(ObjectName(vec![Ident::new("a")])), - value: vec![Expr::Identifier(Ident { - value: "DEFAULT".into(), - quote_style: None - })], + value: vec![Expr::Default], } ); @@ -4229,10 +4226,7 @@ fn test_simple_postgres_insert_with_alias() { body: Box::new(SetExpr::Values(Values { explicit_row: false, rows: vec![vec![ - Expr::Identifier(Ident { - value: "DEFAULT".to_string(), - quote_style: None - }), + Expr::Default, Expr::Value(Value::Number("123".to_string(), false)) ]] })), @@ -4363,10 +4357,7 @@ fn test_simple_insert_with_quoted_alias() { body: Box::new(SetExpr::Values(Values { explicit_row: false, rows: vec![vec![ - Expr::Identifier(Ident { - value: "DEFAULT".to_string(), - quote_style: None - }), + Expr::Default, Expr::Value(Value::SingleQuotedString("0123".to_string())) ]] })), From dc339eb3c1857de580b81c188b78e13dabc56e34 Mon Sep 17 00:00:00 2001 From: Yoav Cohen Date: Wed, 13 Nov 2024 13:48:20 +0200 Subject: [PATCH 2/6] Fix unit test --- tests/sqlparser_postgres.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs index 0f93937d6..6296beba4 100644 --- a/tests/sqlparser_postgres.rs +++ b/tests/sqlparser_postgres.rs @@ -4289,10 +4289,7 @@ fn test_simple_postgres_insert_with_alias() { body: Box::new(SetExpr::Values(Values { explicit_row: false, rows: vec![vec![ - Expr::Identifier(Ident { - value: "DEFAULT".to_string(), - quote_style: None - }), + Expr::Default, Expr::Value(Value::Number( bigdecimal::BigDecimal::new(123.into(), 0), false From 83416aececcdd1ea14c5139a5afc65bf6176b92d Mon Sep 17 00:00:00 2001 From: Yoav Cohen Date: Mon, 18 Nov 2024 10:19:19 +0100 Subject: [PATCH 3/6] Code review comments --- src/ast/mod.rs | 3 - src/parser/mod.rs | 128 +++++++++++++++++------------------- tests/sqlparser_bigquery.rs | 10 ++- tests/sqlparser_common.rs | 7 +- tests/sqlparser_postgres.rs | 8 +-- 5 files changed, 73 insertions(+), 83 deletions(-) diff --git a/src/ast/mod.rs b/src/ast/mod.rs index 8a5123971..9185c9df4 100644 --- a/src/ast/mod.rs +++ b/src/ast/mod.rs @@ -695,8 +695,6 @@ pub enum Expr { // https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#formatting_syntax format: Option, }, - /// `DEFAULT` value of a column e.g. INSERT INTO tbl (a, b) VALUES ('foo', DEFAULT) - Default, /// AT a timestamp to a different timezone e.g. `FROM_UNIXTIME(0) AT TIME ZONE 'UTC-06:00'` AtTimeZone { timestamp: Box, @@ -1451,7 +1449,6 @@ impl fmt::Display for Expr { write!(f, "{expr}::{data_type}") } }, - Expr::Default => write!(f, "DEFAULT"), Expr::Extract { field, syntax, diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 2b2efc2be..1d67456c2 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -45,9 +45,6 @@ pub enum ParserError { TokenizerError(String), ParserError(String), RecursionLimitExceeded, - /// Error indicating that the parsing branch taken - /// did not yield a meaningful result - BranchAbandoned, } // avoid clippy type_complexity warnings @@ -177,7 +174,6 @@ impl fmt::Display for ParserError { ParserError::TokenizerError(s) => s, ParserError::ParserError(s) => s, ParserError::RecursionLimitExceeded => "recursion limit exceeded", - ParserError::BranchAbandoned => "branch abandoned", } ) } @@ -1029,15 +1025,18 @@ impl<'a> Parser<'a> { Ok(Statement::NOTIFY { channel, payload }) } - fn parse_expr_by_keyword(&mut self, w: &Word) -> Result { + fn parse_expr_prefix_by_reserved_word( + &mut self, + w: &Word, + ) -> Result, ParserError> { match w.keyword { Keyword::TRUE | Keyword::FALSE if self.dialect.supports_boolean_literals() => { self.prev_token(); - Ok(Expr::Value(self.parse_value()?)) + Ok(Some(Expr::Value(self.parse_value()?))) } Keyword::NULL => { self.prev_token(); - Ok(Expr::Value(self.parse_value()?)) + Ok(Some(Expr::Value(self.parse_value()?))) } Keyword::CURRENT_CATALOG | Keyword::CURRENT_USER @@ -1045,7 +1044,7 @@ impl<'a> Parser<'a> { | Keyword::USER if dialect_of!(self is PostgreSqlDialect | GenericDialect) => { - Ok(Expr::Function(Function { + Ok(Some(Expr::Function(Function { name: ObjectName(vec![w.to_ident()]), parameters: FunctionArguments::None, args: FunctionArguments::None, @@ -1053,21 +1052,21 @@ impl<'a> Parser<'a> { filter: None, over: None, within_group: vec![], - })) + }))) } Keyword::CURRENT_TIMESTAMP | Keyword::CURRENT_TIME | Keyword::CURRENT_DATE | Keyword::LOCALTIME | Keyword::LOCALTIMESTAMP => { - self.parse_time_functions(ObjectName(vec![w.to_ident()])) - } - Keyword::CASE => self.parse_case_expr(), - Keyword::CONVERT => self.parse_convert_expr(false), - Keyword::TRY_CONVERT if self.dialect.supports_try_convert() => self.parse_convert_expr(true), - Keyword::CAST => self.parse_cast_expr(CastKind::Cast), - Keyword::TRY_CAST => self.parse_cast_expr(CastKind::TryCast), - Keyword::SAFE_CAST => self.parse_cast_expr(CastKind::SafeCast), + Ok(Some(self.parse_time_functions(ObjectName(vec![w.to_ident()]))?)) + } + Keyword::CASE => Ok(Some(self.parse_case_expr()?)), + Keyword::CONVERT => Ok(Some(self.parse_convert_expr(false)?)), + Keyword::TRY_CONVERT if self.dialect.supports_try_convert() => Ok(Some(self.parse_convert_expr(true)?)), + Keyword::CAST => Ok(Some(self.parse_cast_expr(CastKind::Cast)?)), + Keyword::TRY_CAST => Ok(Some(self.parse_cast_expr(CastKind::TryCast)?)), + Keyword::SAFE_CAST => Ok(Some(self.parse_cast_expr(CastKind::SafeCast)?)), Keyword::EXISTS // Support parsing Databricks has a function named `exists`. if !dialect_of!(self is DatabricksDialect) @@ -1079,22 +1078,22 @@ impl<'a> Parser<'a> { }) ) => { - self.parse_exists_expr(false) + Ok(Some(self.parse_exists_expr(false)?)) } - Keyword::EXTRACT => self.parse_extract_expr(), - Keyword::CEIL => self.parse_ceil_floor_expr(true), - Keyword::FLOOR => self.parse_ceil_floor_expr(false), + Keyword::EXTRACT => Ok(Some(self.parse_extract_expr()?)), + Keyword::CEIL => Ok(Some(self.parse_ceil_floor_expr(true)?)), + Keyword::FLOOR => Ok(Some(self.parse_ceil_floor_expr(false)?)), Keyword::POSITION if self.peek_token().token == Token::LParen => { - self.parse_position_expr(w.to_ident()) + Ok(Some(self.parse_position_expr(w.to_ident())?)) } - Keyword::SUBSTRING => self.parse_substring_expr(), - Keyword::OVERLAY => self.parse_overlay_expr(), - Keyword::TRIM => self.parse_trim_expr(), - Keyword::INTERVAL => self.parse_interval(), + Keyword::SUBSTRING => Ok(Some(self.parse_substring_expr()?)), + Keyword::OVERLAY => Ok(Some(self.parse_overlay_expr()?)), + Keyword::TRIM => Ok(Some(self.parse_trim_expr()?)), + Keyword::INTERVAL => Ok(Some(self.parse_interval()?)), // Treat ARRAY[1,2,3] as an array [1,2,3], otherwise try as subquery or a function call Keyword::ARRAY if self.peek_token() == Token::LBracket => { self.expect_token(&Token::LBracket)?; - self.parse_array_expr(true) + Ok(Some(self.parse_array_expr(true)?)) } Keyword::ARRAY if self.peek_token() == Token::LParen @@ -1103,7 +1102,7 @@ impl<'a> Parser<'a> { self.expect_token(&Token::LParen)?; let query = self.parse_query()?; self.expect_token(&Token::RParen)?; - Ok(Expr::Function(Function { + Ok(Some(Expr::Function(Function { name: ObjectName(vec![w.to_ident()]), parameters: FunctionArguments::None, args: FunctionArguments::Subquery(query), @@ -1111,29 +1110,28 @@ impl<'a> Parser<'a> { null_treatment: None, over: None, within_group: vec![], - })) + }))) } - Keyword::NOT => self.parse_not(), + Keyword::NOT => Ok(Some(self.parse_not()?)), Keyword::MATCH if dialect_of!(self is MySqlDialect | GenericDialect) => { - self.parse_match_against() + Ok(Some(self.parse_match_against()?)) } Keyword::STRUCT if dialect_of!(self is BigQueryDialect | GenericDialect) => { self.prev_token(); - self.parse_bigquery_struct_literal() + Ok(Some(self.parse_bigquery_struct_literal()?)) } Keyword::PRIOR if matches!(self.state, ParserState::ConnectBy) => { let expr = self.parse_subexpr(self.dialect.prec_value(Precedence::PlusMinus))?; - Ok(Expr::Prior(Box::new(expr))) + Ok(Some(Expr::Prior(Box::new(expr)))) } Keyword::MAP if self.peek_token() == Token::LBrace && self.dialect.support_map_literal_syntax() => { - self.parse_duckdb_map_literal() + Ok(Some(self.parse_duckdb_map_literal()?)) } - Keyword::DEFAULT => Ok(Expr::Default), - _ => Err(ParserError::BranchAbandoned) + _ => Ok(None) } } - fn parse_ident_expr(&mut self, w: &Word) -> Result { + fn parse_expr_prefix_by_nonreserved_word(&mut self, w: &Word) -> Result { match self.peek_token().token { Token::LParen | Token::Period => { let mut id_parts: Vec = vec![w.to_ident()]; @@ -1249,23 +1247,33 @@ impl<'a> Parser<'a> { let next_token = self.next_token(); let expr = match next_token.token { - // We first try to parse the word as the prefix of an expression. - // For example, the word INTERVAL in: SELECT INTERVAL '7' DAY - Token::Word(w) => match self.try_parse(|parser| parser.parse_expr_by_keyword(&w)) { - Ok(expr) => Ok(expr), - // Word does not indicate the start of a complex expression, try to parse as identifier - Err(ParserError::BranchAbandoned) => Ok(self.parse_ident_expr(&w)?), - // Word indicates the start of a complex expression, try to parse as identifier if the - // dialect does not reserve it, otherwise return the original error - Err(e) => { - if !self.dialect.is_reserved_for_identifier(w.keyword) { - if let Ok(expr) = self.try_parse(|parser| parser.parse_ident_expr(&w)) { - return Ok(expr); + Token::Word(w) => { + // Save the parser index so we can rollback + let index_before = self.index; + // We first try to parse the word as the prefix of an expression. + // For example, the word INTERVAL in: SELECT INTERVAL '7' DAY + match self.parse_expr_prefix_by_reserved_word(&w) { + // No expression prefix associated with this word + Ok(None) => Ok(self.parse_expr_prefix_by_nonreserved_word(&w)?), + // This word indicated an expression prefix and parsing was successful + Ok(Some(expr)) => Ok(expr), + // This word indicated an expression prefix but parsing failed. Two options: + // 1. Malformed statement + // 2. The dialect may allow this word as identifier as well as indicating an expression + Err(e) => { + let index_after_error = self.index; + if !self.dialect.is_reserved_for_identifier(w.keyword) { + // Rollback before trying to parse using a different approach + self.index = index_before; + if let Ok(expr) = self.parse_expr_prefix_by_nonreserved_word(&w) { + return Ok(expr); + } } + self.index = index_after_error; + return Err(e); } - return Err(e); } - }, // End of Token::Word + } // End of Token::Word // array `[1, 2, 3]` Token::LBracket => self.parse_array_expr(false), tok @ Token::Minus | tok @ Token::Plus => { @@ -3716,24 +3724,6 @@ impl<'a> Parser<'a> { } } - /// Run a parser method `f`, reverting back to the current position if unsuccessful - /// but retaining the error message if such was raised by `f` - pub fn try_parse(&mut self, mut f: F) -> Result - where - F: FnMut(&mut Parser) -> Result, - { - let index = self.index; - match f(self) { - Ok(t) => Ok(t), - // Unwind stack if limit exceeded - Err(ParserError::RecursionLimitExceeded) => Err(ParserError::RecursionLimitExceeded), - Err(e) => { - self.index = index; - Err(e) - } - } - } - /// Parse either `ALL`, `DISTINCT` or `DISTINCT ON (...)`. Returns [`None`] if `ALL` is parsed /// and results in a [`ParserError`] if both `ALL` and `DISTINCT` are found. pub fn parse_all_or_distinct(&mut self) -> Result, ParserError> { diff --git a/tests/sqlparser_bigquery.rs b/tests/sqlparser_bigquery.rs index 550133c33..d4c178bbf 100644 --- a/tests/sqlparser_bigquery.rs +++ b/tests/sqlparser_bigquery.rs @@ -1749,7 +1749,10 @@ fn parse_merge() { columns: vec![Ident::new("a"), Ident::new("b"),], kind: MergeInsertKind::Values(Values { explicit_row: false, - rows: vec![vec![Expr::Value(number("1")), Expr::Default,]] + rows: vec![vec![ + Expr::Value(number("1")), + Expr::Identifier(Ident::new("DEFAULT")), + ]] }) }) }, @@ -1760,7 +1763,10 @@ fn parse_merge() { columns: vec![], kind: MergeInsertKind::Values(Values { explicit_row: false, - rows: vec![vec![Expr::Value(number("1")), Expr::Default,]] + rows: vec![vec![ + Expr::Value(number("1")), + Expr::Identifier(Ident::new("DEFAULT")), + ]] }) }) }, diff --git a/tests/sqlparser_common.rs b/tests/sqlparser_common.rs index 26e90405f..bb9c335a3 100644 --- a/tests/sqlparser_common.rs +++ b/tests/sqlparser_common.rs @@ -34,7 +34,7 @@ use sqlparser::dialect::{ GenericDialect, HiveDialect, MsSqlDialect, MySqlDialect, PostgreSqlDialect, RedshiftSqlDialect, SQLiteDialect, SnowflakeDialect, }; -use sqlparser::keywords::{Keyword, ALL_KEYWORDS}; +use sqlparser::keywords::ALL_KEYWORDS; use sqlparser::parser::{Parser, ParserError, ParserOptions}; use sqlparser::tokenizer::Tokenizer; use test_utils::{ @@ -5112,10 +5112,7 @@ fn parse_interval_dont_require_unit() { #[test] fn parse_interval_require_unit() { - let dialects = all_dialects_where(|d| { - d.require_interval_qualifier() && d.is_reserved_for_identifier(Keyword::INTERVAL) - }); - + let dialects = all_dialects_where(|d| d.require_interval_qualifier()); let sql = "SELECT INTERVAL '1 DAY'"; let err = dialects.parse_sql_statements(sql).unwrap_err(); assert_eq!( diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs index 6296beba4..d27569e03 100644 --- a/tests/sqlparser_postgres.rs +++ b/tests/sqlparser_postgres.rs @@ -1352,7 +1352,7 @@ fn parse_set() { local: false, hivevar: false, variables: OneOrManyWithParens::One(ObjectName(vec![Ident::new("a")])), - value: vec![Expr::Default], + value: vec![Expr::Identifier(Ident::new("DEFAULT"))], } ); @@ -4226,7 +4226,7 @@ fn test_simple_postgres_insert_with_alias() { body: Box::new(SetExpr::Values(Values { explicit_row: false, rows: vec![vec![ - Expr::Default, + Expr::Identifier(Ident::new("DEFAULT")), Expr::Value(Value::Number("123".to_string(), false)) ]] })), @@ -4289,7 +4289,7 @@ fn test_simple_postgres_insert_with_alias() { body: Box::new(SetExpr::Values(Values { explicit_row: false, rows: vec![vec![ - Expr::Default, + Expr::Identifier(Ident::new("DEFAULT")), Expr::Value(Value::Number( bigdecimal::BigDecimal::new(123.into(), 0), false @@ -4354,7 +4354,7 @@ fn test_simple_insert_with_quoted_alias() { body: Box::new(SetExpr::Values(Values { explicit_row: false, rows: vec![vec![ - Expr::Default, + Expr::Identifier(Ident::new("DEFAULT")), Expr::Value(Value::SingleQuotedString("0123".to_string())) ]] })), From 8c9b23d20be4ef4dfba4aa7eae81e1bddf18e491 Mon Sep 17 00:00:00 2001 From: Yoav Cohen Date: Mon, 18 Nov 2024 20:44:36 +0100 Subject: [PATCH 4/6] Code review comments --- src/parser/mod.rs | 268 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 254 insertions(+), 14 deletions(-) diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 1d67456c2..ec8e52cf4 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1025,6 +1025,8 @@ impl<'a> Parser<'a> { Ok(Statement::NOTIFY { channel, payload }) } + // Tries to parse an expression by matching the specified word to known keywords that have a special meaning in the dialect. + // Returns `None if no match is found. fn parse_expr_prefix_by_reserved_word( &mut self, w: &Word, @@ -1131,7 +1133,8 @@ impl<'a> Parser<'a> { } } - fn parse_expr_prefix_by_nonreserved_word(&mut self, w: &Word) -> Result { + // Tries to parse an expression by a word that is not known to have a special meaning in the dialect. + fn parse_expr_prefix_by_unnreserved_word(&mut self, w: &Word) -> Result { match self.peek_token().token { Token::LParen | Token::Period => { let mut id_parts: Vec = vec![w.to_ident()]; @@ -1245,27 +1248,252 @@ impl<'a> Parser<'a> { return Ok(expr); } + let next_token = self.next_token(); + let expr = match next_token.token { + Token::Word(w) => { + // The word we consumed may fall into one of two cases: it has a special meaning, or not. + // For example, in Snowflake, the word `interval` may have two meanings depending on the context: + // `SELECT CURRENT_DATE() + INTERVAL '1 DAY', MAX(interval) FROM tbl;` + // ^^^^^^^^^^^^^^^^ ^^^^^^^^ + // interval expression identifier + // + // We first try to parse the word and following tokens as a special expression, and if that fails, + // we rollback and try to parse it as an identifier. + match self + .maybe_parse_internal(|parser| parser.parse_expr_prefix_by_reserved_word(&w)) + { + // This word indicated an expression prefix and parsing was successful + Ok(Some(expr)) => Ok(expr), + + // No expression prefix associated with this word + Ok(None) => Ok(self.parse_expr_prefix_by_unnreserved_word(&w)?), + + // If parsing of the word as a special expression failed, we are facing two options: + // 1. The statement is malformed, e.g. `SELECT INTERVAL '1 DAI` + // 2. The word is used as an identifier, e.g. `SELECT MAX(interval) FROM tbl` + // We first try to parse the word as an identifier and if that fails + // we rollback and return the parsing error we got from trying to parse a + // special expression (to maintain backwards compatibility of parsing errors). + Err(e) => { + if !self.dialect.is_reserved_for_identifier(w.keyword) { + if let Ok(expr) = self.maybe_parse_internal(|parser| { + parser.parse_expr_prefix_by_unnreserved_word(&w) + }) { + return Ok(expr); + } + } + return Err(e); + } + } + } // End of Token::Word + // array `[1, 2, 3]` + Token::LBracket => self.parse_array_expr(false), + tok @ Token::Minus | tok @ Token::Plus => { + let op = if tok == Token::Plus { + UnaryOperator::Plus + } else { + UnaryOperator::Minus + }; + Ok(Expr::UnaryOp { + op, + expr: Box::new( + self.parse_subexpr(self.dialect.prec_value(Precedence::MulDivModOp))?, + ), + }) + } + Token::ExclamationMark if self.dialect.supports_bang_not_operator() => { + Ok(Expr::UnaryOp { + op: UnaryOperator::BangNot, + expr: Box::new( + self.parse_subexpr(self.dialect.prec_value(Precedence::UnaryNot))?, + ), + }) + } + tok @ Token::DoubleExclamationMark + | tok @ Token::PGSquareRoot + | tok @ Token::PGCubeRoot + | tok @ Token::AtSign + | tok @ Token::Tilde + if dialect_of!(self is PostgreSqlDialect) => + { + let op = match tok { + Token::DoubleExclamationMark => UnaryOperator::PGPrefixFactorial, + Token::PGSquareRoot => UnaryOperator::PGSquareRoot, + Token::PGCubeRoot => UnaryOperator::PGCubeRoot, + Token::AtSign => UnaryOperator::PGAbs, + Token::Tilde => UnaryOperator::PGBitwiseNot, + _ => unreachable!(), + }; + Ok(Expr::UnaryOp { + op, + expr: Box::new( + self.parse_subexpr(self.dialect.prec_value(Precedence::PlusMinus))?, + ), + }) + } + Token::EscapedStringLiteral(_) if dialect_of!(self is PostgreSqlDialect | GenericDialect) => + { + self.prev_token(); + Ok(Expr::Value(self.parse_value()?)) + } + Token::UnicodeStringLiteral(_) => { + self.prev_token(); + Ok(Expr::Value(self.parse_value()?)) + } + Token::Number(_, _) + | Token::SingleQuotedString(_) + | Token::DoubleQuotedString(_) + | Token::TripleSingleQuotedString(_) + | Token::TripleDoubleQuotedString(_) + | Token::DollarQuotedString(_) + | Token::SingleQuotedByteStringLiteral(_) + | Token::DoubleQuotedByteStringLiteral(_) + | Token::TripleSingleQuotedByteStringLiteral(_) + | Token::TripleDoubleQuotedByteStringLiteral(_) + | Token::SingleQuotedRawStringLiteral(_) + | Token::DoubleQuotedRawStringLiteral(_) + | Token::TripleSingleQuotedRawStringLiteral(_) + | Token::TripleDoubleQuotedRawStringLiteral(_) + | Token::NationalStringLiteral(_) + | Token::HexStringLiteral(_) => { + self.prev_token(); + Ok(Expr::Value(self.parse_value()?)) + } + Token::LParen => { + let expr = if let Some(expr) = self.try_parse_expr_sub_query()? { + expr + } else if let Some(lambda) = self.try_parse_lambda()? { + return Ok(lambda); + } else { + let exprs = self.parse_comma_separated(Parser::parse_expr)?; + match exprs.len() { + 0 => unreachable!(), // parse_comma_separated ensures 1 or more + 1 => Expr::Nested(Box::new(exprs.into_iter().next().unwrap())), + _ => Expr::Tuple(exprs), + } + }; + self.expect_token(&Token::RParen)?; + let expr = self.try_parse_method(expr)?; + if !self.consume_token(&Token::Period) { + Ok(expr) + } else { + let tok = self.next_token(); + let key = match tok.token { + Token::Word(word) => word.to_ident(), + _ => { + return parser_err!( + format!("Expected identifier, found: {tok}"), + tok.location + ) + } + }; + Ok(Expr::CompositeAccess { + expr: Box::new(expr), + key, + }) + } + } + Token::Placeholder(_) | Token::Colon | Token::AtSign => { + self.prev_token(); + Ok(Expr::Value(self.parse_value()?)) + } + Token::LBrace if self.dialect.supports_dictionary_syntax() => { + self.prev_token(); + self.parse_duckdb_struct_literal() + } + _ => self.expected("an expression", next_token), + }?; + + let expr = self.try_parse_method(expr)?; + + if self.parse_keyword(Keyword::COLLATE) { + Ok(Expr::Collate { + expr: Box::new(expr), + collation: self.parse_object_name(false)?, + }) + } else { + Ok(expr) + } + } + + /// Parse an expression prefix. + pub fn parse_prefix2(&mut self) -> Result { + // allow the dialect to override prefix parsing + if let Some(prefix) = self.dialect.parse_prefix(self) { + return prefix; + } + + // PostgreSQL allows any string literal to be preceded by a type name, indicating that the + // string literal represents a literal of that type. Some examples: + // + // DATE '2020-05-20' + // TIMESTAMP WITH TIME ZONE '2020-05-20 7:43:54' + // BOOL 'true' + // + // The first two are standard SQL, while the latter is a PostgreSQL extension. Complicating + // matters is the fact that INTERVAL string literals may optionally be followed by special + // keywords, e.g.: + // + // INTERVAL '7' DAY + // + // Note also that naively `SELECT date` looks like a syntax error because the `date` type + // name is not followed by a string literal, but in fact in PostgreSQL it is a valid + // expression that should parse as the column name "date". + let loc = self.peek_token().location; + let opt_expr = self.maybe_parse(|parser| { + match parser.parse_data_type()? { + DataType::Interval => parser.parse_interval(), + // PostgreSQL allows almost any identifier to be used as custom data type name, + // and we support that in `parse_data_type()`. But unlike Postgres we don't + // have a list of globally reserved keywords (since they vary across dialects), + // so given `NOT 'a' LIKE 'b'`, we'd accept `NOT` as a possible custom data type + // name, resulting in `NOT 'a'` being recognized as a `TypedString` instead of + // an unary negation `NOT ('a' LIKE 'b')`. To solve this, we don't accept the + // `type 'string'` syntax for the custom data types at all. + DataType::Custom(..) => parser_err!("dummy", loc), + data_type => Ok(Expr::TypedString { + data_type, + value: parser.parse_literal_string()?, + }), + } + })?; + + if let Some(expr) = opt_expr { + return Ok(expr); + } + let next_token = self.next_token(); let expr = match next_token.token { Token::Word(w) => { // Save the parser index so we can rollback let index_before = self.index; - // We first try to parse the word as the prefix of an expression. - // For example, the word INTERVAL in: SELECT INTERVAL '7' DAY + // The word we consumed may fall into one of two cases: it's a reserved word in the dialect + // and has a special meaning, or not. For example, in Snowflake, the word `interval` may have + // two meanings depending on the context: + // `SELECT CURRENT_DATE() + INTERVAL '1 DAY', MAX(interval) FROM test;` + // In its first occurrence it's part of an interval expression and in the second it's an identifier. + + // We first try to parse the word and following tokens as a special expression, and if that fails, + // we rollback and try to parse it as an identifier. match self.parse_expr_prefix_by_reserved_word(&w) { // No expression prefix associated with this word - Ok(None) => Ok(self.parse_expr_prefix_by_nonreserved_word(&w)?), + Ok(None) => Ok(self.parse_expr_prefix_by_unnreserved_word(&w)?), // This word indicated an expression prefix and parsing was successful Ok(Some(expr)) => Ok(expr), - // This word indicated an expression prefix but parsing failed. Two options: - // 1. Malformed statement - // 2. The dialect may allow this word as identifier as well as indicating an expression + // If parsing of the word as a special expression failed, we are facing two options: + // 1. The statement is malformed, e.g. `SELECT INTERVAL '1 DAI` + // 2. The word is used as an identifier, e.g. `SELECT MAX(interval) FROM tbl` + + // We first try to parse the word as an identifier and if that fails + // we rollback to the original position in the token stream and return parsing error + // we got from trying to parse a special expression (to maintain backwards + // compatibility of parsing errors). Err(e) => { let index_after_error = self.index; if !self.dialect.is_reserved_for_identifier(w.keyword) { // Rollback before trying to parse using a different approach self.index = index_before; - if let Ok(expr) = self.parse_expr_prefix_by_nonreserved_word(&w) { + if let Ok(expr) = self.parse_expr_prefix_by_unnreserved_word(&w) { return Ok(expr); } } @@ -3708,18 +3936,30 @@ impl<'a> Parser<'a> { } /// Run a parser method `f`, reverting back to the current position if unsuccessful. - pub fn maybe_parse(&mut self, mut f: F) -> Result, ParserError> + /// Returns `None` if `f` returns an error + pub fn maybe_parse(&mut self, f: F) -> Result, ParserError> where F: FnMut(&mut Parser) -> Result, { - let index = self.index; - match f(self) { + match self.maybe_parse_internal(f) { Ok(t) => Ok(Some(t)), - // Unwind stack if limit exceeded Err(ParserError::RecursionLimitExceeded) => Err(ParserError::RecursionLimitExceeded), - Err(_) => { + _ => Ok(None), + } + } + + /// Run a parser method `f`, reverting back to the current position if unsuccessful. + pub fn maybe_parse_internal(&mut self, mut f: F) -> Result + where + F: FnMut(&mut Parser) -> Result, + { + let index = self.index; + match f(self) { + Ok(t) => Ok(t), + Err(e) => { + // Unwind stack if limit exceeded self.index = index; - Ok(None) + Err(e) } } } From 4486c209c9f4b4fd615057d037c0d3017b061c32 Mon Sep 17 00:00:00 2001 From: Yoav Cohen Date: Wed, 20 Nov 2024 18:16:21 +0100 Subject: [PATCH 5/6] Code review fixes --- src/parser/mod.rs | 234 ++------------------------------------ tests/sqlparser_common.rs | 20 +++- 2 files changed, 27 insertions(+), 227 deletions(-) diff --git a/src/parser/mod.rs b/src/parser/mod.rs index ec8e52cf4..6767f358a 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1134,7 +1134,7 @@ impl<'a> Parser<'a> { } // Tries to parse an expression by a word that is not known to have a special meaning in the dialect. - fn parse_expr_prefix_by_unnreserved_word(&mut self, w: &Word) -> Result { + fn parse_expr_prefix_by_unreserved_word(&mut self, w: &Word) -> Result { match self.peek_token().token { Token::LParen | Token::Period => { let mut id_parts: Vec = vec![w.to_ident()]; @@ -1259,25 +1259,23 @@ impl<'a> Parser<'a> { // // We first try to parse the word and following tokens as a special expression, and if that fails, // we rollback and try to parse it as an identifier. - match self - .maybe_parse_internal(|parser| parser.parse_expr_prefix_by_reserved_word(&w)) - { + match self.try_parse(|parser| parser.parse_expr_prefix_by_reserved_word(&w)) { // This word indicated an expression prefix and parsing was successful Ok(Some(expr)) => Ok(expr), // No expression prefix associated with this word - Ok(None) => Ok(self.parse_expr_prefix_by_unnreserved_word(&w)?), + Ok(None) => Ok(self.parse_expr_prefix_by_unreserved_word(&w)?), // If parsing of the word as a special expression failed, we are facing two options: - // 1. The statement is malformed, e.g. `SELECT INTERVAL '1 DAI` + // 1. The statement is malformed, e.g. `SELECT INTERVAL '1 DAI` (`DAI` instead of `DAY`) // 2. The word is used as an identifier, e.g. `SELECT MAX(interval) FROM tbl` // We first try to parse the word as an identifier and if that fails // we rollback and return the parsing error we got from trying to parse a // special expression (to maintain backwards compatibility of parsing errors). Err(e) => { if !self.dialect.is_reserved_for_identifier(w.keyword) { - if let Ok(expr) = self.maybe_parse_internal(|parser| { - parser.parse_expr_prefix_by_unnreserved_word(&w) + if let Ok(Some(expr)) = self.maybe_parse(|parser| { + parser.parse_expr_prefix_by_unreserved_word(&w) }) { return Ok(expr); } @@ -1416,222 +1414,6 @@ impl<'a> Parser<'a> { } } - /// Parse an expression prefix. - pub fn parse_prefix2(&mut self) -> Result { - // allow the dialect to override prefix parsing - if let Some(prefix) = self.dialect.parse_prefix(self) { - return prefix; - } - - // PostgreSQL allows any string literal to be preceded by a type name, indicating that the - // string literal represents a literal of that type. Some examples: - // - // DATE '2020-05-20' - // TIMESTAMP WITH TIME ZONE '2020-05-20 7:43:54' - // BOOL 'true' - // - // The first two are standard SQL, while the latter is a PostgreSQL extension. Complicating - // matters is the fact that INTERVAL string literals may optionally be followed by special - // keywords, e.g.: - // - // INTERVAL '7' DAY - // - // Note also that naively `SELECT date` looks like a syntax error because the `date` type - // name is not followed by a string literal, but in fact in PostgreSQL it is a valid - // expression that should parse as the column name "date". - let loc = self.peek_token().location; - let opt_expr = self.maybe_parse(|parser| { - match parser.parse_data_type()? { - DataType::Interval => parser.parse_interval(), - // PostgreSQL allows almost any identifier to be used as custom data type name, - // and we support that in `parse_data_type()`. But unlike Postgres we don't - // have a list of globally reserved keywords (since they vary across dialects), - // so given `NOT 'a' LIKE 'b'`, we'd accept `NOT` as a possible custom data type - // name, resulting in `NOT 'a'` being recognized as a `TypedString` instead of - // an unary negation `NOT ('a' LIKE 'b')`. To solve this, we don't accept the - // `type 'string'` syntax for the custom data types at all. - DataType::Custom(..) => parser_err!("dummy", loc), - data_type => Ok(Expr::TypedString { - data_type, - value: parser.parse_literal_string()?, - }), - } - })?; - - if let Some(expr) = opt_expr { - return Ok(expr); - } - - let next_token = self.next_token(); - let expr = match next_token.token { - Token::Word(w) => { - // Save the parser index so we can rollback - let index_before = self.index; - // The word we consumed may fall into one of two cases: it's a reserved word in the dialect - // and has a special meaning, or not. For example, in Snowflake, the word `interval` may have - // two meanings depending on the context: - // `SELECT CURRENT_DATE() + INTERVAL '1 DAY', MAX(interval) FROM test;` - // In its first occurrence it's part of an interval expression and in the second it's an identifier. - - // We first try to parse the word and following tokens as a special expression, and if that fails, - // we rollback and try to parse it as an identifier. - match self.parse_expr_prefix_by_reserved_word(&w) { - // No expression prefix associated with this word - Ok(None) => Ok(self.parse_expr_prefix_by_unnreserved_word(&w)?), - // This word indicated an expression prefix and parsing was successful - Ok(Some(expr)) => Ok(expr), - // If parsing of the word as a special expression failed, we are facing two options: - // 1. The statement is malformed, e.g. `SELECT INTERVAL '1 DAI` - // 2. The word is used as an identifier, e.g. `SELECT MAX(interval) FROM tbl` - - // We first try to parse the word as an identifier and if that fails - // we rollback to the original position in the token stream and return parsing error - // we got from trying to parse a special expression (to maintain backwards - // compatibility of parsing errors). - Err(e) => { - let index_after_error = self.index; - if !self.dialect.is_reserved_for_identifier(w.keyword) { - // Rollback before trying to parse using a different approach - self.index = index_before; - if let Ok(expr) = self.parse_expr_prefix_by_unnreserved_word(&w) { - return Ok(expr); - } - } - self.index = index_after_error; - return Err(e); - } - } - } // End of Token::Word - // array `[1, 2, 3]` - Token::LBracket => self.parse_array_expr(false), - tok @ Token::Minus | tok @ Token::Plus => { - let op = if tok == Token::Plus { - UnaryOperator::Plus - } else { - UnaryOperator::Minus - }; - Ok(Expr::UnaryOp { - op, - expr: Box::new( - self.parse_subexpr(self.dialect.prec_value(Precedence::MulDivModOp))?, - ), - }) - } - Token::ExclamationMark if self.dialect.supports_bang_not_operator() => { - Ok(Expr::UnaryOp { - op: UnaryOperator::BangNot, - expr: Box::new( - self.parse_subexpr(self.dialect.prec_value(Precedence::UnaryNot))?, - ), - }) - } - tok @ Token::DoubleExclamationMark - | tok @ Token::PGSquareRoot - | tok @ Token::PGCubeRoot - | tok @ Token::AtSign - | tok @ Token::Tilde - if dialect_of!(self is PostgreSqlDialect) => - { - let op = match tok { - Token::DoubleExclamationMark => UnaryOperator::PGPrefixFactorial, - Token::PGSquareRoot => UnaryOperator::PGSquareRoot, - Token::PGCubeRoot => UnaryOperator::PGCubeRoot, - Token::AtSign => UnaryOperator::PGAbs, - Token::Tilde => UnaryOperator::PGBitwiseNot, - _ => unreachable!(), - }; - Ok(Expr::UnaryOp { - op, - expr: Box::new( - self.parse_subexpr(self.dialect.prec_value(Precedence::PlusMinus))?, - ), - }) - } - Token::EscapedStringLiteral(_) if dialect_of!(self is PostgreSqlDialect | GenericDialect) => - { - self.prev_token(); - Ok(Expr::Value(self.parse_value()?)) - } - Token::UnicodeStringLiteral(_) => { - self.prev_token(); - Ok(Expr::Value(self.parse_value()?)) - } - Token::Number(_, _) - | Token::SingleQuotedString(_) - | Token::DoubleQuotedString(_) - | Token::TripleSingleQuotedString(_) - | Token::TripleDoubleQuotedString(_) - | Token::DollarQuotedString(_) - | Token::SingleQuotedByteStringLiteral(_) - | Token::DoubleQuotedByteStringLiteral(_) - | Token::TripleSingleQuotedByteStringLiteral(_) - | Token::TripleDoubleQuotedByteStringLiteral(_) - | Token::SingleQuotedRawStringLiteral(_) - | Token::DoubleQuotedRawStringLiteral(_) - | Token::TripleSingleQuotedRawStringLiteral(_) - | Token::TripleDoubleQuotedRawStringLiteral(_) - | Token::NationalStringLiteral(_) - | Token::HexStringLiteral(_) => { - self.prev_token(); - Ok(Expr::Value(self.parse_value()?)) - } - Token::LParen => { - let expr = if let Some(expr) = self.try_parse_expr_sub_query()? { - expr - } else if let Some(lambda) = self.try_parse_lambda()? { - return Ok(lambda); - } else { - let exprs = self.parse_comma_separated(Parser::parse_expr)?; - match exprs.len() { - 0 => unreachable!(), // parse_comma_separated ensures 1 or more - 1 => Expr::Nested(Box::new(exprs.into_iter().next().unwrap())), - _ => Expr::Tuple(exprs), - } - }; - self.expect_token(&Token::RParen)?; - let expr = self.try_parse_method(expr)?; - if !self.consume_token(&Token::Period) { - Ok(expr) - } else { - let tok = self.next_token(); - let key = match tok.token { - Token::Word(word) => word.to_ident(), - _ => { - return parser_err!( - format!("Expected identifier, found: {tok}"), - tok.location - ) - } - }; - Ok(Expr::CompositeAccess { - expr: Box::new(expr), - key, - }) - } - } - Token::Placeholder(_) | Token::Colon | Token::AtSign => { - self.prev_token(); - Ok(Expr::Value(self.parse_value()?)) - } - Token::LBrace if self.dialect.supports_dictionary_syntax() => { - self.prev_token(); - self.parse_duckdb_struct_literal() - } - _ => self.expected("an expression", next_token), - }?; - - let expr = self.try_parse_method(expr)?; - - if self.parse_keyword(Keyword::COLLATE) { - Ok(Expr::Collate { - expr: Box::new(expr), - collation: self.parse_object_name(false)?, - }) - } else { - Ok(expr) - } - } - pub fn parse_utility_options(&mut self) -> Result, ParserError> { self.expect_token(&Token::LParen)?; let options = self.parse_comma_separated(Self::parse_utility_option)?; @@ -3941,7 +3723,7 @@ impl<'a> Parser<'a> { where F: FnMut(&mut Parser) -> Result, { - match self.maybe_parse_internal(f) { + match self.try_parse(f) { Ok(t) => Ok(Some(t)), Err(ParserError::RecursionLimitExceeded) => Err(ParserError::RecursionLimitExceeded), _ => Ok(None), @@ -3949,7 +3731,7 @@ impl<'a> Parser<'a> { } /// Run a parser method `f`, reverting back to the current position if unsuccessful. - pub fn maybe_parse_internal(&mut self, mut f: F) -> Result + pub fn try_parse(&mut self, mut f: F) -> Result where F: FnMut(&mut Parser) -> Result, { diff --git a/tests/sqlparser_common.rs b/tests/sqlparser_common.rs index bb9c335a3..c03370892 100644 --- a/tests/sqlparser_common.rs +++ b/tests/sqlparser_common.rs @@ -34,7 +34,7 @@ use sqlparser::dialect::{ GenericDialect, HiveDialect, MsSqlDialect, MySqlDialect, PostgreSqlDialect, RedshiftSqlDialect, SQLiteDialect, SnowflakeDialect, }; -use sqlparser::keywords::ALL_KEYWORDS; +use sqlparser::keywords::{Keyword, ALL_KEYWORDS}; use sqlparser::parser::{Parser, ParserError, ParserOptions}; use sqlparser::tokenizer::Tokenizer; use test_utils::{ @@ -12197,3 +12197,21 @@ fn parse_create_table_select() { ); } } + +#[test] +fn test_reserved_keywords_for_identifiers() { + let dialects = all_dialects_where(|d| d.is_reserved_for_identifier(Keyword::INTERVAL)); + // Dialects that reserve the word INTERVAL will not allow it as an unquoted identifier + let sql = "SELECT MAX(interval) FROM tbl"; + assert_eq!( + dialects.parse_sql_statements(sql), + Err(ParserError::ParserError( + "Expected: an expression, found: )".to_string() + )) + ); + + // Dialects that do not reserve the word INTERVAL will allow it + let dialects = all_dialects_where(|d| !d.is_reserved_for_identifier(Keyword::INTERVAL)); + let sql = "SELECT MAX(interval) FROM tbl"; + dialects.parse_sql_statements(sql).unwrap(); +} From c50a7918f176a0fed94a627b1e619ec1c22336a4 Mon Sep 17 00:00:00 2001 From: Yoav Cohen Date: Sat, 23 Nov 2024 19:15:48 +0100 Subject: [PATCH 6/6] code format --- src/dialect/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 7d857c697..b622c1da3 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -681,7 +681,7 @@ pub trait Dialect: Debug + Any { fn supports_partiql(&self) -> bool { false } - + /// Returns true if the specified keyword is reserved and cannot be /// used as an identifier without special handling like quoting. fn is_reserved_for_identifier(&self, kw: Keyword) -> bool {