diff --git a/README.md b/README.md index 70c62e8..af5167d 100644 --- a/README.md +++ b/README.md @@ -14,16 +14,18 @@ PyMongoSQL is a Python [DB API 2.0 (PEP 249)](https://www.python.org/dev/peps/pe ## Objectives -PyMongoSQL implements the DB API 2.0 interfaces to provide SQL-like access to MongoDB. The project aims to: +PyMongoSQL implements the DB API 2.0 interfaces to provide SQL-like access to MongoDB, built on PartiQL syntax for querying semi-structured data. The project aims to: -- Bridge the gap between SQL and NoSQL by providing SQL capabilities for MongoDB -- Support standard SQL DQL (Data Query Language) operations including SELECT statements with WHERE, ORDER BY, and LIMIT clauses +- Bridge the gap between SQL and NoSQL by providing SQL capabilities for MongoDB's nested document structures +- Support standard SQL DQL (Data Query Language) operations including SELECT statements with WHERE, ORDER BY, and LIMIT clauses on nested and hierarchical data - Provide seamless integration with existing Python applications that expect DB API 2.0 compliance -- Enable easy migration from traditional SQL databases to MongoDB +- Enable easy migration from traditional SQL databases to MongoDB without rewriting queries for document traversal ## Features - **DB API 2.0 Compliant**: Full compatibility with Python Database API 2.0 specification +- **PartiQL-based SQL Syntax**: Built on [PartiQL](https://partiql.org/tutorial.html) (SQL for semi-structured data), enabling seamless SQL querying of nested and hierarchical MongoDB documents +- **Nested Structure Support**: Query and filter deeply nested fields and arrays within MongoDB documents using standard SQL syntax - **SQLAlchemy Integration**: Complete ORM and Core support with dedicated MongoDB dialect - **SQL Query Support**: SELECT statements with WHERE conditions, field selection, and aliases - **Connection String Support**: MongoDB URI format for easy configuration @@ -140,6 +142,7 @@ while users: ### SELECT Statements - Field selection: `SELECT name, age FROM users` - Wildcards: `SELECT * FROM products` +- **Field aliases**: `SELECT name as user_name, age as user_age FROM users` - **Nested fields**: `SELECT profile.name, profile.age FROM users` - **Array access**: `SELECT items[0], items[1].name FROM orders` @@ -176,6 +179,27 @@ while users: These features are on our development roadmap and contributions are welcome! +## Apache Superset Integration + +PyMongoSQL can be used as a database driver in Apache Superset for querying and visualizing MongoDB data: + +1. **Install PyMongoSQL**: Install PyMongoSQL on the Superset app server: + ```bash + pip install pymongosql + ``` +2. **Create Connection**: Connect to your MongoDB instance using the connection URI with superset mode: + ``` + mongodb://username:password@host:port/database?mode=superset + ``` + or for MongoDB Atlas: + ``` + mongodb+srv://username:password@host/database?mode=superset + ``` +3. **Use SQL Lab**: Write and execute SQL queries against MongoDB collections directly in Superset's SQL Lab +4. **Create Visualizations**: Build charts and dashboards from your MongoDB queries using Superset's visualization tools + +This allows seamless integration between MongoDB data and Superset's BI capabilities without requiring data migration to traditional SQL databases. + ## Contributing Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change. diff --git a/pymongosql/__init__.py b/pymongosql/__init__.py index 0732d1a..dfb892d 100644 --- a/pymongosql/__init__.py +++ b/pymongosql/__init__.py @@ -6,7 +6,7 @@ if TYPE_CHECKING: from .connection import Connection -__version__: str = "0.2.3" +__version__: str = "0.2.4" # Globals https://www.python.org/dev/peps/pep-0249/#globals apilevel: str = "2.0" diff --git a/pymongosql/connection.py b/pymongosql/connection.py index f547e96..a83a832 100644 --- a/pymongosql/connection.py +++ b/pymongosql/connection.py @@ -38,14 +38,18 @@ def __init__( Supports connection string patterns: - mongodb://host:port/database - Core driver (no subquery support) - - mongodb+superset://host:port/database - Superset driver with subquery support + - mongodb+srv://host:port/database - Cloud/SRV connection string + - mongodb://host:port/database?mode=superset - Superset driver with subquery support + - mongodb+srv://host:port/database?mode=superset - Cloud SRV with superset mode + + Mode is specified via the ?mode= query parameter. If not specified, defaults to "standard". See PyMongo MongoClient documentation for full parameter details. https://www.mongodb.com/docs/languages/python/pymongo-driver/current/connect/mongoclient/ """ # Check if connection string specifies mode connection_string = host if isinstance(host, str) else None - mode, host = ConnectionHelper.parse_connection_string(connection_string) + mode, db_from_uri, host = ConnectionHelper.parse_connection_string(connection_string) self._mode = kwargs.pop("mode", None) if not self._mode and mode: @@ -56,7 +60,10 @@ def __init__( self._port = port or 27017 # Handle database parameter separately (not a MongoClient parameter) + # Explicit 'database' parameter takes precedence over database in URI self._database_name = kwargs.pop("database", None) + if not self._database_name and db_from_uri: + self._database_name = db_from_uri # Store all PyMongo parameters to pass through directly self._pymongo_params = kwargs.copy() diff --git a/pymongosql/executor.py b/pymongosql/executor.py index ad36af3..cbdb256 100644 --- a/pymongosql/executor.py +++ b/pymongosql/executor.py @@ -1,14 +1,4 @@ # -*- coding: utf-8 -*- -""" -Query execution strategies for handling both simple and subquery-based SQL operations. - -This module provides different execution strategies: -- StandardExecution: Direct MongoDB query for simple SELECT statements - -The intermediate database is configurable - any backend implementing QueryDatabase -interface can be used (SQLite3, PostgreSQL, MySQL, etc.). -""" - import logging from abc import ABC, abstractmethod from dataclasses import dataclass diff --git a/pymongosql/helper.py b/pymongosql/helper.py index 68344a6..38a3610 100644 --- a/pymongosql/helper.py +++ b/pymongosql/helper.py @@ -7,7 +7,7 @@ import logging from typing import Optional, Tuple -from urllib.parse import urlparse +from urllib.parse import parse_qs, urlparse _logger = logging.getLogger(__name__) @@ -17,52 +17,80 @@ class ConnectionHelper: Supports connection string patterns: - mongodb://host:port/database - Core driver (no subquery support) - - mongodb+superset://host:port/database - Superset driver with subquery support + - mongodb+srv://host:port/database - Cloud/SRV connection string + - mongodb://host:port/database?mode=superset - Superset driver with subquery support + - mongodb+srv://host:port/database?mode=superset - Cloud SRV with superset mode + + Mode is specified via query parameter (?mode=superset) and defaults to "standard" if not specified. """ @staticmethod - def parse_connection_string(connection_string: str) -> Tuple[str, str, Optional[str], int, Optional[str]]: + def parse_connection_string(connection_string: Optional[str]) -> Tuple[Optional[str], Optional[str], Optional[str]]: """ - Parse PyMongoSQL connection string and determine driver mode. + Parse MongoDB connection string and extract driver mode from query parameters. + + Mode is extracted from the 'mode' query parameter and removed from the normalized + connection string. Database name is extracted from the path. If mode is not specified, + it defaults to "standard". + + Supports all standard MongoDB connection string patterns: + mongodb://[username:password@]host1[:port1][,host2[:port2]...][/[defaultauthdb]?options] + + Args: + connection_string: MongoDB connection string + + Returns: + Tuple of (mode, database_name, normalized_connection_string) + - mode: "standard" (default) or other mode values specified via ?mode= parameter + - database_name: extracted database name from path, or None if not specified + - normalized_connection_string: connection string without the mode parameter """ try: if not connection_string: - return "standard", None + return "standard", None, None parsed = urlparse(connection_string) - scheme = parsed.scheme if not parsed.scheme: - return "standard", connection_string - - base_scheme = "mongodb" - mode = "standard" - - # Determine mode from scheme - if "+" in scheme: - base_scheme = scheme.split("+")[0].lower() - mode = scheme.split("+")[-1].lower() - - host = parsed.hostname or "localhost" - port = parsed.port or 27017 - database = parsed.path.lstrip("/") if parsed.path else None - - # Build normalized connection string with mongodb scheme (removing any +mode) - # Reconstruct netloc with credentials if present - netloc = host - if parsed.username: - creds = parsed.username - if parsed.password: - creds += f":{parsed.password}" - netloc = f"{creds}@{host}" - netloc += f":{port}" - - query_part = f"?{parsed.query}" if parsed.query else "" - normalized_connection_string = f"{base_scheme}://{netloc}/{database or ''}{query_part}" - - _logger.debug(f"Parsed connection string - Mode: {mode}, Host: {host}, Port: {port}, Database: {database}") - - return mode, normalized_connection_string + return "standard", None, connection_string + + # Extract mode from query parameters (defaults to "standard" if not specified) + query_params = parse_qs(parsed.query, keep_blank_values=True) if parsed.query else {} + mode = query_params.get("mode", ["standard"])[0] + + # Extract database name from path + database_name = None + if parsed.path: + # Remove leading slash and trailing slashes + path_parts = parsed.path.strip("/").split("/") + if path_parts and path_parts[0]: # Get the first path segment as database name + database_name = path_parts[0] + + # Remove mode from query parameters + query_params.pop("mode", None) + + # Rebuild query string without mode parameter + query_string = ( + "&".join(f"{k}={v}" if v else k for k, v_list in query_params.items() for v in v_list) + if query_params + else "" + ) + + # Reconstruct the connection string without mode parameter + if query_string: + if parsed.path: + normalized_connection_string = f"{parsed.scheme}://{parsed.netloc}{parsed.path}?{query_string}" + else: + normalized_connection_string = f"{parsed.scheme}://{parsed.netloc}?{query_string}" + else: + if parsed.path: + normalized_connection_string = f"{parsed.scheme}://{parsed.netloc}{parsed.path}" + else: + normalized_connection_string = f"{parsed.scheme}://{parsed.netloc}" + + _logger.debug(f"Parsed connection string - Mode: {mode}, Database: {database_name}") + + return mode, database_name, normalized_connection_string except Exception as e: _logger.error(f"Failed to parse connection string: {e}") diff --git a/pymongosql/result_set.py b/pymongosql/result_set.py index 20597b7..abb3583 100644 --- a/pymongosql/result_set.py +++ b/pymongosql/result_set.py @@ -42,7 +42,7 @@ def __init__( self._is_closed = False self._cache_exhausted = False self._total_fetched = 0 - self._description: Optional[List[Tuple[str, str, None, None, None, None, None]]] = None + self._description: Optional[List[Tuple[str, Any, None, None, None, None, None]]] = None self._column_names: Optional[List[str]] = None # Track column order for sequences self._errors: List[Dict[str, str]] = [] @@ -69,9 +69,7 @@ def _build_description(self) -> None: if not self._execution_plan.projection_stage: # No projection specified, build description from column names if available if self._column_names: - self._description = [ - (col_name, "VARCHAR", None, None, None, None, None) for col_name in self._column_names - ] + self._description = [(col_name, str, None, None, None, None, None) for col_name in self._column_names] else: # Will be built dynamically when columns are established self._description = None @@ -79,10 +77,14 @@ def _build_description(self) -> None: # Build description from projection (now in MongoDB format {field: 1}) description = [] + column_aliases = getattr(self._execution_plan, "column_aliases", {}) + for field_name, include_flag in self._execution_plan.projection_stage.items(): # SQL cursor description format: (name, type_code, display_size, internal_size, precision, scale, null_ok) if include_flag == 1: # Field is included in projection - description.append((field_name, "VARCHAR", None, None, None, None, None)) + # Use alias if available, otherwise use field name + display_name = column_aliases.get(field_name, field_name) + description.append((display_name, str, None, None, None, None, None)) self._description = description @@ -202,19 +204,15 @@ def rowcount(self) -> int: @property def description( self, - ) -> Optional[List[Tuple[str, str, None, None, None, None, None]]]: + ) -> Optional[List[Tuple[str, Any, None, None, None, None, None]]]: """Return column description""" if self._description is None: # Try to build description from established column names try: - if not self._cache_exhausted: - # Fetch one result to establish column names if needed - self._ensure_results_available(1) - if self._column_names: # Build description from established column names self._description = [ - (col_name, "VARCHAR", None, None, None, None, None) for col_name in self._column_names + (col_name, str, None, None, None, None, None) for col_name in self._column_names ] except Exception as e: _logger.warning(f"Could not build dynamic description: {e}") @@ -272,7 +270,6 @@ def fetchall(self) -> List[Sequence[Any]]: # Now get everything from cache all_results.extend(self._cached_results) - self._total_fetched += len(self._cached_results) self._cached_results.clear() self._cache_exhausted = True diff --git a/pymongosql/sql/ast.py b/pymongosql/sql/ast.py index 9c20a3f..fa0d97d 100644 --- a/pymongosql/sql/ast.py +++ b/pymongosql/sql/ast.py @@ -50,9 +50,11 @@ def parse_to_execution_plan(self) -> ExecutionPlan: """Convert the parse result to an ExecutionPlan using BuilderFactory""" builder = BuilderFactory.create_query_builder().collection(self._parse_result.collection) - builder.filter(self._parse_result.filter_conditions).project(self._parse_result.projection).sort( - self._parse_result.sort_fields - ).limit(self._parse_result.limit_value).skip(self._parse_result.offset_value) + builder.filter(self._parse_result.filter_conditions).project(self._parse_result.projection).column_aliases( + self._parse_result.column_aliases + ).sort(self._parse_result.sort_fields).limit(self._parse_result.limit_value).skip( + self._parse_result.offset_value + ) return builder.build() diff --git a/pymongosql/sql/builder.py b/pymongosql/sql/builder.py index 66c4f60..839bc41 100644 --- a/pymongosql/sql/builder.py +++ b/pymongosql/sql/builder.py @@ -1,7 +1,4 @@ # -*- coding: utf-8 -*- -""" -Query builder for constructing MongoDB queries in a fluent, readable way -""" import logging from dataclasses import dataclass, field from typing import Any, Dict, List, Optional, Union @@ -16,6 +13,7 @@ class ExecutionPlan: collection: Optional[str] = None filter_stage: Dict[str, Any] = field(default_factory=dict) projection_stage: Dict[str, Any] = field(default_factory=dict) + column_aliases: Dict[str, str] = field(default_factory=dict) # Maps field_name -> alias sort_stage: List[Dict[str, int]] = field(default_factory=list) limit_stage: Optional[int] = None skip_stage: Optional[int] = None @@ -56,6 +54,7 @@ def copy(self) -> "ExecutionPlan": collection=self.collection, filter_stage=self.filter_stage.copy(), projection_stage=self.projection_stage.copy(), + column_aliases=self.column_aliases.copy(), sort_stage=self.sort_stage.copy(), limit_stage=self.limit_stage, skip_stage=self.skip_stage, @@ -156,6 +155,16 @@ def skip(self, count: int) -> "MongoQueryBuilder": _logger.debug(f"Set skip to: {count}") return self + def column_aliases(self, aliases: Dict[str, str]) -> "MongoQueryBuilder": + """Set column aliases mapping (field_name -> alias)""" + if not isinstance(aliases, dict): + self._add_error("Column aliases must be a dictionary") + return self + + self._execution_plan.column_aliases = aliases + _logger.debug(f"Set column aliases to: {aliases}") + return self + def where(self, field: str, operator: str, value: Any) -> "MongoQueryBuilder": """Add a where condition in a readable format""" condition = self._build_condition(field, operator, value) diff --git a/pymongosql/sql/handler.py b/pymongosql/sql/handler.py index 67bfdc0..086dba8 100644 --- a/pymongosql/sql/handler.py +++ b/pymongosql/sql/handler.py @@ -1,7 +1,4 @@ # -*- coding: utf-8 -*- -""" -Expression handlers for converting SQL expressions to MongoDB query format -""" import logging import re from abc import ABC, abstractmethod @@ -42,6 +39,7 @@ class ParseResult: # Visitor parsing state fields collection: Optional[str] = None projection: Dict[str, Any] = field(default_factory=dict) + column_aliases: Dict[str, str] = field(default_factory=dict) # Maps field_name -> alias sort_fields: List[Dict[str, int]] = field(default_factory=list) limit_value: Optional[int] = None offset_value: Optional[int] = None @@ -894,14 +892,19 @@ def can_handle(self, ctx: Any) -> bool: def handle_visitor(self, ctx: PartiQLParser.SelectItemsContext, parse_result: "ParseResult") -> Any: projection = {} + column_aliases = {} if hasattr(ctx, "projectionItems") and ctx.projectionItems(): for item in ctx.projectionItems().projectionItem(): field_name, alias = self._extract_field_and_alias(item) # Use MongoDB standard projection format: {field: 1} to include field projection[field_name] = 1 + # Store alias if present + if alias: + column_aliases[field_name] = alias parse_result.projection = projection + parse_result.column_aliases = column_aliases return projection def _extract_field_and_alias(self, item) -> Tuple[str, Optional[str]]: diff --git a/pymongosql/sqlalchemy_mongodb/__init__.py b/pymongosql/sqlalchemy_mongodb/__init__.py index 94c3078..5e10658 100644 --- a/pymongosql/sqlalchemy_mongodb/__init__.py +++ b/pymongosql/sqlalchemy_mongodb/__init__.py @@ -1,14 +1,4 @@ # -*- coding: utf-8 -*- -""" -SQLAlchemy MongoDB dialect and integration for PyMongoSQL. - -This package provides SQLAlchemy integration including: -- MongoDB-specific dialect -- Version compatibility utilities -- Engine creation helpers -- MongoDB URI handling -""" - # SQLAlchemy integration try: # Import and register the dialect automatically @@ -52,12 +42,16 @@ def create_engine_url( >>> url = create_engine_url("localhost", 27017, "mydb", mode="superset") >>> engine = sqlalchemy.create_engine(url) """ - scheme = "mongodb+superset" if mode == "superset" else "mongodb" + scheme = "mongodb" params = [] for key, value in kwargs.items(): params.append(f"{key}={value}") + # Add mode parameter if not standard + if mode != "standard": + params.append(f"mode={mode}") + param_str = "&".join(params) if param_str: param_str = "?" + param_str @@ -65,87 +59,6 @@ def create_engine_url( return f"{scheme}://{host}:{port}/{database}{param_str}" -def create_mongodb_url(mongodb_uri: str) -> str: - """Convert a standard MongoDB URI to work with PyMongoSQL SQLAlchemy dialect. - - Args: - mongodb_uri: Standard MongoDB connection string - (e.g., 'mongodb://localhost:27017/mydb' or 'mongodb+srv://...') - - Returns: - SQLAlchemy-compatible URL for PyMongoSQL - - Example: - >>> url = create_mongodb_url("mongodb://user:pass@localhost:27017/mydb") - >>> engine = sqlalchemy.create_engine(url) - """ - # Return the MongoDB URI as-is since the dialect now handles MongoDB URLs directly - return mongodb_uri - - -def create_engine_from_mongodb_uri(mongodb_uri: str, **engine_kwargs): - """Create a SQLAlchemy engine from any MongoDB connection string. - - This function handles mongodb://, mongodb+srv://, and mongodb+superset:// URIs properly. - Use this instead of create_engine() directly for special URI schemes. - - Args: - mongodb_uri: MongoDB connection string (supports standard, SRV, and superset modes) - **engine_kwargs: Additional arguments passed to create_engine - - Returns: - SQLAlchemy Engine object - - Example: - >>> # For SRV records (Atlas/Cloud) - >>> engine = create_engine_from_mongodb_uri("mongodb+srv://user:pass@cluster.net/db") - >>> # For standard MongoDB - >>> engine = create_engine_from_mongodb_uri("mongodb://localhost:27017/mydb") - >>> # For superset mode (with subquery support) - >>> engine = create_engine_from_mongodb_uri("mongodb+superset://localhost:27017/mydb") - """ - try: - from sqlalchemy import create_engine - - if mongodb_uri.startswith("mongodb+srv://"): - # For MongoDB+SRV, convert to standard mongodb:// for SQLAlchemy compatibility - # SQLAlchemy doesn't handle the + character in scheme names well - converted_uri = mongodb_uri.replace("mongodb+srv://", "mongodb://") - - # Create engine with converted URI - engine = create_engine(converted_uri, **engine_kwargs) - - def custom_create_connect_args(url): - # Use original SRV URI for actual MongoDB connection - opts = {"host": mongodb_uri} - return [], opts - - engine.dialect.create_connect_args = custom_create_connect_args - return engine - elif mongodb_uri.startswith("mongodb+superset://"): - # For MongoDB+Superset, convert to standard mongodb:// for SQLAlchemy compatibility - # but preserve the superset mode by passing it through connection options - converted_uri = mongodb_uri.replace("mongodb+superset://", "mongodb://") - - # Create engine with converted URI - engine = create_engine(converted_uri, **engine_kwargs) - - def custom_create_connect_args(url): - # Use original superset URI for actual MongoDB connection - # This preserves the superset mode for subquery support - opts = {"host": mongodb_uri} - return [], opts - - engine.dialect.create_connect_args = custom_create_connect_args - return engine - else: - # Standard mongodb:// URLs work fine with SQLAlchemy - return create_engine(mongodb_uri, **engine_kwargs) - - except ImportError: - raise ImportError("SQLAlchemy is required for engine creation") - - def register_dialect(): """Register the PyMongoSQL dialect with SQLAlchemy. @@ -157,29 +70,12 @@ def register_dialect(): # Register for standard MongoDB URLs registry.register("mongodb", "pymongosql.sqlalchemy_mongodb.sqlalchemy_dialect", "PyMongoSQLDialect") - - # Try to register SRV and Superset forms so SQLAlchemy can resolve these URL patterns - # (either with '+' or dotted notation for compatibility with different SQLAlchemy versions). - # Some SQLAlchemy versions accept '+' in scheme names; others import the dotted plugin name. - # Attempt all registrations but don't fail if some are not supported. + # Register for MongoDB SRV URLs try: registry.register("mongodb+srv", "pymongosql.sqlalchemy_mongodb.sqlalchemy_dialect", "PyMongoSQLDialect") registry.register("mongodb.srv", "pymongosql.sqlalchemy_mongodb.sqlalchemy_dialect", "PyMongoSQLDialect") except Exception: - # If registration fails we fall back to handling SRV URIs in - # create_engine_from_mongodb_uri by converting 'mongodb+srv' to 'mongodb'. - pass - - try: - registry.register( - "mongodb+superset", "pymongosql.sqlalchemy_mongodb.sqlalchemy_dialect", "PyMongoSQLDialect" - ) - registry.register( - "mongodb.superset", "pymongosql.sqlalchemy_mongodb.sqlalchemy_dialect", "PyMongoSQLDialect" - ) - except Exception: - # If registration fails we fall back to handling Superset URIs in - # create_engine_from_mongodb_uri by converting 'mongodb+superset' to 'mongodb'. + # If registration fails, users can convert URIs to standard mongodb:// format pass return True @@ -197,8 +93,6 @@ def register_dialect(): # Export all SQLAlchemy-related functionality __all__ = [ "create_engine_url", - "create_mongodb_url", - "create_engine_from_mongodb_uri", "register_dialect", "__sqlalchemy_version__", "__supports_sqlalchemy__", diff --git a/pymongosql/sqlalchemy_mongodb/sqlalchemy_compat.py b/pymongosql/sqlalchemy_mongodb/sqlalchemy_compat.py index 14d76d5..770e774 100644 --- a/pymongosql/sqlalchemy_mongodb/sqlalchemy_compat.py +++ b/pymongosql/sqlalchemy_mongodb/sqlalchemy_compat.py @@ -1,9 +1,4 @@ # -*- coding: utf-8 -*- -""" -SQLAlchemy version compatibility utilities for PyMongoSQL. - -This module provides utilities to work with different SQLAlchemy versions. -""" import warnings from typing import Any, Dict, Optional diff --git a/pymongosql/sqlalchemy_mongodb/sqlalchemy_dialect.py b/pymongosql/sqlalchemy_mongodb/sqlalchemy_dialect.py index f137eb4..d5a3029 100644 --- a/pymongosql/sqlalchemy_mongodb/sqlalchemy_dialect.py +++ b/pymongosql/sqlalchemy_mongodb/sqlalchemy_dialect.py @@ -1,12 +1,4 @@ # -*- coding: utf-8 -*- -""" -SQLAlchemy dialect for PyMongoSQL. - -This module provides a SQLAlchemy dialect that allows PyMongoSQL to work -seamlessly with SQLAlchemy's ORM and core query functionality. - -Supports both SQLAlchemy 1.x and 2.x versions. -""" import logging from typing import Any, Dict, List, Optional, Tuple, Type diff --git a/pymongosql/superset_mongodb/detector.py b/pymongosql/superset_mongodb/detector.py index 9a6eb6b..ed25294 100644 --- a/pymongosql/superset_mongodb/detector.py +++ b/pymongosql/superset_mongodb/detector.py @@ -1,12 +1,4 @@ # -*- coding: utf-8 -*- -""" -Subquery detection and execution context management for handling Superset-style queries. - -This module provides utilities to detect and manage the execution context for SQL queries -that contain subqueries, enabling the use of SQLite3 as an intermediate database for -complex query operations that MongoDB cannot handle natively. -""" - import re from dataclasses import dataclass from typing import Optional, Tuple diff --git a/pymongosql/superset_mongodb/executor.py b/pymongosql/superset_mongodb/executor.py index 920c3fb..0f2da7e 100644 --- a/pymongosql/superset_mongodb/executor.py +++ b/pymongosql/superset_mongodb/executor.py @@ -1,15 +1,4 @@ # -*- coding: utf-8 -*- -""" -Query execution strategies for handling both simple and subquery-based SQL operations. - -This module provides different execution strategies: -- StandardExecution: Direct MongoDB query for simple SELECT statements -- SubqueryExecution: Two-stage execution using intermediate RDBMS (SQLite3 by default) - -The intermediate database is configurable - any backend implementing QueryDatabase -interface can be used (SQLite3, PostgreSQL, MySQL, etc.). -""" - import logging from typing import Any, Dict, List, Optional diff --git a/pymongosql/superset_mongodb/query_db.py b/pymongosql/superset_mongodb/query_db.py index abfe8a6..61d9cb6 100644 --- a/pymongosql/superset_mongodb/query_db.py +++ b/pymongosql/superset_mongodb/query_db.py @@ -1,14 +1,4 @@ # -*- coding: utf-8 -*- -""" -Query database abstraction layer. - -This module provides an abstract interface for databases used -during subquery execution. Allows plugging in different RDBMS backends -(SQLite3, PostgreSQL, MySQL, etc.) while maintaining a unified interface. - -Default implementation uses SQLite3 for in-memory processing. -""" - import logging from abc import ABC, abstractmethod from typing import Any, Dict, List diff --git a/pymongosql/superset_mongodb/query_db_sqlite.py b/pymongosql/superset_mongodb/query_db_sqlite.py index a05e291..5e3d356 100644 --- a/pymongosql/superset_mongodb/query_db_sqlite.py +++ b/pymongosql/superset_mongodb/query_db_sqlite.py @@ -1,14 +1,4 @@ # -*- coding: utf-8 -*- -""" -SQLite3 bridge for handling query database operations. - -This module manages the creation, population, and querying of in-memory SQLite3 -databases that serve as an intermediate layer between MongoDB and Superset, -enabling support for complex SQL operations that MongoDB cannot handle natively. - -SQLiteBridge is the default implementation of the QueryDatabase interface. -""" - import logging import sqlite3 from typing import Any, Dict, List, Optional diff --git a/tests/conftest.py b/tests/conftest.py index 2c033bb..1008428 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -28,15 +28,18 @@ def make_conn(**kwargs): def make_superset_conn(**kwargs): """Create a superset-mode Connection using TEST_URI if provided, otherwise use a local default.""" if TEST_URI: - # Convert test URI to superset mode by replacing mongodb:// with mongodb+superset:// - superset_uri = TEST_URI.replace("mongodb://", "mongodb+superset://", 1) + # Convert test URI to superset mode by adding ?mode=superset query parameter + if "?" in TEST_URI: + superset_uri = TEST_URI + "&mode=superset" + else: + superset_uri = TEST_URI + "?mode=superset" if "database" not in kwargs: kwargs["database"] = TEST_DB return Connection(host=superset_uri, **kwargs) # Default local connection parameters with superset mode defaults = { - "host": "mongodb+superset://testuser:testpass@localhost:27017/test_db?authSource=test_db", + "host": "mongodb://testuser:testpass@localhost:27017/test_db?authSource=test_db&mode=superset", "database": "test_db", } for k, v in defaults.items(): diff --git a/tests/test_connection.py b/tests/test_connection.py index e3a94ec..3cc8788 100644 --- a/tests/test_connection.py +++ b/tests/test_connection.py @@ -156,7 +156,7 @@ def test_explicit_database_param_overrides_uri_default(self): # Test that explicit database parameter overrides URI default if TEST_URI: # Construct a URI with an explicit database path - conn = Connection(host=f"{TEST_URI.rstrip('/')}/uri_db", database="explicit_db") + conn = Connection(host=f"{TEST_URI}", database="explicit_db") else: conn = Connection(host="mongodb://localhost:27017/uri_db", database="explicit_db") assert conn.database is not None @@ -166,9 +166,22 @@ def test_explicit_database_param_overrides_uri_default(self): def test_no_database_param_uses_client_default_database(self): """When no explicit database parameter is passed, use client's default from URI if present""" if TEST_URI: - conn = Connection(host=f"{TEST_URI.rstrip('/')}/test_db") + conn = Connection(host=f"{TEST_URI}") else: conn = Connection(host="mongodb://localhost:27017/test_db") assert conn.database is not None assert conn.database.name == "test_db" conn.close() + + def test_connection_string_with_mode_query_param(self): + """Test that connection string with ?mode parameter is parsed correctly""" + if TEST_URI: + # Test with mode parameter in query string + test_url = f"{TEST_URI.rstrip('&')}&mode=superset" + else: + test_url = "mongodb://localhost:27017/test_db?mode=superset" + + conn = Connection(host=test_url) + assert conn.mode == "superset" + assert conn.database_name == "test_db" + conn.close() diff --git a/tests/test_cursor.py b/tests/test_cursor.py index d00b5c0..6879ac4 100644 --- a/tests/test_cursor.py +++ b/tests/test_cursor.py @@ -261,6 +261,79 @@ def test_fetchall_with_result(self, conn): names = [row[name_idx] for row in rows] assert "John Doe" in names # First user from dataset + def test_description_type_and_shape(self, conn): + """Ensure cursor.description returns a list of DB-API description tuples""" + cursor = conn.cursor() + cursor.execute("SELECT * FROM users") + desc = cursor.description + assert isinstance(desc, list) + assert all(isinstance(d, tuple) and len(d) == 7 and isinstance(d[0], str) for d in desc) + # type_code should be a type object (e.g., str) or None when unknown + assert all((isinstance(d[1], type) or d[1] is None) for d in desc) + + def test_description_projection(self, conn): + """Ensure projection via SQL reflects in the description names and types""" + cursor = conn.cursor() + cursor.execute("SELECT name, email FROM users") + desc = cursor.description + assert isinstance(desc, list) + col_names = [d[0] for d in desc] + assert "name" in col_names + assert "email" in col_names + for d in desc: + if d[0] in ("name", "email"): + assert isinstance(d[1], type) or d[1] is None + + def test_cursor_pagination_fetchmany_triggers_getmore(self, conn, monkeypatch): + """Test that cursor.fetchmany triggers getMore when executing SQL that yields a paginated cursor + + We monkeypatch the underlying database.command to force a small server batch size + so that pagination/getMore behaviour is triggered while still using SQL via cursor.execute. + """ + db = conn.database + original_cmd = db.command + + def wrapper(cmd, *args, **kwargs): + # Force small batchSize for find on users to simulate pagination + if isinstance(cmd, dict) and cmd.get("find") == "users" and "batchSize" not in cmd: + cmd = dict(cmd) + cmd["batchSize"] = 3 + return original_cmd(cmd, *args, **kwargs) + + monkeypatch.setattr(db, "command", wrapper) + + cursor = conn.cursor() + cursor.execute("SELECT * FROM users") + + # Fetch many rows through cursor - should span multiple batches + rows = cursor.fetchmany(10) + assert len(rows) == 10 + assert cursor.rowcount >= 10 + + def test_cursor_pagination_fetchall_triggers_getmore(self, conn, monkeypatch): + """Test that cursor.fetchall retrieves all rows across multiple batches using SQL + + Same approach: monkeypatch to force a small server batch size while using cursor.execute. + """ + db = conn.database + original_cmd = db.command + + def wrapper(cmd, *args, **kwargs): + if isinstance(cmd, dict) and cmd.get("find") == "users" and "batchSize" not in cmd: + cmd = dict(cmd) + cmd["batchSize"] = 4 + return original_cmd(cmd, *args, **kwargs) + + monkeypatch.setattr(db, "command", wrapper) + + cursor = conn.cursor() + cursor.execute("SELECT * FROM users") + + rows = cursor.fetchall() + # There are 22 users in test dataset + assert len(rows) == 22 + assert cursor.rowcount == 22 + def test_close(self, conn): """Test cursor close""" # Should not raise any exception @@ -281,3 +354,47 @@ def test_cursor_properties(self, conn): # Test rowcount property (should be -1 when no query executed) assert cursor.rowcount == -1 + + def test_execute_with_field_alias(self, conn): + """Test executing SELECT with field aliases""" + sql = "SELECT name AS user_name, email AS user_email FROM users LIMIT 5" + cursor = conn.cursor() + result = cursor.execute(sql) + + assert result == cursor # execute returns self + assert isinstance(cursor.result_set, ResultSet) + + # Check that aliases appear in cursor description + assert cursor.result_set.description is not None + col_names = [desc[0] for desc in cursor.result_set.description] + + # Aliases should appear in the description instead of original field names + assert "user_name" in col_names + assert "user_email" in col_names + assert "name" not in col_names + assert "email" not in col_names + + rows = cursor.result_set.fetchall() + assert len(rows) == 5 + assert len(rows[0]) == 2 # Should have 2 columns + + def test_execute_with_nested_field_alias(self, conn): + """Test executing SELECT with nested field alias""" + sql = "SELECT products.name AS product_name, products.price AS product_price FROM products LIMIT 3" + cursor = conn.cursor() + result = cursor.execute(sql) + + assert result == cursor # execute returns self + assert isinstance(cursor.result_set, ResultSet) + + # Check that aliases appear in cursor description + assert cursor.result_set.description is not None + col_names = [desc[0] for desc in cursor.result_set.description] + + # Aliases should appear in the description + assert "product_name" in col_names + assert "product_price" in col_names + + rows = cursor.result_set.fetchall() + assert len(rows) == 3 + assert len(rows[0]) == 2 # Should have 2 columns diff --git a/tests/test_sql_parser_general.py b/tests/test_sql_parser_general.py index 4b7e223..2f390d6 100644 --- a/tests/test_sql_parser_general.py +++ b/tests/test_sql_parser_general.py @@ -344,3 +344,92 @@ def test_complex_mixed_operators(self): # Verify ORDER BY and LIMIT assert execution_plan.sort_stage == [{"age": -1}] # DESC = -1 assert execution_plan.limit_stage == 5 + + def test_select_with_simple_alias(self): + """Test SELECT with a simple field alias""" + sql = "SELECT name AS user_name FROM users" + parser = SQLParser(sql) + + assert not parser.has_errors, f"Parser errors: {parser.errors}" + + execution_plan = parser.get_execution_plan() + assert execution_plan.collection == "users" + assert execution_plan.projection_stage == {"name": 1} + assert execution_plan.column_aliases == {"name": "user_name"} + + def test_select_with_multiple_aliases(self): + """Test SELECT with multiple field aliases""" + sql = "SELECT name AS user_name, email AS user_email, age AS user_age FROM users" + parser = SQLParser(sql) + + assert not parser.has_errors, f"Parser errors: {parser.errors}" + + execution_plan = parser.get_execution_plan() + assert execution_plan.collection == "users" + assert execution_plan.projection_stage == {"name": 1, "email": 1, "age": 1} + assert execution_plan.column_aliases == { + "name": "user_name", + "email": "user_email", + "age": "user_age", + } + + def test_select_with_nested_field_alias(self): + """Test SELECT with nested field alias like field.idx[0] as a""" + sql = "SELECT field.idx[0] AS a FROM users" + parser = SQLParser(sql) + + assert not parser.has_errors, f"Parser errors: {parser.errors}" + + execution_plan = parser.get_execution_plan() + assert execution_plan.collection == "users" + # Nested field should be normalized to mongo dot notation + assert "field.idx.0" in execution_plan.projection_stage + assert execution_plan.projection_stage["field.idx.0"] == 1 + assert execution_plan.column_aliases.get("field.idx.0") == "a" + + def test_select_mixed_with_and_without_aliases(self): + """Test SELECT with some fields having aliases and some not""" + sql = "SELECT name AS user_name, email, age AS user_age FROM users" + parser = SQLParser(sql) + + assert not parser.has_errors, f"Parser errors: {parser.errors}" + + execution_plan = parser.get_execution_plan() + assert execution_plan.collection == "users" + assert execution_plan.projection_stage == {"name": 1, "email": 1, "age": 1} + # Only fields with aliases should be in column_aliases + assert execution_plan.column_aliases == { + "name": "user_name", + "age": "user_age", + } + + def test_select_alias_without_as_keyword(self): + """Test SELECT with implicit alias (without AS keyword)""" + sql = "SELECT name user_name, email user_email FROM users" + parser = SQLParser(sql) + + assert not parser.has_errors, f"Parser errors: {parser.errors}" + + execution_plan = parser.get_execution_plan() + assert execution_plan.collection == "users" + assert execution_plan.projection_stage == {"name": 1, "email": 1} + assert execution_plan.column_aliases == { + "name": "user_name", + "email": "user_email", + } + + def test_select_with_alias_and_where_clause(self): + """Test SELECT with aliases and WHERE clause""" + sql = "SELECT name AS user_name, age AS user_age FROM users WHERE status = 'active'" + parser = SQLParser(sql) + + assert not parser.has_errors, f"Parser errors: {parser.errors}" + + execution_plan = parser.get_execution_plan() + assert execution_plan.collection == "users" + assert execution_plan.projection_stage == {"name": 1, "age": 1} + assert execution_plan.column_aliases == { + "name": "user_name", + "age": "user_age", + } + assert execution_plan.filter_stage == {"status": "active"} diff --git a/tests/test_sql_parser_nested_fields.py b/tests/test_sql_parser_nested_fields.py index 3d223ac..b92cb2d 100644 --- a/tests/test_sql_parser_nested_fields.py +++ b/tests/test_sql_parser_nested_fields.py @@ -1,7 +1,4 @@ # -*- coding: utf-8 -*- -""" -Comprehensive tests for nested field support in PyMongoSQL -""" import pytest from pymongosql.error import SqlSyntaxError diff --git a/tests/test_sqlalchemy_dialect.py b/tests/test_sqlalchemy_dialect.py index 2a4e8e8..52d84b5 100644 --- a/tests/test_sqlalchemy_dialect.py +++ b/tests/test_sqlalchemy_dialect.py @@ -1,9 +1,4 @@ #!/usr/bin/env python3 -""" -Tests for PyMongoSQL SQLAlchemy dialect. - -This test suite validates the SQLAlchemy integration functionality. -""" import unittest from typing import Callable from unittest.mock import Mock, patch diff --git a/tests/test_sqlalchemy_integration.py b/tests/test_sqlalchemy_integration.py index 2519d39..bc8d05b 100644 --- a/tests/test_sqlalchemy_integration.py +++ b/tests/test_sqlalchemy_integration.py @@ -1,14 +1,4 @@ #!/usr/bin/env python3 -""" -Real Integration Tests for PyMongoSQL SQLAlchemy Dialect - -This test suite validates the SQLAlchemy dialect integration by: -1. Using real MongoDB connections (same as other tests) -2. Creating SQLAlchemy ORM models -3. Testing query operations with actual data -4. Validating object creation from query results -""" - import os import pytest diff --git a/tests/test_superset_connection.py b/tests/test_superset_connection.py index 3e89a05..3bb9907 100644 --- a/tests/test_superset_connection.py +++ b/tests/test_superset_connection.py @@ -1,15 +1,4 @@ # -*- coding: utf-8 -*- -""" -Tests for superset subquery connection mode. - -Tests the mongodb+superset:// connection pattern and verifies that: -1. Superset mode is correctly detected from connection strings -2. SubqueryExecution strategy is registered and used -3. Subqueries are supported in superset mode -4. Subqueries are rejected in core mode -""" - - from pymongosql.executor import ExecutionContext, ExecutionPlanFactory from pymongosql.helper import ConnectionHelper from pymongosql.superset_mongodb.executor import SupersetExecution @@ -18,45 +7,64 @@ class TestSupersetConnectionString: """Test parsing of superset connection strings""" - def test_parse_superset_mode(self): - """Test parsing mongodb+superset:// connection string""" - mode, normalized = ConnectionHelper.parse_connection_string("mongodb+superset://localhost:27017/testdb") + def test_parse_superset_mode_with_query_param(self): + """Test parsing connection string with ?mode=superset query parameter""" + mode, db, normalized = ConnectionHelper.parse_connection_string( + "mongodb://localhost:27017/testdb?mode=superset" + ) assert mode == "superset" + assert db == "testdb" + assert "mode" not in normalized assert normalized == "mongodb://localhost:27017/testdb" + def test_parse_srv_with_superset_mode(self): + """Test parsing mongodb+srv with superset mode""" + mode, db, normalized = ConnectionHelper.parse_connection_string("mongodb+srv://localhost/testdb?mode=superset") + assert mode == "superset" + assert db == "testdb" + assert "mongodb+srv" in normalized + assert "mode" not in normalized + def test_parse_core_mode(self): """Test parsing standard mongodb:// connection string""" - mode, normalized = ConnectionHelper.parse_connection_string("mongodb://localhost:27017/testdb") + mode, db, normalized = ConnectionHelper.parse_connection_string("mongodb://localhost:27017/testdb") assert mode == "standard" + assert db == "testdb" assert normalized == "mongodb://localhost:27017/testdb" def test_parse_with_credentials(self): """Test parsing connection string with username and password""" - mode, normalized = ConnectionHelper.parse_connection_string( - "mongodb+superset://user:pass@localhost:27017/testdb" + mode, db, normalized = ConnectionHelper.parse_connection_string( + "mongodb://user:pass@localhost:27017/testdb?mode=superset" ) assert mode == "superset" + assert db == "testdb" assert "user:pass@localhost" in normalized + assert "mode" not in normalized def test_parse_with_query_params(self): - """Test parsing connection string with query parameters""" - mode, normalized = ConnectionHelper.parse_connection_string( - "mongodb+superset://localhost:27017/testdb?retryWrites=true&w=majority" + """Test parsing connection string with multiple query parameters""" + mode, db, normalized = ConnectionHelper.parse_connection_string( + "mongodb://localhost:27017/testdb?mode=superset&retryWrites=true&w=majority" ) assert mode == "superset" + assert db == "testdb" assert "retryWrites=true" in normalized assert "w=majority" in normalized + assert "mode" not in normalized def test_parse_none_connection_string(self): """Test parsing None connection string returns defaults""" - mode, normalized = ConnectionHelper.parse_connection_string(None) + mode, db, normalized = ConnectionHelper.parse_connection_string(None) assert mode == "standard" + assert db is None assert normalized is None def test_parse_empty_connection_string(self): """Test parsing empty connection string returns defaults""" - mode, normalized = ConnectionHelper.parse_connection_string("") + mode, db, normalized = ConnectionHelper.parse_connection_string("") assert mode == "standard" + assert db is None assert normalized is None @@ -113,15 +121,15 @@ def test_superset_mode_detection(self): """Test that superset mode is correctly detected""" from pymongosql.helper import ConnectionHelper - is_superset, _ = ConnectionHelper.parse_connection_string("mongodb+superset://localhost:27017/testdb") - assert is_superset == "superset" + mode, db, _ = ConnectionHelper.parse_connection_string("mongodb://localhost:27017/testdb?mode=superset") + assert mode == "superset" def test_core_mode_detection(self): """Test that core mode is correctly detected""" from pymongosql.helper import ConnectionHelper - is_core, _ = ConnectionHelper.parse_connection_string("mongodb://localhost:27017/testdb") - assert is_core == "standard" + mode, db, _ = ConnectionHelper.parse_connection_string("mongodb://localhost:27017/testdb") + assert mode == "standard" class TestSubqueryExecutionIntegration: @@ -433,6 +441,43 @@ def test_empty_result_with_valid_description(self, superset_conn): assert "name" in col_names assert "age" in col_names + def test_subquery_with_field_aliases_in_inner_query(self, superset_conn): + """Test superset execution with field aliases in the inner MongoDB query""" + cursor = superset_conn.cursor() + + # Inner query has aliases + subquery_sql = "SELECT * FROM (SELECT name AS user_name, email AS user_email FROM users) AS u LIMIT 3" + + cursor.execute(subquery_sql) + rows = cursor.fetchall() + + # Check that aliases appear in cursor description + assert cursor.result_set.description is not None + col_names = [desc[0] for desc in cursor.result_set.description] + + # Aliases from inner query should be preserved + assert "user_name" in col_names + assert "user_email" in col_names + assert len(rows) <= 3 + + def test_subquery_with_nested_field_alias(self, superset_conn): + """Test superset execution with nested field alias in inner query""" + cursor = superset_conn.cursor() + + # Inner query has nested field with alias + subquery_sql = "SELECT * FROM (SELECT _id AS id, name FROM users) AS u WHERE u.id > 0 LIMIT 2" + + cursor.execute(subquery_sql) + rows = cursor.fetchall() + + # Check that aliases appear in cursor description + assert cursor.result_set.description is not None + col_names = [desc[0] for desc in cursor.result_set.description] + + # Alias from inner query should be preserved + assert "id" in col_names + assert len(rows) <= 2 + class TestSubqueryDetector: """Test subquery detection and outer query extraction"""