diff --git a/pyproject.toml b/pyproject.toml index 3bd54543..5898e177 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -234,7 +234,8 @@ stubdeps = [ # dependencies used for typehints in the stubs "typing-extensions", ] test = [ # dependencies used for running tests - "adbc-driver-manager; sys_platform != 'win32' or platform_machine != 'ARM64'", + "adbc-driver-manager>=1.10.0; python_version >= '3.10' and (sys_platform != 'win32' or platform_machine != 'ARM64')", + "adbc-driver-manager>=1.7.0; python_version < '3.10' and (sys_platform != 'win32' or platform_machine != 'ARM64')", "pytest", "pytest-reraise", "pytest-timeout", @@ -252,8 +253,10 @@ test = [ # dependencies used for running tests "requests", "urllib3", "fsspec>=2022.11.0; sys_platform != 'win32' or platform_machine != 'ARM64'", - "pandas>=2.0.0", - "pyarrow>=18.0.0; sys_platform != 'win32' or platform_machine != 'ARM64'", + "pandas>=3.0.0; python_version > '3.10'", + "pandas<3.0.0; python_version < '3.11'", + "pyarrow>=23.0.0; python_version >= '3.10' and (sys_platform != 'win32' or platform_machine != 'ARM64')", + "pyarrow>=18.0.0; python_version < '3.10' and (sys_platform != 'win32' or platform_machine != 'ARM64')", "torch>=2.2.2; python_version < '3.14' and ( sys_platform != 'darwin' or platform_machine != 'x86_64' or python_version < '3.13' ) and ( sys_platform != 'win32' or platform_machine != 'ARM64' or python_version > '3.11' )", "tensorflow==2.14.0; sys_platform == 'darwin' and python_version < '3.12'", "tensorflow-cpu>=2.14.0; sys_platform == 'linux' and platform_machine != 'aarch64' and python_version < '3.12'", diff --git a/src/duckdb_py/include/duckdb_python/numpy/numpy_type.hpp b/src/duckdb_py/include/duckdb_python/numpy/numpy_type.hpp index 982f00ec..d58bc139 100644 --- a/src/duckdb_py/include/duckdb_python/numpy/numpy_type.hpp +++ b/src/duckdb_py/include/duckdb_python/numpy/numpy_type.hpp @@ -18,25 +18,28 @@ namespace duckdb { // Pandas Specific Types (e.g., categorical, datetime_tz,...) enum class NumpyNullableType : uint8_t { //! NumPy dtypes - BOOL, //! bool_, bool8 - INT_8, //! byte, int8 - UINT_8, //! ubyte, uint8 - INT_16, //! int16, short - UINT_16, //! uint16, ushort - INT_32, //! int32, intc - UINT_32, //! uint32, uintc, - INT_64, //! int64, int0, int_, intp, matrix - UINT_64, //! uint64, uint, uint0, uintp - FLOAT_16, //! float16, half - FLOAT_32, //! float32, single - FLOAT_64, //! float64, float_, double - OBJECT, //! object - UNICODE, //! static int64_t ConvertValue(interval_t val, NumpyAppendData &append_data) { (void)append_data; - return Interval::GetNanoseconds(val); + return Interval::GetMicro(val); } template diff --git a/src/duckdb_py/numpy/numpy_scan.cpp b/src/duckdb_py/numpy/numpy_scan.cpp index 0117eaae..b1cd6e60 100644 --- a/src/duckdb_py/numpy/numpy_scan.cpp +++ b/src/duckdb_py/numpy/numpy_scan.cpp @@ -302,7 +302,10 @@ void NumpyScan::Scan(PandasColumnBindData &bind_data, idx_t count, idx_t offset, } break; } - case NumpyNullableType::TIMEDELTA: { + case NumpyNullableType::TIMEDELTA_NS: + case NumpyNullableType::TIMEDELTA_US: + case NumpyNullableType::TIMEDELTA_MS: + case NumpyNullableType::TIMEDELTA_S: { auto src_ptr = reinterpret_cast(array.data()); auto tgt_ptr = FlatVector::GetData(out); auto &mask = FlatVector::Validity(out); @@ -314,7 +317,25 @@ void NumpyScan::Scan(PandasColumnBindData &bind_data, idx_t count, idx_t offset, mask.SetInvalid(row); continue; } - int64_t micro = src_ptr[source_idx] / 1000; + + int64_t micro; + switch (bind_data.numpy_type.type) { + case NumpyNullableType::TIMEDELTA_NS: + micro = src_ptr[source_idx] / 1000; // ns -> us + break; + case NumpyNullableType::TIMEDELTA_US: + micro = src_ptr[source_idx]; // already us + break; + case NumpyNullableType::TIMEDELTA_MS: + micro = src_ptr[source_idx] * 1000; // ms -> us + break; + case NumpyNullableType::TIMEDELTA_S: + micro = src_ptr[source_idx] * 1000000; // s -> us + break; + default: + throw InternalException("Unexpected timedelta type"); + } + int64_t days = micro / Interval::MICROS_PER_DAY; micro = micro % Interval::MICROS_PER_DAY; int64_t months = days / Interval::DAYS_PER_MONTH; diff --git a/src/duckdb_py/numpy/raw_array_wrapper.cpp b/src/duckdb_py/numpy/raw_array_wrapper.cpp index 5d73685b..0b3f8d14 100644 --- a/src/duckdb_py/numpy/raw_array_wrapper.cpp +++ b/src/duckdb_py/numpy/raw_array_wrapper.cpp @@ -108,7 +108,7 @@ string RawArrayWrapper::DuckDBToNumpyDtype(const LogicalType &type) { case LogicalTypeId::DATE: return "datetime64[us]"; case LogicalTypeId::INTERVAL: - return "timedelta64[ns]"; + return "timedelta64[us]"; case LogicalTypeId::TIME: case LogicalTypeId::TIME_TZ: case LogicalTypeId::VARCHAR: diff --git a/src/duckdb_py/numpy/type.cpp b/src/duckdb_py/numpy/type.cpp index 92ac4785..3d8d9096 100644 --- a/src/duckdb_py/numpy/type.cpp +++ b/src/duckdb_py/numpy/type.cpp @@ -58,11 +58,23 @@ static NumpyNullableType ConvertNumpyTypeInternal(const string &col_type_str) { if (col_type_str == "string") { return NumpyNullableType::STRING; } + if (col_type_str == "str") { + return NumpyNullableType::STRING; + } if (col_type_str == "object") { return NumpyNullableType::OBJECT; } if (col_type_str == "timedelta64[ns]") { - return NumpyNullableType::TIMEDELTA; + return NumpyNullableType::TIMEDELTA_NS; + } + if (col_type_str == "timedelta64[us]") { + return NumpyNullableType::TIMEDELTA_US; + } + if (col_type_str == "timedelta64[ms]") { + return NumpyNullableType::TIMEDELTA_MS; + } + if (col_type_str == "timedelta64[s]") { + return NumpyNullableType::TIMEDELTA_S; } // We use 'StartsWith' because it might have ', tz' at the end, indicating timezone if (StringUtil::StartsWith(col_type_str, "datetime64[ns")) { @@ -140,7 +152,10 @@ LogicalType NumpyToLogicalType(const NumpyType &col_type) { return LogicalType::VARCHAR; case NumpyNullableType::OBJECT: return LogicalType::VARCHAR; - case NumpyNullableType::TIMEDELTA: + case NumpyNullableType::TIMEDELTA_NS: + case NumpyNullableType::TIMEDELTA_US: + case NumpyNullableType::TIMEDELTA_MS: + case NumpyNullableType::TIMEDELTA_S: return LogicalType::INTERVAL; case NumpyNullableType::DATETIME_MS: { if (col_type.has_timezone) { diff --git a/src/duckdb_py/pyresult.cpp b/src/duckdb_py/pyresult.cpp index e92f6abe..cc6224c2 100644 --- a/src/duckdb_py/pyresult.cpp +++ b/src/duckdb_py/pyresult.cpp @@ -304,7 +304,7 @@ void DuckDBPyResult::ConvertDateTimeTypes(PandasDataFrame &df, bool date_as_obje // We need to create the column anew because the exact dt changed to a new timezone ReplaceDFColumn(df, names[i].c_str(), i, new_value); } else if (date_as_object && result->types[i] == LogicalType::DATE) { - auto new_value = df[names[i].c_str()].attr("dt").attr("date"); + py::object new_value = df[names[i].c_str()].attr("dt").attr("date"); ReplaceDFColumn(df, names[i].c_str(), i, new_value); } } diff --git a/tests/conftest.py b/tests/conftest.py index bfb458a5..ed7c359a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,7 +2,7 @@ import warnings from importlib import import_module from pathlib import Path -from typing import Any, Union +from typing import Union import pytest @@ -19,13 +19,27 @@ pandas = None pyarrow_dtype = None -# Check if pandas has arrow dtypes enabled -try: - from pandas.compat import pa_version_under7p0 - pyarrow_dtypes_enabled = not pa_version_under7p0 -except ImportError: - pyarrow_dtypes_enabled = False +# Version-aware helpers for Pandas 2.x vs 3.0 compatibility +def _get_pandas_ge_3(): + if pandas is None: + return False + from packaging.version import Version + + return Version(pandas.__version__) >= Version("3.0.0") + + +PANDAS_GE_3 = _get_pandas_ge_3() + + +def is_string_dtype(dtype): + """Check if a dtype is a string dtype (works across Pandas 2.x and 3.0). + + Uses pd.api.types.is_string_dtype() which handles: + - Pandas 2.x: object dtype for strings + - Pandas 3.0+: str (StringDtype) for strings + """ + return pandas.api.types.is_string_dtype(dtype) def import_pandas(): @@ -113,78 +127,6 @@ def pandas_supports_arrow_backend(): return pandas_2_or_higher() -def numpy_pandas_df(*args, **kwargs): - return import_pandas().DataFrame(*args, **kwargs) - - -def arrow_pandas_df(*args, **kwargs): - df = numpy_pandas_df(*args, **kwargs) - return df.convert_dtypes(dtype_backend="pyarrow") - - -class NumpyPandas: - def __init__(self) -> None: - self.backend = "numpy_nullable" - self.DataFrame = numpy_pandas_df - self.pandas = import_pandas() - - def __getattr__(self, name: str) -> Any: # noqa: ANN401 - return getattr(self.pandas, name) - - -def convert_arrow_to_numpy_backend(df): - names = df.columns - df_content = {} - for name in names: - df_content[name] = df[name].array.__arrow_array__() - # This should convert the pyarrow chunked arrays into numpy arrays - return import_pandas().DataFrame(df_content) - - -def convert_to_numpy(df): - if ( - pyarrow_dtypes_enabled - and pyarrow_dtype is not None - and any(True for x in df.dtypes if isinstance(x, pyarrow_dtype)) - ): - return convert_arrow_to_numpy_backend(df) - return df - - -def convert_and_equal(df1, df2, **kwargs): - df1 = convert_to_numpy(df1) - df2 = convert_to_numpy(df2) - import_pandas().testing.assert_frame_equal(df1, df2, **kwargs) - - -class ArrowMockTesting: - def __init__(self) -> None: - self.testing = import_pandas().testing - self.assert_frame_equal = convert_and_equal - - def __getattr__(self, name: str) -> Any: # noqa: ANN401 - return getattr(self.testing, name) - - -# This converts dataframes constructed with 'DataFrame(...)' to pyarrow backed dataframes -# Assert equal does the opposite, turning all pyarrow backed dataframes into numpy backed ones -# this is done because we don't produce pyarrow backed dataframes yet -class ArrowPandas: - def __init__(self) -> None: - self.pandas = import_pandas() - if pandas_2_or_higher() and pyarrow_dtypes_enabled: - self.backend = "pyarrow" - self.DataFrame = arrow_pandas_df - else: - # For backwards compatible reasons, just mock regular pandas - self.backend = "numpy_nullable" - self.DataFrame = self.pandas.DataFrame - self.testing = ArrowMockTesting() - - def __getattr__(self, name: str) -> Any: # noqa: ANN401 - return getattr(self.pandas, name) - - @pytest.fixture def require(): def _require(extension_name, db_name="") -> Union[duckdb.DuckDBPyConnection, None]: diff --git a/tests/coverage/test_pandas_categorical_coverage.py b/tests/coverage/test_pandas_categorical_coverage.py index 7b0645e0..6155138a 100644 --- a/tests/coverage/test_pandas_categorical_coverage.py +++ b/tests/coverage/test_pandas_categorical_coverage.py @@ -1,5 +1,4 @@ -import pytest -from conftest import NumpyPandas +import pandas as pd import duckdb @@ -9,23 +8,23 @@ def check_result_list(res): assert res_item[0] == res_item[1] -def check_create_table(category, pandas): +def check_create_table(category): conn = duckdb.connect() conn.execute("PRAGMA enable_verification") - df_in = pandas.DataFrame( + df_in = pd.DataFrame( { - "x": pandas.Categorical(category, ordered=True), - "y": pandas.Categorical(category, ordered=True), + "x": pd.Categorical(category, ordered=True), + "y": pd.Categorical(category, ordered=True), "z": category, } ) category.append("bla") - df_in_diff = pandas.DataFrame( # noqa: F841 + df_in_diff = pd.DataFrame( # noqa: F841 { - "k": pandas.Categorical(category, ordered=True), + "k": pd.Categorical(category, ordered=True), } ) @@ -68,14 +67,11 @@ def check_create_table(category, pandas): conn.execute("DROP TABLE t1") -# TODO: extend tests with ArrowPandas # noqa: TD002, TD003 class TestCategory: - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_category_string_uint16(self, duckdb_cursor, pandas): + def test_category_string_uint16(self, duckdb_cursor): category = [str(i) for i in range(300)] - check_create_table(category, pandas) + check_create_table(category) - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_category_string_uint32(self, duckdb_cursor, pandas): + def test_category_string_uint32(self, duckdb_cursor): category = [str(i) for i in range(70000)] - check_create_table(category, pandas) + check_create_table(category) diff --git a/tests/extensions/test_httpfs.py b/tests/extensions/test_httpfs.py index 26ce917c..b8335814 100644 --- a/tests/extensions/test_httpfs.py +++ b/tests/extensions/test_httpfs.py @@ -1,8 +1,8 @@ import datetime import os +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb @@ -34,8 +34,7 @@ def test_s3fs(self, require): res = rel.fetchone() assert res == (1, 0, datetime.date(1965, 2, 28), 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 6, 0, 0, 0, 0) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_httpfs(self, require, pandas): + def test_httpfs(self, require): connection = require("httpfs") try: connection.execute(""" @@ -51,14 +50,14 @@ def test_httpfs(self, require, pandas): raise result_df = connection.fetchdf() - exp_result = pandas.DataFrame( + exp_result = pd.DataFrame( { - "id": pandas.Series([1, 2, 3], dtype="int32"), + "id": pd.Series([1, 2, 3], dtype="int32"), "first_name": ["Amanda", "Albert", "Evelyn"], "last_name": ["Jordan", "Freeman", "Morgan"], } ) - pandas.testing.assert_frame_equal(result_df, exp_result) + pd.testing.assert_frame_equal(result_df, exp_result, check_dtype=False) def test_http_exception(self, require): connection = require("httpfs") diff --git a/tests/fast/api/test_3654.py b/tests/fast/api/test_3654.py index a6b01dd5..11f37946 100644 --- a/tests/fast/api/test_3654.py +++ b/tests/fast/api/test_3654.py @@ -1,4 +1,4 @@ -import pytest +import pandas as pd import duckdb @@ -8,13 +8,11 @@ can_run = True except Exception: can_run = False -from conftest import ArrowPandas, NumpyPandas class Test3654: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_3654_pandas(self, duckdb_cursor, pandas): - df1 = pandas.DataFrame( + def test_3654_pandas(self, duckdb_cursor): + df1 = pd.DataFrame( { "id": [1, 1, 2], } @@ -25,12 +23,11 @@ def test_3654_pandas(self, duckdb_cursor, pandas): print(rel.execute().fetchall()) assert rel.execute().fetchall() == [(1,), (1,), (2,)] - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_3654_arrow(self, duckdb_cursor, pandas): + def test_3654_arrow(self, duckdb_cursor): if not can_run: return - df1 = pandas.DataFrame( + df1 = pd.DataFrame( { "id": [1, 1, 2], } diff --git a/tests/fast/api/test_config.py b/tests/fast/api/test_config.py index aaec24c4..7d1370eb 100644 --- a/tests/fast/api/test_config.py +++ b/tests/fast/api/test_config.py @@ -2,37 +2,32 @@ import os import re -import pytest -from conftest import ArrowPandas, NumpyPandas +import pandas as pd import duckdb class TestDBConfig: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_default_order(self, duckdb_cursor, pandas): - df = pandas.DataFrame({"a": [1, 2, 3]}) + def test_default_order(self, duckdb_cursor): + df = pd.DataFrame({"a": [1, 2, 3]}) con = duckdb.connect(":memory:", config={"default_order": "desc"}) result = con.execute("select * from df order by a").fetchall() assert result == [(3,), (2,), (1,)] - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_null_order(self, duckdb_cursor, pandas): - df = pandas.DataFrame({"a": [1, 2, 3, None]}) + def test_null_order(self, duckdb_cursor): + df = pd.DataFrame({"a": [1, 2, 3, None]}) con = duckdb.connect(":memory:", config={"default_null_order": "nulls_last"}) result = con.execute("select * from df order by a").fetchall() assert result == [(1,), (2,), (3,), (None,)] - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_multiple_options(self, duckdb_cursor, pandas): - df = pandas.DataFrame({"a": [1, 2, 3, None]}) + def test_multiple_options(self, duckdb_cursor): + df = pd.DataFrame({"a": [1, 2, 3, None]}) con = duckdb.connect(":memory:", config={"default_null_order": "nulls_last", "default_order": "desc"}) result = con.execute("select * from df order by a").fetchall() assert result == [(3,), (2,), (1,), (None,)] - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_external_access(self, duckdb_cursor, pandas): - df = pandas.DataFrame({"a": [1, 2, 3]}) + def test_external_access(self, duckdb_cursor): + df = pd.DataFrame({"a": [1, 2, 3]}) # this works (replacement scan) con_regular = duckdb.connect(":memory:", config={}) con_regular.execute("select * from df") diff --git a/tests/fast/api/test_dbapi00.py b/tests/fast/api/test_dbapi00.py index 425cb7e1..4a942128 100644 --- a/tests/fast/api/test_dbapi00.py +++ b/tests/fast/api/test_dbapi00.py @@ -1,8 +1,8 @@ # simple DB API testcase import numpy +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas def assert_result_equal(result): @@ -83,30 +83,29 @@ def test_numpy_selection(self, duckdb_cursor, integers, timestamps): arr.mask = [False, False, True] numpy.testing.assert_array_equal(result["t"], arr, "Incorrect result returned") - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_pandas_selection(self, duckdb_cursor, pandas, integers, timestamps): + def test_pandas_selection(self, duckdb_cursor, integers, timestamps): import datetime from packaging.version import Version # I don't know when this exactly changed, but 2.0.3 does not support this, recent versions do - if Version(pandas.__version__) <= Version("2.0.3"): + if Version(pd.__version__) <= Version("2.0.3"): pytest.skip("The resulting dtype is 'object' when given a Series with dtype Int32DType") duckdb_cursor.execute("SELECT * FROM integers") result = duckdb_cursor.fetchdf() array = numpy.ma.masked_array(numpy.arange(11)) array.mask = [False] * 10 + [True] - arr = {"i": pandas.Series(array.data, dtype=pandas.Int32Dtype)} - arr["i"][array.mask] = pandas.NA - arr = pandas.DataFrame(arr) - pandas.testing.assert_frame_equal(result, arr) + arr = {"i": pd.Series(array.data, dtype=pd.Int32Dtype)} + arr["i"][array.mask] = pd.NA + arr = pd.DataFrame(arr) + pd.testing.assert_frame_equal(result, arr) duckdb_cursor.execute("SELECT * FROM timestamps") result = duckdb_cursor.fetchdf() - df = pandas.DataFrame( + df = pd.DataFrame( { - "t": pandas.Series( + "t": pd.Series( data=[ datetime.datetime(year=1992, month=10, day=3, hour=18, minute=34, second=45), datetime.datetime(year=2010, month=1, day=1, hour=0, minute=0, second=1), @@ -116,7 +115,7 @@ def test_pandas_selection(self, duckdb_cursor, pandas, integers, timestamps): ) } ) - pandas.testing.assert_frame_equal(result, df) + pd.testing.assert_frame_equal(result, df) # def test_numpy_creation(self, duckdb_cursor): # # numpyarray = {'i': numpy.arange(10), 'v': numpy.random.randint(100, size=(1, 10))} # segfaults diff --git a/tests/fast/api/test_dbapi08.py b/tests/fast/api/test_dbapi08.py index def4e925..79b2ce0b 100644 --- a/tests/fast/api/test_dbapi08.py +++ b/tests/fast/api/test_dbapi08.py @@ -1,21 +1,19 @@ # test fetchdf with various types -import pytest -from conftest import NumpyPandas +import pandas as pd import duckdb class TestType: - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_fetchdf(self, pandas): + def test_fetchdf(self): con = duckdb.connect() con.execute("CREATE TABLE items(item VARCHAR)") con.execute("INSERT INTO items VALUES ('jeans'), (''), (NULL)") res = con.execute("SELECT item FROM items").fetchdf() - assert isinstance(res, pandas.core.frame.DataFrame) + assert isinstance(res, pd.core.frame.DataFrame) - df = pandas.DataFrame({"item": ["jeans", "", None]}) + df = pd.DataFrame({"item": ["jeans", "", None]}) print(res) print(df) - pandas.testing.assert_frame_equal(res, df) + pd.testing.assert_frame_equal(res, df, check_dtype=False) diff --git a/tests/fast/api/test_duckdb_connection.py b/tests/fast/api/test_duckdb_connection.py index 246b9d92..efcc2203 100644 --- a/tests/fast/api/test_duckdb_connection.py +++ b/tests/fast/api/test_duckdb_connection.py @@ -1,7 +1,7 @@ import re +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb @@ -25,10 +25,9 @@ def tmp_database(tmp_path_factory): # This file contains tests for DuckDBPyConnection methods, # wrapped by the 'duckdb' module, to execute with the 'default_connection' class TestDuckDBConnection: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_append(self, pandas): + def test_append(self): duckdb.execute("Create table integers (i integer)") - df_in = pandas.DataFrame( + df_in = pd.DataFrame( { "numbers": [1, 2, 3, 4, 5], } @@ -345,13 +344,12 @@ def test_unregister_with_scary_name(self, duckdb_cursor): with pytest.raises(duckdb.CatalogException): duckdb_cursor.sql(f'select * from "{escaped_scary_name}"') - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_relation_out_of_scope(self, pandas): + def test_relation_out_of_scope(self): def temporary_scope(): # Create a connection, we will return this con = duckdb.connect() # Create a dataframe - df = pandas.DataFrame({"a": [1, 2, 3]}) + df = pd.DataFrame({"a": [1, 2, 3]}) # The dataframe has to be registered as well # making sure it does not go out of scope con.register("df", df) @@ -389,10 +387,11 @@ def test_interrupt(self): assert duckdb.interrupt is not None def test_wrap_shadowing(self): - pd = NumpyPandas() + import pandas as pd_local + import duckdb - df = pd.DataFrame({"a": [1, 2, 3]}) # noqa: F841 + df = pd_local.DataFrame({"a": [1, 2, 3]}) # noqa: F841 res = duckdb.sql("from df").fetchall() assert res == [(1,), (2,), (3,)] diff --git a/tests/fast/api/test_duckdb_query.py b/tests/fast/api/test_duckdb_query.py index 04531e49..8be3287c 100644 --- a/tests/fast/api/test_duckdb_query.py +++ b/tests/fast/api/test_duckdb_query.py @@ -1,5 +1,5 @@ +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb from duckdb import Value @@ -21,9 +21,8 @@ def test_duckdb_query(self, duckdb_cursor): res = duckdb_cursor.sql("select 42; select 84;").fetchall() assert res == [(84,)] - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_duckdb_from_query_multiple_statements(self, pandas): - tst_df = pandas.DataFrame({"a": [1, 23, 3, 5]}) # noqa: F841 + def test_duckdb_from_query_multiple_statements(self): + tst_df = pd.DataFrame({"a": [1, 23, 3, 5]}) # noqa: F841 res = duckdb.sql( """ diff --git a/tests/fast/api/test_native_tz.py b/tests/fast/api/test_native_tz.py index 66b06565..61b9ba24 100644 --- a/tests/fast/api/test_native_tz.py +++ b/tests/fast/api/test_native_tz.py @@ -1,4 +1,5 @@ import datetime +import zoneinfo from pathlib import Path import pytest @@ -12,6 +13,17 @@ filename = str(Path(__file__).parent / ".." / "data" / "tz.parquet") +def get_tz_string(obj): + if isinstance(obj, zoneinfo.ZoneInfo): + # Pandas 3.0.0 creates ZoneInfo objects + return obj.key + if hasattr(obj, "zone"): + # Before 3.0.0 Pandas created tzdata objects + return obj.zone + msg = f"Can't get tz string from {obj}" + raise ValueError(msg) + + class TestNativeTimeZone: def test_native_python_timestamp_timezone(self, duckdb_cursor): duckdb_cursor.execute("SET timezone='America/Los_Angeles';") @@ -46,7 +58,7 @@ def test_native_python_time_timezone(self, duckdb_cursor): def test_pandas_timestamp_timezone(self, duckdb_cursor): res = duckdb_cursor.execute("SET timezone='America/Los_Angeles';") res = duckdb_cursor.execute(f"select TimeRecStart as tz from '{filename}'").df() - assert res.dtypes["tz"].tz.zone == "America/Los_Angeles" + assert get_tz_string(res.dtypes["tz"].tz) == "America/Los_Angeles" assert res["tz"][0].hour == 14 assert res["tz"][0].minute == 52 @@ -65,16 +77,16 @@ def test_pandas_timestamp_time(self, duckdb_cursor): Version(pa.__version__) < Version("15.0.0"), reason="pyarrow 14.0.2 'to_pandas' causes a DeprecationWarning" ) def test_arrow_timestamp_timezone(self, duckdb_cursor): - res = duckdb_cursor.execute("SET timezone='America/Los_Angeles';") + duckdb_cursor.execute("SET timezone='America/Los_Angeles';") table = duckdb_cursor.execute(f"select TimeRecStart as tz from '{filename}'").fetch_arrow_table() res = table.to_pandas() - assert res.dtypes["tz"].tz.zone == "America/Los_Angeles" + assert get_tz_string(res.dtypes["tz"].tz) == "America/Los_Angeles" assert res["tz"][0].hour == 14 assert res["tz"][0].minute == 52 duckdb_cursor.execute("SET timezone='UTC';") res = duckdb_cursor.execute(f"select TimeRecStart as tz from '{filename}'").fetch_arrow_table().to_pandas() - assert res.dtypes["tz"].tz.zone == "UTC" + assert get_tz_string(res.dtypes["tz"].tz) == "UTC" assert res["tz"][0].hour == 21 assert res["tz"][0].minute == 52 diff --git a/tests/fast/api/test_to_csv.py b/tests/fast/api/test_to_csv.py index 97f13d8b..1354888a 100644 --- a/tests/fast/api/test_to_csv.py +++ b/tests/fast/api/test_to_csv.py @@ -3,17 +3,17 @@ import os import tempfile +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas, getTimeSeriesData +from conftest import getTimeSeriesData import duckdb class TestToCSV: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_basic_to_csv(self, pandas): + def test_basic_to_csv(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame({"a": [5, 3, 23, 2], "b": [45, 234, 234, 2]}) + df = pd.DataFrame({"a": [5, 3, 23, 2], "b": [45, 234, 234, 2]}) rel = duckdb.from_df(df) rel.to_csv(temp_file_name) @@ -21,10 +21,9 @@ def test_basic_to_csv(self, pandas): csv_rel = duckdb.read_csv(temp_file_name) assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_sep(self, pandas): + def test_to_csv_sep(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame({"a": [5, 3, 23, 2], "b": [45, 234, 234, 2]}) + df = pd.DataFrame({"a": [5, 3, 23, 2], "b": [45, 234, 234, 2]}) rel = duckdb.from_df(df) rel.to_csv(temp_file_name, sep=",") @@ -32,10 +31,9 @@ def test_to_csv_sep(self, pandas): csv_rel = duckdb.read_csv(temp_file_name, sep=",") assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_na_rep(self, pandas): + def test_to_csv_na_rep(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame({"a": [5, None, 23, 2], "b": [45, 234, 234, 2]}) + df = pd.DataFrame({"a": [5, None, 23, 2], "b": [45, 234, 234, 2]}) rel = duckdb.from_df(df) rel.to_csv(temp_file_name, na_rep="test") @@ -43,10 +41,9 @@ def test_to_csv_na_rep(self, pandas): csv_rel = duckdb.read_csv(temp_file_name, na_values="test") assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_header(self, pandas): + def test_to_csv_header(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame({"a": [5, None, 23, 2], "b": [45, 234, 234, 2]}) + df = pd.DataFrame({"a": [5, None, 23, 2], "b": [45, 234, 234, 2]}) rel = duckdb.from_df(df) rel.to_csv(temp_file_name) @@ -54,10 +51,9 @@ def test_to_csv_header(self, pandas): csv_rel = duckdb.read_csv(temp_file_name) assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_quotechar(self, pandas): + def test_to_csv_quotechar(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame({"a": ["'a,b,c'", None, "hello", "bye"], "b": [45, 234, 234, 2]}) + df = pd.DataFrame({"a": ["'a,b,c'", None, "hello", "bye"], "b": [45, 234, 234, 2]}) rel = duckdb.from_df(df) rel.to_csv(temp_file_name, quotechar="'", sep=",") @@ -65,10 +61,9 @@ def test_to_csv_quotechar(self, pandas): csv_rel = duckdb.read_csv(temp_file_name, sep=",", quotechar="'") assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_escapechar(self, pandas): + def test_to_csv_escapechar(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame( + df = pd.DataFrame( { "c_bool": [True, False], "c_float": [1.0, 3.2], @@ -81,12 +76,11 @@ def test_to_csv_escapechar(self, pandas): csv_rel = duckdb.read_csv(temp_file_name, quotechar='"', escapechar="!") assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_date_format(self, pandas): + def test_to_csv_date_format(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame(getTimeSeriesData()) + df = pd.DataFrame(getTimeSeriesData()) dt_index = df.index - df = pandas.DataFrame({"A": dt_index, "B": dt_index.shift(1)}, index=dt_index) + df = pd.DataFrame({"A": dt_index, "B": dt_index.shift(1)}, index=dt_index) rel = duckdb.from_df(df) rel.to_csv(temp_file_name, date_format="%Y%m%d") @@ -94,11 +88,10 @@ def test_to_csv_date_format(self, pandas): assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_timestamp_format(self, pandas): + def test_to_csv_timestamp_format(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 data = [datetime.time(hour=23, minute=1, second=34, microsecond=234345)] - df = pandas.DataFrame({"0": pandas.Series(data=data, dtype="object")}) + df = pd.DataFrame({"0": pd.Series(data=data, dtype="object")}) rel = duckdb.from_df(df) rel.to_csv(temp_file_name, timestamp_format="%m/%d/%Y") @@ -106,68 +99,61 @@ def test_to_csv_timestamp_format(self, pandas): assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_quoting_off(self, pandas): + def test_to_csv_quoting_off(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame({"a": ["string1", "string2", "string3"]}) + df = pd.DataFrame({"a": ["string1", "string2", "string3"]}) rel = duckdb.from_df(df) rel.to_csv(temp_file_name, quoting=None) csv_rel = duckdb.read_csv(temp_file_name) assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_quoting_on(self, pandas): + def test_to_csv_quoting_on(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame({"a": ["string1", "string2", "string3"]}) + df = pd.DataFrame({"a": ["string1", "string2", "string3"]}) rel = duckdb.from_df(df) rel.to_csv(temp_file_name, quoting="force") csv_rel = duckdb.read_csv(temp_file_name) assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_quoting_quote_all(self, pandas): + def test_to_csv_quoting_quote_all(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame({"a": ["string1", "string2", "string3"]}) + df = pd.DataFrame({"a": ["string1", "string2", "string3"]}) rel = duckdb.from_df(df) rel.to_csv(temp_file_name, quoting=csv.QUOTE_ALL) csv_rel = duckdb.read_csv(temp_file_name) assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_encoding_incorrect(self, pandas): + def test_to_csv_encoding_incorrect(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame({"a": ["string1", "string2", "string3"]}) + df = pd.DataFrame({"a": ["string1", "string2", "string3"]}) rel = duckdb.from_df(df) with pytest.raises( duckdb.InvalidInputException, match="Invalid Input Error: The only supported encoding option is 'UTF8" ): rel.to_csv(temp_file_name, encoding="nope") - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_encoding_correct(self, pandas): + def test_to_csv_encoding_correct(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame({"a": ["string1", "string2", "string3"]}) + df = pd.DataFrame({"a": ["string1", "string2", "string3"]}) rel = duckdb.from_df(df) rel.to_csv(temp_file_name, encoding="UTF-8") csv_rel = duckdb.read_csv(temp_file_name) assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_compression_gzip(self, pandas): + def test_compression_gzip(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame({"a": ["string1", "string2", "string3"]}) + df = pd.DataFrame({"a": ["string1", "string2", "string3"]}) rel = duckdb.from_df(df) rel.to_csv(temp_file_name, compression="gzip") csv_rel = duckdb.read_csv(temp_file_name, compression="gzip") assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_partition(self, pandas): + def test_to_csv_partition(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame( + df = pd.DataFrame( { "c_category": ["a", "a", "b", "b"], "c_bool": [True, False, True, True], @@ -190,10 +176,9 @@ def test_to_csv_partition(self, pandas): assert csv_rel.execute().fetchall() == expected - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_partition_with_columns_written(self, pandas): + def test_to_csv_partition_with_columns_written(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame( + df = pd.DataFrame( { "c_category": ["a", "a", "b", "b"], "c_bool": [True, False, True, True], @@ -210,10 +195,9 @@ def test_to_csv_partition_with_columns_written(self, pandas): ) assert res.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_overwrite(self, pandas): + def test_to_csv_overwrite(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame( + df = pd.DataFrame( { "c_category_1": ["a", "a", "b", "b"], "c_category_2": ["c", "c", "d", "d"], @@ -238,10 +222,9 @@ def test_to_csv_overwrite(self, pandas): ] assert csv_rel.execute().fetchall() == expected - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_overwrite_with_columns_written(self, pandas): + def test_to_csv_overwrite_with_columns_written(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame( + df = pd.DataFrame( { "c_category_1": ["a", "a", "b", "b"], "c_category_2": ["c", "c", "d", "d"], @@ -264,10 +247,9 @@ def test_to_csv_overwrite_with_columns_written(self, pandas): res = duckdb.sql("FROM rel order by all") assert res.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_overwrite_not_enabled(self, pandas): + def test_to_csv_overwrite_not_enabled(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame( + df = pd.DataFrame( { "c_category_1": ["a", "a", "b", "b"], "c_category_2": ["c", "c", "d", "d"], @@ -282,12 +264,11 @@ def test_to_csv_overwrite_not_enabled(self, pandas): with pytest.raises(duckdb.IOException, match="OVERWRITE"): rel.to_csv(temp_file_name, header=True, partition_by=["c_category_1"]) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_per_thread_output(self, pandas): + def test_to_csv_per_thread_output(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 num_threads = duckdb.sql("select current_setting('threads')").fetchone()[0] print("num_threads:", num_threads) - df = pandas.DataFrame( + df = pd.DataFrame( { "c_category": ["a", "a", "b", "b"], "c_bool": [True, False, True, True], @@ -301,10 +282,9 @@ def test_to_csv_per_thread_output(self, pandas): csv_rel = duckdb.read_csv(f"{temp_file_name}/*.csv", header=True) assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_use_tmp_file(self, pandas): + def test_to_csv_use_tmp_file(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame( + df = pd.DataFrame( { "c_category_1": ["a", "a", "b", "b"], "c_category_2": ["c", "c", "d", "d"], diff --git a/tests/fast/api/test_to_parquet.py b/tests/fast/api/test_to_parquet.py index 370ab8e4..5c70bf3f 100644 --- a/tests/fast/api/test_to_parquet.py +++ b/tests/fast/api/test_to_parquet.py @@ -3,15 +3,14 @@ import re import tempfile +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb class TestToParquet: - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) - def test_basic_to_parquet(self, pd): + def test_basic_to_parquet(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 df = pd.DataFrame({"a": [5, 3, 23, 2], "b": [45, 234, 234, 2]}) rel = duckdb.from_df(df) @@ -21,8 +20,7 @@ def test_basic_to_parquet(self, pd): csv_rel = duckdb.read_parquet(temp_file_name) assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) - def test_compression_gzip(self, pd): + def test_compression_gzip(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 df = pd.DataFrame({"a": ["string1", "string2", "string3"]}) rel = duckdb.from_df(df) @@ -50,9 +48,8 @@ def test_field_ids(self): """ ).execute().fetchall() == [("duckdb_schema", None), ("i", 42), ("my_struct", 43), ("j", 44)] - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) @pytest.mark.parametrize("row_group_size_bytes", [122880 * 1024, "2MB"]) - def test_row_group_size_bytes(self, pd, row_group_size_bytes): + def test_row_group_size_bytes(self, row_group_size_bytes): con = duckdb.connect() con.execute("SET preserve_insertion_order=false;") @@ -63,8 +60,7 @@ def test_row_group_size_bytes(self, pd, row_group_size_bytes): parquet_rel = con.read_parquet(temp_file_name) assert rel.execute().fetchall() == parquet_rel.execute().fetchall() - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) - def test_row_group_size(self, pd): + def test_row_group_size(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 df = pd.DataFrame({"a": ["string1", "string2", "string3"]}) rel = duckdb.from_df(df) @@ -72,9 +68,8 @@ def test_row_group_size(self, pd): parquet_rel = duckdb.read_parquet(temp_file_name) assert rel.execute().fetchall() == parquet_rel.execute().fetchall() - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) @pytest.mark.parametrize("write_columns", [None, True, False]) - def test_partition(self, pd, write_columns): + def test_partition(self, write_columns): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 df = pd.DataFrame( { @@ -89,9 +84,8 @@ def test_partition(self, pd, write_columns): expected = [("rei", 321.0, "a"), ("shinji", 123.0, "a"), ("asuka", 23.0, "b"), ("kaworu", 340.0, "c")] assert result.execute().fetchall() == expected - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) @pytest.mark.parametrize("write_columns", [None, True, False]) - def test_overwrite(self, pd, write_columns): + def test_overwrite(self, write_columns): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 df = pd.DataFrame( { @@ -108,8 +102,7 @@ def test_overwrite(self, pd, write_columns): assert result.execute().fetchall() == expected - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) - def test_use_tmp_file(self, pd): + def test_use_tmp_file(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 df = pd.DataFrame( { @@ -124,8 +117,7 @@ def test_use_tmp_file(self, pd): result = duckdb.read_parquet(temp_file_name) assert rel.execute().fetchall() == result.execute().fetchall() - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) - def test_per_thread_output(self, pd): + def test_per_thread_output(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 num_threads = duckdb.sql("select current_setting('threads')").fetchone()[0] print("threads:", num_threads) @@ -141,8 +133,7 @@ def test_per_thread_output(self, pd): result = duckdb.read_parquet(f"{temp_file_name}/*.parquet") assert rel.execute().fetchall() == result.execute().fetchall() - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) - def test_append(self, pd): + def test_append(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 df = pd.DataFrame( { @@ -173,8 +164,7 @@ def test_append(self, pd): ] assert result.execute().fetchall() == expected - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) - def test_filename_pattern_with_index(self, pd): + def test_filename_pattern_with_index(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 df = pd.DataFrame( { @@ -199,8 +189,7 @@ def test_filename_pattern_with_index(self, pd): expected = [("rei", 321.0, "a"), ("shinji", 123.0, "a"), ("asuka", 23.0, "b"), ("kaworu", 340.0, "c")] assert result.execute().fetchall() == expected - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) - def test_filename_pattern_with_uuid(self, pd): + def test_filename_pattern_with_uuid(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 df = pd.DataFrame( { @@ -242,9 +231,8 @@ def test_file_size_bytes_basic(self, file_size_bytes): result = duckdb.read_parquet(f"{temp_file_name}/*.parquet") assert len(result.execute().fetchall()) == 10000 - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) @pytest.mark.parametrize("file_size_bytes", ["256MB", "1G"]) - def test_file_size_bytes_human_readable(self, pd, file_size_bytes): + def test_file_size_bytes_human_readable(self, file_size_bytes): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 df = pd.DataFrame( { diff --git a/tests/fast/arrow/test_6796.py b/tests/fast/arrow/test_6796.py index bf557038..a9e877d5 100644 --- a/tests/fast/arrow/test_6796.py +++ b/tests/fast/arrow/test_6796.py @@ -1,15 +1,14 @@ +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb pyarrow = pytest.importorskip("pyarrow") -@pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) -def test_6796(pandas): +def test_6796(): conn = duckdb.connect() - input_df = pandas.DataFrame({"foo": ["bar"]}) + input_df = pd.DataFrame({"foo": ["bar"]}) conn.register("input_df", input_df) query = """ diff --git a/tests/fast/pandas/test_2304.py b/tests/fast/pandas/test_2304.py index c60b1b4a..e40c2dd1 100644 --- a/tests/fast/pandas/test_2304.py +++ b/tests/fast/pandas/test_2304.py @@ -1,14 +1,12 @@ import numpy as np -import pytest -from conftest import ArrowPandas, NumpyPandas +import pandas as pd import duckdb class TestPandasMergeSameName: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_2304(self, duckdb_cursor, pandas): - df1 = pandas.DataFrame( + def test_2304(self, duckdb_cursor): + df1 = pd.DataFrame( { "id_1": [1, 1, 1, 2, 2], "agedate": np.array(["2010-01-01", "2010-02-01", "2010-03-01", "2020-02-01", "2020-03-01"]).astype( @@ -19,7 +17,7 @@ def test_2304(self, duckdb_cursor, pandas): } ) - df2 = pandas.DataFrame( + df2 = pd.DataFrame( { "id_1": [1, 1, 2], "agedate": np.array(["2010-01-01", "2010-02-01", "2020-03-01"]).astype("datetime64[D]"), @@ -54,9 +52,8 @@ def test_2304(self, duckdb_cursor, pandas): assert result == expected_result - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_pd_names(self, duckdb_cursor, pandas): - df1 = pandas.DataFrame( + def test_pd_names(self, duckdb_cursor): + df1 = pd.DataFrame( { "id": [1, 1, 2], "id_1": [1, 1, 2], @@ -64,9 +61,9 @@ def test_pd_names(self, duckdb_cursor, pandas): } ) - df2 = pandas.DataFrame({"id": [1, 1, 2], "id_1": [1, 1, 2], "id_2": [1, 1, 1]}) + df2 = pd.DataFrame({"id": [1, 1, 2], "id_1": [1, 1, 2], "id_2": [1, 1, 1]}) - exp_result = pandas.DataFrame( + exp_result = pd.DataFrame( { "id": [1, 1, 2, 1, 1], "id_1": [1, 1, 2, 1, 1], @@ -85,11 +82,10 @@ def test_pd_names(self, duckdb_cursor, pandas): ON (df1.id_1=df2.id_1)""" result_df = con.execute(query).fetchdf() - pandas.testing.assert_frame_equal(exp_result, result_df) + pd.testing.assert_frame_equal(exp_result, result_df) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_repeat_name(self, duckdb_cursor, pandas): - df1 = pandas.DataFrame( + def test_repeat_name(self, duckdb_cursor): + df1 = pd.DataFrame( { "id": [1], "id_1": [1], @@ -97,9 +93,9 @@ def test_repeat_name(self, duckdb_cursor, pandas): } ) - df2 = pandas.DataFrame({"id": [1]}) + df2 = pd.DataFrame({"id": [1]}) - exp_result = pandas.DataFrame( + exp_result = pd.DataFrame( { "id": [1], "id_1": [1], @@ -119,4 +115,4 @@ def test_repeat_name(self, duckdb_cursor, pandas): ON (df1.id=df2.id) """ ).fetchdf() - pandas.testing.assert_frame_equal(exp_result, result_df) + pd.testing.assert_frame_equal(exp_result, result_df) diff --git a/tests/fast/pandas/test_append_df.py b/tests/fast/pandas/test_append_df.py index d93cfa2d..be287a8f 100644 --- a/tests/fast/pandas/test_append_df.py +++ b/tests/fast/pandas/test_append_df.py @@ -1,15 +1,14 @@ +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb class TestAppendDF: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_df_to_table_append(self, duckdb_cursor, pandas): + def test_df_to_table_append(self, duckdb_cursor): conn = duckdb.connect() conn.execute("Create table integers (i integer)") - df_in = pandas.DataFrame( + df_in = pd.DataFrame( { "numbers": [1, 2, 3, 4, 5], } @@ -17,11 +16,10 @@ def test_df_to_table_append(self, duckdb_cursor, pandas): conn.append("integers", df_in) assert conn.execute("select count(*) from integers").fetchone()[0] == 5 - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_append_by_name(self, pandas): + def test_append_by_name(self): con = duckdb.connect() con.execute("create table tbl (a integer, b bool, c varchar)") - df_in = pandas.DataFrame({"c": ["duck", "db"], "b": [False, True], "a": [4, 2]}) + df_in = pd.DataFrame({"c": ["duck", "db"], "b": [False, True], "a": [4, 2]}) # By default we append by position, causing the following exception: with pytest.raises( duckdb.ConversionException, match="Conversion Error: Could not convert string 'duck' to INT32" @@ -33,29 +31,27 @@ def test_append_by_name(self, pandas): res = con.table("tbl").fetchall() assert res == [(4, False, "duck"), (2, True, "db")] - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_append_by_name_quoted(self, pandas): + def test_append_by_name_quoted(self): con = duckdb.connect() con.execute( """ create table tbl ("needs to be quoted" integer, other varchar) """ ) - df_in = pandas.DataFrame({"needs to be quoted": [1, 2, 3]}) + df_in = pd.DataFrame({"needs to be quoted": [1, 2, 3]}) con.append("tbl", df_in, by_name=True) res = con.table("tbl").fetchall() assert res == [(1, None), (2, None), (3, None)] - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_append_by_name_no_exact_match(self, pandas): + def test_append_by_name_no_exact_match(self): con = duckdb.connect() con.execute("create table tbl (a integer, b bool)") - df_in = pandas.DataFrame({"c": ["a", "b"], "b": [True, False], "a": [42, 1337]}) + df_in = pd.DataFrame({"c": ["a", "b"], "b": [True, False], "a": [42, 1337]}) # Too many columns raises an error, because the columns cant be found in the targeted table with pytest.raises(duckdb.BinderException, match='Table "tbl" does not have a column with name "c"'): con.append("tbl", df_in, by_name=True) - df_in = pandas.DataFrame({"b": [False, False, False]}) + df_in = pd.DataFrame({"b": [False, False, False]}) # Not matching all columns is not a problem, as they will be filled with NULL instead con.append("tbl", df_in, by_name=True) @@ -66,7 +62,7 @@ def test_append_by_name_no_exact_match(self, pandas): # Empty the table con.execute("create or replace table tbl (a integer, b bool)") - df_in = pandas.DataFrame({"a": [1, 2, 3]}) + df_in = pd.DataFrame({"a": [1, 2, 3]}) con.append("tbl", df_in, by_name=True) res = con.table("tbl").fetchall() # Also works for missing columns *after* the supplied ones diff --git a/tests/fast/pandas/test_bug5922.py b/tests/fast/pandas/test_bug5922.py index b75ddf1b..196764e3 100644 --- a/tests/fast/pandas/test_bug5922.py +++ b/tests/fast/pandas/test_bug5922.py @@ -1,13 +1,11 @@ -import pytest -from conftest import ArrowPandas, NumpyPandas +import pandas as pd import duckdb class TestPandasAcceptFloat16: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_pandas_accept_float16(self, duckdb_cursor, pandas): - df = pandas.DataFrame({"col": [1, 2, 3]}) + def test_pandas_accept_float16(self, duckdb_cursor): + df = pd.DataFrame({"col": [1, 2, 3]}) df16 = df.astype({"col": "float16"}) # noqa: F841 con = duckdb.connect() con.execute("CREATE TABLE tbl AS SELECT * FROM df16") diff --git a/tests/fast/pandas/test_copy_on_write.py b/tests/fast/pandas/test_copy_on_write.py index 176c2133..417fae0d 100644 --- a/tests/fast/pandas/test_copy_on_write.py +++ b/tests/fast/pandas/test_copy_on_write.py @@ -1,26 +1,27 @@ import datetime import pytest +from packaging.version import Version import duckdb # https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html pandas = pytest.importorskip("pandas", "1.5", reason="copy_on_write does not exist in earlier versions") +# Starting from Pandas 3.0.0 copy-on-write can no longer be disabled and this setting is deprecated +pre_3_0 = Version(pandas.__version__) < Version("3.0.0") # Make sure the variable get's properly reset even in case of error @pytest.fixture(autouse=True) def scoped_copy_on_write_setting(): - old_value = pandas.options.mode.copy_on_write - pandas.options.mode.copy_on_write = True - yield - # Reset it at the end of the function - pandas.options.mode.copy_on_write = old_value - return - - -def convert_to_result(col): - return [(x,) for x in col] + if pre_3_0: + old_value = pandas.options.mode.copy_on_write + pandas.options.mode.copy_on_write = True + yield + # Reset it at the end of the function + pandas.options.mode.copy_on_write = old_value + else: + yield class TestCopyOnWrite: @@ -35,7 +36,6 @@ class TestCopyOnWrite: ], ) def test_copy_on_write(self, col): - assert pandas.options.mode.copy_on_write con = duckdb.connect() df_in = pandas.DataFrame( # noqa: F841 { @@ -45,5 +45,5 @@ def test_copy_on_write(self, col): rel = con.sql("select * from df_in") res = rel.fetchall() print(res) - expected = convert_to_result(col) + expected = [(x,) for x in col] assert res == expected diff --git a/tests/fast/pandas/test_create_table_from_pandas.py b/tests/fast/pandas/test_create_table_from_pandas.py index 436fd0c8..b9937de2 100644 --- a/tests/fast/pandas/test_create_table_from_pandas.py +++ b/tests/fast/pandas/test_create_table_from_pandas.py @@ -1,12 +1,11 @@ -import pytest -from conftest import ArrowPandas, NumpyPandas +import pandas as pd import duckdb -def assert_create(internal_data, expected_result, data_type, pandas): +def assert_create(internal_data, expected_result, data_type): conn = duckdb.connect() - df_in = pandas.DataFrame(data=internal_data, dtype=data_type) # noqa: F841 + df_in = pd.DataFrame(data=internal_data, dtype=data_type) # noqa: F841 conn.execute("CREATE TABLE t AS SELECT * FROM df_in") @@ -14,9 +13,9 @@ def assert_create(internal_data, expected_result, data_type, pandas): assert result == expected_result -def assert_create_register(internal_data, expected_result, data_type, pandas): +def assert_create_register(internal_data, expected_result, data_type): conn = duckdb.connect() - df_in = pandas.DataFrame(data=internal_data, dtype=data_type) + df_in = pd.DataFrame(data=internal_data, dtype=data_type) conn.register("dataframe", df_in) conn.execute("CREATE TABLE t AS SELECT * FROM dataframe") @@ -25,15 +24,14 @@ def assert_create_register(internal_data, expected_result, data_type, pandas): class TestCreateTableFromPandas: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_integer_create_table(self, duckdb_cursor, pandas): + def test_integer_create_table(self, duckdb_cursor): # TODO: This should work with other data types e.g., int8... # noqa: TD002, TD003 data_types = ["Int8", "Int16", "Int32", "Int64"] internal_data = [1, 2, 3, 4] expected_result = [(1,), (2,), (3,), (4,)] for data_type in data_types: print(data_type) - assert_create_register(internal_data, expected_result, data_type, pandas) - assert_create(internal_data, expected_result, data_type, pandas) + assert_create_register(internal_data, expected_result, data_type) + assert_create(internal_data, expected_result, data_type) # TODO: Also test other data types # noqa: TD002, TD003 diff --git a/tests/fast/pandas/test_datetime_time.py b/tests/fast/pandas/test_datetime_time.py index 0b2642b0..a2fda09a 100644 --- a/tests/fast/pandas/test_datetime_time.py +++ b/tests/fast/pandas/test_datetime_time.py @@ -1,8 +1,8 @@ from datetime import datetime, time, timezone import numpy as np +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb @@ -10,25 +10,22 @@ class TestDateTimeTime: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_time_high(self, duckdb_cursor, pandas): + def test_time_high(self, duckdb_cursor): duckdb_time = duckdb_cursor.sql("SELECT make_time(23, 1, 34.234345) AS '0'").df() data = [time(hour=23, minute=1, second=34, microsecond=234345)] - df_in = pandas.DataFrame({"0": pandas.Series(data=data, dtype="object")}) + df_in = pd.DataFrame({"0": pd.Series(data=data, dtype="object")}) df_out = duckdb.query_df(df_in, "df", "select * from df").df() - pandas.testing.assert_frame_equal(df_out, duckdb_time) + pd.testing.assert_frame_equal(df_out, duckdb_time) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_time_low(self, duckdb_cursor, pandas): + def test_time_low(self, duckdb_cursor): duckdb_time = duckdb_cursor.sql("SELECT make_time(00, 01, 1.000) AS '0'").df() data = [time(hour=0, minute=1, second=1)] - df_in = pandas.DataFrame({"0": pandas.Series(data=data, dtype="object")}) + df_in = pd.DataFrame({"0": pd.Series(data=data, dtype="object")}) df_out = duckdb.query_df(df_in, "df", "select * from df").df() - pandas.testing.assert_frame_equal(df_out, duckdb_time) + pd.testing.assert_frame_equal(df_out, duckdb_time) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) @pytest.mark.parametrize("input", ["2263-02-28", "9999-01-01"]) - def test_pandas_datetime_big(self, pandas, input): + def test_pandas_datetime_big(self, input): duckdb_con = duckdb.connect() duckdb_con.execute("create table test (date DATE)") @@ -36,8 +33,8 @@ def test_pandas_datetime_big(self, pandas, input): res = duckdb_con.execute("select * from test").df() date_value = np.array([f"{input}"], dtype="datetime64[us]") - df = pandas.DataFrame({"date": date_value}) - pandas.testing.assert_frame_equal(res, df) + df = pd.DataFrame({"date": date_value}) + pd.testing.assert_frame_equal(res, df) def test_timezone_datetime(self): con = duckdb.connect() diff --git a/tests/fast/pandas/test_datetime_timestamp.py b/tests/fast/pandas/test_datetime_timestamp.py index c6d4e3a9..063be160 100644 --- a/tests/fast/pandas/test_datetime_timestamp.py +++ b/tests/fast/pandas/test_datetime_timestamp.py @@ -1,39 +1,35 @@ import datetime +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas from packaging.version import Version -pd = pytest.importorskip("pandas") - class TestDateTimeTimeStamp: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_timestamp_high(self, pandas, duckdb_cursor): + def test_timestamp_high(self, duckdb_cursor): duckdb_time = duckdb_cursor.sql("SELECT '2260-01-01 23:59:00'::TIMESTAMP AS '0'").df() - df_in = pandas.DataFrame( # noqa: F841 + df_in = pd.DataFrame( # noqa: F841 { - 0: pandas.Series( + 0: pd.Series( data=[datetime.datetime(year=2260, month=1, day=1, hour=23, minute=59)], dtype="datetime64[us]", ) } ) df_out = duckdb_cursor.sql("select * from df_in").df() - pandas.testing.assert_frame_equal(df_out, duckdb_time) + pd.testing.assert_frame_equal(df_out, duckdb_time) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_timestamp_low(self, pandas, duckdb_cursor): + def test_timestamp_low(self, duckdb_cursor): duckdb_time = duckdb_cursor.sql( """ SELECT '1680-01-01 23:59:00.234243'::TIMESTAMP AS '0' """ ).df() - df_in = pandas.DataFrame( + df_in = pd.DataFrame( { - "0": pandas.Series( + "0": pd.Series( data=[ - pandas.Timestamp( + pd.Timestamp( datetime.datetime(year=1680, month=1, day=1, hour=23, minute=59, microsecond=234243), unit="us", ) @@ -46,13 +42,12 @@ def test_timestamp_low(self, pandas, duckdb_cursor): print("df_in:", df_in["0"].dtype) df_out = duckdb_cursor.sql("select * from df_in").df() print("df_out:", df_out["0"].dtype) - pandas.testing.assert_frame_equal(df_out, duckdb_time) + pd.testing.assert_frame_equal(df_out, duckdb_time) @pytest.mark.skipif( Version(pd.__version__) < Version("2.0.2"), reason="pandas < 2.0.2 does not properly convert timezones" ) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_timestamp_timezone_regular(self, pandas, duckdb_cursor): + def test_timestamp_timezone_regular(self, duckdb_cursor): duckdb_time = duckdb_cursor.sql( """ SELECT timestamp '2022-01-01 12:00:00' AT TIME ZONE 'Pacific/Easter' as "0" @@ -61,9 +56,9 @@ def test_timestamp_timezone_regular(self, pandas, duckdb_cursor): offset = datetime.timedelta(hours=-2) timezone = datetime.timezone(offset) - df_in = pandas.DataFrame( # noqa: F841 + df_in = pd.DataFrame( # noqa: F841 { - 0: pandas.Series( + 0: pd.Series( data=[datetime.datetime(year=2022, month=1, day=1, hour=15, tzinfo=timezone)], dtype="object" ) } @@ -71,13 +66,12 @@ def test_timestamp_timezone_regular(self, pandas, duckdb_cursor): df_out = duckdb_cursor.sql("select * from df_in").df() print(df_out) print(duckdb_time) - pandas.testing.assert_frame_equal(df_out, duckdb_time) + pd.testing.assert_frame_equal(df_out, duckdb_time) @pytest.mark.skipif( Version(pd.__version__) < Version("2.0.2"), reason="pandas < 2.0.2 does not properly convert timezones" ) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_timestamp_timezone_negative_extreme(self, pandas, duckdb_cursor): + def test_timestamp_timezone_negative_extreme(self, duckdb_cursor): duckdb_time = duckdb_cursor.sql( """ SELECT timestamp '2022-01-01 12:00:00' AT TIME ZONE 'Chile/EasterIsland' as "0" @@ -87,21 +81,20 @@ def test_timestamp_timezone_negative_extreme(self, pandas, duckdb_cursor): offset = datetime.timedelta(hours=-19) timezone = datetime.timezone(offset) - df_in = pandas.DataFrame( # noqa: F841 + df_in = pd.DataFrame( # noqa: F841 { - 0: pandas.Series( + 0: pd.Series( data=[datetime.datetime(year=2021, month=12, day=31, hour=22, tzinfo=timezone)], dtype="object" ) } ) df_out = duckdb_cursor.sql("select * from df_in").df() - pandas.testing.assert_frame_equal(df_out, duckdb_time) + pd.testing.assert_frame_equal(df_out, duckdb_time) @pytest.mark.skipif( Version(pd.__version__) < Version("2.0.2"), reason="pandas < 2.0.2 does not properly convert timezones" ) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_timestamp_timezone_positive_extreme(self, pandas, duckdb_cursor): + def test_timestamp_timezone_positive_extreme(self, duckdb_cursor): duckdb_time = duckdb_cursor.sql( """ SELECT timestamp '2021-12-31 23:00:00' AT TIME ZONE 'Etc/GMT-14' as "0" @@ -111,22 +104,21 @@ def test_timestamp_timezone_positive_extreme(self, pandas, duckdb_cursor): offset = datetime.timedelta(hours=14) timezone = datetime.timezone(offset) - df_in = pandas.DataFrame( # noqa: F841 + df_in = pd.DataFrame( # noqa: F841 { - 0: pandas.Series( + 0: pd.Series( data=[datetime.datetime(year=2021, month=12, day=31, hour=23, tzinfo=timezone)], dtype="object" ) } ) df_out = duckdb_cursor.sql("""select * from df_in""").df() - pandas.testing.assert_frame_equal(df_out, duckdb_time) + pd.testing.assert_frame_equal(df_out, duckdb_time) @pytest.mark.skipif( Version(pd.__version__) < Version("2.0.2"), reason="pandas < 2.0.2 does not properly convert timezones" ) @pytest.mark.parametrize("unit", ["ms", "ns", "s"]) def test_timestamp_timezone_coverage(self, unit, duckdb_cursor): - pd = pytest.importorskip("pandas") ts_df = pd.DataFrame( # noqa: F841 {"ts": pd.Series(data=[pd.Timestamp(datetime.datetime(1990, 12, 21))], dtype=f"datetime64[{unit}]")} ) diff --git a/tests/fast/pandas/test_df_analyze.py b/tests/fast/pandas/test_df_analyze.py index 96cd426d..d9881ffa 100644 --- a/tests/fast/pandas/test_df_analyze.py +++ b/tests/fast/pandas/test_df_analyze.py @@ -1,58 +1,51 @@ -import numpy as np +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas +from conftest import is_string_dtype import duckdb -def create_generic_dataframe(data, pandas): - return pandas.DataFrame({"col0": pandas.Series(data=data, dtype="object")}) +def create_generic_dataframe(data): + return pd.DataFrame({"col0": pd.Series(data=data, dtype="object")}) class TestResolveObjectColumns: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_sample_low_correct(self, duckdb_cursor, pandas): - print(pandas.backend) + def test_sample_low_correct(self, duckdb_cursor): duckdb_conn = duckdb.connect() duckdb_conn.execute("SET pandas_analyze_sample=3") data = [1000008, 6, 9, 4, 1, 6] - df = create_generic_dataframe(data, pandas) + df = create_generic_dataframe(data) roundtripped_df = duckdb.query_df(df, "x", "select * from x", connection=duckdb_conn).df() duckdb_df = duckdb_conn.query("select * FROM (VALUES (1000008), (6), (9), (4), (1), (6)) as '0'").df() - pandas.testing.assert_frame_equal(duckdb_df, roundtripped_df, check_dtype=False) + pd.testing.assert_frame_equal(duckdb_df, roundtripped_df, check_dtype=False) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_sample_low_incorrect_detected(self, duckdb_cursor, pandas): + def test_sample_low_incorrect_detected(self, duckdb_cursor): duckdb_conn = duckdb.connect() duckdb_conn.execute("SET pandas_analyze_sample=2") # size of list (6) divided by 'pandas_analyze_sample' (2) is the increment used # in this case index 0 (1000008) and index 3 ([4]) are checked, which dont match data = [1000008, 6, 9, [4], 1, 6] - df = create_generic_dataframe(data, pandas) + df = create_generic_dataframe(data) roundtripped_df = duckdb.query_df(df, "x", "select * from x", connection=duckdb_conn).df() # Sample high enough to detect mismatch in types, fallback to VARCHAR - assert roundtripped_df["col0"].dtype == np.dtype("object") + assert is_string_dtype(roundtripped_df["col0"].dtype) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_sample_zero(self, duckdb_cursor, pandas): + def test_sample_zero_infers_varchar(self, duckdb_cursor): + """Test that with analyze disabled, object columns are treated as VARCHAR.""" duckdb_conn = duckdb.connect() # Disable dataframe analyze duckdb_conn.execute("SET pandas_analyze_sample=0") data = [1000008, 6, 9, 3, 1, 6] - df = create_generic_dataframe(data, pandas) + df = create_generic_dataframe(data) roundtripped_df = duckdb.query_df(df, "x", "select * from x", connection=duckdb_conn).df() - # Always converts to VARCHAR - if pandas.backend == "pyarrow": - assert roundtripped_df["col0"].dtype == np.dtype("int64") - else: - assert roundtripped_df["col0"].dtype == np.dtype("object") + # Always converts to VARCHAR when analyze is disabled + assert is_string_dtype(roundtripped_df["col0"].dtype) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_sample_low_incorrect_undetected(self, duckdb_cursor, pandas): + def test_sample_low_incorrect_undetected(self, duckdb_cursor): duckdb_conn = duckdb.connect() duckdb_conn.execute("SET pandas_analyze_sample=1") data = [1000008, 6, 9, [4], [1], 6] - df = create_generic_dataframe(data, pandas) + df = create_generic_dataframe(data) # Sample size is too low to detect the mismatch, exception is raised when trying to convert with pytest.raises(duckdb.InvalidInputException, match="Failed to cast value: Unimplemented type for cast"): duckdb.query_df(df, "x", "select * from x", connection=duckdb_conn).df() @@ -65,12 +58,11 @@ def test_reset_analyze_sample_setting(self, duckdb_cursor): res = duckdb_cursor.execute("select current_setting('pandas_analyze_sample')").fetchall() assert res == [(1000,)] - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_10750(self, duckdb_cursor, pandas): + def test_10750(self, duckdb_cursor): max_row_number = 2000 data = {"id": list(range(max_row_number + 1)), "content": [None for _ in range(max_row_number + 1)]} - pdf = pandas.DataFrame(data=data) + pdf = pd.DataFrame(data=data) duckdb_cursor.register("content", pdf) res = duckdb_cursor.query("select id from content").fetchall() expected = [(i,) for i in range(2001)] diff --git a/tests/fast/pandas/test_df_object_resolution.py b/tests/fast/pandas/test_df_object_resolution.py index 58ae0c94..0c5ab311 100644 --- a/tests/fast/pandas/test_df_object_resolution.py +++ b/tests/fast/pandas/test_df_object_resolution.py @@ -7,16 +7,17 @@ from decimal import Decimal import numpy as np +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas +from conftest import is_string_dtype import duckdb standard_vector_size = duckdb.__standard_vector_size__ -def create_generic_dataframe(data, pandas): - return pandas.DataFrame({"0": pandas.Series(data=data, dtype="object")}) +def create_generic_dataframe(data): + return pd.DataFrame({"0": pd.Series(data=data, dtype="object")}) def create_repeated_nulls(size): @@ -42,11 +43,11 @@ def __str__(self) -> str: # To avoid DECIMAL being upgraded to DOUBLE (because DOUBLE outranks DECIMAL as a LogicalType) # These floats had their precision preserved as string and are now cast to decimal.Decimal -def ConvertStringToDecimal(data: list, pandas): +def ConvertStringToDecimal(data: list): for i in range(len(data)): if isinstance(data[i], str): data[i] = decimal.Decimal(data[i]) - data = pandas.Series(data=data, dtype="object") + data = pd.Series(data=data, dtype="object") return data @@ -74,9 +75,9 @@ def construct_map(pair): ] -def check_struct_upgrade(expected_type: str, creation_method, pair: ObjectPair, pandas, cursor): +def check_struct_upgrade(expected_type: str, creation_method, pair: ObjectPair, cursor): column_data = creation_method(pair) - df = pandas.DataFrame(data={"col": column_data}) + df = pd.DataFrame(data={"col": column_data}) rel = cursor.query("select col from df") res = rel.fetchall() print("COLUMN_DATA", column_data) @@ -85,29 +86,25 @@ def check_struct_upgrade(expected_type: str, creation_method, pair: ObjectPair, class TestResolveObjectColumns: - # TODO: add support for ArrowPandas # noqa: TD002, TD003 - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_integers(self, pandas, duckdb_cursor): + def test_integers(self, duckdb_cursor): data = [5, 0, 3] - df_in = create_generic_dataframe(data, pandas) + df_in = create_generic_dataframe(data) # These are float64 because pandas would force these to be float64 even if we set them to int8, int16, # int32, int64 respectively - df_expected_res = pandas.DataFrame({"0": pandas.Series(data=data, dtype="int32")}) + df_expected_res = pd.DataFrame({"0": pd.Series(data=data, dtype="int32")}) df_out = duckdb_cursor.sql("SELECT * FROM df_in").df() print(df_out) - pandas.testing.assert_frame_equal(df_expected_res, df_out) + pd.testing.assert_frame_equal(df_expected_res, df_out) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_struct_correct(self, pandas, duckdb_cursor): + def test_struct_correct(self, duckdb_cursor): data = [{"a": 1, "b": 3, "c": 3, "d": 7}] - df = pandas.DataFrame({"0": pandas.Series(data=data)}) + df = pd.DataFrame({"0": pd.Series(data=data)}) duckdb_col = duckdb_cursor.sql("SELECT {a: 1, b: 3, c: 3, d: 7} as '0'").df() converted_col = duckdb_cursor.sql("SELECT * FROM df").df() - pandas.testing.assert_frame_equal(duckdb_col, converted_col) + pd.testing.assert_frame_equal(duckdb_col, converted_col) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_map_fallback_different_keys(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_map_fallback_different_keys(self, duckdb_cursor): + x = pd.DataFrame( [ [{"a": 1, "b": 3, "c": 3, "d": 7}], [{"a": 1, "b": 3, "c": 3, "d": 7}], @@ -118,7 +115,7 @@ def test_map_fallback_different_keys(self, pandas, duckdb_cursor): ) converted_df = duckdb_cursor.sql("SELECT * FROM x").df() - y = pandas.DataFrame( + y = pd.DataFrame( [ [{"key": ["a", "b", "c", "d"], "value": [1, 3, 3, 7]}], [{"key": ["a", "b", "c", "d"], "value": [1, 3, 3, 7]}], @@ -128,11 +125,10 @@ def test_map_fallback_different_keys(self, pandas, duckdb_cursor): ] ) equal_df = duckdb_cursor.sql("SELECT * FROM y").df() - pandas.testing.assert_frame_equal(converted_df, equal_df) + pd.testing.assert_frame_equal(converted_df, equal_df) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_map_fallback_incorrect_amount_of_keys(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_map_fallback_incorrect_amount_of_keys(self, duckdb_cursor): + x = pd.DataFrame( [ [{"a": 1, "b": 3, "c": 3, "d": 7}], [{"a": 1, "b": 3, "c": 3, "d": 7}], @@ -142,7 +138,7 @@ def test_map_fallback_incorrect_amount_of_keys(self, pandas, duckdb_cursor): ] ) converted_df = duckdb_cursor.sql("SELECT * FROM x").df() - y = pandas.DataFrame( + y = pd.DataFrame( [ [{"key": ["a", "b", "c", "d"], "value": [1, 3, 3, 7]}], [{"key": ["a", "b", "c", "d"], "value": [1, 3, 3, 7]}], @@ -152,11 +148,10 @@ def test_map_fallback_incorrect_amount_of_keys(self, pandas, duckdb_cursor): ] ) equal_df = duckdb_cursor.sql("SELECT * FROM y").df() - pandas.testing.assert_frame_equal(converted_df, equal_df) + pd.testing.assert_frame_equal(converted_df, equal_df) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_struct_value_upgrade(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_struct_value_upgrade(self, duckdb_cursor): + x = pd.DataFrame( [ [{"a": 1, "b": 3, "c": 3, "d": "string"}], [{"a": 1, "b": 3, "c": 3, "d": 7}], @@ -165,7 +160,7 @@ def test_struct_value_upgrade(self, pandas, duckdb_cursor): [{"a": 1, "b": 3, "c": 3, "d": 7}], ] ) - y = pandas.DataFrame( + y = pd.DataFrame( [ [{"a": 1, "b": 3, "c": 3, "d": "string"}], [{"a": 1, "b": 3, "c": 3, "d": "7"}], @@ -176,11 +171,10 @@ def test_struct_value_upgrade(self, pandas, duckdb_cursor): ) converted_df = duckdb_cursor.sql("SELECT * FROM x").df() equal_df = duckdb_cursor.sql("SELECT * FROM y").df() - pandas.testing.assert_frame_equal(converted_df, equal_df) + pd.testing.assert_frame_equal(converted_df, equal_df) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_struct_null(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_struct_null(self, duckdb_cursor): + x = pd.DataFrame( [ [None], [{"a": 1, "b": 3, "c": 3, "d": 7}], @@ -189,7 +183,7 @@ def test_struct_null(self, pandas, duckdb_cursor): [{"a": 1, "b": 3, "c": 3, "d": 7}], ] ) - y = pandas.DataFrame( + y = pd.DataFrame( [ [None], [{"a": 1, "b": 3, "c": 3, "d": 7}], @@ -200,11 +194,10 @@ def test_struct_null(self, pandas, duckdb_cursor): ) converted_df = duckdb_cursor.sql("SELECT * FROM x").df() equal_df = duckdb_cursor.sql("SELECT * FROM y").df() - pandas.testing.assert_frame_equal(converted_df, equal_df) + pd.testing.assert_frame_equal(converted_df, equal_df) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_map_fallback_value_upgrade(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_map_fallback_value_upgrade(self, duckdb_cursor): + x = pd.DataFrame( [ [{"a": 1, "b": 3, "c": 3, "d": "test"}], [{"a": 1, "b": 3, "c": 3, "d": 7}], @@ -213,7 +206,7 @@ def test_map_fallback_value_upgrade(self, pandas, duckdb_cursor): [{"a": 1, "b": 3, "c": 3, "d": 7}], ] ) - y = pandas.DataFrame( + y = pd.DataFrame( [ [{"a": "1", "b": "3", "c": "3", "d": "test"}], [{"a": "1", "b": "3", "c": "3", "d": "7"}], @@ -224,11 +217,10 @@ def test_map_fallback_value_upgrade(self, pandas, duckdb_cursor): ) converted_df = duckdb_cursor.sql("SELECT * FROM x").df() equal_df = duckdb_cursor.sql("SELECT * FROM y").df() - pandas.testing.assert_frame_equal(converted_df, equal_df) + pd.testing.assert_frame_equal(converted_df, equal_df) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_map_correct(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_map_correct(self, duckdb_cursor): + x = pd.DataFrame( [ [{"key": ["a", "b", "c", "d"], "value": [1, 3, 3, 7]}], [{"key": ["a", "b", "c", "d"], "value": [1, 3, 3, 7]}], @@ -255,23 +247,21 @@ def test_map_correct(self, pandas, duckdb_cursor): duckdb_col = duckdb_cursor.sql("select a from tmp AS '0'").df() print(duckdb_col.columns) print(converted_col.columns) - pandas.testing.assert_frame_equal(converted_col, duckdb_col) + pd.testing.assert_frame_equal(converted_col, duckdb_col) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) @pytest.mark.parametrize("sample_size", [1, 10]) @pytest.mark.parametrize("fill", [1000, 10000]) @pytest.mark.parametrize("get_data", [create_repeated_nulls, create_trailing_non_null]) - def test_analyzing_nulls(self, pandas, duckdb_cursor, fill, sample_size, get_data): + def test_analyzing_nulls(self, duckdb_cursor, fill, sample_size, get_data): data = get_data(fill) - df1 = pandas.DataFrame(data={"col1": data}) + df1 = pd.DataFrame(data={"col1": data}) duckdb_cursor.execute(f"SET GLOBAL pandas_analyze_sample={sample_size}") df = duckdb_cursor.execute("select * from df1").df() - pandas.testing.assert_frame_equal(df1, df) + pd.testing.assert_frame_equal(df1, df, check_dtype=False) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_nested_map(self, pandas, duckdb_cursor): - df = pandas.DataFrame(data={"col1": [{"a": {"b": {"x": "A", "y": "B"}}}, {"c": {"b": {"x": "A"}}}]}) + def test_nested_map(self, duckdb_cursor): + df = pd.DataFrame(data={"col1": [{"a": {"b": {"x": "A", "y": "B"}}}, {"c": {"b": {"x": "A"}}}]}) rel = duckdb_cursor.sql("select * from df") expected_rel = duckdb_cursor.sql( @@ -287,9 +277,8 @@ def test_nested_map(self, pandas, duckdb_cursor): expected_res = str(expected_rel) assert res == expected_res - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_map_value_upgrade(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_map_value_upgrade(self, duckdb_cursor): + x = pd.DataFrame( [ [{"key": ["a", "b", "c", "d"], "value": [1, 3, 3, "test"]}], [{"key": ["a", "b", "c", "d"], "value": [1, 3, 3, 7]}], @@ -321,36 +310,31 @@ def test_map_value_upgrade(self, pandas, duckdb_cursor): duckdb_col = duckdb_cursor.sql("select a from tmp2 AS '0'").df() print(duckdb_col.columns) print(converted_col.columns) - pandas.testing.assert_frame_equal(converted_col, duckdb_col) + pd.testing.assert_frame_equal(converted_col, duckdb_col) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_map_duplicate(self, pandas, duckdb_cursor): - x = pandas.DataFrame([[{"key": ["a", "a", "b"], "value": [4, 0, 4]}]]) + def test_map_duplicate(self, duckdb_cursor): + x = pd.DataFrame([[{"key": ["a", "a", "b"], "value": [4, 0, 4]}]]) with pytest.raises(duckdb.InvalidInputException, match="Map keys must be unique"): duckdb_cursor.sql("select * from x").show() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_map_nullkey(self, pandas, duckdb_cursor): - x = pandas.DataFrame([[{"key": [None, "a", "b"], "value": [4, 0, 4]}]]) + def test_map_nullkey(self, duckdb_cursor): + x = pd.DataFrame([[{"key": [None, "a", "b"], "value": [4, 0, 4]}]]) with pytest.raises(duckdb.InvalidInputException, match="Map keys can not be NULL"): converted_col = duckdb_cursor.sql("select * from x").df() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_map_nullkeylist(self, pandas, duckdb_cursor): - x = pandas.DataFrame([[{"key": None, "value": None}]]) + def test_map_nullkeylist(self, duckdb_cursor): + x = pd.DataFrame([[{"key": None, "value": None}]]) converted_col = duckdb_cursor.sql("select * from x").df() duckdb_col = duckdb_cursor.sql("SELECT MAP(NULL, NULL) as '0'").df() - pandas.testing.assert_frame_equal(duckdb_col, converted_col) + pd.testing.assert_frame_equal(duckdb_col, converted_col) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_map_fallback_nullkey(self, pandas, duckdb_cursor): - x = pandas.DataFrame([[{"a": 4, None: 0, "c": 4}], [{"a": 4, None: 0, "d": 4}]]) + def test_map_fallback_nullkey(self, duckdb_cursor): + x = pd.DataFrame([[{"a": 4, None: 0, "c": 4}], [{"a": 4, None: 0, "d": 4}]]) with pytest.raises(duckdb.InvalidInputException, match="Map keys can not be NULL"): converted_col = duckdb_cursor.sql("select * from x").df() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_map_fallback_nullkey_coverage(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_map_fallback_nullkey_coverage(self, duckdb_cursor): + x = pd.DataFrame( [ [{"key": None, "value": None}], [{"key": None, None: 5}], @@ -359,8 +343,7 @@ def test_map_fallback_nullkey_coverage(self, pandas, duckdb_cursor): with pytest.raises(duckdb.InvalidInputException, match="Map keys can not be NULL"): converted_col = duckdb_cursor.sql("select * from x").df() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_structs_in_nested_types(self, pandas, duckdb_cursor): + def test_structs_in_nested_types(self, duckdb_cursor): # This test is testing a bug that occurred when type upgrades occurred inside nested types # STRUCT(key1 varchar) + STRUCT(key1 varchar, key2 varchar) turns into MAP # But when inside a nested structure, this upgrade did not happen properly @@ -373,20 +356,19 @@ def test_structs_in_nested_types(self, pandas, duckdb_cursor): } for pair in pairs.values(): - check_struct_upgrade("MAP(VARCHAR, INTEGER)[]", construct_list, pair, pandas, duckdb_cursor) + check_struct_upgrade("MAP(VARCHAR, INTEGER)[]", construct_list, pair, duckdb_cursor) for key, pair in pairs.items(): expected_type = "MAP(VARCHAR, MAP(VARCHAR, INTEGER))" if key == "v4" else "STRUCT(v1 MAP(VARCHAR, INTEGER))" - check_struct_upgrade(expected_type, construct_struct, pair, pandas, duckdb_cursor) + check_struct_upgrade(expected_type, construct_struct, pair, duckdb_cursor) for pair in pairs.values(): - check_struct_upgrade("MAP(VARCHAR, MAP(VARCHAR, INTEGER))", construct_map, pair, pandas, duckdb_cursor) + check_struct_upgrade("MAP(VARCHAR, MAP(VARCHAR, INTEGER))", construct_map, pair, duckdb_cursor) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_structs_of_different_sizes(self, pandas, duckdb_cursor): + def test_structs_of_different_sizes(self, duckdb_cursor): # This list has both a STRUCT(v1) and a STRUCT(v1, v2) member # Those can't be combined - df = pandas.DataFrame( + df = pd.DataFrame( data={ "col": [ [ @@ -416,9 +398,8 @@ def test_structs_of_different_sizes(self, pandas, duckdb_cursor): ): res = duckdb_cursor.execute("select $1", [malformed_struct]) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_struct_key_conversion(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_struct_key_conversion(self, duckdb_cursor): + x = pd.DataFrame( [ [{IntString(5): 1, IntString(-25): 3, IntString(32): 3, IntString(32456): 7}], ] @@ -426,43 +407,38 @@ def test_struct_key_conversion(self, pandas, duckdb_cursor): duckdb_col = duckdb_cursor.sql("select {'5':1, '-25':3, '32':3, '32456':7} as '0'").df() converted_col = duckdb_cursor.sql("select * from x").df() duckdb_cursor.sql("drop view if exists tbl") - pandas.testing.assert_frame_equal(duckdb_col, converted_col) + pd.testing.assert_frame_equal(duckdb_col, converted_col) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_list_correct(self, pandas, duckdb_cursor): - x = pandas.DataFrame([{"0": [[5], [34], [-245]]}]) + def test_list_correct(self, duckdb_cursor): + x = pd.DataFrame([{"0": [[5], [34], [-245]]}]) duckdb_col = duckdb_cursor.sql("select [[5], [34], [-245]] as '0'").df() converted_col = duckdb_cursor.sql("select * from x").df() duckdb_cursor.sql("drop view if exists tbl") - pandas.testing.assert_frame_equal(duckdb_col, converted_col) + pd.testing.assert_frame_equal(duckdb_col, converted_col) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_list_contains_null(self, pandas, duckdb_cursor): - x = pandas.DataFrame([{"0": [[5], None, [-245]]}]) + def test_list_contains_null(self, duckdb_cursor): + x = pd.DataFrame([{"0": [[5], None, [-245]]}]) duckdb_col = duckdb_cursor.sql("select [[5], NULL, [-245]] as '0'").df() converted_col = duckdb_cursor.sql("select * from x").df() duckdb_cursor.sql("drop view if exists tbl") - pandas.testing.assert_frame_equal(duckdb_col, converted_col) + pd.testing.assert_frame_equal(duckdb_col, converted_col) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_list_starts_with_null(self, pandas, duckdb_cursor): - x = pandas.DataFrame([{"0": [None, [5], [-245]]}]) + def test_list_starts_with_null(self, duckdb_cursor): + x = pd.DataFrame([{"0": [None, [5], [-245]]}]) duckdb_col = duckdb_cursor.sql("select [NULL, [5], [-245]] as '0'").df() converted_col = duckdb_cursor.sql("select * from x").df() duckdb_cursor.sql("drop view if exists tbl") - pandas.testing.assert_frame_equal(duckdb_col, converted_col) + pd.testing.assert_frame_equal(duckdb_col, converted_col) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_list_value_upgrade(self, pandas, duckdb_cursor): - x = pandas.DataFrame([{"0": [["5"], [34], [-245]]}]) + def test_list_value_upgrade(self, duckdb_cursor): + x = pd.DataFrame([{"0": [["5"], [34], [-245]]}]) duckdb_rel = duckdb_cursor.sql("select [['5'], ['34'], ['-245']] as '0'") duckdb_col = duckdb_rel.df() converted_col = duckdb_cursor.sql("select * from x").df() - pandas.testing.assert_frame_equal(duckdb_col, converted_col) + pd.testing.assert_frame_equal(duckdb_col, converted_col) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_list_column_value_upgrade(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_list_column_value_upgrade(self, duckdb_cursor): + x = pd.DataFrame( [ [[1, 25, 300]], [[500, 345, 30]], @@ -496,46 +472,35 @@ def test_list_column_value_upgrade(self, pandas, duckdb_cursor): duckdb_col = duckdb_cursor.sql("select a from tmp3 AS '0'").df() print(duckdb_col.columns) print(converted_col.columns) - pandas.testing.assert_frame_equal(converted_col, duckdb_col) + pd.testing.assert_frame_equal(converted_col, duckdb_col) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_ubigint_object_conversion(self, pandas, duckdb_cursor): + def test_ubigint_object_conversion(self, duckdb_cursor): # UBIGINT + TINYINT would result in HUGEINT, but conversion to HUGEINT is not supported yet from pandas->duckdb # So this instead becomes a DOUBLE data = [18446744073709551615, 0] - x = pandas.DataFrame({"0": pandas.Series(data=data, dtype="object")}) + x = pd.DataFrame({"0": pd.Series(data=data, dtype="object")}) converted_col = duckdb_cursor.sql("select * from x").df() - if pandas.backend == "numpy_nullable": - float64 = np.dtype("float64") - assert isinstance(converted_col["0"].dtype, float64.__class__) - else: - uint64 = np.dtype("uint64") - assert isinstance(converted_col["0"].dtype, uint64.__class__) - - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_double_object_conversion(self, pandas, duckdb_cursor): + float64 = np.dtype("float64") + assert isinstance(converted_col["0"].dtype, float64.__class__) + + def test_double_object_conversion(self, duckdb_cursor): data = [18446744073709551616, 0] - x = pandas.DataFrame({"0": pandas.Series(data=data, dtype="object")}) + x = pd.DataFrame({"0": pd.Series(data=data, dtype="object")}) converted_col = duckdb_cursor.sql("select * from x").df() double_dtype = np.dtype("float64") assert isinstance(converted_col["0"].dtype, double_dtype.__class__) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) @pytest.mark.xfail( condition=platform.system() == "Emscripten", reason="older numpy raises a warning when running with Pyodide", ) - def test_numpy_object_with_stride(self, pandas, duckdb_cursor): - df = pandas.DataFrame(columns=["idx", "evens", "zeros"]) - - df["idx"] = list(range(10)) - for col in df.columns[1:]: - df[col].values[:] = 0 + def test_numpy_object_with_stride(self, duckdb_cursor): + # Create 2D array in C-order (row-major) + data = np.zeros((10, 3), dtype=np.int64) + data[:, 0] = np.arange(10) + data[:, 1] = np.arange(0, 20, 2) - counter = 0 - for i in range(10): - df.loc[df["idx"] == i, "evens"] += counter - counter += 2 + df = pd.DataFrame(data, columns=["idx", "evens", "zeros"]) res = duckdb_cursor.sql("select * from df").fetchall() assert res == [ @@ -551,27 +516,24 @@ def test_numpy_object_with_stride(self, pandas, duckdb_cursor): (9, 18, 0), ] - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_numpy_stringliterals(self, pandas, duckdb_cursor): - df = pandas.DataFrame({"x": list(map(np.str_, range(3)))}) + def test_numpy_stringliterals(self, duckdb_cursor): + df = pd.DataFrame({"x": list(map(np.str_, range(3)))}) res = duckdb_cursor.execute("select * from df").fetchall() assert res == [("0",), ("1",), ("2",)] - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_integer_conversion_fail(self, pandas, duckdb_cursor): + def test_integer_conversion_fail(self, duckdb_cursor): data = [2**10000, 0] - x = pandas.DataFrame({"0": pandas.Series(data=data, dtype="object")}) + x = pd.DataFrame({"0": pd.Series(data=data, dtype="object")}) converted_col = duckdb_cursor.sql("select * from x").df() print(converted_col["0"]) - double_dtype = np.dtype("object") - assert isinstance(converted_col["0"].dtype, double_dtype.__class__) + # default: VARCHAR + assert is_string_dtype(converted_col["0"].dtype) # Most of the time numpy.datetime64 is just a wrapper around a datetime.datetime object # But to support arbitrary precision, it can fall back to using an `int` internally - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) # Which we don't support yet - def test_numpy_datetime(self, pandas, duckdb_cursor): + def test_numpy_datetime(self, duckdb_cursor): numpy = pytest.importorskip("numpy") data = [] @@ -579,25 +541,23 @@ def test_numpy_datetime(self, pandas, duckdb_cursor): data += [numpy.datetime64("2022-02-21T06:59:23.324812")] * standard_vector_size data += [numpy.datetime64("1974-06-05T13:12:01.000000")] * standard_vector_size data += [numpy.datetime64("2049-01-13T00:24:31.999999")] * standard_vector_size - x = pandas.DataFrame({"dates": pandas.Series(data=data, dtype="object")}) + x = pd.DataFrame({"dates": pd.Series(data=data, dtype="object")}) res = duckdb_cursor.sql("select distinct * from x").df() assert len(res["dates"].__array__()) == 4 - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_numpy_datetime_int_internally(self, pandas, duckdb_cursor): + def test_numpy_datetime_int_internally(self, duckdb_cursor): numpy = pytest.importorskip("numpy") data = [numpy.datetime64("2022-12-10T21:38:24.0000000000001")] - x = pandas.DataFrame({"dates": pandas.Series(data=data, dtype="object")}) + x = pd.DataFrame({"dates": pd.Series(data=data, dtype="object")}) with pytest.raises( duckdb.ConversionException, match=re.escape("Conversion Error: Unimplemented type for cast (BIGINT -> TIMESTAMP)"), ): rel = duckdb.query_df(x, "x", "create table dates as select dates::TIMESTAMP WITHOUT TIME ZONE from x") - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_fallthrough_object_conversion(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_fallthrough_object_conversion(self, duckdb_cursor): + x = pd.DataFrame( [ [IntString(4)], [IntString(2)], @@ -605,11 +565,10 @@ def test_fallthrough_object_conversion(self, pandas, duckdb_cursor): ] ) duckdb_col = duckdb_cursor.sql("select * from x").df() - df_expected_res = pandas.DataFrame({"0": pandas.Series(["4", "2", "0"])}) - pandas.testing.assert_frame_equal(duckdb_col, df_expected_res) + df_expected_res = pd.DataFrame({"0": pd.Series(["4", "2", "0"])}) + pd.testing.assert_frame_equal(duckdb_col, df_expected_res, check_dtype=False) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_numeric_decimal(self, pandas, duckdb_cursor): + def test_numeric_decimal(self, duckdb_cursor): # DuckDB uses DECIMAL where possible, so all the 'float' types here are actually DECIMAL reference_query = """ CREATE TABLE tbl AS SELECT * FROM ( @@ -625,14 +584,12 @@ def test_numeric_decimal(self, pandas, duckdb_cursor): duckdb_cursor.execute(reference_query) # Because of this we need to wrap these native floats as DECIMAL for this test, to avoid these decimals being # "upgraded" to DOUBLE - x = pandas.DataFrame( + x = pd.DataFrame( { - "0": ConvertStringToDecimal([5, "12.0", "-123.0", "-234234.0", None, "1.234"], pandas), - "1": ConvertStringToDecimal( - [5002340, 13, "-12.0000000005", "7453324234.0", None, "-324234234"], pandas - ), + "0": ConvertStringToDecimal([5, "12.0", "-123.0", "-234234.0", None, "1.234"]), + "1": ConvertStringToDecimal([5002340, 13, "-12.0000000005", "7453324234.0", None, "-324234234"]), "2": ConvertStringToDecimal( - ["-234234234234.0", "324234234.00000005", -128, 345345, "1E5", "1324234359"], pandas + ["-234234234234.0", "324234234.00000005", -128, 345345, "1E5", "1324234359"] ), } ) @@ -641,9 +598,8 @@ def test_numeric_decimal(self, pandas, duckdb_cursor): assert conversion == reference - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_numeric_decimal_coverage(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_numeric_decimal_coverage(self, duckdb_cursor): + x = pd.DataFrame( {"0": [Decimal("nan"), Decimal("+nan"), Decimal("-nan"), Decimal("inf"), Decimal("+inf"), Decimal("-inf")]} ) conversion = duckdb_cursor.sql("select * from x").fetchall() @@ -659,22 +615,18 @@ def test_numeric_decimal_coverage(self, pandas, duckdb_cursor): assert str(conversion) == "[(nan,), (nan,), (nan,), (inf,), (inf,), (inf,)]" # Test that the column 'offset' is actually used when converting, - - @pytest.mark.parametrize( - "pandas", [NumpyPandas(), ArrowPandas()] - ) # and that the same 2048 (STANDARD_VECTOR_SIZE) values are not being scanned over and over again - def test_multiple_chunks(self, pandas, duckdb_cursor): + # and that the same 2048 (STANDARD_VECTOR_SIZE) values are not being scanned over and over again + def test_multiple_chunks(self, duckdb_cursor): data = [] data += [datetime.date(2022, 9, 13) for x in range(standard_vector_size)] data += [datetime.date(2022, 9, 14) for x in range(standard_vector_size)] data += [datetime.date(2022, 9, 15) for x in range(standard_vector_size)] data += [datetime.date(2022, 9, 16) for x in range(standard_vector_size)] - x = pandas.DataFrame({"dates": pandas.Series(data=data, dtype="object")}) + x = pd.DataFrame({"dates": pd.Series(data=data, dtype="object")}) res = duckdb_cursor.sql("select distinct * from x").df() assert len(res["dates"].__array__()) == 4 - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_multiple_chunks_aggregate(self, pandas, duckdb_cursor): + def test_multiple_chunks_aggregate(self, duckdb_cursor): duckdb_cursor.execute("SET GLOBAL pandas_analyze_sample=4096") duckdb_cursor.execute( "create table dates as select '2022-09-14'::DATE + INTERVAL (i::INTEGER) DAY as i from range(4096) tbl(i);" @@ -684,7 +636,7 @@ def test_multiple_chunks_aggregate(self, pandas, duckdb_cursor): date_df = res.copy() # Convert the dataframe to datetime - date_df["i"] = pandas.to_datetime(res["i"]).dt.date + date_df["i"] = pd.to_datetime(res["i"]).dt.date assert str(date_df["i"].dtype) == "object" expected_res = [ @@ -722,7 +674,7 @@ def test_multiple_chunks_aggregate(self, pandas, duckdb_cursor): ] # Convert the dataframe to datetime date_df = res.copy() - date_df["i"] = pandas.to_datetime(res["i"]).dt.date + date_df["i"] = pd.to_datetime(res["i"]).dt.date assert str(date_df["i"].dtype) == "object" actual_res = duckdb_cursor.sql( @@ -737,21 +689,19 @@ def test_multiple_chunks_aggregate(self, pandas, duckdb_cursor): assert expected_res == actual_res - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_mixed_object_types(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_mixed_object_types(self, duckdb_cursor): + x = pd.DataFrame( { - "nested": pandas.Series( + "nested": pd.Series( data=[{"a": 1, "b": 2}, [5, 4, 3], {"key": [1, 2, 3], "value": ["a", "b", "c"]}], dtype="object" ), } ) res = duckdb_cursor.sql("select * from x").df() - assert res["nested"].dtype == np.dtype("object") + assert is_string_dtype(res["nested"].dtype) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_struct_deeply_nested_in_struct(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_struct_deeply_nested_in_struct(self, duckdb_cursor): + x = pd.DataFrame( [ { # STRUCT(b STRUCT(x VARCHAR, y VARCHAR)) @@ -768,9 +718,8 @@ def test_struct_deeply_nested_in_struct(self, pandas, duckdb_cursor): res = duckdb_cursor.sql("select * from x").fetchall() assert res == [({"b": {"x": "A", "y": "B"}},), ({"b": {"x": "A"}},)] - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_struct_deeply_nested_in_list(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_struct_deeply_nested_in_list(self, duckdb_cursor): + x = pd.DataFrame( { "a": [ [ @@ -787,16 +736,14 @@ def test_struct_deeply_nested_in_list(self, pandas, duckdb_cursor): res = duckdb_cursor.sql("select * from x").fetchall() assert res == [([{"x": "A", "y": "B"}, {"x": "A"}],)] - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_analyze_sample_too_small(self, pandas, duckdb_cursor): + def test_analyze_sample_too_small(self, duckdb_cursor): data = [1 for _ in range(9)] + [[1, 2, 3]] + [1 for _ in range(9991)] - x = pandas.DataFrame({"a": pandas.Series(data=data)}) + x = pd.DataFrame({"a": pd.Series(data=data)}) with pytest.raises(duckdb.InvalidInputException, match="Failed to cast value: Unimplemented type for cast"): res = duckdb_cursor.sql("select * from x").df() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_numeric_decimal_zero_fractional(self, pandas, duckdb_cursor): - decimals = pandas.DataFrame( + def test_numeric_decimal_zero_fractional(self, duckdb_cursor): + decimals = pd.DataFrame( data={ "0": [ Decimal("0.00"), @@ -827,8 +774,7 @@ def test_numeric_decimal_zero_fractional(self, pandas, duckdb_cursor): assert conversion == reference - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_numeric_decimal_incompatible(self, pandas, duckdb_cursor): + def test_numeric_decimal_incompatible(self, duckdb_cursor): reference_query = """ CREATE TABLE tbl AS SELECT * FROM ( VALUES @@ -841,13 +787,11 @@ def test_numeric_decimal_incompatible(self, pandas, duckdb_cursor): ) tbl(a, b, c); """ duckdb_cursor.execute(reference_query) - x = pandas.DataFrame( + x = pd.DataFrame( { - "0": ConvertStringToDecimal(["5", "12.0", "-123.0", "-234234.0", None, "1.234"], pandas), - "1": ConvertStringToDecimal([5002340, 13, "-12.0000000005", 7453324234, None, "-324234234"], pandas), - "2": ConvertStringToDecimal( - [-234234234234, "324234234.00000005", -128, 345345, 0, "1324234359"], pandas - ), + "0": ConvertStringToDecimal(["5", "12.0", "-123.0", "-234234.0", None, "1.234"]), + "1": ConvertStringToDecimal([5002340, 13, "-12.0000000005", 7453324234, None, "-324234234"]), + "2": ConvertStringToDecimal([-234234234234, "324234234.00000005", -128, 345345, 0, "1324234359"]), } ) reference = duckdb_cursor.sql("select * from tbl").fetchall() @@ -857,11 +801,9 @@ def test_numeric_decimal_incompatible(self, pandas, duckdb_cursor): print(reference) print(conversion) - @pytest.mark.parametrize( - "pandas", [NumpyPandas(), ArrowPandas()] - ) # result: [('1E-28',), ('10000000000000000000000000.0',)] - def test_numeric_decimal_combined(self, pandas, duckdb_cursor): - decimals = pandas.DataFrame( + # result: [('1E-28',), ('10000000000000000000000000.0',)] + def test_numeric_decimal_combined(self, duckdb_cursor): + decimals = pd.DataFrame( data={"0": [Decimal("0.0000000000000000000000000001"), Decimal("10000000000000000000000000.0")]} ) reference_query = """ @@ -879,9 +821,8 @@ def test_numeric_decimal_combined(self, pandas, duckdb_cursor): print(conversion) # result: [('1234.0',), ('123456789.0',), ('1234567890123456789.0',), ('0.1234567890123456789',)] - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_numeric_decimal_varying_sizes(self, pandas, duckdb_cursor): - decimals = pandas.DataFrame( + def test_numeric_decimal_varying_sizes(self, duckdb_cursor): + decimals = pd.DataFrame( data={ "0": [ Decimal("1234.0"), @@ -907,14 +848,13 @@ def test_numeric_decimal_varying_sizes(self, pandas, duckdb_cursor): print(reference) print(conversion) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_numeric_decimal_fallback_to_double(self, pandas, duckdb_cursor): + def test_numeric_decimal_fallback_to_double(self, duckdb_cursor): # The widths of these decimal values are bigger than the max supported width for DECIMAL data = [ Decimal("1.234567890123456789012345678901234567890123456789"), Decimal("123456789012345678901234567890123456789012345678.0"), ] - decimals = pandas.DataFrame(data={"0": data}) + decimals = pd.DataFrame(data={"0": data}) reference_query = """ CREATE TABLE tbl AS SELECT * FROM ( VALUES @@ -928,8 +868,7 @@ def test_numeric_decimal_fallback_to_double(self, pandas, duckdb_cursor): assert conversion == reference assert isinstance(conversion[0][0], float) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_numeric_decimal_double_mixed(self, pandas, duckdb_cursor): + def test_numeric_decimal_double_mixed(self, duckdb_cursor): data = [ Decimal("1.234"), Decimal("1.234567891234567890123456789012345678901234567890123456789"), @@ -940,7 +879,7 @@ def test_numeric_decimal_double_mixed(self, pandas, duckdb_cursor): Decimal("1232354.000000000000000000000000000035"), Decimal("123.5e300"), ] - decimals = pandas.DataFrame(data={"0": data}) + decimals = pd.DataFrame(data={"0": data}) reference_query = """ CREATE TABLE tbl AS SELECT * FROM ( VALUES @@ -960,10 +899,9 @@ def test_numeric_decimal_double_mixed(self, pandas, duckdb_cursor): assert conversion == reference assert isinstance(conversion[0][0], float) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_numeric_decimal_out_of_range(self, pandas, duckdb_cursor): + def test_numeric_decimal_out_of_range(self, duckdb_cursor): data = [Decimal("1.234567890123456789012345678901234567"), Decimal("123456789012345678901234567890123456.0")] - decimals = pandas.DataFrame(data={"0": data}) + decimals = pd.DataFrame(data={"0": data}) reference_query = """ CREATE TABLE tbl AS SELECT * FROM ( VALUES diff --git a/tests/fast/pandas/test_df_recursive_nested.py b/tests/fast/pandas/test_df_recursive_nested.py index 871132ae..c3971cf6 100644 --- a/tests/fast/pandas/test_df_recursive_nested.py +++ b/tests/fast/pandas/test_df_recursive_nested.py @@ -1,5 +1,4 @@ -import pytest -from conftest import ArrowPandas, NumpyPandas +import pandas as pd import duckdb from duckdb import Value @@ -21,39 +20,35 @@ def create_reference_query(): class TestDFRecursiveNested: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_list_of_structs(self, duckdb_cursor, pandas): + def test_list_of_structs(self, duckdb_cursor): data = [[{"a": 5}, NULL, {"a": NULL}], NULL, [{"a": 5}, NULL, {"a": NULL}]] reference_query = create_reference_query() - df = pandas.DataFrame([{"a": data}]) + df = pd.DataFrame([{"a": data}]) check_equal(duckdb_cursor, df, reference_query, Value(data, "STRUCT(a INTEGER)[]")) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_list_of_map(self, duckdb_cursor, pandas): + def test_list_of_map(self, duckdb_cursor): # LIST(MAP(VARCHAR, VARCHAR)) data = [[{5: NULL}, NULL, {}], NULL, [NULL, {3: NULL, 2: "a", 4: NULL}, {"a": 1, "b": 2, "c": 3}]] reference_query = create_reference_query() print(reference_query) - df = pandas.DataFrame([{"a": data}]) + df = pd.DataFrame([{"a": data}]) check_equal(duckdb_cursor, df, reference_query, Value(data, "MAP(VARCHAR, VARCHAR)[][]")) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_recursive_list(self, duckdb_cursor, pandas): + def test_recursive_list(self, duckdb_cursor): # LIST(LIST(LIST(LIST(INTEGER)))) data = [[[[3, NULL, 5], NULL], NULL, [[5, -20, NULL]]], NULL, [[[NULL]], [[]], NULL]] reference_query = create_reference_query() - df = pandas.DataFrame([{"a": data}]) + df = pd.DataFrame([{"a": data}]) check_equal(duckdb_cursor, df, reference_query, Value(data, "INTEGER[][][][]")) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_recursive_struct(self, duckdb_cursor, pandas): + def test_recursive_struct(self, duckdb_cursor): # STRUCT(STRUCT(STRUCT(LIST))) data = { "A": {"a": {"1": [1, 2, 3]}, "b": NULL, "c": {"1": NULL}}, "B": {"a": {"1": [1, NULL, 3]}, "b": NULL, "c": {"1": NULL}}, } reference_query = create_reference_query() - df = pandas.DataFrame([{"a": data}]) + df = pd.DataFrame([{"a": data}]) check_equal( duckdb_cursor, df, @@ -89,8 +84,7 @@ def test_recursive_struct(self, duckdb_cursor, pandas): ), ) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_recursive_map(self, duckdb_cursor, pandas): + def test_recursive_map(self, duckdb_cursor): # MAP( # MAP( # INTEGER, @@ -106,13 +100,12 @@ def test_recursive_map(self, duckdb_cursor, pandas): "value": [1, 2], } reference_query = create_reference_query() - df = pandas.DataFrame([{"a": data}]) + df = pd.DataFrame([{"a": data}]) check_equal( duckdb_cursor, df, reference_query, Value(data, "MAP(MAP(INTEGER, MAP(INTEGER, VARCHAR)), INTEGER)") ) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_recursive_stresstest(self, duckdb_cursor, pandas): + def test_recursive_stresstest(self, duckdb_cursor): data = [ { "a": { @@ -134,7 +127,7 @@ def test_recursive_stresstest(self, duckdb_cursor, pandas): } ] reference_query = create_reference_query() - df = pandas.DataFrame([{"a": data}]) + df = pd.DataFrame([{"a": data}]) duckdb_type = """ STRUCT( a MAP( diff --git a/tests/fast/pandas/test_implicit_pandas_scan.py b/tests/fast/pandas/test_implicit_pandas_scan.py index 76f2c200..af3a8758 100644 --- a/tests/fast/pandas/test_implicit_pandas_scan.py +++ b/tests/fast/pandas/test_implicit_pandas_scan.py @@ -1,43 +1,27 @@ # simple DB API testcase import pandas as pd -import pytest -from conftest import ArrowPandas, NumpyPandas -from packaging.version import Version import duckdb -numpy_nullable_df = pd.DataFrame([{"COL1": "val1", "CoL2": 1.05}, {"COL1": "val4", "CoL2": 17}]) - -try: - from pandas.compat import pa_version_under7p0 - - pyarrow_dtypes_enabled = not pa_version_under7p0 -except Exception: - pyarrow_dtypes_enabled = False - -if Version(pd.__version__) >= Version("2.0.0") and pyarrow_dtypes_enabled: - pyarrow_df = numpy_nullable_df.convert_dtypes(dtype_backend="pyarrow") -else: - # dtype_backend is not supported in pandas < 2.0.0 - pyarrow_df = numpy_nullable_df - class TestImplicitPandasScan: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_local_pandas_scan(self, duckdb_cursor, pandas): + def test_local_pandas_scan(self, duckdb_cursor): con = duckdb.connect() - df = pandas.DataFrame([{"COL1": "val1", "CoL2": 1.05}, {"COL1": "val3", "CoL2": 17}]) # noqa: F841 + df = pd.DataFrame([{"COL1": "val1", "CoL2": 1.05}, {"COL1": "val3", "CoL2": 17}]) # noqa: F841 r1 = con.execute("select * from df").fetchdf() assert r1["COL1"][0] == "val1" assert r1["COL1"][1] == "val3" assert r1["CoL2"][0] == 1.05 assert r1["CoL2"][1] == 17 - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_global_pandas_scan(self, duckdb_cursor, pandas): + def test_global_pandas_scan(self, duckdb_cursor): + """Test that DuckDB can scan a module-level DataFrame variable.""" con = duckdb.connect() - r1 = con.execute(f"select * from {pandas.backend}_df").fetchdf() + # Create a global-scope dataframe for this test + global test_global_df + test_global_df = pd.DataFrame([{"COL1": "val1", "CoL2": 1.05}, {"COL1": "val4", "CoL2": 17}]) + r1 = con.execute("select * from test_global_df").fetchdf() assert r1["COL1"][0] == "val1" assert r1["COL1"][1] == "val4" assert r1["CoL2"][0] == 1.05 diff --git a/tests/fast/pandas/test_import_cache.py b/tests/fast/pandas/test_import_cache.py index eb1c8fb8..1b3a98ee 100644 --- a/tests/fast/pandas/test_import_cache.py +++ b/tests/fast/pandas/test_import_cache.py @@ -1,29 +1,38 @@ +import importlib.util + +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb -@pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) -def test_import_cache_explicit_dtype(pandas): - df = pandas.DataFrame( # noqa: F841 +@pytest.mark.parametrize( + "string_dtype", + [ + "python", + pytest.param( + "pyarrow", marks=pytest.mark.skipif(not importlib.util.find_spec("pyarrow"), reason="pyarrow not installed") + ), + ], +) +def test_import_cache_explicit_dtype(string_dtype): + df = pd.DataFrame( # noqa: F841 { "id": [1, 2, 3], - "value": pandas.Series(["123.123", pandas.NaT, pandas.NA], dtype=pandas.StringDtype(storage="python")), + "value": pd.Series(["123.123", pd.NaT, pd.NA], dtype=pd.StringDtype(storage=string_dtype)), } ) con = duckdb.connect() result_df = con.query("select id, value from df").df() - assert result_df["value"][1] is None - assert result_df["value"][2] is None + assert pd.isna(result_df["value"][1]) + assert pd.isna(result_df["value"][2]) -@pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) -def test_import_cache_implicit_dtype(pandas): - df = pandas.DataFrame({"id": [1, 2, 3], "value": pandas.Series(["123.123", pandas.NaT, pandas.NA])}) # noqa: F841 +def test_import_cache_implicit_dtype(): + df = pd.DataFrame({"id": [1, 2, 3], "value": pd.Series(["123.123", pd.NaT, pd.NA])}) # noqa: F841 con = duckdb.connect() result_df = con.query("select id, value from df").df() - assert result_df["value"][1] is None - assert result_df["value"][2] is None + assert pd.isna(result_df["value"][1]) + assert pd.isna(result_df["value"][2]) diff --git a/tests/fast/pandas/test_issue_1767.py b/tests/fast/pandas/test_issue_1767.py index 48d3e852..1677001e 100644 --- a/tests/fast/pandas/test_issue_1767.py +++ b/tests/fast/pandas/test_issue_1767.py @@ -1,22 +1,20 @@ #!/usr/bin/env python -import pytest -from conftest import ArrowPandas, NumpyPandas +import pandas as pd import duckdb # Join from pandas not matching identical strings #1767 class TestIssue1767: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_unicode_join_pandas(self, duckdb_cursor, pandas): - A = pandas.DataFrame({"key": ["a", "п"]}) - B = pandas.DataFrame({"key": ["a", "п"]}) + def test_unicode_join_pandas(self, duckdb_cursor): + A = pd.DataFrame({"key": ["a", "п"]}) + B = pd.DataFrame({"key": ["a", "п"]}) con = duckdb.connect(":memory:") arrow = con.register("A", A).register("B", B) q = arrow.query("""SELECT key FROM "A" FULL JOIN "B" USING ("key") ORDER BY key""") result = q.df() d = {"key": ["a", "п"]} - df = pandas.DataFrame(data=d) - pandas.testing.assert_frame_equal(result, df) + df = pd.DataFrame(data=d) + pd.testing.assert_frame_equal(result, df, check_dtype=False) diff --git a/tests/fast/pandas/test_limit.py b/tests/fast/pandas/test_limit.py index 51c4a382..2fb6c769 100644 --- a/tests/fast/pandas/test_limit.py +++ b/tests/fast/pandas/test_limit.py @@ -1,13 +1,11 @@ -import pytest -from conftest import ArrowPandas, NumpyPandas +import pandas as pd import duckdb class TestLimitPandas: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_limit_df(self, duckdb_cursor, pandas): - df_in = pandas.DataFrame( + def test_limit_df(self, duckdb_cursor): + df_in = pd.DataFrame( { "numbers": [1, 2, 3, 4, 5], } @@ -15,9 +13,8 @@ def test_limit_df(self, duckdb_cursor, pandas): limit_df = duckdb.limit(df_in, 2) assert len(limit_df.execute().fetchall()) == 2 - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_aggregate_df(self, duckdb_cursor, pandas): - df_in = pandas.DataFrame( + def test_aggregate_df(self, duckdb_cursor): + df_in = pd.DataFrame( { "numbers": [1, 2, 2, 2], } diff --git a/tests/fast/pandas/test_new_string_type.py b/tests/fast/pandas/test_new_string_type.py new file mode 100644 index 00000000..bd13d53a --- /dev/null +++ b/tests/fast/pandas/test_new_string_type.py @@ -0,0 +1,20 @@ +import pandas as pd +import pytest +from packaging.version import Version + +import duckdb + + +@pytest.mark.skipif( + Version(pd.__version__) < Version("3.0"), reason="Pandas < 3.0 doesn't have the new string type yet" +) +def test_new_str_type_pandas_3_0(): + df = pd.DataFrame({"s": ["DuckDB"]}) # noqa: F841 + duckdb.sql("select * from df") + + +@pytest.mark.skipif(Version(pd.__version__) >= Version("3.0"), reason="Pandas >= 3.0 has the new string type") +def test_new_str_type_pandas_lt_3_0(): + pd.options.future.infer_string = True + df = pd.DataFrame({"s": ["DuckDB"]}) # noqa: F841 + duckdb.sql("select * from df") diff --git a/tests/fast/pandas/test_pandas_na.py b/tests/fast/pandas/test_pandas_na.py index 6462c298..166fc21e 100644 --- a/tests/fast/pandas/test_pandas_na.py +++ b/tests/fast/pandas/test_pandas_na.py @@ -1,8 +1,9 @@ import platform import numpy as np +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas +from conftest import is_string_dtype import duckdb @@ -10,27 +11,25 @@ def assert_nullness(items, null_indices): for i in range(len(items)): if i in null_indices: - assert items[i] is None + assert pd.isna(items[i]) else: - assert items[i] is not None + assert not pd.isna(items[i]) @pytest.mark.skipif(platform.system() == "Emscripten", reason="Pandas interaction is broken in Pyodide 3.11") class TestPandasNA: @pytest.mark.parametrize("rows", [100, duckdb.__standard_vector_size__, 5000, 1000000]) - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) - def test_pandas_string_null(self, duckdb_cursor, rows, pd): - df: pd.DataFrame = pd.DataFrame(index=np.arange(rows)) + def test_pandas_string_null(self, duckdb_cursor, rows): + df = pd.DataFrame(index=np.arange(rows)) df["string_column"] = pd.Series(dtype="string") e_df_rel = duckdb_cursor.from_df(df) assert e_df_rel.types == ["VARCHAR"] roundtrip = e_df_rel.df() - assert roundtrip["string_column"].dtype == "object" + assert is_string_dtype(roundtrip["string_column"].dtype) expected = pd.DataFrame({"string_column": [None for _ in range(rows)]}) - pd.testing.assert_frame_equal(expected, roundtrip) + pd.testing.assert_frame_equal(expected, roundtrip, check_dtype=False) def test_pandas_na(self, duckdb_cursor): - pd = pytest.importorskip("pandas", minversion="1.0.0", reason="Support for pandas.NA has not been added yet") # DataFrame containing a single pd.NA df = pd.DataFrame(pd.Series([pd.NA])) @@ -74,7 +73,9 @@ def test_pandas_na(self, duckdb_cursor): } ) assert str(nan_df["a"].dtype) == "float64" - assert str(na_df["a"].dtype) == "object" # pd.NA values turn the column into 'object' + # pd.NA values turn the column into 'object' in Pandas 2.x + # In Pandas 3.0+, it may be different but we just check it's not float64 + assert str(na_df["a"].dtype) != "float64" nan_result = duckdb_cursor.execute("select * from nan_df").df() na_result = duckdb_cursor.execute("select * from na_df").df() diff --git a/tests/fast/pandas/test_pandas_unregister.py b/tests/fast/pandas/test_pandas_unregister.py index ab83eb42..c89ae320 100644 --- a/tests/fast/pandas/test_pandas_unregister.py +++ b/tests/fast/pandas/test_pandas_unregister.py @@ -1,16 +1,15 @@ import gc import tempfile +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb class TestPandasUnregister: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_pandas_unregister1(self, duckdb_cursor, pandas): - df = pandas.DataFrame([[1, 2, 3], [4, 5, 6]]) + def test_pandas_unregister1(self, duckdb_cursor): + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) connection = duckdb.connect(":memory:") connection.register("dataframe", df) @@ -22,13 +21,12 @@ def test_pandas_unregister1(self, duckdb_cursor, pandas): connection.execute("DROP VIEW dataframe;") connection.execute("DROP VIEW IF EXISTS dataframe;") - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_pandas_unregister2(self, duckdb_cursor, pandas): + def test_pandas_unregister2(self, duckdb_cursor): with tempfile.NamedTemporaryFile() as tmp: db = tmp.name connection = duckdb.connect(db) - df = pandas.DataFrame([[1, 2, 3], [4, 5, 6]]) + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) connection.register("dataframe", df) connection.unregister("dataframe") # Attempting to unregister. diff --git a/tests/fast/pandas/test_parallel_pandas_scan.py b/tests/fast/pandas/test_parallel_pandas_scan.py index 9ac7b738..7e04a933 100644 --- a/tests/fast/pandas/test_parallel_pandas_scan.py +++ b/tests/fast/pandas/test_parallel_pandas_scan.py @@ -2,13 +2,12 @@ import datetime import numpy -import pytest -from conftest import ArrowPandas, NumpyPandas +import pandas as pd import duckdb -def run_parallel_queries(main_table, left_join_table, expected_df, pandas, iteration_count=5): +def run_parallel_queries(main_table, left_join_table, expected_df, iteration_count=5): for _i in range(iteration_count): output_df = None sql = """ @@ -28,7 +27,7 @@ def run_parallel_queries(main_table, left_join_table, expected_df, pandas, itera duckdb_conn.register("main_table", main_table) duckdb_conn.register("left_join_table", left_join_table) output_df = duckdb_conn.execute(sql).fetchdf() - pandas.testing.assert_frame_equal(expected_df, output_df) + pd.testing.assert_frame_equal(expected_df, output_df, check_dtype=False) print(output_df) except Exception as err: print(err) @@ -37,67 +36,59 @@ def run_parallel_queries(main_table, left_join_table, expected_df, pandas, itera class TestParallelPandasScan: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_parallel_numeric_scan(self, duckdb_cursor, pandas): - main_table = pandas.DataFrame([{"join_column": 3}]) - left_join_table = pandas.DataFrame([{"join_column": 3, "other_column": 4}]) - run_parallel_queries(main_table, left_join_table, left_join_table, pandas) + def test_parallel_numeric_scan(self, duckdb_cursor): + main_table = pd.DataFrame([{"join_column": 3}]) + left_join_table = pd.DataFrame([{"join_column": 3, "other_column": 4}]) + run_parallel_queries(main_table, left_join_table, left_join_table) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_parallel_ascii_text(self, duckdb_cursor, pandas): - main_table = pandas.DataFrame([{"join_column": "text"}]) - left_join_table = pandas.DataFrame([{"join_column": "text", "other_column": "more text"}]) - run_parallel_queries(main_table, left_join_table, left_join_table, pandas) + def test_parallel_ascii_text(self, duckdb_cursor): + main_table = pd.DataFrame([{"join_column": "text"}]) + left_join_table = pd.DataFrame([{"join_column": "text", "other_column": "more text"}]) + run_parallel_queries(main_table, left_join_table, left_join_table) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_parallel_unicode_text(self, duckdb_cursor, pandas): - main_table = pandas.DataFrame([{"join_column": "mühleisen"}]) - left_join_table = pandas.DataFrame([{"join_column": "mühleisen", "other_column": "höhöhö"}]) - run_parallel_queries(main_table, left_join_table, left_join_table, pandas) + def test_parallel_unicode_text(self, duckdb_cursor): + main_table = pd.DataFrame([{"join_column": "mühleisen"}]) + left_join_table = pd.DataFrame([{"join_column": "mühleisen", "other_column": "höhöhö"}]) + run_parallel_queries(main_table, left_join_table, left_join_table) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_parallel_complex_unicode_text(self, duckdb_cursor, pandas): - main_table = pandas.DataFrame([{"join_column": "鴨"}]) - left_join_table = pandas.DataFrame([{"join_column": "鴨", "other_column": "數據庫"}]) - run_parallel_queries(main_table, left_join_table, left_join_table, pandas) + def test_parallel_complex_unicode_text(self, duckdb_cursor): + main_table = pd.DataFrame([{"join_column": "鴨"}]) + left_join_table = pd.DataFrame([{"join_column": "鴨", "other_column": "數據庫"}]) + run_parallel_queries(main_table, left_join_table, left_join_table) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_parallel_emojis(self, duckdb_cursor, pandas): - main_table = pandas.DataFrame([{"join_column": "🤦🏼‍♂️ L🤦🏼‍♂️R 🤦🏼‍♂️"}]) - left_join_table = pandas.DataFrame([{"join_column": "🤦🏼‍♂️ L🤦🏼‍♂️R 🤦🏼‍♂️", "other_column": "🦆🍞🦆"}]) - run_parallel_queries(main_table, left_join_table, left_join_table, pandas) + def test_parallel_emojis(self, duckdb_cursor): + main_table = pd.DataFrame([{"join_column": "🤦🏼‍♂️ L🤦🏼‍♂️R 🤦🏼‍♂️"}]) + left_join_table = pd.DataFrame([{"join_column": "🤦🏼‍♂️ L🤦🏼‍♂️R 🤦🏼‍♂️", "other_column": "🦆🍞🦆"}]) + run_parallel_queries(main_table, left_join_table, left_join_table) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_parallel_numeric_object(self, duckdb_cursor, pandas): - main_table = pandas.DataFrame({"join_column": pandas.Series([3], dtype="Int8")}) - left_join_table = pandas.DataFrame( - {"join_column": pandas.Series([3], dtype="Int8"), "other_column": pandas.Series([4], dtype="Int8")} + def test_parallel_numeric_object(self, duckdb_cursor): + main_table = pd.DataFrame({"join_column": pd.Series([3], dtype="Int8")}) + left_join_table = pd.DataFrame( + {"join_column": pd.Series([3], dtype="Int8"), "other_column": pd.Series([4], dtype="Int8")} ) - expected_df = pandas.DataFrame( + expected_df = pd.DataFrame( {"join_column": numpy.array([3], dtype=numpy.int8), "other_column": numpy.array([4], dtype=numpy.int8)} ) - run_parallel_queries(main_table, left_join_table, expected_df, pandas) + run_parallel_queries(main_table, left_join_table, expected_df) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_parallel_timestamp(self, duckdb_cursor, pandas): - main_table = pandas.DataFrame({"join_column": [pandas.Timestamp("20180310T11:17:54Z")]}) - left_join_table = pandas.DataFrame( + def test_parallel_timestamp(self, duckdb_cursor): + main_table = pd.DataFrame({"join_column": [pd.Timestamp("20180310T11:17:54Z")]}) + left_join_table = pd.DataFrame( { - "join_column": [pandas.Timestamp("20180310T11:17:54Z")], - "other_column": [pandas.Timestamp("20190310T11:17:54Z")], + "join_column": [pd.Timestamp("20180310T11:17:54Z")], + "other_column": [pd.Timestamp("20190310T11:17:54Z")], } ) - expected_df = pandas.DataFrame( + expected_df = pd.DataFrame( { "join_column": numpy.array([datetime.datetime(2018, 3, 10, 11, 17, 54)], dtype="datetime64[ns]"), "other_column": numpy.array([datetime.datetime(2019, 3, 10, 11, 17, 54)], dtype="datetime64[ns]"), } ) - run_parallel_queries(main_table, left_join_table, expected_df, pandas) + run_parallel_queries(main_table, left_join_table, expected_df) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_parallel_empty(self, duckdb_cursor, pandas): - df_empty = pandas.DataFrame({"A": []}) + def test_parallel_empty(self, duckdb_cursor): + df_empty = pd.DataFrame({"A": []}) duckdb_conn = duckdb.connect() duckdb_conn.execute("PRAGMA threads=4") duckdb_conn.execute("PRAGMA verify_parallelism") diff --git a/tests/fast/pandas/test_stride.py b/tests/fast/pandas/test_stride.py index cbe23cfd..65204ea8 100644 --- a/tests/fast/pandas/test_stride.py +++ b/tests/fast/pandas/test_stride.py @@ -57,7 +57,9 @@ def test_stride_timedelta(self, duckdb_cursor): ] } ) - pd.testing.assert_frame_equal(roundtrip, expected) + # DuckDB INTERVAL type stores in microseconds, so output is always timedelta64[us] + # Check values match without strict dtype comparison + pd.testing.assert_frame_equal(roundtrip, expected, check_dtype=False) def test_stride_fp64(self, duckdb_cursor): expected_df = pd.DataFrame(np.arange(20, dtype="float64").reshape(5, 4), columns=["a", "b", "c", "d"]) diff --git a/tests/fast/pandas/test_timestamp.py b/tests/fast/pandas/test_timestamp.py index 81651634..c6d080b8 100644 --- a/tests/fast/pandas/test_timestamp.py +++ b/tests/fast/pandas/test_timestamp.py @@ -65,7 +65,9 @@ def test_timestamp_timedelta(self): } ) df_from_duck = duckdb.from_df(df).df() - assert df_from_duck.equals(df) + # DuckDB INTERVAL type stores in microseconds, so output is always timedelta64[us] + # Check values match without strict dtype comparison + pd.testing.assert_frame_equal(df_from_duck, df, check_dtype=False) @pytest.mark.xfail( condition=platform.system() == "Emscripten" and os.environ.get("TZ") != "UTC", diff --git a/tests/fast/spark/test_spark_to_csv.py b/tests/fast/spark/test_spark_to_csv.py index 10e0028c..5003a20b 100644 --- a/tests/fast/spark/test_spark_to_csv.py +++ b/tests/fast/spark/test_spark_to_csv.py @@ -2,8 +2,9 @@ import datetime import os +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas, getTimeSeriesData +from conftest import getTimeSeriesData from spark_namespace import USE_ACTUAL_SPARK from duckdb import InvalidInputException, read_csv @@ -33,17 +34,15 @@ def df(spark): return dataframe -@pytest.fixture(params=[NumpyPandas(), ArrowPandas()]) -def pandas_df_ints(request, spark): - pandas = request.param - dataframe = pandas.DataFrame({"a": [5, 3, 23, 2], "b": [45, 234, 234, 2]}) +@pytest.fixture +def pandas_df_ints(spark): + dataframe = pd.DataFrame({"a": [5, 3, 23, 2], "b": [45, 234, 234, 2]}) return dataframe -@pytest.fixture(params=[NumpyPandas(), ArrowPandas()]) -def pandas_df_strings(request, spark): - pandas = request.param - dataframe = pandas.DataFrame({"a": ["string1", "string2", "string3"]}) +@pytest.fixture +def pandas_df_strings(spark): + dataframe = pd.DataFrame({"a": ["string1", "string2", "string3"]}) return dataframe @@ -69,10 +68,9 @@ def test_to_csv_sep(self, pandas_df_ints, spark, tmp_path): csv_rel = spark.read.csv(temp_file_name, sep=",") assert df.collect() == csv_rel.collect() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_na_rep(self, pandas, spark, tmp_path): + def test_to_csv_na_rep(self, spark, tmp_path): temp_file_name = os.path.join(tmp_path, "temp_file.csv") # noqa: PTH118 - pandas_df = pandas.DataFrame({"a": [5, None, 23, 2], "b": [45, 234, 234, 2]}) + pandas_df = pd.DataFrame({"a": [5, None, 23, 2], "b": [45, 234, 234, 2]}) df = spark.createDataFrame(pandas_df) @@ -81,10 +79,9 @@ def test_to_csv_na_rep(self, pandas, spark, tmp_path): csv_rel = spark.read.csv(temp_file_name, nullValue="test") assert df.collect() == csv_rel.collect() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_header(self, pandas, spark, tmp_path): + def test_to_csv_header(self, spark, tmp_path): temp_file_name = os.path.join(tmp_path, "temp_file.csv") # noqa: PTH118 - pandas_df = pandas.DataFrame({"a": [5, None, 23, 2], "b": [45, 234, 234, 2]}) + pandas_df = pd.DataFrame({"a": [5, None, 23, 2], "b": [45, 234, 234, 2]}) df = spark.createDataFrame(pandas_df) @@ -93,11 +90,10 @@ def test_to_csv_header(self, pandas, spark, tmp_path): csv_rel = spark.read.csv(temp_file_name) assert df.collect() == csv_rel.collect() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_quotechar(self, pandas, spark, tmp_path): + def test_to_csv_quotechar(self, spark, tmp_path): temp_file_name = os.path.join(tmp_path, "temp_file.csv") # noqa: PTH118 - pandas_df = pandas.DataFrame({"a": ["'a,b,c'", None, "hello", "bye"], "b": [45, 234, 234, 2]}) + pandas_df = pd.DataFrame({"a": ["'a,b,c'", None, "hello", "bye"], "b": [45, 234, 234, 2]}) df = spark.createDataFrame(pandas_df) @@ -106,10 +102,9 @@ def test_to_csv_quotechar(self, pandas, spark, tmp_path): csv_rel = spark.read.csv(temp_file_name, sep=",", quote="'") assert df.collect() == csv_rel.collect() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_escapechar(self, pandas, spark, tmp_path): + def test_to_csv_escapechar(self, spark, tmp_path): temp_file_name = os.path.join(tmp_path, "temp_file.csv") # noqa: PTH118 - pandas_df = pandas.DataFrame( + pandas_df = pd.DataFrame( { "c_bool": [True, False], "c_float": [1.0, 3.2], @@ -124,12 +119,11 @@ def test_to_csv_escapechar(self, pandas, spark, tmp_path): csv_rel = spark.read.csv(temp_file_name, quote='"', escape="!") assert df.collect() == csv_rel.collect() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_date_format(self, pandas, spark, tmp_path): + def test_to_csv_date_format(self, spark, tmp_path): temp_file_name = os.path.join(tmp_path, "temp_file.csv") # noqa: PTH118 - pandas_df = pandas.DataFrame(getTimeSeriesData()) + pandas_df = pd.DataFrame(getTimeSeriesData()) dt_index = pandas_df.index - pandas_df = pandas.DataFrame({"A": dt_index, "B": dt_index.shift(1)}, index=dt_index) + pandas_df = pd.DataFrame({"A": dt_index, "B": dt_index.shift(1)}, index=dt_index) df = spark.createDataFrame(pandas_df) @@ -139,11 +133,10 @@ def test_to_csv_date_format(self, pandas, spark, tmp_path): assert df.collect() == csv_rel.collect() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_timestamp_format(self, pandas, spark, tmp_path): + def test_to_csv_timestamp_format(self, spark, tmp_path): temp_file_name = os.path.join(tmp_path, "temp_file.csv") # noqa: PTH118 data = [datetime.time(hour=23, minute=1, second=34, microsecond=234345)] - pandas_df = pandas.DataFrame({"0": pandas.Series(data=data, dtype="object")}) + pandas_df = pd.DataFrame({"0": pd.Series(data=data, dtype="object")}) df = spark.createDataFrame(pandas_df) diff --git a/tests/fast/test_case_alias.py b/tests/fast/test_case_alias.py index d1afb4d8..f99b994e 100644 --- a/tests/fast/test_case_alias.py +++ b/tests/fast/test_case_alias.py @@ -1,15 +1,13 @@ -import pytest -from conftest import ArrowPandas, NumpyPandas +import pandas as pd import duckdb class TestCaseAlias: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_case_alias(self, duckdb_cursor, pandas): + def test_case_alias(self, duckdb_cursor): con = duckdb.connect(":memory:") - df = pandas.DataFrame([{"COL1": "val1", "CoL2": 1.05}, {"COL1": "val3", "CoL2": 17}]) + df = pd.DataFrame([{"COL1": "val1", "CoL2": 1.05}, {"COL1": "val3", "CoL2": 17}]) r1 = con.from_df(df).query("df", "select * from df").df() assert r1["COL1"][0] == "val1" diff --git a/tests/fast/test_insert.py b/tests/fast/test_insert.py index c5de1589..6eeabd67 100644 --- a/tests/fast/test_insert.py +++ b/tests/fast/test_insert.py @@ -1,13 +1,11 @@ -import pytest -from conftest import ArrowPandas, NumpyPandas +import pandas as pd import duckdb class TestInsert: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_insert(self, pandas): - test_df = pandas.DataFrame({"i": [1, 2, 3], "j": ["one", "two", "three"]}) + def test_insert(self): + test_df = pd.DataFrame({"i": [1, 2, 3], "j": ["one", "two", "three"]}) # connect to an in-memory temporary database conn = duckdb.connect() # get a cursor @@ -18,7 +16,7 @@ def test_insert(self, pandas): rel.insert([2, "two"]) rel.insert([3, "three"]) rel_a3 = cursor.table("test").project("CAST(i as BIGINT)i, j").to_df() - pandas.testing.assert_frame_equal(rel_a3, test_df) + pd.testing.assert_frame_equal(rel_a3, test_df) def test_insert_with_schema(self, duckdb_cursor): duckdb_cursor.sql("create schema not_main") diff --git a/tests/fast/test_map.py b/tests/fast/test_map.py index 336b2775..622095c2 100644 --- a/tests/fast/test_map.py +++ b/tests/fast/test_map.py @@ -2,8 +2,8 @@ from datetime import date, timedelta from typing import NoReturn +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb @@ -17,15 +17,13 @@ def evil1(df): class TestMap: - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_evil_map(self, duckdb_cursor, pandas): + def test_evil_map(self, duckdb_cursor): testrel = duckdb.values([1, 2]) rel = testrel.map(evil1, schema={"i": str}) with pytest.raises(duckdb.InvalidInputException, match="Expected 1 columns from UDF, got 2"): rel.df() - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_map(self, duckdb_cursor, pandas): + def test_map(self, duckdb_cursor): testrel = duckdb.values([1, 2]) conn = duckdb_cursor conn.execute("CREATE TABLE t (a integer)") @@ -57,16 +55,16 @@ def evil5(df) -> NoReturn: raise TypeError def return_dataframe(df): - return pandas.DataFrame({"A": [1]}) + return pd.DataFrame({"A": [1]}) def return_big_dataframe(df): - return pandas.DataFrame({"A": [1] * 5000}) + return pd.DataFrame({"A": [1] * 5000}) def return_none(df) -> None: return None def return_empty_df(df): - return pandas.DataFrame() + return pd.DataFrame() with pytest.raises(duckdb.InvalidInputException, match="Expected 1 columns from UDF, got 2"): print(testrel.map(evil1).df()) @@ -93,14 +91,14 @@ def return_empty_df(df): with pytest.raises(TypeError): print(testrel.map().df()) - testrel.map(return_dataframe).df().equals(pandas.DataFrame({"A": [1]})) + testrel.map(return_dataframe).df().equals(pd.DataFrame({"A": [1]})) with pytest.raises( duckdb.InvalidInputException, match="UDF returned more than 2048 rows, which is not allowed" ): testrel.map(return_big_dataframe).df() - empty_rel.map(return_dataframe).df().equals(pandas.DataFrame({"A": []})) + empty_rel.map(return_dataframe).df().equals(pd.DataFrame({"A": []})) with pytest.raises(duckdb.InvalidInputException, match="No return value from Python function"): testrel.map(return_none).df() @@ -118,18 +116,17 @@ def return_with_no_modification(df): # in this case we assume the returned type should be the same as the input type duckdb_cursor.values([b"1234"]).map(return_with_no_modification).fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_isse_3237(self, duckdb_cursor, pandas): + def test_isse_3237(self, duckdb_cursor): def process(rel): def mapper(x): dates = x["date"].to_numpy("datetime64[us]") days = x["days_to_add"].to_numpy("int") - x["result1"] = pandas.Series( - [pandas.to_datetime(y[0]).date() + timedelta(days=y[1].item()) for y in zip(dates, days)], + x["result1"] = pd.Series( + [pd.to_datetime(y[0]).date() + timedelta(days=y[1].item()) for y in zip(dates, days)], dtype="datetime64[us]", ) - x["result2"] = pandas.Series( - [pandas.to_datetime(y[0]).date() + timedelta(days=-y[1].item()) for y in zip(dates, days)], + x["result2"] = pd.Series( + [pd.to_datetime(y[0]).date() + timedelta(days=-y[1].item()) for y in zip(dates, days)], dtype="datetime64[us]", ) return x @@ -140,8 +137,8 @@ def mapper(x): rel = rel.project("*, IF(ABS(one) > ABS(two), one, two) as three") return rel - df = pandas.DataFrame( - {"date": pandas.Series([date(2000, 1, 1), date(2000, 1, 2)], dtype="datetime64[us]"), "days_to_add": [1, 2]} + df = pd.DataFrame( + {"date": pd.Series([date(2000, 1, 1), date(2000, 1, 2)], dtype="datetime64[us]"), "days_to_add": [1, 2]} ) rel = duckdb.from_df(df) rel = process(rel) @@ -172,10 +169,9 @@ def does_nothing(df): ): rel.fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_explicit_schema_name_mismatch(self, pandas): + def test_explicit_schema_name_mismatch(self): def renames_column(df): - return pandas.DataFrame({"a": df["i"]}) + return pd.DataFrame({"a": df["i"]}) con = duckdb.connect() rel = con.sql("select i from range(10) tbl(i)") @@ -183,8 +179,7 @@ def renames_column(df): with pytest.raises(duckdb.InvalidInputException, match=re.escape("UDF column name mismatch")): rel.fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_explicit_schema_error(self, pandas): + def test_explicit_schema_error(self): def no_op(df): return df @@ -196,8 +191,7 @@ def no_op(df): ): rel.map(no_op, schema=[int]) - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_returns_non_dataframe(self, pandas): + def test_returns_non_dataframe(self): def returns_series(df): return df.loc[:, "i"] @@ -205,17 +199,14 @@ def returns_series(df): rel = con.sql("select i, i as j from range(10) tbl(i)") with pytest.raises( duckdb.InvalidInputException, - match=re.escape( - "Expected the UDF to return an object of type 'pandas.DataFrame', found " - "'' instead" - ), + match=r"Expected the UDF to return an object of type 'pandas\.DataFrame', found " + r"'' instead", ): rel = rel.map(returns_series) - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_explicit_schema_columncount_mismatch(self, pandas): + def test_explicit_schema_columncount_mismatch(self): def returns_subset(df): - return pandas.DataFrame({"i": df.loc[:, "i"]}) + return pd.DataFrame({"i": df.loc[:, "i"]}) con = duckdb.connect() rel = con.sql("select i, i as j from range(10) tbl(i)") @@ -225,14 +216,13 @@ def returns_subset(df): ): rel.fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_pyarrow_df(self, pandas): + def test_pyarrow_df(self): # PyArrow backed dataframes only exist on pandas >= 2.0.0 pytest.importorskip("pandas", "2.0.0") def basic_function(df): # Create a pyarrow backed dataframe - df = pandas.DataFrame({"a": [5, 3, 2, 1, 2]}).convert_dtypes(dtype_backend="pyarrow") + df = pd.DataFrame({"a": [5, 3, 2, 1, 2]}).convert_dtypes(dtype_backend="pyarrow") return df con = duckdb.connect() diff --git a/tests/fast/test_multithread.py b/tests/fast/test_multithread.py index dfefb918..ccf809c5 100644 --- a/tests/fast/test_multithread.py +++ b/tests/fast/test_multithread.py @@ -4,8 +4,8 @@ from pathlib import Path import numpy as np +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb @@ -25,11 +25,10 @@ def everything_succeeded(results: list[bool]): class DuckDBThreaded: - def __init__(self, duckdb_insert_thread_count, thread_function, pandas) -> None: + def __init__(self, duckdb_insert_thread_count, thread_function) -> None: self.duckdb_insert_thread_count = duckdb_insert_thread_count self.threads = [] self.thread_function = thread_function - self.pandas = pandas def multithread_test(self, result_verification=everything_succeeded): duckdb_conn = duckdb.connect() @@ -38,9 +37,7 @@ def multithread_test(self, result_verification=everything_succeeded): # Create all threads for i in range(self.duckdb_insert_thread_count): self.threads.append( - threading.Thread( - target=self.thread_function, args=(duckdb_conn, queue, self.pandas), name="duckdb_thread_" + str(i) - ) + threading.Thread(target=self.thread_function, args=(duckdb_conn, queue), name="duckdb_thread_" + str(i)) ) # Record for every thread if they succeeded or not @@ -58,7 +55,7 @@ def multithread_test(self, result_verification=everything_succeeded): assert result_verification(thread_results) -def execute_query_same_connection(duckdb_conn, queue, pandas): +def execute_query_same_connection(duckdb_conn, queue): try: duckdb_conn.execute("select i from (values (42), (84), (NULL), (128)) tbl(i)") queue.put(False) @@ -66,7 +63,7 @@ def execute_query_same_connection(duckdb_conn, queue, pandas): queue.put(True) -def execute_query(duckdb_conn, queue, pandas): +def execute_query(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -76,7 +73,7 @@ def execute_query(duckdb_conn, queue, pandas): queue.put(False) -def insert_runtime_error(duckdb_conn, queue, pandas): +def insert_runtime_error(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -86,7 +83,7 @@ def insert_runtime_error(duckdb_conn, queue, pandas): queue.put(True) -def execute_many_query(duckdb_conn, queue, pandas): +def execute_many_query(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -119,7 +116,7 @@ def execute_many_query(duckdb_conn, queue, pandas): queue.put(False) -def fetchone_query(duckdb_conn, queue, pandas): +def fetchone_query(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -129,7 +126,7 @@ def fetchone_query(duckdb_conn, queue, pandas): queue.put(False) -def fetchall_query(duckdb_conn, queue, pandas): +def fetchall_query(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -139,7 +136,7 @@ def fetchall_query(duckdb_conn, queue, pandas): queue.put(False) -def conn_close(duckdb_conn, queue, pandas): +def conn_close(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -149,7 +146,7 @@ def conn_close(duckdb_conn, queue, pandas): queue.put(False) -def fetchnp_query(duckdb_conn, queue, pandas): +def fetchnp_query(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -159,7 +156,7 @@ def fetchnp_query(duckdb_conn, queue, pandas): queue.put(False) -def fetchdf_query(duckdb_conn, queue, pandas): +def fetchdf_query(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -169,7 +166,7 @@ def fetchdf_query(duckdb_conn, queue, pandas): queue.put(False) -def fetchdf_chunk_query(duckdb_conn, queue, pandas): +def fetchdf_chunk_query(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -179,7 +176,7 @@ def fetchdf_chunk_query(duckdb_conn, queue, pandas): queue.put(False) -def fetch_arrow_query(duckdb_conn, queue, pandas): +def fetch_arrow_query(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -189,7 +186,7 @@ def fetch_arrow_query(duckdb_conn, queue, pandas): queue.put(False) -def fetch_record_batch_query(duckdb_conn, queue, pandas): +def fetch_record_batch_query(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -199,7 +196,7 @@ def fetch_record_batch_query(duckdb_conn, queue, pandas): queue.put(False) -def transaction_query(duckdb_conn, queue, pandas): +def transaction_query(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() duckdb_conn.execute("CREATE TABLE T ( i INTEGER)") @@ -214,11 +211,11 @@ def transaction_query(duckdb_conn, queue, pandas): queue.put(False) -def df_append(duckdb_conn, queue, pandas): +def df_append(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() duckdb_conn.execute("CREATE TABLE T ( i INTEGER)") - df = pandas.DataFrame(np.random.randint(0, 100, size=15), columns=["A"]) + df = pd.DataFrame(np.random.randint(0, 100, size=15), columns=["A"]) try: duckdb_conn.append("T", df) queue.put(True) @@ -226,10 +223,10 @@ def df_append(duckdb_conn, queue, pandas): queue.put(False) -def df_register(duckdb_conn, queue, pandas): +def df_register(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() - df = pandas.DataFrame(np.random.randint(0, 100, size=15), columns=["A"]) + df = pd.DataFrame(np.random.randint(0, 100, size=15), columns=["A"]) try: duckdb_conn.register("T", df) queue.put(True) @@ -237,10 +234,10 @@ def df_register(duckdb_conn, queue, pandas): queue.put(False) -def df_unregister(duckdb_conn, queue, pandas): +def df_unregister(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() - df = pandas.DataFrame(np.random.randint(0, 100, size=15), columns=["A"]) + df = pd.DataFrame(np.random.randint(0, 100, size=15), columns=["A"]) try: duckdb_conn.register("T", df) duckdb_conn.unregister("T") @@ -249,7 +246,7 @@ def df_unregister(duckdb_conn, queue, pandas): queue.put(False) -def arrow_register_unregister(duckdb_conn, queue, pandas): +def arrow_register_unregister(duckdb_conn, queue): # Get a new connection pa = pytest.importorskip("pyarrow") duckdb_conn = duckdb.connect() @@ -262,7 +259,7 @@ def arrow_register_unregister(duckdb_conn, queue, pandas): queue.put(False) -def table(duckdb_conn, queue, pandas): +def table(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() duckdb_conn.execute("CREATE TABLE T ( i INTEGER)") @@ -273,7 +270,7 @@ def table(duckdb_conn, queue, pandas): queue.put(False) -def view(duckdb_conn, queue, pandas): +def view(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() duckdb_conn.execute("CREATE TABLE T ( i INTEGER)") @@ -285,7 +282,7 @@ def view(duckdb_conn, queue, pandas): queue.put(False) -def values(duckdb_conn, queue, pandas): +def values(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -295,7 +292,7 @@ def values(duckdb_conn, queue, pandas): queue.put(False) -def from_query(duckdb_conn, queue, pandas): +def from_query(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -305,10 +302,10 @@ def from_query(duckdb_conn, queue, pandas): queue.put(False) -def from_df(duckdb_conn, queue, pandas): +def from_df(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() - df = pandas.DataFrame(["bla", "blabla"] * 10, columns=["A"]) # noqa: F841 + df = pd.DataFrame(["bla", "blabla"] * 10, columns=["A"]) # noqa: F841 try: duckdb_conn.execute("select * from df").fetchall() queue.put(True) @@ -316,7 +313,7 @@ def from_df(duckdb_conn, queue, pandas): queue.put(False) -def from_arrow(duckdb_conn, queue, pandas): +def from_arrow(duckdb_conn, queue): # Get a new connection pa = pytest.importorskip("pyarrow") duckdb_conn = duckdb.connect() @@ -328,7 +325,7 @@ def from_arrow(duckdb_conn, queue, pandas): queue.put(False) -def from_csv_auto(duckdb_conn, queue, pandas): +def from_csv_auto(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() filename = str(Path(__file__).parent / "data" / "integers.csv") @@ -339,7 +336,7 @@ def from_csv_auto(duckdb_conn, queue, pandas): queue.put(False) -def from_parquet(duckdb_conn, queue, pandas): +def from_parquet(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() filename = str(Path(__file__).parent / "data" / "binary_string.parquet") @@ -350,7 +347,7 @@ def from_parquet(duckdb_conn, queue, pandas): queue.put(False) -def description(_, queue, __): +def description(_, queue): # Get a new connection duckdb_conn = duckdb.connect() duckdb_conn.execute("CREATE TABLE test (i bool, j TIME, k VARCHAR)") @@ -364,7 +361,7 @@ def description(_, queue, __): queue.put(False) -def cursor(duckdb_conn, queue, pandas): +def cursor(duckdb_conn, queue): # Get a new connection cx = duckdb_conn.cursor() try: @@ -375,136 +372,111 @@ def cursor(duckdb_conn, queue, pandas): class TestDuckMultithread: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_execute(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, execute_query, pandas) + def test_execute(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, execute_query) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_execute_many(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, execute_many_query, pandas) + def test_execute_many(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, execute_many_query) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_fetchone(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, fetchone_query, pandas) + def test_fetchone(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, fetchone_query) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_fetchall(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, fetchall_query, pandas) + def test_fetchall(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, fetchall_query) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_close(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, conn_close, pandas) + def test_close(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, conn_close) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_fetchnp(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, fetchnp_query, pandas) + def test_fetchnp(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, fetchnp_query) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_fetchdf(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, fetchdf_query, pandas) + def test_fetchdf(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, fetchdf_query) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_fetchdfchunk(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, fetchdf_chunk_query, pandas) + def test_fetchdfchunk(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, fetchdf_chunk_query) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_fetcharrow(self, duckdb_cursor, pandas): + def test_fetcharrow(self, duckdb_cursor): pytest.importorskip("pyarrow") - duck_threads = DuckDBThreaded(10, fetch_arrow_query, pandas) + duck_threads = DuckDBThreaded(10, fetch_arrow_query) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_fetch_record_batch(self, duckdb_cursor, pandas): + def test_fetch_record_batch(self, duckdb_cursor): pytest.importorskip("pyarrow") - duck_threads = DuckDBThreaded(10, fetch_record_batch_query, pandas) + duck_threads = DuckDBThreaded(10, fetch_record_batch_query) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_transaction(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, transaction_query, pandas) + def test_transaction(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, transaction_query) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_df_append(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, df_append, pandas) + def test_df_append(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, df_append) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_df_register(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, df_register, pandas) + def test_df_register(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, df_register) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_df_unregister(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, df_unregister, pandas) + def test_df_unregister(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, df_unregister) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_arrow_register_unregister(self, duckdb_cursor, pandas): + def test_arrow_register_unregister(self, duckdb_cursor): pytest.importorskip("pyarrow") - duck_threads = DuckDBThreaded(10, arrow_register_unregister, pandas) + duck_threads = DuckDBThreaded(10, arrow_register_unregister) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_table(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, table, pandas) + def test_table(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, table) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_view(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, view, pandas) + def test_view(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, view) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_values(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, values, pandas) + def test_values(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, values) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_from_query(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, from_query, pandas) + def test_from_query(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, from_query) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_from_DF(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, from_df, pandas) + def test_from_DF(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, from_df) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_from_arrow(self, duckdb_cursor, pandas): + def test_from_arrow(self, duckdb_cursor): pytest.importorskip("pyarrow") - duck_threads = DuckDBThreaded(10, from_arrow, pandas) + duck_threads = DuckDBThreaded(10, from_arrow) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_from_csv_auto(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, from_csv_auto, pandas) + def test_from_csv_auto(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, from_csv_auto) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_from_parquet(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, from_parquet, pandas) + def test_from_parquet(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, from_parquet) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_description(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, description, pandas) + def test_description(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, description) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_cursor(self, duckdb_cursor, pandas): + def test_cursor(self, duckdb_cursor): def only_some_succeed(results: list[bool]) -> bool: if not any(result for result in results): return False return not all(result for result in results) - duck_threads = DuckDBThreaded(10, cursor, pandas) + duck_threads = DuckDBThreaded(10, cursor) duck_threads.multithread_test(only_some_succeed) diff --git a/tests/fast/test_parameter_list.py b/tests/fast/test_parameter_list.py index 22413999..6d101bcb 100644 --- a/tests/fast/test_parameter_list.py +++ b/tests/fast/test_parameter_list.py @@ -1,5 +1,5 @@ +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb @@ -12,10 +12,9 @@ def test_bool(self, duckdb_cursor): res = conn.execute("select count(*) from bool_table where a =?", [True]) assert res.fetchone()[0] == 1 - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_exception(self, duckdb_cursor, pandas): + def test_exception(self, duckdb_cursor): conn = duckdb.connect() - df_in = pandas.DataFrame( + df_in = pd.DataFrame( { "numbers": [1, 2, 3, 4, 5], } diff --git a/tests/fast/test_relation.py b/tests/fast/test_relation.py index f386b091..220fb954 100644 --- a/tests/fast/test_relation.py +++ b/tests/fast/test_relation.py @@ -2,13 +2,11 @@ import datetime import gc import os -import platform import tempfile import numpy as np import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb from duckdb import ColumnExpression @@ -39,10 +37,9 @@ def test_csv_auto(self): csv_rel = duckdb.from_csv_auto(temp_file_name) assert df_rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_relation_view(self, duckdb_cursor, pandas): + def test_relation_view(self, duckdb_cursor): def create_view(duckdb_cursor) -> None: - df_in = pandas.DataFrame({"numbers": [1, 2, 3, 4, 5]}) + df_in = pd.DataFrame({"numbers": [1, 2, 3, 4, 5]}) rel = duckdb_cursor.query("select * from df_in") rel.to_view("my_view") @@ -536,15 +533,6 @@ def test_relation_print(self): 1024, 2048, 5000, - 1000000, - pytest.param( - 10000000, - marks=pytest.mark.skipif( - condition=platform.system() == "Emscripten", - reason="Emscripten/Pyodide builds run out of memory at this scale, and error might not " - "thrown reliably", - ), - ), ], ) def test_materialized_relation(self, duckdb_cursor, num_rows): diff --git a/tests/fast/test_relation_dependency_leak.py b/tests/fast/test_relation_dependency_leak.py index 659e1c28..db83ff1c 100644 --- a/tests/fast/test_relation_dependency_leak.py +++ b/tests/fast/test_relation_dependency_leak.py @@ -1,6 +1,7 @@ import os import numpy as np +import pandas as pd import pytest try: @@ -9,67 +10,61 @@ can_run = True except ImportError: can_run = False -from conftest import ArrowPandas, NumpyPandas psutil = pytest.importorskip("psutil") -def check_memory(function_to_check, pandas, duckdb_cursor): +def check_memory(function_to_check, duckdb_cursor): process = psutil.Process(os.getpid()) mem_usage = process.memory_info().rss / (10**9) for __ in range(100): - function_to_check(pandas, duckdb_cursor) + function_to_check(duckdb_cursor) cur_mem_usage = process.memory_info().rss / (10**9) # This seems a good empirical value assert cur_mem_usage / 3 < mem_usage -def from_df(pandas, duckdb_cursor): - df = pandas.DataFrame({"x": np.random.rand(1_000_000)}) +def from_df(duckdb_cursor): + df = pd.DataFrame({"x": np.random.rand(1_000_000)}) return duckdb_cursor.from_df(df) -def from_arrow(pandas, duckdb_cursor): +def from_arrow(duckdb_cursor): data = pa.array(np.random.rand(1_000_000), type=pa.float32()) arrow_table = pa.Table.from_arrays([data], ["a"]) duckdb_cursor.from_arrow(arrow_table) -def arrow_replacement(pandas, duckdb_cursor): +def arrow_replacement(duckdb_cursor): data = pa.array(np.random.rand(1_000_000), type=pa.float32()) arrow_table = pa.Table.from_arrays([data], ["a"]) # noqa: F841 duckdb_cursor.query("select sum(a) from arrow_table").fetchall() -def pandas_replacement(pandas, duckdb_cursor): - df = pandas.DataFrame({"x": np.random.rand(1_000_000)}) # noqa: F841 +def pandas_replacement(duckdb_cursor): + df = pd.DataFrame({"x": np.random.rand(1_000_000)}) # noqa: F841 duckdb_cursor.query("select sum(x) from df").fetchall() class TestRelationDependencyMemoryLeak: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_from_arrow_leak(self, pandas, duckdb_cursor): + def test_from_arrow_leak(self, duckdb_cursor): if not can_run: return - check_memory(from_arrow, pandas, duckdb_cursor) + check_memory(from_arrow, duckdb_cursor) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_from_df_leak(self, pandas, duckdb_cursor): - check_memory(from_df, pandas, duckdb_cursor) + def test_from_df_leak(self, duckdb_cursor): + check_memory(from_df, duckdb_cursor) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_arrow_replacement_scan_leak(self, pandas, duckdb_cursor): + def test_arrow_replacement_scan_leak(self, duckdb_cursor): if not can_run: return - check_memory(arrow_replacement, pandas, duckdb_cursor) + check_memory(arrow_replacement, duckdb_cursor) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_pandas_replacement_scan_leak(self, pandas, duckdb_cursor): - check_memory(pandas_replacement, pandas, duckdb_cursor) + def test_pandas_replacement_scan_leak(self, duckdb_cursor): + check_memory(pandas_replacement, duckdb_cursor) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_relation_view_leak(self, pandas, duckdb_cursor): - rel = from_df(pandas, duckdb_cursor) + def test_relation_view_leak(self, duckdb_cursor): + rel = from_df(duckdb_cursor) rel.create_view("bla") duckdb_cursor.unregister("bla") assert rel.query("bla", "select count(*) from bla").fetchone()[0] == 1_000_000 diff --git a/tests/fast/test_runtime_error.py b/tests/fast/test_runtime_error.py index 9f1975a0..44910a13 100644 --- a/tests/fast/test_runtime_error.py +++ b/tests/fast/test_runtime_error.py @@ -1,5 +1,5 @@ +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb @@ -61,10 +61,9 @@ def test_arrow_record_batch_reader_error(self): with pytest.raises(duckdb.ProgrammingError, match="There is no query result"): res.fetch_arrow_reader(1) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_relation_cache_fetchall(self, pandas): + def test_relation_cache_fetchall(self): conn = duckdb.connect() - df_in = pandas.DataFrame( + df_in = pd.DataFrame( { "numbers": [1, 2, 3, 4, 5], } @@ -78,10 +77,9 @@ def test_relation_cache_fetchall(self, pandas): # so the dependency of 'x' on 'df_in' is not registered in 'rel' rel.fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_relation_cache_execute(self, pandas): + def test_relation_cache_execute(self): conn = duckdb.connect() - df_in = pandas.DataFrame( + df_in = pd.DataFrame( { "numbers": [1, 2, 3, 4, 5], } @@ -92,10 +90,9 @@ def test_relation_cache_execute(self, pandas): with pytest.raises(duckdb.ProgrammingError, match="Table with name df_in does not exist"): rel.execute() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_relation_query_error(self, pandas): + def test_relation_query_error(self): conn = duckdb.connect() - df_in = pandas.DataFrame( + df_in = pd.DataFrame( { "numbers": [1, 2, 3, 4, 5], } @@ -106,10 +103,9 @@ def test_relation_query_error(self, pandas): with pytest.raises(duckdb.CatalogException, match="Table with name df_in does not exist"): rel.query("bla", "select * from bla") - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_conn_broken_statement_error(self, pandas): + def test_conn_broken_statement_error(self): conn = duckdb.connect() - df_in = pandas.DataFrame( + df_in = pd.DataFrame( { "numbers": [1, 2, 3, 4, 5], } @@ -128,11 +124,10 @@ def test_conn_prepared_statement_error(self): ): conn.execute("select * from integers where a =? and b=?", [1]) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_closed_conn_exceptions(self, pandas): + def test_closed_conn_exceptions(self): conn = duckdb.connect() conn.close() - df_in = pandas.DataFrame( + df_in = pd.DataFrame( { "numbers": [1, 2, 3, 4, 5], } diff --git a/tests/slow/test_materialized_relation.py b/tests/slow/test_materialized_relation.py new file mode 100644 index 00000000..69008adc --- /dev/null +++ b/tests/slow/test_materialized_relation.py @@ -0,0 +1,52 @@ +import platform + +import pytest + + +class TestMaterializedRelationSlow: + @pytest.mark.parametrize( + "num_rows", + [ + 1000000, + pytest.param( + 10000000, + marks=pytest.mark.skipif( + condition=platform.system() == "Emscripten", + reason="Emscripten/Pyodide builds run out of memory at this scale, and error might not " + "thrown reliably", + ), + ), + ], + ) + def test_materialized_relation(self, duckdb_cursor, num_rows): + # Anything that is not a SELECT statement becomes a materialized relation, so we use `CALL` + query = f"call repeat_row(42, 'test', 'this is a long string', true, num_rows={num_rows})" + rel = duckdb_cursor.sql(query) + res = rel.fetchone() + assert res is not None + + res = rel.fetchmany(num_rows) + assert len(res) == num_rows - 1 + + res = rel.fetchmany(5) + assert len(res) == 0 + res = rel.fetchmany(5) + assert len(res) == 0 + res = rel.fetchone() + assert res is None + + rel.execute() + res = rel.fetchone() + assert res is not None + + res = rel.fetchall() + assert len(res) == num_rows - 1 + res = rel.fetchall() + assert len(res) == num_rows + + rel = duckdb_cursor.sql(query) + projection = rel.select("column0") + assert projection.fetchall() == [(42,) for _ in range(num_rows)] + + filtered = rel.filter("column1 != 'test'") + assert filtered.fetchall() == []