Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,8 @@ stubdeps = [ # dependencies used for typehints in the stubs
"typing-extensions",
]
test = [ # dependencies used for running tests
"adbc-driver-manager; sys_platform != 'win32' or platform_machine != 'ARM64'",
"adbc-driver-manager>=1.10.0; python_version >= '3.10' and (sys_platform != 'win32' or platform_machine != 'ARM64')",
"adbc-driver-manager>=1.7.0; python_version < '3.10' and (sys_platform != 'win32' or platform_machine != 'ARM64')",
"pytest",
"pytest-reraise",
"pytest-timeout",
Expand All @@ -252,8 +253,10 @@ test = [ # dependencies used for running tests
"requests",
"urllib3",
"fsspec>=2022.11.0; sys_platform != 'win32' or platform_machine != 'ARM64'",
"pandas>=2.0.0",
"pyarrow>=18.0.0; sys_platform != 'win32' or platform_machine != 'ARM64'",
"pandas>=3.0.0; python_version > '3.10'",
"pandas<3.0.0; python_version < '3.11'",
"pyarrow>=23.0.0; python_version >= '3.10' and (sys_platform != 'win32' or platform_machine != 'ARM64')",
"pyarrow>=18.0.0; python_version < '3.10' and (sys_platform != 'win32' or platform_machine != 'ARM64')",
"torch>=2.2.2; python_version < '3.14' and ( sys_platform != 'darwin' or platform_machine != 'x86_64' or python_version < '3.13' ) and ( sys_platform != 'win32' or platform_machine != 'ARM64' or python_version > '3.11' )",
"tensorflow==2.14.0; sys_platform == 'darwin' and python_version < '3.12'",
"tensorflow-cpu>=2.14.0; sys_platform == 'linux' and platform_machine != 'aarch64' and python_version < '3.12'",
Expand Down
41 changes: 22 additions & 19 deletions src/duckdb_py/include/duckdb_python/numpy/numpy_type.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,28 @@ namespace duckdb {
// Pandas Specific Types (e.g., categorical, datetime_tz,...)
enum class NumpyNullableType : uint8_t {
//! NumPy dtypes
BOOL, //! bool_, bool8
INT_8, //! byte, int8
UINT_8, //! ubyte, uint8
INT_16, //! int16, short
UINT_16, //! uint16, ushort
INT_32, //! int32, intc
UINT_32, //! uint32, uintc,
INT_64, //! int64, int0, int_, intp, matrix
UINT_64, //! uint64, uint, uint0, uintp
FLOAT_16, //! float16, half
FLOAT_32, //! float32, single
FLOAT_64, //! float64, float_, double
OBJECT, //! object
UNICODE, //! <U1, unicode_, str_, str0
DATETIME_S, //! datetime64[s], <M8[s]
DATETIME_MS, //! datetime64[ms], <M8[ms]
DATETIME_NS, //! datetime64[ns], <M8[ns]
DATETIME_US, //! datetime64[us], <M8[us]
TIMEDELTA, //! timedelta64[D], timedelta64
BOOL, //! bool_, bool8
INT_8, //! byte, int8
UINT_8, //! ubyte, uint8
INT_16, //! int16, short
UINT_16, //! uint16, ushort
INT_32, //! int32, intc
UINT_32, //! uint32, uintc,
INT_64, //! int64, int0, int_, intp, matrix
UINT_64, //! uint64, uint, uint0, uintp
FLOAT_16, //! float16, half
FLOAT_32, //! float32, single
FLOAT_64, //! float64, float_, double
OBJECT, //! object
UNICODE, //! <U1, unicode_, str_, str0
DATETIME_S, //! datetime64[s], <M8[s]
DATETIME_MS, //! datetime64[ms], <M8[ms]
DATETIME_NS, //! datetime64[ns], <M8[ns]
DATETIME_US, //! datetime64[us], <M8[us]
TIMEDELTA_NS, //! timedelta64[ns]
TIMEDELTA_US, //! timedelta64[us]
TIMEDELTA_MS, //! timedelta64[ms]
TIMEDELTA_S, //! timedelta64[s]

//! ------------------------------------------------------------
//! Extension Types
Expand Down
2 changes: 1 addition & 1 deletion src/duckdb_py/numpy/array_wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ struct IntervalConvert {
template <class DUCKDB_T, class NUMPY_T>
static int64_t ConvertValue(interval_t val, NumpyAppendData &append_data) {
(void)append_data;
return Interval::GetNanoseconds(val);
return Interval::GetMicro(val);
}

template <class NUMPY_T, bool PANDAS>
Expand Down
25 changes: 23 additions & 2 deletions src/duckdb_py/numpy/numpy_scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,10 @@ void NumpyScan::Scan(PandasColumnBindData &bind_data, idx_t count, idx_t offset,
}
break;
}
case NumpyNullableType::TIMEDELTA: {
case NumpyNullableType::TIMEDELTA_NS:
case NumpyNullableType::TIMEDELTA_US:
case NumpyNullableType::TIMEDELTA_MS:
case NumpyNullableType::TIMEDELTA_S: {
auto src_ptr = reinterpret_cast<const int64_t *>(array.data());
auto tgt_ptr = FlatVector::GetData<interval_t>(out);
auto &mask = FlatVector::Validity(out);
Expand All @@ -314,7 +317,25 @@ void NumpyScan::Scan(PandasColumnBindData &bind_data, idx_t count, idx_t offset,
mask.SetInvalid(row);
continue;
}
int64_t micro = src_ptr[source_idx] / 1000;

int64_t micro;
switch (bind_data.numpy_type.type) {
case NumpyNullableType::TIMEDELTA_NS:
micro = src_ptr[source_idx] / 1000; // ns -> us
break;
case NumpyNullableType::TIMEDELTA_US:
micro = src_ptr[source_idx]; // already us
break;
case NumpyNullableType::TIMEDELTA_MS:
micro = src_ptr[source_idx] * 1000; // ms -> us
break;
case NumpyNullableType::TIMEDELTA_S:
micro = src_ptr[source_idx] * 1000000; // s -> us
break;
default:
throw InternalException("Unexpected timedelta type");
}

int64_t days = micro / Interval::MICROS_PER_DAY;
micro = micro % Interval::MICROS_PER_DAY;
int64_t months = days / Interval::DAYS_PER_MONTH;
Expand Down
2 changes: 1 addition & 1 deletion src/duckdb_py/numpy/raw_array_wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ string RawArrayWrapper::DuckDBToNumpyDtype(const LogicalType &type) {
case LogicalTypeId::DATE:
return "datetime64[us]";
case LogicalTypeId::INTERVAL:
return "timedelta64[ns]";
return "timedelta64[us]";
case LogicalTypeId::TIME:
case LogicalTypeId::TIME_TZ:
case LogicalTypeId::VARCHAR:
Expand Down
19 changes: 17 additions & 2 deletions src/duckdb_py/numpy/type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,23 @@ static NumpyNullableType ConvertNumpyTypeInternal(const string &col_type_str) {
if (col_type_str == "string") {
return NumpyNullableType::STRING;
}
if (col_type_str == "str") {
return NumpyNullableType::STRING;
}
if (col_type_str == "object") {
return NumpyNullableType::OBJECT;
}
if (col_type_str == "timedelta64[ns]") {
return NumpyNullableType::TIMEDELTA;
return NumpyNullableType::TIMEDELTA_NS;
}
if (col_type_str == "timedelta64[us]") {
return NumpyNullableType::TIMEDELTA_US;
}
if (col_type_str == "timedelta64[ms]") {
return NumpyNullableType::TIMEDELTA_MS;
}
if (col_type_str == "timedelta64[s]") {
return NumpyNullableType::TIMEDELTA_S;
}
// We use 'StartsWith' because it might have ', tz' at the end, indicating timezone
if (StringUtil::StartsWith(col_type_str, "datetime64[ns")) {
Expand Down Expand Up @@ -140,7 +152,10 @@ LogicalType NumpyToLogicalType(const NumpyType &col_type) {
return LogicalType::VARCHAR;
case NumpyNullableType::OBJECT:
return LogicalType::VARCHAR;
case NumpyNullableType::TIMEDELTA:
case NumpyNullableType::TIMEDELTA_NS:
case NumpyNullableType::TIMEDELTA_US:
case NumpyNullableType::TIMEDELTA_MS:
case NumpyNullableType::TIMEDELTA_S:
return LogicalType::INTERVAL;
case NumpyNullableType::DATETIME_MS: {
if (col_type.has_timezone) {
Expand Down
2 changes: 1 addition & 1 deletion src/duckdb_py/pyresult.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ void DuckDBPyResult::ConvertDateTimeTypes(PandasDataFrame &df, bool date_as_obje
// We need to create the column anew because the exact dt changed to a new timezone
ReplaceDFColumn(df, names[i].c_str(), i, new_value);
} else if (date_as_object && result->types[i] == LogicalType::DATE) {
auto new_value = df[names[i].c_str()].attr("dt").attr("date");
py::object new_value = df[names[i].c_str()].attr("dt").attr("date");
ReplaceDFColumn(df, names[i].c_str(), i, new_value);
}
}
Expand Down
100 changes: 21 additions & 79 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import warnings
from importlib import import_module
from pathlib import Path
from typing import Any, Union
from typing import Union

import pytest

Expand All @@ -19,13 +19,27 @@
pandas = None
pyarrow_dtype = None

# Check if pandas has arrow dtypes enabled
try:
from pandas.compat import pa_version_under7p0

pyarrow_dtypes_enabled = not pa_version_under7p0
except ImportError:
pyarrow_dtypes_enabled = False
# Version-aware helpers for Pandas 2.x vs 3.0 compatibility
def _get_pandas_ge_3():
if pandas is None:
return False
from packaging.version import Version

return Version(pandas.__version__) >= Version("3.0.0")


PANDAS_GE_3 = _get_pandas_ge_3()


def is_string_dtype(dtype):
"""Check if a dtype is a string dtype (works across Pandas 2.x and 3.0).

Uses pd.api.types.is_string_dtype() which handles:
- Pandas 2.x: object dtype for strings
- Pandas 3.0+: str (StringDtype) for strings
"""
return pandas.api.types.is_string_dtype(dtype)


def import_pandas():
Expand Down Expand Up @@ -113,78 +127,6 @@ def pandas_supports_arrow_backend():
return pandas_2_or_higher()


def numpy_pandas_df(*args, **kwargs):
return import_pandas().DataFrame(*args, **kwargs)


def arrow_pandas_df(*args, **kwargs):
df = numpy_pandas_df(*args, **kwargs)
return df.convert_dtypes(dtype_backend="pyarrow")


class NumpyPandas:
def __init__(self) -> None:
self.backend = "numpy_nullable"
self.DataFrame = numpy_pandas_df
self.pandas = import_pandas()

def __getattr__(self, name: str) -> Any: # noqa: ANN401
return getattr(self.pandas, name)


def convert_arrow_to_numpy_backend(df):
names = df.columns
df_content = {}
for name in names:
df_content[name] = df[name].array.__arrow_array__()
# This should convert the pyarrow chunked arrays into numpy arrays
return import_pandas().DataFrame(df_content)


def convert_to_numpy(df):
if (
pyarrow_dtypes_enabled
and pyarrow_dtype is not None
and any(True for x in df.dtypes if isinstance(x, pyarrow_dtype))
):
return convert_arrow_to_numpy_backend(df)
return df


def convert_and_equal(df1, df2, **kwargs):
df1 = convert_to_numpy(df1)
df2 = convert_to_numpy(df2)
import_pandas().testing.assert_frame_equal(df1, df2, **kwargs)


class ArrowMockTesting:
def __init__(self) -> None:
self.testing = import_pandas().testing
self.assert_frame_equal = convert_and_equal

def __getattr__(self, name: str) -> Any: # noqa: ANN401
return getattr(self.testing, name)


# This converts dataframes constructed with 'DataFrame(...)' to pyarrow backed dataframes
# Assert equal does the opposite, turning all pyarrow backed dataframes into numpy backed ones
# this is done because we don't produce pyarrow backed dataframes yet
class ArrowPandas:
def __init__(self) -> None:
self.pandas = import_pandas()
if pandas_2_or_higher() and pyarrow_dtypes_enabled:
self.backend = "pyarrow"
self.DataFrame = arrow_pandas_df
else:
# For backwards compatible reasons, just mock regular pandas
self.backend = "numpy_nullable"
self.DataFrame = self.pandas.DataFrame
self.testing = ArrowMockTesting()

def __getattr__(self, name: str) -> Any: # noqa: ANN401
return getattr(self.pandas, name)


@pytest.fixture
def require():
def _require(extension_name, db_name="") -> Union[duckdb.DuckDBPyConnection, None]:
Expand Down
26 changes: 11 additions & 15 deletions tests/coverage/test_pandas_categorical_coverage.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import pytest
from conftest import NumpyPandas
import pandas as pd

import duckdb

Expand All @@ -9,23 +8,23 @@ def check_result_list(res):
assert res_item[0] == res_item[1]


def check_create_table(category, pandas):
def check_create_table(category):
conn = duckdb.connect()

conn.execute("PRAGMA enable_verification")
df_in = pandas.DataFrame(
df_in = pd.DataFrame(
{
"x": pandas.Categorical(category, ordered=True),
"y": pandas.Categorical(category, ordered=True),
"x": pd.Categorical(category, ordered=True),
"y": pd.Categorical(category, ordered=True),
"z": category,
}
)

category.append("bla")

df_in_diff = pandas.DataFrame( # noqa: F841
df_in_diff = pd.DataFrame( # noqa: F841
{
"k": pandas.Categorical(category, ordered=True),
"k": pd.Categorical(category, ordered=True),
}
)

Expand Down Expand Up @@ -68,14 +67,11 @@ def check_create_table(category, pandas):
conn.execute("DROP TABLE t1")


# TODO: extend tests with ArrowPandas # noqa: TD002, TD003
class TestCategory:
@pytest.mark.parametrize("pandas", [NumpyPandas()])
def test_category_string_uint16(self, duckdb_cursor, pandas):
def test_category_string_uint16(self, duckdb_cursor):
category = [str(i) for i in range(300)]
check_create_table(category, pandas)
check_create_table(category)

@pytest.mark.parametrize("pandas", [NumpyPandas()])
def test_category_string_uint32(self, duckdb_cursor, pandas):
def test_category_string_uint32(self, duckdb_cursor):
category = [str(i) for i in range(70000)]
check_create_table(category, pandas)
check_create_table(category)
11 changes: 5 additions & 6 deletions tests/extensions/test_httpfs.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import datetime
import os

import pandas as pd
import pytest
from conftest import ArrowPandas, NumpyPandas

import duckdb

Expand Down Expand Up @@ -34,8 +34,7 @@ def test_s3fs(self, require):
res = rel.fetchone()
assert res == (1, 0, datetime.date(1965, 2, 28), 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 6, 0, 0, 0, 0)

@pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()])
def test_httpfs(self, require, pandas):
def test_httpfs(self, require):
connection = require("httpfs")
try:
connection.execute("""
Expand All @@ -51,14 +50,14 @@ def test_httpfs(self, require, pandas):
raise

result_df = connection.fetchdf()
exp_result = pandas.DataFrame(
exp_result = pd.DataFrame(
{
"id": pandas.Series([1, 2, 3], dtype="int32"),
"id": pd.Series([1, 2, 3], dtype="int32"),
"first_name": ["Amanda", "Albert", "Evelyn"],
"last_name": ["Jordan", "Freeman", "Morgan"],
}
)
pandas.testing.assert_frame_equal(result_df, exp_result)
pd.testing.assert_frame_equal(result_df, exp_result, check_dtype=False)

def test_http_exception(self, require):
connection = require("httpfs")
Expand Down
Loading