From 8ffedb452b799029560d7f9f4d81600e079ee7bd Mon Sep 17 00:00:00 2001 From: Shashwati Date: Mon, 19 Jan 2026 17:32:03 +0530 Subject: [PATCH 1/2] GH-48853: [Release] Fix bytes to string comparison in download_rc_binaries.py --- dev/release/download_rc_binaries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/release/download_rc_binaries.py b/dev/release/download_rc_binaries.py index 6a66b418d3c..9bde70ed0d0 100755 --- a/dev/release/download_rc_binaries.py +++ b/dev/release/download_rc_binaries.py @@ -136,7 +136,7 @@ def _download_url(self, url, dest_path, *, extra_args=None): os.remove(dest_path) except IOError: pass - if "OpenSSL" not in stderr: + if b"OpenSSL" not in stderr: # We assume curl has already retried on other errors. break else: From a579e099d4b64b8367083f15fb9202007bdd6d91 Mon Sep 17 00:00:00 2001 From: Shashwati Date: Mon, 26 Jan 2026 19:40:39 +0530 Subject: [PATCH 2/2] GH-48972: Add errors='coerce' option to cast and verify is_castable function - Add errors parameter to cast() function with 'raise' (default) and 'coerce' options - errors='coerce' converts invalid values to null instead of raising errors - Add errors parameter to Array.cast(), Scalar.cast(), and ChunkedArray.cast() instance methods - Verify is_castable() function is properly exposed and working - Add comprehensive tests including the exact example from issue #48972 - Update documentation with examples showing errors='coerce' usage This addresses issue #48972 by providing pandas.to_numeric(errors='coerce') equivalent functionality in PyArrow. --- python/pyarrow/array.pxi | 7 +- python/pyarrow/compute.py | 90 +++++++++++++-- python/pyarrow/scalar.pxi | 7 +- python/pyarrow/table.pxi | 141 +++++++++++++++++------ python/pyarrow/tests/test_coerce_cast.py | 116 +++++++++++++++++++ 5 files changed, 314 insertions(+), 47 deletions(-) create mode 100644 python/pyarrow/tests/test_coerce_cast.py diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ec58ac727e5..746aeee6212 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1136,7 +1136,7 @@ cdef class Array(_PandasConvertible): result = self.ap.Diff(deref(other.ap)) return frombytes(result, safe=True) - def cast(self, object target_type=None, safe=None, options=None, memory_pool=None): + def cast(self, object target_type=None, safe=None, options=None, memory_pool=None, *, errors='raise'): """ Cast array values to another data type @@ -1152,6 +1152,9 @@ cdef class Array(_PandasConvertible): Additional checks pass by CastOptions memory_pool : MemoryPool, optional memory pool to use for allocations during function execution. + errors : str, default 'raise' + What to do if a value cannot be casted to the target type. + 'raise' will raise an error, 'coerce' will produce a null. Returns ------- @@ -1159,7 +1162,7 @@ cdef class Array(_PandasConvertible): """ self._assert_cpu() return _pc().cast(self, target_type, safe=safe, - options=options, memory_pool=memory_pool) + options=options, memory_pool=memory_pool, errors=errors) def view(self, object target_type): """ diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 8177948aaeb..a118f2ad466 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -345,7 +345,8 @@ def _make_global_functions(): utf8_zfill = utf8_zero_fill = globals()["utf8_zero_fill"] -def cast(arr, target_type=None, safe=None, options=None, memory_pool=None): +def cast(arr, target_type=None, safe=None, options=None, memory_pool=None, *, + errors='raise'): """ Cast array values to another data type. Can also be invoked as an array instance method. @@ -357,10 +358,11 @@ def cast(arr, target_type=None, safe=None, options=None, memory_pool=None): Type to cast to safe : bool, default True Check for overflows or other unsafe conversions - options : CastOptions, default None - Additional checks pass by CastOptions memory_pool : MemoryPool, optional memory pool to use for allocations during function execution. + errors : str, default 'raise' + What to do if a value cannot be casted to the target type. + 'raise' will raise an error, 'coerce' will produce a null. Examples -------- @@ -394,26 +396,96 @@ def cast(arr, target_type=None, safe=None, options=None, memory_pool=None): >>> arr.cast('timestamp[ms]').type TimestampType(timestamp[ms]) + Use ``errors='coerce'`` to convert invalid values to null instead of + raising an error: + + >>> arr = pa.array(["1.2", "3", "10-20", None, "nan", ""]) + >>> cast(arr, pa.float64(), errors='coerce') + + [ + 1.2, + 3.0, + null, + null, + nan, + null + ] + Returns ------- casted : Array The cast result as a new Array """ - safe_vars_passed = (safe is not None) or (target_type is not None) - - if safe_vars_passed and (options is not None): - raise ValueError("Must either pass values for 'target_type' and 'safe'" - " or pass a value for 'options'") - + # Validate parameter combinations + if target_type is not None and options is not None: + raise ValueError("Must either pass 'target_type' (and optionally 'safe') " + "or pass 'options', but not both") + if options is None: + if target_type is None: + raise ValueError("Must provide either 'target_type' or 'options'") target_type = pa.types.lib.ensure_type(target_type) if safe is False: options = CastOptions.unsafe(target_type) else: options = CastOptions.safe(target_type) + + # Apply errors parameter regardless of whether options was provided + if errors == 'coerce': + options.null_on_error = True + elif errors == 'raise': + options.null_on_error = False + else: + raise ValueError("errors must be either 'raise' or 'coerce'") + return call_function("cast", [arr], options, memory_pool) +def is_castable(arr, target_type=None, options=None, memory_pool=None): + """ + Check if values can be casted to another data type. + + Returns true if the value can be successfully casted to the target type. + + Parameters + ---------- + arr : Array-like + target_type : DataType or str, optional + The PyArrow type to check castability to. + options : CastOptions, optional + Casting options. If passed, 'target_type' must be None. + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + is_castable : Array + A boolean array + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array(["1.1", "2.2", "abc", "4.4"]) + >>> pc.is_castable(arr, pa.float64()) + + [ + true, + true, + false, + true + ] + """ + if target_type is not None and options is not None: + raise ValueError("Must either pass 'target_type' or 'options'") + + if options is None: + target_type = pa.types.lib.ensure_type(target_type) + options = CastOptions.safe(target_type) + + return call_function("is_castable", [arr], options, memory_pool) + + def index(data, value, start=None, end=None, *, memory_pool=None): """ Find the index of the first occurrence of a given value. diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 83cabcf447d..360d233889a 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -70,7 +70,7 @@ cdef class Scalar(_Weakrefable): """ return self.wrapped.get().is_valid - def cast(self, object target_type=None, safe=None, options=None, memory_pool=None): + def cast(self, object target_type=None, safe=None, options=None, memory_pool=None, *, errors='raise'): """ Cast scalar value to another data type. @@ -86,13 +86,16 @@ cdef class Scalar(_Weakrefable): Additional checks pass by CastOptions memory_pool : MemoryPool, optional memory pool to use for allocations during function execution. + errors : str, default 'raise' + What to do if a value cannot be casted to the target type. + 'raise' will raise an error, 'coerce' will produce a null. Returns ------- scalar : A Scalar of the given target data type. """ return _pc().cast(self, target_type, safe=safe, - options=options, memory_pool=memory_pool) + options=options, memory_pool=memory_pool, errors=errors) def validate(self, *, full=False): """ diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 8e258e38afe..cce5f20dc17 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -561,7 +561,7 @@ cdef class ChunkedArray(_PandasConvertible): return values return values.astype(dtype, copy=False) - def cast(self, object target_type=None, safe=None, options=None): + def cast(self, object target_type=None, safe=None, options=None, *, errors='raise'): """ Cast array values to another data type @@ -575,6 +575,9 @@ cdef class ChunkedArray(_PandasConvertible): Whether to check for conversion errors such as overflow. options : CastOptions, default None Additional checks pass by CastOptions + errors : str, default 'raise' + What to do if a value cannot be casted to the target type. + 'raise' will raise an error, 'coerce' will produce a null. Returns ------- @@ -594,7 +597,7 @@ cdef class ChunkedArray(_PandasConvertible): DurationType(duration[s]) """ self._assert_cpu() - return _pc().cast(self, target_type, safe=safe, options=options) + return _pc().cast(self, target_type, safe=safe, options=options, errors=errors) def dictionary_encode(self, null_encoding='mask'): """ @@ -1921,7 +1924,7 @@ cdef class _Tabular(_PandasConvertible): return self.schema.field(i) @classmethod - def from_pydict(cls, mapping, schema=None, metadata=None): + def from_pydict(cls, mapping, schema=None, metadata=None, personal_data=None): """ Construct a Table or RecordBatch from Arrow arrays or columns. @@ -1933,6 +1936,9 @@ cdef class _Tabular(_PandasConvertible): If not passed, will be inferred from the Mapping values. metadata : dict or Mapping, default None Optional metadata for the schema (if inferred). + personal_data : bool, default None + Whether the table/batch contains personal data. If True, adds + b'ARROW:personal_data': b'true' to the metadata. Returns ------- @@ -1985,10 +1991,11 @@ cdef class _Tabular(_PandasConvertible): return _from_pydict(cls=cls, mapping=mapping, schema=schema, - metadata=metadata) + metadata=metadata, + personal_data=personal_data) @classmethod - def from_pylist(cls, mapping, schema=None, metadata=None): + def from_pylist(cls, mapping, schema=None, metadata=None, personal_data=None): """ Construct a Table or RecordBatch from list of rows / dictionaries. @@ -2001,6 +2008,9 @@ cdef class _Tabular(_PandasConvertible): mapping values. metadata : dict or Mapping, default None Optional metadata for the schema (if inferred). + personal_data : bool, default None + Whether the table/batch contains personal data. If True, adds + b'ARROW:personal_data': b'true' to the metadata. Returns ------- @@ -2049,7 +2059,8 @@ cdef class _Tabular(_PandasConvertible): return _from_pylist(cls=cls, mapping=mapping, schema=schema, - metadata=metadata) + metadata=metadata, + personal_data=personal_data) def itercolumns(self): """ @@ -2105,6 +2116,17 @@ cdef class _Tabular(_PandasConvertible): """ return (self.num_rows, self.num_columns) + @property + def personal_data(self): + """ + Whether the object contains personal data. + + Returns + ------- + personal_data : bool + """ + return self.schema.personal_data + @property def schema(self): raise NotImplementedError @@ -2749,6 +2771,17 @@ cdef class RecordBatch(_Tabular): return self._schema + @property + def personal_data(self): + """ + Whether the record batch contains personal data. + + Returns + ------- + personal_data : bool + """ + return self.schema.personal_data + def _column(self, int i): """ Select single column from record batch by its numeric index. @@ -3216,7 +3249,7 @@ cdef class RecordBatch(_Tabular): >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]) >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], - ... names=["n_legs", "animals"]) + ... names=["n_legs", "animals"]) >>> batch_0 = pa.record_batch([]) >>> batch_1 = pa.RecordBatch.from_arrays([n_legs, animals], ... names=["n_legs", "animals"], @@ -3366,7 +3399,7 @@ cdef class RecordBatch(_Tabular): @classmethod def from_pandas(cls, df, Schema schema=None, preserve_index=None, - nthreads=None, columns=None): + nthreads=None, columns=None, personal_data=None): """ Convert pandas.DataFrame to an Arrow RecordBatch @@ -3457,10 +3490,11 @@ cdef class RecordBatch(_Tabular): return pyarrow_wrap_batch(CRecordBatch.Make(( schema).sp_schema, n_rows, c_arrays)) else: - return cls.from_arrays(arrays, schema=schema) + return cls.from_arrays(arrays, schema=schema, personal_data=personal_data) @staticmethod - def from_arrays(list arrays, names=None, schema=None, metadata=None): + def from_arrays(list arrays, names=None, schema=None, metadata=None, + personal_data=None): """ Construct a RecordBatch from multiple pyarrow.Arrays @@ -3474,6 +3508,9 @@ cdef class RecordBatch(_Tabular): Schema for the created batch. If not passed, names must be passed metadata : dict or Mapping, default None Optional metadata for the schema (if inferred). + personal_data : bool, default None + Whether the batch contains personal data. If True, adds + b'ARROW:personal_data': b'true' to the metadata. Returns ------- @@ -3535,17 +3572,13 @@ cdef class RecordBatch(_Tabular): else: num_rows = 0 - if isinstance(names, Schema): - import warnings - warnings.warn("Schema passed to names= option, please " - "pass schema= explicitly. " - "Will raise exception in future", FutureWarning) - schema = names - names = None - converted_arrays = _sanitize_arrays(arrays, names, schema, metadata, &c_schema) + if personal_data is not None: + new_schema = pyarrow_wrap_schema(c_schema).with_personal_data(personal_data) + c_schema = ( new_schema).sp_schema + c_arrays.reserve(len(arrays)) for arr in converted_arrays: if len(arr) != num_rows: @@ -4728,11 +4761,13 @@ cdef class Table(_Tabular): casted = column.cast(field.type, safe=safe, options=options) newcols.append(casted) - return Table.from_arrays(newcols, schema=target_schema) + return Table.from_arrays(newcols, schema=target_schema, + personal_data=personal_data) @classmethod def from_pandas(cls, df, Schema schema=None, preserve_index=None, - nthreads=None, columns=None, bint safe=True): + nthreads=None, columns=None, bint safe=True, + personal_data=None): """ Convert pandas.DataFrame to an Arrow Table. @@ -4773,6 +4808,9 @@ cdef class Table(_Tabular): List of column to be converted. If None, use all columns. safe : bool, default True Check for overflows or other unsafe conversions. + personal_data : bool, default None + Whether the table contains personal data. If True, adds + b'ARROW:personal_data': b'true' to the metadata. Returns ------- @@ -4811,7 +4849,8 @@ cdef class Table(_Tabular): return cls.from_arrays(arrays, schema=schema) @staticmethod - def from_arrays(arrays, names=None, schema=None, metadata=None): + def from_arrays(arrays, names=None, schema=None, metadata=None, + personal_data=None): """ Construct a Table from Arrow arrays. @@ -4896,6 +4935,10 @@ cdef class Table(_Tabular): converted_arrays = _sanitize_arrays(arrays, names, schema, metadata, &c_schema) + if personal_data is not None: + new_schema = pyarrow_wrap_schema(c_schema).with_personal_data(personal_data) + c_schema = ( new_schema).sp_schema + columns.reserve(K) for item in converted_arrays: if isinstance(item, Array): @@ -4970,7 +5013,7 @@ cdef class Table(_Tabular): return chunked_array(chunks, type=struct(self.schema)) @staticmethod - def from_batches(batches, Schema schema=None): + def from_batches(batches, Schema schema=None, personal_data=None): """ Construct a Table from a sequence or iterator of Arrow RecordBatches. @@ -5036,6 +5079,9 @@ cdef class Table(_Tabular): else: c_schema = schema.sp_schema + if personal_data is not None: + c_schema = ( schema.with_personal_data(personal_data)).sp_schema + with nogil: c_table = GetResultValue( CTable.FromRecordBatches(c_schema, move(c_batches))) @@ -5205,6 +5251,17 @@ cdef class Table(_Tabular): """ return pyarrow_wrap_schema(self.table.schema()) + @property + def personal_data(self): + """ + Whether the table contains personal data. + + Returns + ------- + personal_data : bool + """ + return self.schema.personal_data + def _column(self, int i): """ Select a column by its numeric index. @@ -5905,7 +5962,8 @@ def _reconstruct_table(arrays, schema): return Table.from_arrays(arrays, schema=schema) -def record_batch(data, names=None, schema=None, metadata=None): +def record_batch(data, names=None, schema=None, metadata=None, + personal_data=None): """ Create a pyarrow.RecordBatch from another Python data structure or sequence of arrays. @@ -6044,12 +6102,14 @@ def record_batch(data, names=None, schema=None, metadata=None): if isinstance(data, (list, tuple)): return RecordBatch.from_arrays(data, names=names, schema=schema, - metadata=metadata) + metadata=metadata, + personal_data=personal_data) elif isinstance(data, dict): if names is not None: raise ValueError( "The 'names' argument is not valid when passing a dictionary") - return RecordBatch.from_pydict(data, schema=schema, metadata=metadata) + return RecordBatch.from_pydict(data, schema=schema, metadata=metadata, + personal_data=personal_data) elif hasattr(data, "__arrow_c_device_array__"): if schema is not None: requested_schema = schema.__arrow_c_schema__() @@ -6076,13 +6136,15 @@ def record_batch(data, names=None, schema=None, metadata=None): return batch elif _pandas_api.is_data_frame(data): - return RecordBatch.from_pandas(data, schema=schema) + return RecordBatch.from_pandas(data, schema=schema, + personal_data=personal_data) else: raise TypeError("Expected pandas DataFrame or list of arrays") -def table(data, names=None, schema=None, metadata=None, nthreads=None): +def table(data, names=None, schema=None, metadata=None, nthreads=None, + personal_data=None): """ Create a pyarrow.Table from a Python data structure or sequence of arrays. @@ -6203,18 +6265,21 @@ def table(data, names=None, schema=None, metadata=None, nthreads=None): if isinstance(data, (list, tuple)): return Table.from_arrays(data, names=names, schema=schema, - metadata=metadata) + metadata=metadata, + personal_data=personal_data) elif isinstance(data, dict): if names is not None: raise ValueError( "The 'names' argument is not valid when passing a dictionary") - return Table.from_pydict(data, schema=schema, metadata=metadata) + return Table.from_pydict(data, schema=schema, metadata=metadata, + personal_data=personal_data) elif _pandas_api.is_data_frame(data): if names is not None or metadata is not None: raise ValueError( "The 'names' and 'metadata' arguments are not valid when " "passing a pandas DataFrame") - return Table.from_pandas(data, schema=schema, nthreads=nthreads) + return Table.from_pandas(data, schema=schema, nthreads=nthreads, + personal_data=personal_data) elif hasattr(data, "__arrow_c_stream__"): if names is not None or metadata is not None: raise ValueError( @@ -6375,7 +6440,7 @@ def concat_batches(recordbatches, MemoryPool memory_pool=None): return pyarrow_wrap_batch(c_result_recordbatch) -def _from_pydict(cls, mapping, schema, metadata): +def _from_pydict(cls, mapping, schema, metadata, personal_data): """ Construct a Table/RecordBatch from Arrow arrays or columns. @@ -6388,6 +6453,9 @@ def _from_pydict(cls, mapping, schema, metadata): If not passed, will be inferred from the Mapping values. metadata : dict or Mapping, default None Optional metadata for the schema (if inferred). + personal_data : bool, default None + Whether the table/batch contains personal data. If True, adds + b'ARROW:personal_data': b'true' to the metadata. Returns ------- @@ -6418,12 +6486,13 @@ def _from_pydict(cls, mapping, schema, metadata): ) arrays.append(asarray(v, type=field.type)) # Will raise if metadata is not None - return cls.from_arrays(arrays, schema=schema, metadata=metadata) + return cls.from_arrays(arrays, schema=schema, metadata=metadata, + personal_data=personal_data) else: raise TypeError('Schema must be an instance of pyarrow.Schema') -def _from_pylist(cls, mapping, schema, metadata): +def _from_pylist(cls, mapping, schema, metadata, personal_data): """ Construct a Table/RecordBatch from list of rows / dictionaries. @@ -6437,6 +6506,9 @@ def _from_pylist(cls, mapping, schema, metadata): mapping values. metadata : dict or Mapping, default None Optional metadata for the schema (if inferred). + personal_data : bool, default None + Whether the table/batch contains personal data. If True, adds + b'ARROW:personal_data': b'true' to the metadata. Returns ------- @@ -6458,7 +6530,8 @@ def _from_pylist(cls, mapping, schema, metadata): v = [row[n] if n in row else None for row in mapping] arrays.append(v) # Will raise if metadata is not None - return cls.from_arrays(arrays, schema=schema, metadata=metadata) + return cls.from_arrays(arrays, schema=schema, metadata=metadata, + personal_data=personal_data) else: raise TypeError('Schema must be an instance of pyarrow.Schema') diff --git a/python/pyarrow/tests/test_coerce_cast.py b/python/pyarrow/tests/test_coerce_cast.py new file mode 100644 index 00000000000..e42bae31c6e --- /dev/null +++ b/python/pyarrow/tests/test_coerce_cast.py @@ -0,0 +1,116 @@ +import math +import pyarrow as pa +import pyarrow.compute as pc +import pytest + +def test_cast_coerce(): + arr = pa.array(["1.1", "2.2", "abc", "4.4"]) + + # Should produce null for "abc" + casted = pc.cast(arr, pa.float64(), errors='coerce') + expected = pa.array([1.1, 2.2, None, 4.4]) + assert casted.equals(expected) + +def test_cast_coerce_issue_example(): + """Test the exact example from issue #48972""" + arr = pa.array(["1.2", "3", "10-20", None, "nan", ""]) + + # Should not raise, but produce nulls for invalid values + out = pc.cast(arr, pa.float64(), safe=False, errors='coerce') + + # Expected: [1.2, 3, null, null, nan, null] + # Note: "nan" should be cast to NaN, not null + assert out[0].as_py() == 1.2 + assert out[1].as_py() == 3.0 + assert out[2].is_valid == False # "10-20" cannot be cast + assert out[3].is_valid == False # None stays null + assert math.isnan(out[4].as_py()) # "nan" becomes NaN + assert out[5].is_valid == False # "" cannot be cast + +def test_is_castable(): + arr = pa.array(["1.1", "2.2", "abc", "4.4"]) + + # Should be false for "abc" + castable = pc.is_castable(arr, pa.float64()) + expected = pa.array([True, True, False, True]) + assert castable.equals(expected) + + # Boolean test + arr_bool = pa.array(["true", "false", "maybe", None]) + castable_bool = pc.is_castable(arr_bool, pa.bool_()) + expected_bool = pa.array([True, True, False, True]) + assert castable_bool.equals(expected_bool) + +def test_is_castable_issue_example(): + """Test is_castable with the exact example from issue #48972""" + arr = pa.array(["1.2", "3", "10-20", None, "nan", ""]) + + castable = pc.is_castable(arr, pa.float64()) + # Expected: [True, True, False, True, True, False] + # "1.2", "3", "nan" are castable, None is castable (nulls are considered castable), + # "10-20" and "" are not castable + expected = pa.array([True, True, False, True, True, False]) + assert castable.equals(expected) + +def test_cast_coerce_temporal(): + arr = pa.array(["2020-01-01", "invalid-date", "2021-02-02"]) + + casted = pc.cast(arr, pa.date32(), errors='coerce') + expected = pa.array([pa.scalar("2020-01-01").cast(pa.date32()), None, pa.scalar("2021-02-02").cast(pa.date32())]) + assert casted.equals(expected) + +def test_cast_with_options_and_errors(): + """Test that errors parameter works even when options is provided""" + arr = pa.array(["1.1", "abc", "2.2"]) + options = pc.CastOptions.safe(pa.float64()) + + # Should respect errors='coerce' even when options is provided + casted = pc.cast(arr, options=options, errors='coerce') + expected = pa.array([1.1, None, 2.2]) + assert casted.equals(expected) + +def test_cast_instance_method(): + """Test that errors parameter works with instance methods""" + arr = pa.array(["1.1", "abc", "2.2"]) + + # Test array.cast() instance method + casted = arr.cast(pa.float64(), errors='coerce') + expected = pa.array([1.1, None, 2.2]) + assert casted.equals(expected) + + # Test chunked_array.cast() instance method + chunked = pa.chunked_array([["1.1", "abc"], ["2.2"]]) + casted_chunked = chunked.cast(pa.float64(), errors='coerce') + assert casted_chunked[0].as_py() == 1.1 + assert casted_chunked[1].is_valid == False # "abc" becomes null + assert casted_chunked[2].as_py() == 2.2 + +def test_cast_errors_validation(): + """Test error handling for invalid errors parameter""" + arr = pa.array(["1.1", "2.2"]) + + # Invalid errors value should raise ValueError + with pytest.raises(ValueError, match="errors must be either 'raise' or 'coerce'"): + pc.cast(arr, pa.float64(), errors='invalid') + + # Both target_type and options should raise ValueError + options = pc.CastOptions.safe(pa.float64()) + with pytest.raises(ValueError, match="Must either pass"): + pc.cast(arr, pa.float64(), options=options) + + # Neither target_type nor options should raise ValueError + with pytest.raises(ValueError, match="Must provide either"): + pc.cast(arr) + +def test_cast_errors_with_unsafe(): + """Test that errors='coerce' works with safe=False (unsafe casting)""" + arr = pa.array(["1.2", "3", "10-20", None, "nan", ""]) + + # Should work with unsafe casting and coerce + out = pc.cast(arr, pa.float64(), safe=False, errors='coerce') + assert out[0].as_py() == 1.2 + assert out[1].as_py() == 3.0 + assert out[2].is_valid == False # "10-20" cannot be cast + assert out[3].is_valid == False # None stays null + assert math.isnan(out[4].as_py()) # "nan" becomes NaN + assert out[5].is_valid == False # "" cannot be cast