Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,7 @@ repos:
?^ci/scripts/python_sdist_build\.sh$|
?^ci/scripts/python_sdist_test\.sh$|
?^ci/scripts/python_wheel_unix_test\.sh$|
?^ci/scripts/python_test_type_annotations\.sh$|
?^ci/scripts/r_build\.sh$|
?^ci/scripts/r_revdepcheck\.sh$|
?^ci/scripts/release_test\.sh$|
Expand Down Expand Up @@ -379,6 +380,7 @@ repos:
# TODO: Remove this when we fix all lint failures
files: >-
(
?^ci/scripts/python_test_type_annotations\.sh$|
?^dev/release/05-binary-upload\.sh$|
?^dev/release/binary-recover\.sh$|
?^dev/release/post-03-binary\.sh$|
Expand Down
43 changes: 43 additions & 0 deletions ci/scripts/python_test_type_annotations.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/env bash
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

set -ex
pyarrow_dir=${1}

if [ "${PYARROW_TEST_ANNOTATIONS}" == "ON" ]; then
if [ -n "${ARROW_PYTHON_VENV:-}" ]; then
# shellcheck source=/dev/null
. "${ARROW_PYTHON_VENV}/bin/activate"
fi

# Install library stubs. Note some libraries contain their own type hints so they need to be installed.
pip install fsspec pandas-stubs scipy-stubs types-cffi types-psutil types-requests types-python-dateutil

# Install type checkers
pip install mypy pyright ty

# Run type checkers
pushd "${pyarrow_dir}"
mypy
pyright --stats
ty check --verbose --output-format concise
popd
else
echo "Skipping type annotation tests"
fi
2 changes: 2 additions & 0 deletions ci/scripts/python_wheel_macos_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,8 @@ export CMAKE_PREFIX_PATH=${build_dir}/install
export SETUPTOOLS_SCM_PRETEND_VERSION=${PYARROW_VERSION}

pushd ${source_dir}/python
# Install libcst for build-time stub docstring extraction
python -m pip install libcst
python setup.py bdist_wheel
popd

Expand Down
2 changes: 1 addition & 1 deletion ci/scripts/python_wheel_validate_contents.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def validate_wheel(path):
]
assert not outliers, f"Unexpected contents in wheel: {sorted(outliers)}"
print(f"The wheel: {wheels[0]} seems valid.")

# TODO(GH-32609): Validate some docstrings were generated and added.

def main():
parser = argparse.ArgumentParser()
Expand Down
3 changes: 3 additions & 0 deletions ci/scripts/python_wheel_windows_build.bat
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,9 @@ set CMAKE_PREFIX_PATH=C:\arrow-dist

pushd C:\arrow\python

@REM Install libcst for build-time stub docstring extraction
%PYTHON_CMD% -m pip install libcst

@REM Build wheel
%PYTHON_CMD% setup.py bdist_wheel || exit /B 1

Expand Down
2 changes: 2 additions & 0 deletions ci/scripts/python_wheel_xlinux_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,8 @@ export ARROW_HOME=/tmp/arrow-dist
export CMAKE_PREFIX_PATH=/tmp/arrow-dist

pushd /arrow/python
# Install libcst for build-time stub docstring extraction
python -m pip install libcst
python setup.py bdist_wheel

echo "=== Strip symbols from wheel ==="
Expand Down
4 changes: 3 additions & 1 deletion compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1531,13 +1531,15 @@ services:
BUILD_DOCS_CPP: "ON"
BUILD_DOCS_PYTHON: "ON"
PYTEST_ARGS: "--doctest-modules --doctest-cython"
PYARROW_TEST_ANNOTATIONS: "ON"
volumes: *conda-volumes
command:
["/arrow/ci/scripts/cpp_build.sh /arrow /build &&
/arrow/ci/scripts/python_build.sh /arrow /build &&
pip install -e /arrow/dev/archery[numpydoc] &&
archery numpydoc --allow-rule GL10,PR01,PR03,PR04,PR05,PR10,RT03,YD01 &&
/arrow/ci/scripts/python_test.sh /arrow"]
/arrow/ci/scripts/python_test.sh /arrow &&
/arrow/ci/scripts/python_test_type_annotations.sh /arrow/python"]

conda-python-dask:
# Possible $DASK parameters:
Expand Down
233 changes: 233 additions & 0 deletions dev/update_stub_docstrings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

"""
Extract docstrings from pyarrow runtime and insert them into stub files.

Usage (from python/ directory with pyarrow built):
python ../dev/update_stub_docstrings.py pyarrow-stubs
"""

import argparse
import importlib
import inspect
import shutil
import sys
from pathlib import Path
from textwrap import indent

import libcst
from libcst import matchers as m


def _resolve_object(module, path):
"""Resolve an object by dotted path from a module."""
if not path:
return module, None, module.__name__

parts = path.split(".")
parent = None
obj = module

for part in parts:
parent = obj
try:
obj = getattr(obj, part)
except AttributeError:
try:
obj = vars(parent).get(part)
if obj is not None:
continue
except TypeError:
pass
return None, None, None

return obj, parent, getattr(obj, "__name__", parts[-1])


def _get_docstring(name, module, indentation):
"""Extract and format a docstring for insertion into a stub file."""
obj, parent, obj_name = _resolve_object(module, name)
if obj is None:
print(f"{name} not found in {module.__name__}")
return None

docstring = inspect.getdoc(obj)
if not docstring:
return None

# Remove signature prefix
parent_name = getattr(parent, "__name__", None) if parent else None
if docstring.startswith(obj_name) or (
parent_name and docstring.startswith(f"{parent_name}.{obj_name}")
):
docstring = "\n".join(docstring.splitlines()[2:])

# Skip empty docstrings
if not docstring.strip():
return None

prefix = " " * indentation
return '"""\n' + indent(docstring + '\n"""', prefix)


class DocstringInserter(libcst.CSTTransformer):
"""CST transformer that inserts docstrings into stub file nodes."""

def __init__(self, module, namespace):
self.module = module
self.base_namespace = namespace
self.stack = []
self.indentation = 0

def _full_name(self):
name = ".".join(self.stack)
return f"{self.base_namespace}.{name}" if self.base_namespace else name

def leave_Module(self, original_node, updated_node):
new_body = []
clone_matcher = m.SimpleStatementLine(
body=[m.Assign(value=m.Call(func=m.Name(value="_clone_signature"))),
m.ZeroOrMore()]
)
for stmt in updated_node.body:
new_body.append(stmt)
if m.matches(stmt, clone_matcher):
name = stmt.body[0].targets[0].target.value
if self.base_namespace:
name = f"{self.base_namespace}.{name}"
docstring = _get_docstring(name, self.module, 0)
if docstring:
new_body.append(libcst.SimpleStatementLine(
body=[libcst.Expr(value=libcst.SimpleString(docstring))]))
return updated_node.with_changes(body=new_body)

def visit_ClassDef(self, node):
self.stack.append(node.name.value)
self.indentation += 1

def leave_ClassDef(self, original_node, updated_node):
name = self._full_name()
docstring = _get_docstring(name, self.module, self.indentation)

if docstring:
ellipsis_class = m.ClassDef(body=m.IndentedBlock(body=[
m.SimpleStatementLine(body=[
m.Expr(m.Ellipsis()), m.ZeroOrMore()]), m.ZeroOrMore()]))
func_class = m.ClassDef(body=m.IndentedBlock(
body=[m.FunctionDef(), m.ZeroOrMore()]))

if m.matches(updated_node, ellipsis_class):
updated_node = updated_node.deep_replace(
updated_node.body.body[0].body[0].value,
libcst.SimpleString(value=docstring))
elif m.matches(updated_node, func_class):
docstring_stmt = libcst.SimpleStatementLine(
body=[libcst.Expr(value=libcst.SimpleString(value=docstring))])
updated_node = updated_node.with_changes(
body=updated_node.body.with_changes(
body=[docstring_stmt] + list(updated_node.body.body)))

self.stack.pop()
self.indentation -= 1
return updated_node

def visit_FunctionDef(self, node):
self.stack.append(node.name.value)
self.indentation += 1

def leave_FunctionDef(self, original_node, updated_node):
name = self._full_name()
ellipsis_func = m.FunctionDef(
body=m.SimpleStatementSuite(body=[m.Expr(m.Ellipsis())]))

if m.matches(original_node, ellipsis_func):
docstring = _get_docstring(name, self.module, self.indentation)
if docstring:
docstring_stmt = libcst.SimpleStatementLine(
body=[libcst.Expr(value=libcst.SimpleString(value=docstring))])
updated_node = updated_node.with_changes(
body=libcst.IndentedBlock(body=[docstring_stmt]))

self.stack.pop()
self.indentation -= 1
return updated_node


LIB_MODULES = {"array", "builder", "compat", "config", "device", "error", "io",
"_ipc", "memory", "pandas_shim", "scalar", "table", "tensor", "_types"}


def add_docstrings_to_stubs(stubs_dir):
"""Update all stub files in stubs_dir with docstrings from pyarrow runtime."""
stubs_dir = Path(stubs_dir)
print(f"Updating stub docstrings in: {stubs_dir}")

pyarrow = importlib.import_module("pyarrow")

for stub_file in stubs_dir.rglob('*.pyi'):
if stub_file.name == "_stubs_typing.pyi":
continue

module_name = stub_file.stem
if module_name in LIB_MODULES:
namespace = "lib"
elif stub_file.parent.name in ("parquet", "interchange"):
namespace = f"{stub_file.parent.name}.{module_name}"
elif module_name == "__init__":
namespace = ""
else:
namespace = module_name

print(f" {stub_file.name} -> {namespace or '(root)'}")
tree = libcst.parse_module(stub_file.read_text())
modified = tree.visit(DocstringInserter(pyarrow, namespace))
stub_file.write_text(modified.code)


def copy_stubs(src_dir, dest_dir):
"""Copy .pyi files from src_dir to dest_dir."""
src_dir, dest_dir = Path(src_dir), Path(dest_dir)
if not src_dir.exists():
return

print(f"Copying stubs: {src_dir} -> {dest_dir}")
for src in src_dir.rglob('*.pyi'):
dest = dest_dir / src.relative_to(src_dir)
dest.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(src, dest)


def update_stubs_for_build(stubs_dir, build_lib):
"""Entry point for setup.py: update docstrings and copy stubs to build dir."""
stubs_dir, build_lib = Path(stubs_dir), Path(build_lib)

sys.path.insert(0, str(build_lib))
try:
add_docstrings_to_stubs(stubs_dir)
copy_stubs(stubs_dir / "pyarrow", build_lib / "pyarrow")
finally:
sys.path.pop(0)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("stubs_dir", type=Path, help="Path to pyarrow-stubs folder")
args = parser.parse_args()

sys.path.insert(0, ".")
add_docstrings_to_stubs(args.stubs_dir.resolve())
Loading
Loading