diff --git a/docs/source/pipelines/yaml.rst b/docs/source/pipelines/yaml.rst index 5c499ae3d..408ad3899 100644 --- a/docs/source/pipelines/yaml.rst +++ b/docs/source/pipelines/yaml.rst @@ -58,19 +58,6 @@ Those pipelines consist of methods from HTTomolibgpu (GPU) and HTTomolib (CPU) b .. literalinclude:: ../pipelines_full/FISTA3d_tomobar.yaml :language: yaml -.. _tutorials_pl_templates_cpu: - -Pipelines using TomoPy library ------------------------------- - -One can build CPU-only pipelines by using mostly TomoPy methods. They are expected to be slower than the pipelines above. - -.. dropdown:: CPU pipeline using auto-centering and the gridrec reconstruction method from TomoPy. - - .. literalinclude:: ../pipelines_full/tomopy_gridrec.yaml - :language: yaml - - .. _tutorials_pl_templates_dls: DLS-specific pipelines @@ -95,17 +82,31 @@ Pipelines with parameter sweeps Here we demonstrate how to perform a sweep across multiple values of a single parameter (see :ref:`parameter_sweeping` for more details). -.. note:: There is no need to add image saving plugin for sweep runs as it will be added automatically. It is also preferable to keep the `preview` small as the time of computation can be substantial. +.. note:: There is no need to add image saving plugin for sweep runs as it will be added automatically. .. dropdown:: Parameter sweep using the :code:`!SweepRange` tag to do a sweep over several CoR values of the :code:`center` parameter in the reconstruction method. .. literalinclude:: ../pipelines_full/sweep_center_FBP3d_tomobar.yaml :language: yaml - :emphasize-lines: 34-37 + :emphasize-lines: 36-39 .. dropdown:: Parameter sweep using the :code:`!Sweep` tag over several particular values (not a range) of the :code:`ratio_delta_beta` parameter for the Paganin filter. .. literalinclude:: ../pipelines_full/sweep_paganin_FBP3d_tomobar.yaml :language: yaml - :emphasize-lines: 53-56 + :emphasize-lines: 51-54 + +.. _tutorials_pl_templates_cpu: + +Pipelines using TomoPy library +------------------------------ + +One can build CPU-only pipelines by using mostly TomoPy methods. + +.. note:: Methods from TomoPy are expected to be slower than the GPU-accelerated methods from the libraries above. + +.. dropdown:: CPU pipeline using auto-centering and the gridrec reconstruction method from TomoPy. + + .. literalinclude:: ../pipelines_full/tomopy_gridrec.yaml + :language: yaml \ No newline at end of file diff --git a/httomo/method_wrappers/generic.py b/httomo/method_wrappers/generic.py index 1aa59deaf..ffe5dfade 100644 --- a/httomo/method_wrappers/generic.py +++ b/httomo/method_wrappers/generic.py @@ -14,7 +14,14 @@ MethodRepository, ) from httomo.runner.output_ref import OutputRef -from httomo.utils import catch_gputime, catchtime, gpu_enabled, log_rank, xp +from httomo.utils import ( + catch_gputime, + catchtime, + gpu_enabled, + log_rank, + xp, + search_max_slices_iterative, +) import numpy as np @@ -447,6 +454,17 @@ def calculate_max_slices( * np.prod(non_slice_dims_shape) * data_dtype.itemsize ) + elif self.memory_gpu.method == "iterative": + # The iterative method may use the GPU + assert gpu_enabled, "GPU method used on a system without GPU support" + with xp.cuda.Device(self._gpu_id): + gpumem_cleanup() + return ( + self._calculate_max_slices_iterative( + data_dtype, non_slice_dims_shape, available_memory + ), + available_memory, + ) else: ( memory_bytes_method, @@ -462,6 +480,31 @@ def calculate_max_slices( available_memory - subtract_bytes ) // memory_bytes_method, available_memory + def _calculate_max_slices_iterative( + self, + data_dtype: np.dtype, + non_slice_dims_shape: Tuple[int, int], + available_memory: int, + ) -> int: + def get_mem_bytes(current_slices): + try: + memory_bytes = self._query.calculate_memory_bytes_for_slices( + dims_shape=( + current_slices, + non_slice_dims_shape[0], + non_slice_dims_shape[1], + ), + dtype=data_dtype, + **self._unwrap_output_ref_values(), + ) + return memory_bytes + except: + return 2**64 + finally: + gpumem_cleanup() + + return search_max_slices_iterative(available_memory, get_mem_bytes) + def _unwrap_output_ref_values(self) -> Dict[str, Any]: """ Iterate through params in `self.config_params` and, for any value of type `OutputRef`, diff --git a/httomo/runner/methods_repository_interface.py b/httomo/runner/methods_repository_interface.py index 88be412b8..75d3a5a56 100644 --- a/httomo/runner/methods_repository_interface.py +++ b/httomo/runner/methods_repository_interface.py @@ -50,6 +50,12 @@ def calculate_memory_bytes( """Calculate the memory required in bytes, returning bytes method and subtract bytes tuple""" ... # pragma: no cover + def calculate_memory_bytes_for_slices( + self, dims_shape: Tuple[int, int, int], dtype: np.dtype, **kwargs + ) -> int: + """Calculate the memory required in bytes for a given 3D grid""" + ... # pragma: no cover + def calculate_output_dims( self, non_slice_dims_shape: Tuple[int, int], **kwargs ) -> Tuple[int, int]: diff --git a/httomo/sweep_runner/param_sweep_runner.py b/httomo/sweep_runner/param_sweep_runner.py index c453c79b9..f200c5617 100644 --- a/httomo/sweep_runner/param_sweep_runner.py +++ b/httomo/sweep_runner/param_sweep_runner.py @@ -16,12 +16,12 @@ from httomo.sweep_runner.param_sweep_block import ParamSweepBlock from httomo.sweep_runner.side_output_manager import SideOutputManager from httomo.sweep_runner.stages import NonSweepStage, Stages, SweepStage -from httomo.utils import catchtime, log_exception, log_once -from httomo.runner.gpu_utils import get_available_gpu_memory +from httomo.utils import catchtime, log_exception, log_once, search_max_slices_iterative +from httomo.runner.gpu_utils import get_available_gpu_memory, gpumem_cleanup from httomo.preview import PreviewConfig, PreviewDimConfig from httomo.runner.dataset_store_interfaces import DataSetSource from httomo_backends.methods_database.packages.backends.httomolibgpu.supporting_funcs.prep.phase import ( - _calc_memory_bytes_paganin_filter, + _calc_memory_bytes_for_slices_paganin_filter, ) @@ -322,8 +322,15 @@ def _slices_to_fit_memory_Paganin(source: DataSetSource) -> int: angles_total = source.aux_data.angles_length det_X_length = source.chunk_shape[2] - (memory_bytes_method, subtract_bytes) = _calc_memory_bytes_paganin_filter( - (angles_total, det_X_length), dtype=np.float32() - ) + def get_mem_bytes(slices): + try: + return _calc_memory_bytes_for_slices_paganin_filter( + (slices, angles_total, det_X_length), dtype=np.float32() + ) + except: + return 2**64 + finally: + gpumem_cleanup() - return (available_memory - subtract_bytes) // memory_bytes_method + gpumem_cleanup() + return search_max_slices_iterative(available_memory, get_mem_bytes) diff --git a/httomo/utils.py b/httomo/utils.py index 26436760a..585db502f 100644 --- a/httomo/utils.py +++ b/httomo/utils.py @@ -299,3 +299,60 @@ def mpi_abort_excepthook(type, value, traceback): log_rank("\n".join(format_tb(traceback)), MPI.COMM_WORLD) MPI.COMM_WORLD.Abort() sys.__excepthook__(type, value, traceback) + + +def search_max_slices_iterative( + available_memory: int, get_mem_bytes: Callable[[int], int] +) -> int: + """ + Approximates the maximum number of fitting slices to the GPU memory for a given function. + The memory profile of the function must be increasing in the function of the number of slices. + First, a linear approximation of the memory profile is performed. If this is not accurate enough, + a binary search follows to determine the number of fitting slices. This function never returns a + number of slices for what `get_mem_bytes(slices) > available_memory`. + + :param available_memory: Bytes of available device memory + :type available_memory: int + :param get_mem_bytes: A functor that produces the bytes of device memory needed for a given number of slices. + :type get_mem_bytes: Callable[[int], int] + :return: Returns the approximation of the maximum number of fitting slices. + :rtype: int + """ + MEM_RATIO_THRESHOLD = 0.9 # 90% of the used device memory is the target + + # Find a number of slices that does not fit + current_slices = 100 + slices_high = None + memory_bytes = get_mem_bytes(current_slices) + if memory_bytes > available_memory: + # Found upper limit, continue to binary search + slices_high = current_slices + else: + # linear approximation + current_slices = int(current_slices * available_memory / memory_bytes) + while True: + memory_bytes = get_mem_bytes(current_slices) + if memory_bytes > available_memory: + # Found upper limit, continue to binary search + break + elif memory_bytes >= available_memory * MEM_RATIO_THRESHOLD: + # This is "good enough", return + return current_slices + + # If linear approximation is not enough, just double every iteration + current_slices *= 2 + slices_high = current_slices + + # Binary search between low and high + slices_low = 0 + while slices_high - slices_low > 1: + current_slices = (slices_low + slices_high) // 2 + memory_bytes = get_mem_bytes(current_slices) + if memory_bytes > available_memory: + slices_high = current_slices + elif memory_bytes >= available_memory * MEM_RATIO_THRESHOLD: + return current_slices + else: + slices_low = current_slices + + return slices_low diff --git a/tests/method_wrappers/test_generic.py b/tests/method_wrappers/test_generic.py index e62fe0425..dad9ff70c 100644 --- a/tests/method_wrappers/test_generic.py +++ b/tests/method_wrappers/test_generic.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Union +from typing import Callable, List, Optional, Union import numpy as np from httomo.method_wrappers import make_method_wrapper @@ -689,6 +689,105 @@ def test_method(data): assert available_memory == 1_000_000_000 +def _linear_mem(*args, **kwargs): + proj, x, y = kwargs["dims_shape"] + dtype = kwargs["dtype"] + return proj * x * y * dtype.itemsize + + +def _linear_offset_mem(*args, **kwargs): + proj, x, y = kwargs["dims_shape"] + dtype = kwargs["dtype"] + return (x * y + proj * x * y + proj * x**2) * dtype.itemsize + + +def _quadratic_mem(*args, **kwargs): + proj, x, y = kwargs["dims_shape"] + dtype = kwargs["dtype"] + return (4 * x * y + proj * proj * x * y) * dtype.itemsize + + +THROW_OVER_SLICES = 77 + + +def _quadratic_mem_throws(*args, **kwargs): + proj, x, y = kwargs["dims_shape"] + dtype = kwargs["dtype"] + if proj > THROW_OVER_SLICES: + raise Exception("Memory estimator failed") + return (4 * x * y + proj * proj * x * y) * dtype.itemsize + + +@pytest.mark.cupy +@pytest.mark.parametrize("available_memory", [0, 1_000, 1_000_000, 1_000_000_000]) +@pytest.mark.parametrize( + "memcalc_fn", + [_linear_mem, _linear_offset_mem, _quadratic_mem, _quadratic_mem_throws], +) +def test_generic_calculate_max_slices_iterative( + mocker: MockerFixture, + dummy_block: DataSetBlock, + available_memory: int, + memcalc_fn: Callable, +): + class FakeModule: + def test_method(data): + return data + + mocker.patch( + "httomo.method_wrappers.generic.import_module", return_value=FakeModule + ) + + memory_gpu = GpuMemoryRequirement(multiplier=None, method="iterative") + repo = make_mock_repo( + mocker, + pattern=Pattern.projection, + output_dims_change=True, + implementation="gpu_cupy", + memory_gpu=memory_gpu, + ) + + memcalc_mock = mocker.patch.object( + repo.query("", ""), "calculate_memory_bytes_for_slices", side_effect=memcalc_fn + ) + wrp = make_method_wrapper( + repo, + "mocked_module_path", + "test_method", + MPI.COMM_WORLD, + make_mock_preview_config(mocker), + ) + shape_t = list(dummy_block.chunk_shape) + shape_t.pop(0) + shape = (shape_t[0], shape_t[1]) + max_slices, _ = wrp.calculate_max_slices( + dummy_block.data.dtype, + shape, + available_memory, + ) + + check_slices = lambda slices: memcalc_mock( + dims_shape=(slices, shape[0], shape[1]), dtype=dummy_block.data.dtype + ) + threshold = 0.9 + if check_slices(1) > available_memory: + # If zero slice fits + assert max_slices == 0 + else: + # Computed slices must fit in the available memory + assert check_slices(max_slices) <= available_memory + + if memcalc_fn == _quadratic_mem_throws and max_slices + 1 >= THROW_OVER_SLICES: + with pytest.raises(Exception): + check_slices(max_slices + 1) + else: + # And one more slice must not fit OR above threshold + assert ( + check_slices(max_slices + 1) > available_memory + or check_slices(max_slices) >= available_memory * threshold + ) + + @pytest.mark.cupy def test_generic_calculate_output_dims(mocker: MockerFixture): class FakeModule: