diff --git a/.drone.star.disabled b/.drone.star similarity index 65% rename from .drone.star.disabled rename to .drone.star index b9ae6888..d3741e87 100644 --- a/.drone.star.disabled +++ b/.drone.star @@ -14,8 +14,8 @@ def main(ctx): return generate( # Compilers [ - 'gcc >=5.0', - 'clang >=3.9', + 'gcc >=12.0', + 'clang >=17.0', 'msvc >=14.1', 'arm64-gcc latest', 's390x-gcc latest', @@ -27,16 +27,12 @@ def main(ctx): 'x86-msvc latest' ], # Standards - '>=11', + '>=20', # Asan is delegated to GHA asan=False, docs=False, - cache_dir='cache') + [ - linux_cxx("GCC 12 (no-mutex)", "g++-12", packages="g++-12", buildscript="drone", buildtype="boost", - image="cppalliance/droneubuntu2204:1", - environment={'B2_TOOLSET': 'gcc-12', 'B2_DEFINES': 'BOOST_URL_DISABLE_THREADS=1', - 'B2_CXXSTD': '17'}, globalenv={'B2_CI_VERSION': '1', 'B2_VARIANT': 'release'}), - ] + coverage=False, + cache_dir='cache') # from https://github.com/cppalliance/ci-automation diff --git a/.drone/drone.bat b/.drone/drone.bat new file mode 100644 index 00000000..fa83aba6 --- /dev/null +++ b/.drone/drone.bat @@ -0,0 +1,47 @@ + +@ECHO ON +setlocal enabledelayedexpansion + +set TRAVIS_OS_NAME=windows + +IF "!DRONE_BRANCH!" == "" ( + for /F %%i in ("!GITHUB_REF!") do @set TRAVIS_BRANCH=%%~nxi +) else ( + SET TRAVIS_BRANCH=!DRONE_BRANCH! +) + +if "%DRONE_JOB_BUILDTYPE%" == "boost" ( + +echo "Running boost job" +echo '==================================> INSTALL' +REM there seems to be some problem with b2 bootstrap on Windows +REM when CXX env variable is set +SET "CXX=" + +git clone https://github.com/boostorg/boost-ci.git boost-ci-cloned --depth 1 +cp -prf boost-ci-cloned/ci . +rm -rf boost-ci-cloned +REM source ci/travis/install.sh +REM The contents of install.sh below: + +for /F %%i in ("%DRONE_REPO%") do @set SELF=%%~nxi +SET BOOST_CI_TARGET_BRANCH=!TRAVIS_BRANCH! +SET BOOST_CI_SRC_FOLDER=%cd% +if "%BOOST_BRANCH%" == "" ( + SET BOOST_BRANCH=develop + if "%BOOST_CI_TARGET_BRANCH%" == "master" set BOOST_BRANCH=master +) + +call ci\common_install.bat + +echo '==================================> ZLIB' +git clone --branch v1.2.13 https://github.com/madler/zlib.git !BOOST_ROOT!\zlib-src --depth 1 +set ZLIB_SOURCE=!BOOST_ROOT!\zlib-src + +echo '==================================> COMPILE' + +REM set B2_TARGETS=libs/!SELF!/test libs/!SELF!/example +set B2_TARGETS=libs/!SELF!/test +call !BOOST_ROOT!\libs\!SELF!\ci\build.bat + +) diff --git a/.drone/drone.sh b/.drone/drone.sh new file mode 100755 index 00000000..004fd0a6 --- /dev/null +++ b/.drone/drone.sh @@ -0,0 +1,150 @@ +#!/bin/bash + +# Copyright 2020 Rene Rivera, Sam Darwin +# Distributed under the Boost Software License, Version 1.0. +# (See accompanying file LICENSE.txt or copy at http://boost.org/LICENSE_1_0.txt) + +set -xe + +export DRONE_BUILD_DIR=$(pwd) +export VCS_COMMIT_ID=$DRONE_COMMIT +export GIT_COMMIT=$DRONE_COMMIT +export REPO_NAME=$DRONE_REPO +export USER=$(whoami) +export CC=${CC:-gcc} +export PATH=~/.local/bin:/usr/local/bin:$PATH +export TRAVIS_BUILD_DIR=$(pwd) +export TRAVIS_BRANCH=$DRONE_BRANCH +export TRAVIS_EVENT_TYPE=$DRONE_BUILD_EVENT + +common_install () { + if [ -z "$SELF" ]; then + export SELF=`basename $REPO_NAME` + fi + + git clone https://github.com/boostorg/boost-ci.git boost-ci-cloned --depth 1 + [ "$SELF" == "boost-ci" ] || cp -prf boost-ci-cloned/ci . + rm -rf boost-ci-cloned + + if [ "$TRAVIS_OS_NAME" == "osx" ]; then + unset -f cd + fi + + export BOOST_CI_TARGET_BRANCH="$TRAVIS_BRANCH" + export BOOST_CI_SRC_FOLDER=$(pwd) + + . ./ci/common_install.sh +} + +if [[ $(uname) == "Linux" && "$B2_ASAN" == "1" ]]; then + echo 0 | sudo tee /proc/sys/kernel/randomize_va_space > /dev/null +fi + +if [ "$DRONE_JOB_BUILDTYPE" == "boost" ]; then + +echo '==================================> INSTALL' + +common_install + +echo '==================================> SCRIPT' + +. $BOOST_ROOT/libs/$SELF/ci/build.sh + +elif [ "$DRONE_JOB_BUILDTYPE" == "codecov" ]; then + +echo '==================================> INSTALL' + +common_install + +echo '==================================> SCRIPT' + +cd $BOOST_ROOT/libs/$SELF +ci/travis/codecov.sh + +elif [ "$DRONE_JOB_BUILDTYPE" == "valgrind" ]; then + +echo '==================================> INSTALL' + +common_install + +echo '==================================> SCRIPT' + +cd $BOOST_ROOT/libs/$SELF +ci/travis/valgrind.sh + +elif [ "$DRONE_JOB_BUILDTYPE" == "coverity" ]; then + +echo '==================================> INSTALL' + +common_install + +echo '==================================> SCRIPT' + +if [ -n "${COVERITY_SCAN_NOTIFICATION_EMAIL}" -a \( "$TRAVIS_BRANCH" = "develop" -o "$TRAVIS_BRANCH" = "master" \) -a \( "$DRONE_BUILD_EVENT" = "push" -o "$DRONE_BUILD_EVENT" = "cron" \) ] ; then +cd $BOOST_ROOT/libs/$SELF +ci/travis/coverity.sh +fi + +elif [ "$DRONE_JOB_BUILDTYPE" == "cmake1" ]; then + +set -xe + +echo '==================================> INSTALL' + +# already in the image +# pip install --user cmake + +echo '==================================> SCRIPT' + +export SELF=`basename $REPO_NAME` +BOOST_BRANCH=develop && [ "$DRONE_BRANCH" == "master" ] && BOOST_BRANCH=master || true +echo BOOST_BRANCH: $BOOST_BRANCH +cd .. +git clone -b $BOOST_BRANCH --depth 1 https://github.com/boostorg/boost.git boost-root +cd boost-root + +mkdir -p libs/$SELF +cp -r $DRONE_BUILD_DIR/* libs/$SELF +# git submodule update --init tools/boostdep +git submodule update --init --recursive + +cd libs/$SELF +mkdir __build__ && cd __build__ +cmake -DCMAKE_INSTALL_PREFIX=~/.local .. +cmake --build . --target install + +elif [ "$DRONE_JOB_BUILDTYPE" == "cmake-superproject" ]; then + +echo '==================================> INSTALL' + +common_install + +echo '==================================> COMPILE' + +# Warnings as errors -Werror not building. Remove for now: +# export CXXFLAGS="-Wall -Wextra -Werror" +export CXXFLAGS="-Wall -Wextra" +export CMAKE_OPTIONS=${CMAKE_OPTIONS:--DBUILD_TESTING=ON} +export CMAKE_SHARED_LIBS=${CMAKE_SHARED_LIBS:-1} + +mkdir __build_static +cd __build_static +cmake -DBOOST_ENABLE_CMAKE=1 -DBoost_VERBOSE=1 ${CMAKE_OPTIONS} \ + -DBOOST_INCLUDE_LIBRARIES=$SELF .. +cmake --build . +ctest --output-on-failure -R boost_$SELF + +cd .. + +if [ "$CMAKE_SHARED_LIBS" = 1 ]; then + +mkdir __build_shared +cd __build_shared +cmake -DBOOST_ENABLE_CMAKE=1 -DBoost_VERBOSE=1 ${CMAKE_OPTIONS} \ + -DBOOST_INCLUDE_LIBRARIES=$SELF -DBUILD_SHARED_LIBS=ON .. +cmake --build . +ctest --output-on-failure -R boost_$SELF + +fi + +fi diff --git a/doc/modules/ROOT/nav.adoc b/doc/modules/ROOT/nav.adoc index 15142409..fb6125af 100644 --- a/doc/modules/ROOT/nav.adoc +++ b/doc/modules/ROOT/nav.adoc @@ -1,8 +1,5 @@ * xref:index.adoc[Introduction] * xref:why-capy.adoc[Why Capy?] -* xref:why-not-cobalt.adoc[Why Not Cobalt?] -* xref:why-not-cobalt-2.adoc[Why Not Cobalt Concepts?] -* xref:why-not-tmc.adoc[Why Not TooManyCooks?] * xref:quick-start.adoc[Quick Start] * xref:2.cpp20-coroutines/2.intro.adoc[Introduction To {cpp}20 Coroutines] ** xref:2.cpp20-coroutines/2a.foundations.adoc[Part I: Foundations] @@ -48,14 +45,19 @@ ** xref:7.examples/7i.echo-server-corosio.adoc[Echo Server with Corosio] ** xref:7.examples/7j.stream-pipeline.adoc[Stream Pipeline] * xref:8.design/8.intro.adoc[Design] -** xref:8.design/8a.ReadStream.adoc[ReadStream] -** xref:8.design/8b.ReadSource.adoc[ReadSource] -** xref:8.design/8c.BufferSource.adoc[BufferSource] -** xref:8.design/8d.WriteStream.adoc[WriteStream] -** xref:8.design/8e.WriteSink.adoc[WriteSink] -** xref:8.design/8f.BufferSink.adoc[BufferSink] -** xref:8.design/8g.RunApi.adoc[Run API] -** xref:8.design/8h.TypeEraseAwaitable.adoc[Type-Erasing Awaitables] -** xref:8.design/8i.any_buffer_sink.adoc[AnyBufferSink] -** xref:8.design/8j.Executor.adoc[Executor] +** xref:8.design/8a.CapyLayering.adoc[Layered Abstractions] +** xref:8.design/8b.Separation.adoc[Why Capy Is Separate] +** xref:8.design/8c.ReadStream.adoc[ReadStream] +** xref:8.design/8d.ReadSource.adoc[ReadSource] +** xref:8.design/8e.BufferSource.adoc[BufferSource] +** xref:8.design/8f.WriteStream.adoc[WriteStream] +** xref:8.design/8g.WriteSink.adoc[WriteSink] +** xref:8.design/8h.BufferSink.adoc[BufferSink] +** xref:8.design/8i.TypeEraseAwaitable.adoc[Type-Erasing Awaitables] +** xref:8.design/8j.any_buffer_sink.adoc[AnyBufferSink] +** xref:8.design/8k.Executor.adoc[Executor] +** xref:8.design/8l.RunApi.adoc[Run API] +** xref:8.design/8m.WhyNotCobalt.adoc[Why Not Cobalt?] +** xref:8.design/8n.WhyNotCobaltConcepts.adoc[Why Not Cobalt Concepts?] +** xref:8.design/8o.WhyNotTMC.adoc[Why Not TooManyCooks?] * xref:reference:boost/capy.adoc[Reference] diff --git a/doc/modules/ROOT/pages/8.design/8a.CapyLayering.adoc b/doc/modules/ROOT/pages/8.design/8a.CapyLayering.adoc new file mode 100644 index 00000000..c29fda74 --- /dev/null +++ b/doc/modules/ROOT/pages/8.design/8a.CapyLayering.adoc @@ -0,0 +1,152 @@ +// +// Copyright (c) 2025 Vinnie Falco (vinnie.falco@gmail.com) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Official repository: https://github.com/cppalliance/capy +// + += Layered Abstractions + +{cpp} async libraries have traditionally forced users into a single abstraction level, and every choice comes with baggage. You go with templates and you get zero overhead, full optimization, and unreadable error messages that scroll for pages. Compile times explode. You cannot hide implementation behind a compilation boundary, so you have no ABI stability. You go with virtual dispatch and you get readable code, stable ABIs, and a runtime cost that every call path pays whether it needs to or not. + +This is a false binary. Users need different things at different layers of their system. A protocol parser needs zero-copy buffer access and zero overhead because it runs on every byte of every message. Business logic needs readability because the person maintaining it in two years needs to understand what it does. Library boundaries need ABI stability because you do not want downstream code recompiling every time an internal detail changes. + +One abstraction level cannot serve all of these needs simultaneously. The insight behind Capy's architecture is that users should choose the abstraction level appropriate to each part of their code, and the library should make that choice natural rather than painful. + + +== The Three Layers + +Capy offers three layers. They coexist. They interoperate. Users pick the one that matches their constraints. + +The first layer is concepts. These are the template-based interfaces: `ReadStream`, `WriteStream`, `BufferSink`, `BufferSource`. Algorithms written against concepts get full optimization. The compiler sees through everything. There is no indirection, no vtable, no allocation overhead. This is what you use for hot inner loops, for protocol parsing, for any path where performance dominates: + +[source,cpp] +---- +template +io_task +push_to(Src& source, Sink& sink); +---- + +The cost is that templates propagate. Every caller sees the full implementation. Compile times grow. You cannot hide this behind a `.cpp` file. + +The second layer is type-erased wrappers. `any_stream`, `any_read_stream`, `any_write_stream`. These use a vtable internally, similar to `std::function` but specialized for I/O. You can write an algorithm against `any_stream&` and it compiles once, lives in a single translation unit, and works with any stream type: + +[source,cpp] +---- +task<> echo(any_stream& stream) +{ + char buf[1024]; + for(;;) + { + auto [ec, n] = co_await stream.read_some( + mutable_buffer(buf)); + if(ec.failed()) + co_return; + co_await write(stream, const_buffer(buf, n)); + } +} +---- + +The cost is a virtual call per I/O operation. For operations dominated by syscalls and network latency, this cost is invisible. For tight loops over in-memory buffers, it matters. + +The third layer is coroutine type erasure via `task<>`. This is the most powerful form of type erasure in the language. Inside a coroutine, when you write `co_await`, everything in the awaitable becomes type-erased from the perspective of the caller. The caller sees a `task<>`. The implementation is invisible. A pure virtual function returning `task<>` hides the stream type, the buffer strategy, the algorithm, the error handling - everything: + +[source,cpp] +---- +class connection_base { +public: + task<> run(); +protected: + virtual task<> do_handshake() = 0; + virtual task<> do_shutdown() = 0; +}; +---- + +The SSL derived class performs a TLS handshake inside `do_handshake()`. The TCP derived class just connects. The base class implements all the shared business logic against an `any_stream&` and never knows what is underneath. + +This is the layer you use for architectural boundaries. Plugin systems. Transport abstraction. Anything where you want complete separation between the interface and the implementation. + + +== Compilation Boundary Economics + +Benchmarks on Corosio gave us hard numbers on the cost of crossing a compilation boundary. The result is intuitive once you see it: the cost is proportional to the size of the coroutine. + +For small, fast coroutines - the kind that do a quick buffer manipulation and return - the overhead of virtual dispatch plus parameter marshalling is a significant percentage of total execution time. For larger operations - data transfers dominated by syscalls, protocol handshakes, connection establishment - the boundary cost vanishes into noise. + +This has a direct implication for how you structure code. Use concepts for tight inner operations where the work per call is small. Use type erasure at module boundaries where the work per call is large enough to absorb the overhead. The library does not make this decision for you. You make it based on your profiling data and your architecture requirements. + +The user chooses where the boundary falls. Not the library. + + +== Zero-Copy as a First-Class Concern + +Buffer sink and buffer source invert the traditional ownership model. Instead of the caller allocating a buffer, filling it with data, and handing it to the library, the library exposes its own internal storage and the caller fills it in place. Zero copies. The data goes directly where it needs to be. + +The `BufferSink` concept formalizes this with three operations. `prepare()` returns writable buffers from the sink's internal storage. The caller writes data into those buffers. `commit()` tells the sink how many bytes were written: + +[source,cpp] +---- +concept BufferSink = + requires(T& sink, std::span dest, + std::size_t n) + { + { sink.prepare(dest) } + -> std::same_as>; + { sink.commit(n) } -> IoAwaitable; + }; +---- + +This matters at the protocol level. The HTTP parser's internal buffer is the buffer you write into. The serializer's internal buffer is the buffer you read from. There is no intermediate copy between the network and the parser, and no intermediate copy between the serializer and the network. + +The key detail is that `commit()` returns an `IoAwaitable`. When the sink is backed by a socket, `commit()` suspends and performs the actual write. When the sink is an in-memory buffer - a string, a parser, a test fixture - `commit()` completes synchronously without creating a coroutine frame. Same code, same API, no overhead for the synchronous case. This is what makes the buffer abstractions practical for both production I/O and testing. + + +== Symmetric Transfer and the Pump + +Symmetric transfer is the mechanism that allows Corosio to match or beat ASIO callback throughput. When one coroutine completes and its continuation is ready to run, symmetric transfer reuses the same coroutine frame without allocation and bypasses the global work queue entirely. ASIO callbacks always go through the queue. Symmetric transfer skips that step. + +The pump mechanism extends this by allowing multiple inline completions before returning to the queue. If a chain of coroutines completes quickly, the pump lets them execute back-to-back without touching the scheduler. For throughput-sensitive workloads like HTTP servers, this is significant. + +The trade-off is P99 latency. While the pump is running inline completions, queued work waits. For latency-sensitive workloads, you want to return to the queue more frequently so that every piece of work gets prompt attention. The pump is configurable. You can disable it entirely for HFT-style workloads that care about tail latency, or let it ramp up for servers that care about throughput. + +The frame recycler is a per-thread cache of coroutine frames. Chain workloads that allocate and free frames in sequence benefit from this cache. Fan-out workloads that spawn many concurrent tasks can exhaust it. The `right_now` pattern addresses this for repeated invocations of the same operation: declare a stack object with a one-element frame cache, and repeated calls reuse that cache without touching the recycler at all. `when_all` could carry its own private frame cache sized to its arity, giving each child a frame from the parent's stash via a TLS hook. Every use case that you make better can make another use case worse. You have to pay attention to that which is not seen. + + +== The Type System as Architecture + +The derived class pattern is the practical application of everything described above. A base class implements business logic against type-erased references. Derived classes carry concrete types and implement the operations that differ between transports. + +Each derived class lives in its own translation unit. The linker only pulls in what is used. Users who need only TCP link only TCP code. Users who need SSL link the SSL translation unit. No variant that pulls in all transport code. No enum and switch that ties everything together. The type system enforces the separation: + +[source,cpp] +---- +// User who needs only plain TCP +tcp_connection conn(ctx); + +// User who needs TLS +ssl_connection conn(ctx, tls_context); + +// User who needs runtime transport selection +multi_connection conn(ctx, config); +---- + +This extends naturally to testing. Derive a mock connection that uses Capy's test stream with a fuse for error injection, and a mock timer for deterministic time control. The base class algorithm runs against the mock exactly as it would against a real connection. No conditional compilation, no test-only code paths in production logic, no `#ifdef TESTING`. + +A database library built this way can express protocol parsing with zero-copy buffer sinks for the hot path, implement connection logic against type-erased streams for maintainability, let users select TCP vs. SSL vs. Unix at the type level for linker efficiency, and test without linking OpenSSL or running a real server. The hot paths use concepts. The cold paths use virtual dispatch. The architectural boundaries use `task<>`. Every user finds the abstraction level they need. + + +== Choosing the Right Layer + +The question that matters is: can a library author look at their problem and immediately see which layer to use? If the answer is yes, the design is working. If they have to think about it, something is wrong. + +**Protocol parsing:** use `BufferSink` and `BufferSource` concepts as template parameters. Zero copy, zero overhead. Call member functions whose awaitables do all the work, with no coroutine frame allocation. The compiler optimizes everything. + +**Connection management:** use concrete types like `tcp_socket`. These give you `connect()` and `shutdown()` - the operations that are transport-specific. But the concrete type is derived from `io_stream`, a class that models `capy::Stream`, so you can pass `io_stream&` to a non-template function for the business logic that sits on top of the connection. + +**Full transport abstraction across a library boundary:** use `any_stream`. Complete type erasure, but you lose connection management - there is no `connect()` on an `any_stream`. This means you have to carefully arrange your code so it genuinely requires a physical separation in the Lakos sense. The protocol logic and the connection logic live in separate components, and the type-erased boundary sits between them. + +The layers compose. An algorithm written against a `BufferSource` concept can be called from inside a coroutine that is type-erased behind a `task<>`, which is dispatched through a virtual function on a base class that holds an `any_stream&`. Each layer handles its part. Nothing leaks through the boundaries unless you want it to. + +This is what it means when we say the user chooses. Capy provides the tools. The user decides where the boundaries go based on what they know about their performance requirements, their compilation budget, and their architecture. The library does not impose a single answer because there is not one. diff --git a/doc/modules/ROOT/pages/8.design/8a.ReadStream.adoc b/doc/modules/ROOT/pages/8.design/8a.ReadStream.adoc deleted file mode 100644 index 931e2800..00000000 --- a/doc/modules/ROOT/pages/8.design/8a.ReadStream.adoc +++ /dev/null @@ -1,553 +0,0 @@ -= ReadStream Concept Design - -== Overview - -This document describes the design of the `ReadStream` concept: the -fundamental partial-read primitive in the concept hierarchy. It explains -why `read_some` is the correct building block, how composed algorithms -build on top of it, and the relationship to `ReadSource`. - -== Definition - -[source,cpp] ----- -template -concept ReadStream = - requires(T& stream, mutable_buffer_archetype buffers) - { - { stream.read_some(buffers) } -> IoAwaitable; - requires awaitable_decomposes_to< - decltype(stream.read_some(buffers)), - std::error_code, std::size_t>; - }; ----- - -A `ReadStream` provides a single operation: - -=== `read_some(buffers)` -- Partial Read - -Reads one or more bytes from the stream into the buffer sequence. -Returns `(error_code, std::size_t)` where `n` is the number of bytes -read. - -==== Semantics - -- On success: `!ec`, `n >= 1` and `n \<= buffer_size(buffers)`. -- On EOF: `ec == cond::eof`, `n == 0`. -- On error: `ec`, `n == 0`. -- If `buffer_empty(buffers)`: completes immediately, `!ec`, `n == 0`. - -The caller must not assume the buffer is filled. `read_some` may -return fewer bytes than the buffer can hold. This is the defining -property of a partial-read primitive. - -Once `read_some` returns an error (including EOF), the caller must -not call `read_some` again. The stream is done. Not all -implementations can reproduce a prior error on subsequent calls, so -the behavior after an error is undefined. - -Buffers in the sequence are filled completely before proceeding to the -next buffer in the sequence. - -==== Buffer Lifetime - -The caller must ensure that the memory referenced by `buffers` remains -valid until the `co_await` expression returns. - -==== Conforming Signatures - -[source,cpp] ----- -template -IoAwaitable auto read_some(Buffers buffers); ----- - -Buffer sequences should be accepted by value when the member function -is a coroutine, to ensure the sequence lives in the coroutine frame -across suspension points. - -== Concept Hierarchy - -`ReadStream` is the base of the read-side hierarchy: - ----- -ReadStream { read_some } - | - v -ReadSource { read_some, read } ----- - -`ReadSource` refines `ReadStream`. Every `ReadSource` is a -`ReadStream`. Algorithms constrained on `ReadStream` accept both raw -streams and sources. The `ReadSource` concept adds a complete-read -primitive on top of the partial-read primitive. - -This mirrors the write side: - ----- -WriteStream { write_some } - | - v -WriteSink { write_some, write, write_eof(buffers), write_eof() } ----- - -== Composed Algorithms - -Three composed algorithms build on `read_some`: - -=== `read(stream, buffers)` -- Fill a Buffer Sequence - -[source,cpp] ----- -auto read(ReadStream auto& stream, - MutableBufferSequence auto const& buffers) - -> io_task; ----- - -Loops `read_some` until the entire buffer sequence is filled or an -error (including EOF) occurs. On success, `n == buffer_size(buffers)`. - -[source,cpp] ----- -template -task<> read_header(Stream& stream) -{ - char header[16]; - auto [ec, n] = co_await read( - stream, mutable_buffer(header)); - if(ec == cond::eof) - co_return; // clean shutdown - if(ec) - co_return; - // header contains exactly 16 bytes -} ----- - -=== `read(stream, dynamic_buffer)` -- Read Until EOF - -[source,cpp] ----- -auto read(ReadStream auto& stream, - DynamicBufferParam auto&& buffers, - std::size_t initial_amount = 2048) - -> io_task; ----- - -Reads from the stream into a dynamic buffer until EOF is reached. The -buffer grows with a 1.5x factor when filled. On success (EOF), `ec` -is clear and `n` is the total bytes read. - -[source,cpp] ----- -template -task slurp(Stream& stream) -{ - std::string body; - auto [ec, n] = co_await read( - stream, string_dynamic_buffer(&body)); - if(ec) - co_return {}; - co_return body; -} ----- - -=== `read_until(stream, dynamic_buffer, match)` -- Delimited Read - -Reads from the stream into a dynamic buffer until a delimiter or match -condition is found. Used for line-oriented protocols and message -framing. - -[source,cpp] ----- -template -task<> read_line(Stream& stream) -{ - std::string line; - auto [ec, n] = co_await read_until( - stream, string_dynamic_buffer(&line), "\r\n"); - if(ec) - co_return; - // line contains data up to and including "\r\n" -} ----- - -== Use Cases - -=== Incremental Processing with `read_some` - -When processing data as it arrives without waiting for a full buffer, -`read_some` is the right choice. This is common for real-time data or -when the processing can handle partial input. - -[source,cpp] ----- -template -task<> echo(Stream& stream, WriteStream auto& dest) -{ - char buf[4096]; - for(;;) - { - auto [ec, n] = co_await stream.read_some( - mutable_buffer(buf)); - if(ec == cond::eof) - co_return; - if(ec) - co_return; - - // Forward whatever we received immediately - auto [wec, nw] = co_await dest.write_some( - const_buffer(buf, n)); - if(wec) - co_return; - } -} ----- - -=== Relaying from ReadStream to WriteStream - -When relaying data from a reader to a writer, `read_some` feeds -`write_some` directly. This is the fundamental streaming pattern. - -[source,cpp] ----- -template -task<> relay(Src& src, Dest& dest) -{ - char storage[65536]; - circular_dynamic_buffer cb(storage, sizeof(storage)); - - for(;;) - { - // Read into free space - auto mb = cb.prepare(cb.capacity()); - auto [rec, nr] = co_await src.read_some(mb); - cb.commit(nr); - - if(rec && rec != cond::eof) - co_return; - - // Drain to destination - while(cb.size() > 0) - { - auto [wec, nw] = co_await dest.write_some( - cb.data()); - if(wec) - co_return; - cb.consume(nw); - } - - if(rec == cond::eof) - co_return; - } -} ----- - -Because `ReadSource` refines `ReadStream`, this relay function also -accepts `ReadSource` types. An HTTP body source or a decompressor -can be relayed to a `WriteStream` using the same function. - -== Relationship to the Write Side - -[cols="1,1"] -|=== -| Read Side | Write Side - -| `ReadStream::read_some` -| `WriteStream::write_some` - -| `read` free function (composed) -| `write_now` (composed, eager) - -| `read_until` (composed, delimited) -| No write-side equivalent - -| `ReadSource::read` -| `WriteSink::write` -|=== - -== Design Foundations: Why Errors Exclude Data - -The `read_some` contract requires that `n` is 0 whenever `ec` is set. -Data and errors are mutually exclusive outcomes. This is the most -consequential design decision in the `ReadStream` concept, with -implications for every consumer of `read_some` in the library. The -rule follows Asio's established `AsyncReadStream` contract, is -reinforced by the behavior of POSIX and Windows I/O system calls, -and produces cleaner consumer code. This section explains the design -and its consequences. - -=== Reconstructing Kohlhoff's Reasoning - -Christopher Kohlhoff's Asio library defines an `AsyncReadStream` -concept with the identical requirement: on error, `bytes_transferred` -is 0. No design rationale document accompanies this rule. The -reasoning presented here was reconstructed from three sources: - -- *The Asio source code.* The function `non_blocking_recv1` in - `socket_ops.ipp` explicitly sets `bytes_transferred = 0` on every - error path. The function `complete_iocp_recv` maps Windows IOCP - errors to portable error codes, relying on the operating system's - guarantee that failed completions report zero bytes. These are - deliberate choices, not accidental pass-through of OS behavior. -- *A documentation note Kohlhoff left.* Titled "Why EOF is an error," - it gives two reasons: composed operations need EOF-as-error to - report contract violations, and EOF-as-error disambiguates the - end of a stream from a successful zero-byte read. The note is - terse but the implications are deep. -- *Analysis of the underlying system calls.* POSIX `recv()` and - Windows `WSARecv()` both enforce a binary outcome per call: data - or error, never both. This is not because the {cpp} abstraction - copied the OS, but because both levels face the same fundamental - constraint. - -The following sections examine each of these points and their -consequences. - -=== Alignment with Asio - -Asio's `AsyncReadStream` concept has enforced the same rule for over -two decades: on error, `bytes_transferred` is 0. This is a deliberate -design choice, not an accident. The Asio source code explicitly zeroes -`bytes_transferred` on every error path, and the underlying system -calls (POSIX `recv()`, Windows IOCP) enforce binary outcomes at the -OS level. The `read_some` contract follows this established practice. - -=== The Empty-Buffer Rule - -Every `ReadStream` must support the following: - -[quote] -`read_some(empty_buffer)` completes immediately with `{success, 0}`. - -This is a no-op. The caller passed no buffer space, so no I/O is -attempted. The operation does not inspect the stream's internal state -because that would require a probe capability -- a way to ask "is -there data? is the stream at EOF?" -- without actually reading. Not -every source supports probing. A TCP socket does not know that its -peer has closed until it calls `recv()` and gets 0 back. A pipe does -not know it is broken until a read fails. The empty-buffer rule is -therefore unconditional: return `{success, 0}` regardless of the -stream's state. - -This rule is a natural consequence of the contract, not a proof of -it. When no I/O is attempted, no state is discovered and no error -is reported. - -=== Why EOF Is an Error - -Kohlhoff's documentation note gives two reasons for making EOF an -error code rather than a success: - -*Composed operations need EOF-as-error to report contract violations.* -The composed `read(stream, buffer(buf, 100))` promises to fill -exactly 100 bytes. If the stream ends after 50, the operation did not -fulfill its contract. Reporting `{success, 50}` would be misleading -- -it suggests the operation completed normally. Reporting `{eof, 50}` -tells the caller both what happened (50 bytes landed in the buffer) -and why the operation stopped (the stream ended). EOF-as-error is the -mechanism by which composed operations explain early termination. - -*EOF-as-error disambiguates the empty-buffer no-op from the end of a -stream.* Without EOF-as-error, both `read_some(empty_buffer)` on a -live stream and `read_some(non_empty_buffer)` on an exhausted stream -would produce `{success, 0}`. The caller could not distinguish "I -passed no buffer" from "the stream is done." Making EOF an error code -separates these two cases cleanly. - -These two reasons reinforce each other. Composed operations need EOF -to be an error code so they can report early termination. The -empty-buffer rule needs EOF to be an error code so `{success, 0}` -is unambiguously a no-op. Together with the rule that errors exclude -data, `read_some` results form a clean trichotomy: success with -data, or an error (including EOF) without data. - -=== The Write-Side Asymmetry - -On the write side, `WriteSink` provides `write_eof(buffers)` to -atomically combine the final data with the EOF signal. A natural -question follows: if the write side fuses data with EOF, why does the -read side forbid it? - -The answer is that the two sides of the I/O boundary have different -roles. The writer _decides_ when to signal EOF. The reader -_discovers_ it. This asymmetry has three consequences: - -*`write_eof` exists for correctness, not convenience.* Protocol -framings require the final data and the EOF marker to be emitted -together so the peer observes a complete message. HTTP chunked -encoding needs the terminal `0\r\n\r\n` coalesced with the final -data chunk. A TLS session needs the close-notify alert coalesced -with the final application data. A compressor needs `Z_FINISH` -applied to the final input. These are correctness requirements, not -optimizations. On the read side, whether the last bytes arrive with -EOF or on a separate call does not change what the reader observes. -The data and the order are identical either way. - -*`write_eof` is a separate function the caller explicitly invokes.* -`write_some` never signals EOF. The writer opts into data-plus-EOF -by calling a different function. The call site reads `write_eof(data)` -and the intent is unambiguous. If `read_some` could return data with -EOF, every call to `read_some` would _sometimes_ be a data-only -operation and _sometimes_ a data-plus-EOF operation. The stream -decides which mode the caller gets, at runtime. Every call site must -handle both possibilities. The burden falls on every consumer in the -codebase, not on a single call site that opted into the combined -behavior. - -*A hypothetical `read_eof` makes no sense.* On the write side, -`write_eof` exists because the producer signals the end of data. On -the read side, the consumer does not tell the stream to end -- it -discovers that the stream has ended. EOF flows from producer to -consumer, not the reverse. There is no action the reader can take to -"read the EOF." The reader discovers EOF as a side effect of -attempting to read. - -=== A Clean Trichotomy - -With the current contract, every `read_some` result falls into -exactly one of three mutually exclusive cases: - -- **Success**: `!ec`, `n >= 1` -- data arrived, process it. -- **EOF**: `ec == cond::eof`, `n == 0` -- stream ended, no data. -- **Error**: `ec`, `n == 0` -- failure, no data. - -Data is present if and only if the operation succeeded. This -invariant -- _data implies success_ -- eliminates an entire category -of reasoning from every read loop. The common pattern is: - -[source,cpp] ----- -auto [ec, n] = co_await stream.read_some(buf); -if(ec) - break; // EOF or error -- no data to handle -process(buf, n); // only reached on success, n >= 1 ----- - -If `read_some` could return `n > 0` with EOF, the loop becomes: - -[source,cpp] ----- -auto [ec, n] = co_await stream.read_some(buf); -if(n > 0) - process(buf, n); // must handle data even on EOF -if(ec) - break; ----- - -Every consumer pays this tax: an extra branch to handle data -accompanying EOF. The branch is easy to forget. Forgetting it -silently drops the final bytes of the stream -- a bug that only -manifests when the source delivers EOF with its last data rather than -on a separate call. A TCP socket receiving data in one packet and FIN -in another will not trigger the bug. A memory source that knows its -remaining length will. The non-determinism makes the bug difficult to -reproduce and diagnose. - -The clean trichotomy eliminates this class of bugs entirely. - -=== Conforming Sources - -Every concrete `ReadStream` implementation naturally separates its -last data delivery from its EOF signal: - -- **TCP sockets**: `read_some` maps to a single `recv()` or - `WSARecv()` call, returning whatever the kernel has buffered. The - kernel delivers bytes on one call and returns 0 on the next. The - separation is inherent in the POSIX and Windows APIs. -- **TLS streams**: `read_some` decrypts and returns one TLS record's - worth of application data. The close-notify alert arrives as a - separate record. -- **HTTP content-length body**: the source delivers bytes up to the - content-length limit. Once the limit is reached, the next - `read_some` returns EOF. -- **HTTP chunked body**: the unchunker delivers decoded data from - chunks. The terminal `0\r\n\r\n` is parsed on a separate pass that - returns EOF. -- **Compression (inflate)**: the decompressor delivers output bytes. - When `Z_STREAM_END` is detected, the next read returns EOF. -- **Memory source**: returns `min(requested, remaining)` bytes. When - `remaining` reaches 0, the next call returns EOF. -- **QUIC streams**: `read_some` returns data from received QUIC - frames. Stream FIN is delivered as EOF on a subsequent call. -- **Buffered read streams**: `read_some` returns data from an - internal buffer, refilling from the underlying stream when empty. - EOF propagates from the underlying stream. -- **Test mock streams**: `read_some` returns configurable data and - error sequences for testing. - -No source is forced into an unnatural pattern. The `read_some` call -that discovers EOF is the natural result of attempting to read from -an exhausted stream -- not a separate probing step. Once the caller -receives EOF, it stops reading. - -=== Composed Operations and Partial Results - -The composed `read` algorithm (and `ReadSource::read`) _does_ report -`n > 0` on EOF, because it accumulates data across multiple internal -`read_some` calls. When the underlying stream signals EOF -mid-accumulation, discarding the bytes already gathered would be -wrong. The caller needs `n` to know how much valid data landed in the -buffer. - -The design separates concerns cleanly: the single-shot primitive -(`read_some`) delivers unambiguous results with a clean trichotomy. -Composed operations that accumulate state (`read`) report what they -accumulated, including partial results on EOF. Callers who need -partial-on-EOF semantics get them through the composed layer, while -the primitive layer remains clean. - -=== Evidence from the Asio Implementation - -The Asio source code confirms this design at every level. - -On POSIX platforms, `non_blocking_recv1` in `socket_ops.ipp` calls -`recv()` and branches on the result. If `recv()` returns a positive -value, the bytes are reported as a successful transfer. If `recv()` -returns 0 on a stream socket, EOF is reported. If `recv()` returns --1, the function explicitly sets `bytes_transferred = 0` before -returning the error. The POSIX `recv()` system call itself enforces -binary outcomes: it returns `N > 0` on success, `0` on EOF, or `-1` -on error. A single call never returns both data and an error. - -On Windows, `complete_iocp_recv` processes the results from -`GetQueuedCompletionStatus`. It maps `ERROR_NETNAME_DELETED` to -`connection_reset` and `ERROR_PORT_UNREACHABLE` to -`connection_refused`. Windows IOCP similarly reports zero -`bytes_transferred` on failed completions. The operating system -enforces the same binary outcome per I/O completion. - -The one edge case is POSIX signal interruption (`EINTR`). If a signal -arrives after `recv()` has already copied some bytes, the kernel -returns the partial byte count as success rather than `-1`/`EINTR`. -Asio handles this transparently by retrying on `EINTR`, so the -caller never observes it. Even the kernel does not combine data with -an error -- it chooses to report the partial data as success. - -=== Convergent Design with POSIX - -POSIX `recv()` independently enforces the same rule: `N > 0` on -success, `-1` on error, `0` on EOF. The kernel never returns "here -are your last 5 bytes, and also EOF." It delivers the available bytes -on one call and returns 0 on the next. This is not because the {cpp} -abstraction copied POSIX semantics. It is because the kernel faces -the same fundamental constraint: state is discovered through the act -of I/O. The alignment between `read_some` and `recv()` is convergent -design, not leaky abstraction. - -== Summary - -`ReadStream` provides `read_some` as the single partial-read -primitive. This is deliberately minimal: - -- Algorithms that need to fill a buffer completely use the `read` - composed algorithm. -- Algorithms that need delimited reads use `read_until`. -- Algorithms that need to process data as it arrives use `read_some` - directly. -- `ReadSource` refines `ReadStream` by adding `read` for - complete-read semantics. - -The contract that errors exclude data follows Asio's established -`AsyncReadStream` contract, aligns with POSIX and Windows system -call semantics, and produces a clean trichotomy that makes every -read loop safe by construction. diff --git a/doc/modules/ROOT/pages/8.design/8b.Separation.adoc b/doc/modules/ROOT/pages/8.design/8b.Separation.adoc new file mode 100644 index 00000000..850149b5 --- /dev/null +++ b/doc/modules/ROOT/pages/8.design/8b.Separation.adoc @@ -0,0 +1,171 @@ +// +// Copyright (c) 2025 Vinnie Falco (vinnie.falco@gmail.com) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Official repository: https://github.com/cppalliance/capy +// + += Why Capy Is Separate + +"Why are Capy and Corosio two separate libraries? Why not just put everything in one place?" + +The answer is physical design. Capy and Corosio sit at different levels of the physical hierarchy. They encapsulate different information, change for different reasons, and have different platform dependencies. Merging them would degrade the design along every axis that matters for a large-scale system: testability, reusability, and build cost. + +This document applies well-established physical design principles to show why the separation is a structural requirement. + + +== What Lives Where + +**Capy** provides the foundational abstractions for coroutine-based I/O. Tasks. Buffers. Stream concepts. Executors. The IoAwaitable protocol. Type-erased streams. Composition primitives like `when_all` and `when_any`. It is pure {cpp}20. It does not include a single line of platform-specific code. No sockets. No file descriptors. No `#ifdef _WIN32`. + +**Corosio** provides platform networking. TCP sockets. TLS streams. DNS resolution. Timers. Signal handling. It implements four platform-specific event loop backends: IOCP on Windows, epoll on Linux, kqueue on macOS/BSD, and POSIX select as a fallback. Corosio depends on Capy. Capy does not depend on Corosio. + +The dependency arrow points in one direction. That is not an accident. + + +== Levelization + +Three principles underpin the physical organization of large systems: + +. Fine-grained encapsulation (Parnas, 1972) +. Acyclic physical dependencies (Dijkstra, 1968) +. Well-documented internal interface boundaries (Myers, 1978) + +Lakos synthesized these into a discipline called levelization. The idea is not a means of achieving fine-grained components. It is a means of organizing the implied dependencies of the logical entities in a system so that the component dependencies are acyclic (see Fig 0-15, p. 22 of Lakos'20). + +The levels are straightforward: + +* A component that depends on nothing is *level 0*. +* A component that depends only on level-0 components is *level 1*. +* A component that depends on level-1 components is *level 2*. +* And so on. + +This creates a directed acyclic graph where dependencies flow in one direction. If the graph has a cycle, the design is broken. The presence of acyclic dependencies does not guarantee good design, but the presence of cycles guarantees bad design. + +[quote, John Lakos, 'https://www.amazon.com/dp/0201633620[Large-Scale C++ Software Design] (1996)'] +____ +Systems with [acyclic] physical hierarchies are fundamentally +easier and more economical to maintain, test, and reuse than +tightly interdependent systems. +____ + +Knowing that logical designs must be levelized, you alter the logical designs accordingly. This is the insight that separates engineers who have built at scale from those who have not. + +Capy sits at a lower level. It provides tasks, buffers, stream concepts, and executors - abstractions that do not depend on any particular I/O backend. Corosio sits at a higher level. It provides sockets, TLS, and event loops that depend on Capy's abstractions. + +Components at different levels belong in different packages. This is a structural requirement, not a style preference. + + +== Cumulative Component Dependency + +Lakos quantified the cost of getting levels wrong with Cumulative Component Dependency (CCD): the sum over all components in a subsystem of the number of components needed in order to test each component incrementally (see Figure 4-22, p. 191 of Lakos'96). + +CCD ranges from N for a perfectly horizontal (flat) design to N-squared for a vertical or cyclically dependent one. The metric is additive for independent subsystems. If two independent libraries each have CCD of 5, combining them without adding cross-dependencies gives CCD 10 - exactly the sum: + +---- +-------- ---------- + [3] [3] + / \ / \ +[1] [1] [1] [1] +-------- --------- +CCD = 5 CCD = 5 + + ------------------- + [3] [3] + / \ / \ + [1] [1] [1] [1] + ------------------- + CCD = 10 +---- + +Each component should have a single purpose. Ideally all of the functionality within a component is primitive - if you can write a function in terms of a type rather than as a member of that type, write a free function (or today, a template function constrained by a concept). This keeps levels flat and CCD low. + +Merging two libraries at different levels inflates CCD. Every component that only needs buffers and tasks now drags in sockets, TLS, and four platform backends. Testing cost, build cost, and cognitive cost all increase. + + +== Deep Modules + +Ousterhout's model for module quality measures interface area against implementation depth. A deep module has a small interface and a large implementation. + +[quote, John Ousterhout, 'https://web.stanford.edu/~ouster/cgi-bin/aposd.php[A Philosophy of Software Design] (2021)'] +____ +The best modules are those that provide powerful functionality, +but have a simple interface. +____ + +Capy is a deep module. Its public surface is narrow: a handful of concepts (`ReadStream`, `WriteStream`, `BufferSource`, `BufferSink`), a task type, an executor model, and buffer utilities. Behind that surface lives a substantial implementation: coroutine frame allocation, forward propagation of executors and stop tokens, type-erased stream machinery, and composition primitives. + +Corosio is also a deep module, but a different one. It hides platform-specific event loop complexity (IOCP, epoll, kqueue, select) behind a uniform socket and timer interface. + +These two modules hide different information. That is the practical reason they are separate. Lakos would say: do not collocate two independent systems, because doing so creates gratuitous physical dependencies. Ousterhout would say: modules that hide different information should remain different modules. + +Capy pulls the complexity of coroutine execution, buffer management, and context propagation downward, so that libraries like Http and Corosio do not have to deal with it. Merging Capy into Corosio does not eliminate that complexity. It buries it inside a larger library where it is harder to find, harder to test, and impossible to reuse without taking the whole thing. + + +== Writing Against the Narrowest Interface + +A `ReadStream` concept captures the essential operation: anything you can `read_some` from. TCP sockets, TLS streams, file handles, in-memory buffers - one generic algorithm works with all of them. That algorithm belongs in Capy, not Corosio, because it depends only on the concept, not on any particular implementation. + +Stepanov's principle applies here: algorithms should be abstracted away from particular implementations so that the minimum requirements the algorithm assumes are the only requirements the code uses. In practice, zero-overhead abstraction is an ideal rather than a guarantee - Chandler Carruth has argued persuasively that real compilers on real hardware rarely achieve it perfectly. But the principle of coding against minimal requirements remains sound, even when the abstraction has some cost. + +If you can express your algorithm using Capy instead of Corosio, you depend on fewer things. Fewer dependencies means lower CCD, easier testing, and broader reuse. + + +== The Existence Proof + +Boost.Http is a sans-I/O HTTP/1.1 protocol library. It parses requests, serializes responses, and implements routing. It is written entirely against Capy. It has zero dependency on Corosio. + +This is not a hypothetical. It is a real library, shipping today. It works with any I/O backend that satisfies Capy's stream concepts. You could plug in Corosio's TCP sockets, or Asio's sockets, or a mock stream for testing. The protocol logic does not care. + +If Capy were merged into Corosio, Boost.Http would be forced to depend on platform networking it never touches. Every user who wants to parse HTTP headers would need to link against IOCP on Windows, epoll on Linux, and kqueue on macOS. The HTTP parser does not use sockets. It should not pay for sockets. + +This is precisely the excessive link-time dependency that levelization is designed to prevent. Merging Capy into Corosio does not create a cycle, but it forces every consumer of Capy's abstractions to inherit Corosio's platform dependencies. The cost is paid by everyone, even those who need nothing from Corosio. + + +== Testing in Isolation + +With Capy as a separate library, you can test buffer algorithms, stream concepts, and task machinery without a network stack. No sockets. No event loops. No platform dependencies. Just pure {cpp}20 coroutine logic. + +With Corosio as a separate library, you can test socket behavior, DNS resolution, and timer accuracy against a known-good Capy foundation. + +Merge them, and every test of a buffer copy routine must compile against platform I/O headers. Every CI run must configure platform-specific backends even to test portable abstractions. The test matrix explodes. Each unnecessary dependency is small, but they accumulate, and once they accumulate they are nearly impossible to remove. + + +== Platform Isolation + +Capy is portable {cpp}20. It compiles on any conforming compiler with no platform-specific code. It can be used on embedded systems, in WebAssembly, on platforms that do not have sockets, and in environments where the I/O backend has not been written yet. + +Corosio contains four platform backends, each a substantial body of platform-specific code: + +* *IOCP* on Windows (sockets, overlapped I/O, NT timers) +* *epoll* on Linux +* *kqueue* on macOS and BSD +* *select* as a POSIX fallback + +Merging these into Capy would mean that a developer who wants a `task<>` type or a `circular_dynamic_buffer` must compile against platform I/O headers. Keeping Capy separate ensures that none of the headers a consumer includes transitively pull in anything from the platform I/O layer. Consumers take only what they need. + + +== Conclusion + +Good design separates things that change for different reasons. Capy changes when the coroutine execution model evolves - new composition primitives, new buffer types, refinements to the IoAwaitable protocol. Corosio changes when platform I/O APIs evolve - new io_uring features on Linux, new IOCP capabilities on Windows, new TLS backends. + +The converse is also important: things that change together should not be separated. An unstable implementation detail that serves only one component belongs inside that component, not in a separate library. Capy and Corosio do not change together. They have different rates of change, different levels of abstraction, and different platform dependencies. + +These are distinct reasons for separation. Levelization demands acyclic dependencies between packages. Isolation prevents excessive compile-time and link-time coupling. Abstraction - hiding unnecessary details - reduces the interface each consumer must understand. The three reinforce each other, but they are separate concerns. + +Capy is the narrow waist. It is the small-surface-area interface that hides substantial machinery. It is the lower-level foundation that everything else builds on. Merging it into Corosio would force every consumer of portable abstractions to pay for platform networking they do not use. + +Keep them separate. The architecture demands it. + + +== References + +. John Lakos. https://www.amazon.com/dp/0201633620[_Large-Scale C++ Software Design._] Addison-Wesley, 1996. +. John Lakos. https://informit.com/articles/article.aspx?p=2995361[_Large-Scale C++, Volume I: Process and Architecture._] Addison-Wesley, 2020. +. John Ousterhout. https://web.stanford.edu/~ouster/cgi-bin/aposd.php[_A Philosophy of Software Design._] Yaknyam Press, 2nd Edition, 2021. +. Alexander Stepanov. https://stepanovpapers.com/drdobbs-interview.pdf["Al Stevens Interviews Alex Stepanov."] _Dr. Dobb's Journal_, 1995. +. D.L. Parnas. "On the Criteria To Be Used in Decomposing Systems into Modules." _Communications of the ACM_, 1972. +. E.W. Dijkstra. "The Structure of the 'THE'-Multiprogramming System." _Communications of the ACM_, 1968. +. G.J. Myers. _Composite/Structured Design._ Van Nostrand Reinhold, 1978. diff --git a/doc/modules/ROOT/pages/8.design/8c.ReadStream.adoc b/doc/modules/ROOT/pages/8.design/8c.ReadStream.adoc new file mode 100644 index 00000000..f37c1bb7 --- /dev/null +++ b/doc/modules/ROOT/pages/8.design/8c.ReadStream.adoc @@ -0,0 +1,370 @@ += ReadStream Concept Design + +== Overview + +This document describes the design of the `ReadStream` concept: the fundamental partial-read primitive in the concept hierarchy. It explains why `read_some` is the correct building block, how composed algorithms build on top of it, and the relationship to `ReadSource`. + +== Definition + +[source,cpp] +---- +template +concept ReadStream = + requires(T& stream, mutable_buffer_archetype buffers) + { + { stream.read_some(buffers) } -> IoAwaitable; + requires awaitable_decomposes_to< + decltype(stream.read_some(buffers)), + std::error_code, std::size_t>; + }; +---- + +A `ReadStream` provides a single operation: + +=== `read_some(buffers)` -- Partial Read + +Reads one or more bytes from the stream into the buffer sequence. Returns `(error_code, std::size_t)` where `n` is the number of bytes read. + +==== Semantics + +- On success: `!ec`, `n >= 1` and `n \<= buffer_size(buffers)`. +- On EOF: `ec == cond::eof`, `n == 0`. +- On error: `ec`, `n == 0`. +- If `buffer_empty(buffers)`: completes immediately, `!ec`, `n == 0`. + +The caller must not assume the buffer is filled. `read_some` may return fewer bytes than the buffer can hold. This is the defining property of a partial-read primitive. + +Once `read_some` returns an error (including EOF), the caller must not call `read_some` again. The stream is done. Not all implementations can reproduce a prior error on subsequent calls, so the behavior after an error is undefined. + +Buffers in the sequence are filled completely before proceeding to the next buffer in the sequence. + +==== Buffer Lifetime + +The caller must ensure that the memory referenced by `buffers` remains valid until the `co_await` expression returns. + +==== Conforming Signatures + +[source,cpp] +---- +template +IoAwaitable auto read_some(Buffers buffers); +---- + +Buffer sequences should be accepted by value when the member function is a coroutine, to ensure the sequence lives in the coroutine frame across suspension points. + +== Concept Hierarchy + +`ReadStream` is the base of the read-side hierarchy: + +---- +ReadStream { read_some } + | + v +ReadSource { read_some, read } +---- + +`ReadSource` refines `ReadStream`. Every `ReadSource` is a `ReadStream`. Algorithms constrained on `ReadStream` accept both raw streams and sources. The `ReadSource` concept adds a complete-read primitive on top of the partial-read primitive. + +This mirrors the write side: + +---- +WriteStream { write_some } + | + v +WriteSink { write_some, write, write_eof(buffers), write_eof() } +---- + +== Composed Algorithms + +Three composed algorithms build on `read_some`: + +=== `read(stream, buffers)` -- Fill a Buffer Sequence + +[source,cpp] +---- +auto read(ReadStream auto& stream, + MutableBufferSequence auto const& buffers) + -> io_task; +---- + +Loops `read_some` until the entire buffer sequence is filled or an error (including EOF) occurs. On success, `n == buffer_size(buffers)`. + +[source,cpp] +---- +template +task<> read_header(Stream& stream) +{ + char header[16]; + auto [ec, n] = co_await read( + stream, mutable_buffer(header)); + if(ec == cond::eof) + co_return; // clean shutdown + if(ec) + co_return; + // header contains exactly 16 bytes +} +---- + +=== `read(stream, dynamic_buffer)` -- Read Until EOF + +[source,cpp] +---- +auto read(ReadStream auto& stream, + DynamicBufferParam auto&& buffers, + std::size_t initial_amount = 2048) + -> io_task; +---- + +Reads from the stream into a dynamic buffer until EOF is reached. The buffer grows with a 1.5x factor when filled. On success (EOF), `ec` is clear and `n` is the total bytes read. + +[source,cpp] +---- +template +task slurp(Stream& stream) +{ + std::string body; + auto [ec, n] = co_await read( + stream, string_dynamic_buffer(&body)); + if(ec) + co_return {}; + co_return body; +} +---- + +=== `read_until(stream, dynamic_buffer, match)` -- Delimited Read + +Reads from the stream into a dynamic buffer until a delimiter or match condition is found. Used for line-oriented protocols and message framing. + +[source,cpp] +---- +template +task<> read_line(Stream& stream) +{ + std::string line; + auto [ec, n] = co_await read_until( + stream, string_dynamic_buffer(&line), "\r\n"); + if(ec) + co_return; + // line contains data up to and including "\r\n" +} +---- + +== Use Cases + +=== Incremental Processing with `read_some` + +When processing data as it arrives without waiting for a full buffer, `read_some` is the right choice. This is common for real-time data or when the processing can handle partial input. + +[source,cpp] +---- +template +task<> echo(Stream& stream, WriteStream auto& dest) +{ + char buf[4096]; + for(;;) + { + auto [ec, n] = co_await stream.read_some( + mutable_buffer(buf)); + if(ec == cond::eof) + co_return; + if(ec) + co_return; + + // Forward whatever we received immediately + auto [wec, nw] = co_await dest.write_some( + const_buffer(buf, n)); + if(wec) + co_return; + } +} +---- + +=== Relaying from ReadStream to WriteStream + +When relaying data from a reader to a writer, `read_some` feeds `write_some` directly. This is the fundamental streaming pattern. + +[source,cpp] +---- +template +task<> relay(Src& src, Dest& dest) +{ + char storage[65536]; + circular_dynamic_buffer cb(storage, sizeof(storage)); + + for(;;) + { + // Read into free space + auto mb = cb.prepare(cb.capacity()); + auto [rec, nr] = co_await src.read_some(mb); + cb.commit(nr); + + if(rec && rec != cond::eof) + co_return; + + // Drain to destination + while(cb.size() > 0) + { + auto [wec, nw] = co_await dest.write_some( + cb.data()); + if(wec) + co_return; + cb.consume(nw); + } + + if(rec == cond::eof) + co_return; + } +} +---- + +Because `ReadSource` refines `ReadStream`, this relay function also accepts `ReadSource` types. An HTTP body source or a decompressor can be relayed to a `WriteStream` using the same function. + +== Relationship to the Write Side + +[cols="1,1"] +|=== +| Read Side | Write Side + +| `ReadStream::read_some` +| `WriteStream::write_some` + +| `read` free function (composed) +| `write_now` (composed, eager) + +| `read_until` (composed, delimited) +| No write-side equivalent + +| `ReadSource::read` +| `WriteSink::write` +|=== + +== Design Foundations: Why Errors Exclude Data + +The `read_some` contract requires that `n` is 0 whenever `ec` is set. Data and errors are mutually exclusive outcomes. This is the most consequential design decision in the `ReadStream` concept, with implications for every consumer of `read_some` in the library. The rule follows Asio's established `AsyncReadStream` contract, is reinforced by the behavior of POSIX and Windows I/O system calls, and produces cleaner consumer code. This section explains the design and its consequences. + +=== Reconstructing Kohlhoff's Reasoning + +Christopher Kohlhoff's Asio library defines an `AsyncReadStream` concept with the identical requirement: on error, `bytes_transferred` is 0. No design rationale document accompanies this rule. The reasoning presented here was reconstructed from three sources: + +- *The Asio source code.* The function `non_blocking_recv1` in `socket_ops.ipp` explicitly sets `bytes_transferred = 0` on every error path. The function `complete_iocp_recv` maps Windows IOCP errors to portable error codes, relying on the operating system's guarantee that failed completions report zero bytes. These are deliberate choices, not accidental pass-through of OS behavior. +- *A documentation note Kohlhoff left.* Titled "Why EOF is an error," it gives two reasons: composed operations need EOF-as-error to report contract violations, and EOF-as-error disambiguates the end of a stream from a successful zero-byte read. The note is terse but the implications are deep. +- *Analysis of the underlying system calls.* POSIX `recv()` and Windows `WSARecv()` both enforce a binary outcome per call: data or error, never both. This is not because the {cpp} abstraction copied the OS, but because both levels face the same fundamental constraint. + +The following sections examine each of these points and their consequences. + +=== Alignment with Asio + +Asio's `AsyncReadStream` concept has enforced the same rule for over two decades: on error, `bytes_transferred` is 0. This is a deliberate design choice, not an accident. The Asio source code explicitly zeroes `bytes_transferred` on every error path, and the underlying system calls (POSIX `recv()`, Windows IOCP) enforce binary outcomes at the OS level. The `read_some` contract follows this established practice. + +=== The Empty-Buffer Rule + +Every `ReadStream` must support the following: + +[quote] +`read_some(empty_buffer)` completes immediately with `{success, 0}`. + +This is a no-op. The caller passed no buffer space, so no I/O is attempted. The operation does not inspect the stream's internal state because that would require a probe capability -- a way to ask "is there data? is the stream at EOF?" -- without actually reading. Not every source supports probing. A TCP socket does not know that its peer has closed until it calls `recv()` and gets 0 back. A pipe does not know it is broken until a read fails. The empty-buffer rule is therefore unconditional: return `{success, 0}` regardless of the stream's state. + +This rule is a natural consequence of the contract, not a proof of it. When no I/O is attempted, no state is discovered and no error is reported. + +=== Why EOF Is an Error + +Kohlhoff's documentation note gives two reasons for making EOF an error code rather than a success: + +*Composed operations need EOF-as-error to report contract violations.* The composed `read(stream, buffer(buf, 100))` promises to fill exactly 100 bytes. If the stream ends after 50, the operation did not fulfill its contract. Reporting `{success, 50}` would be misleading -- it suggests the operation completed normally. Reporting `{eof, 50}` tells the caller both what happened (50 bytes landed in the buffer) and why the operation stopped (the stream ended). EOF-as-error is the mechanism by which composed operations explain early termination. + +*EOF-as-error disambiguates the empty-buffer no-op from the end of a stream.* Without EOF-as-error, both `read_some(empty_buffer)` on a live stream and `read_some(non_empty_buffer)` on an exhausted stream would produce `{success, 0}`. The caller could not distinguish "I passed no buffer" from "the stream is done." Making EOF an error code separates these two cases cleanly. + +These two reasons reinforce each other. Composed operations need EOF to be an error code so they can report early termination. The empty-buffer rule needs EOF to be an error code so `{success, 0}` is unambiguously a no-op. Together with the rule that errors exclude data, `read_some` results form a clean trichotomy: success with data, or an error (including EOF) without data. + +=== The Write-Side Asymmetry + +On the write side, `WriteSink` provides `write_eof(buffers)` to atomically combine the final data with the EOF signal. A natural question follows: if the write side fuses data with EOF, why does the read side forbid it? + +The answer is that the two sides of the I/O boundary have different roles. The writer _decides_ when to signal EOF. The reader _discovers_ it. This asymmetry has three consequences: + +*`write_eof` exists for correctness, not convenience.* Protocol framings require the final data and the EOF marker to be emitted together so the peer observes a complete message. HTTP chunked encoding needs the terminal `0\r\n\r\n` coalesced with the final data chunk. A TLS session needs the close-notify alert coalesced with the final application data. A compressor needs `Z_FINISH` applied to the final input. These are correctness requirements, not optimizations. On the read side, whether the last bytes arrive with EOF or on a separate call does not change what the reader observes. The data and the order are identical either way. + +*`write_eof` is a separate function the caller explicitly invokes.* `write_some` never signals EOF. The writer opts into data-plus-EOF by calling a different function. The call site reads `write_eof(data)` and the intent is unambiguous. If `read_some` could return data with EOF, every call to `read_some` would _sometimes_ be a data-only operation and _sometimes_ a data-plus-EOF operation. The stream decides which mode the caller gets, at runtime. Every call site must handle both possibilities. The burden falls on every consumer in the codebase, not on a single call site that opted into the combined behavior. + +*A hypothetical `read_eof` makes no sense.* On the write side, `write_eof` exists because the producer signals the end of data. On the read side, the consumer does not tell the stream to end -- it discovers that the stream has ended. EOF flows from producer to consumer, not the reverse. There is no action the reader can take to "read the EOF." The reader discovers EOF as a side effect of attempting to read. + +=== A Clean Trichotomy + +With the current contract, every `read_some` result falls into exactly one of three mutually exclusive cases: + +- **Success**: `!ec`, `n >= 1` -- data arrived, process it. +- **EOF**: `ec == cond::eof`, `n == 0` -- stream ended, no data. +- **Error**: `ec`, `n == 0` -- failure, no data. + +Data is present if and only if the operation succeeded. This invariant -- _data implies success_ -- eliminates an entire category of reasoning from every read loop. The common pattern is: + +[source,cpp] +---- +auto [ec, n] = co_await stream.read_some(buf); +if(ec) + break; // EOF or error -- no data to handle +process(buf, n); // only reached on success, n >= 1 +---- + +If `read_some` could return `n > 0` with EOF, the loop becomes: + +[source,cpp] +---- +auto [ec, n] = co_await stream.read_some(buf); +if(n > 0) + process(buf, n); // must handle data even on EOF +if(ec) + break; +---- + +Every consumer pays this tax: an extra branch to handle data accompanying EOF. The branch is easy to forget. Forgetting it silently drops the final bytes of the stream -- a bug that only manifests when the source delivers EOF with its last data rather than on a separate call. A TCP socket receiving data in one packet and FIN in another will not trigger the bug. A memory source that knows its remaining length will. The non-determinism makes the bug difficult to reproduce and diagnose. + +The clean trichotomy eliminates this class of bugs entirely. + +=== Conforming Sources + +Every concrete `ReadStream` implementation naturally separates its last data delivery from its EOF signal: + +- **TCP sockets**: `read_some` maps to a single `recv()` or `WSARecv()` call, returning whatever the kernel has buffered. The kernel delivers bytes on one call and returns 0 on the next. The separation is inherent in the POSIX and Windows APIs. +- **TLS streams**: `read_some` decrypts and returns one TLS record's worth of application data. The close-notify alert arrives as a separate record. +- **HTTP content-length body**: the source delivers bytes up to the content-length limit. Once the limit is reached, the next `read_some` returns EOF. +- **HTTP chunked body**: the unchunker delivers decoded data from chunks. The terminal `0\r\n\r\n` is parsed on a separate pass that returns EOF. +- **Compression (inflate)**: the decompressor delivers output bytes. When `Z_STREAM_END` is detected, the next read returns EOF. +- **Memory source**: returns `min(requested, remaining)` bytes. When `remaining` reaches 0, the next call returns EOF. +- **QUIC streams**: `read_some` returns data from received QUIC frames. Stream FIN is delivered as EOF on a subsequent call. +- **Buffered read streams**: `read_some` returns data from an internal buffer, refilling from the underlying stream when empty. EOF propagates from the underlying stream. +- **Test mock streams**: `read_some` returns configurable data and error sequences for testing. + +No source is forced into an unnatural pattern. The `read_some` call that discovers EOF is the natural result of attempting to read from an exhausted stream -- not a separate probing step. Once the caller receives EOF, it stops reading. + +=== Composed Operations and Partial Results + +The composed `read` algorithm (and `ReadSource::read`) _does_ report `n > 0` on EOF, because it accumulates data across multiple internal `read_some` calls. When the underlying stream signals EOF mid-accumulation, discarding the bytes already gathered would be wrong. The caller needs `n` to know how much valid data landed in the buffer. + +The design separates concerns cleanly: the single-shot primitive (`read_some`) delivers unambiguous results with a clean trichotomy. Composed operations that accumulate state (`read`) report what they accumulated, including partial results on EOF. Callers who need partial-on-EOF semantics get them through the composed layer, while the primitive layer remains clean. + +=== Evidence from the Asio Implementation + +The Asio source code confirms this design at every level. + +On POSIX platforms, `non_blocking_recv1` in `socket_ops.ipp` calls `recv()` and branches on the result. If `recv()` returns a positive value, the bytes are reported as a successful transfer. If `recv()` returns 0 on a stream socket, EOF is reported. If `recv()` returns -1, the function explicitly sets `bytes_transferred = 0` before returning the error. The POSIX `recv()` system call itself enforces binary outcomes: it returns `N > 0` on success, `0` on EOF, or `-1` on error. A single call never returns both data and an error. + +On Windows, `complete_iocp_recv` processes the results from `GetQueuedCompletionStatus`. It maps `ERROR_NETNAME_DELETED` to `connection_reset` and `ERROR_PORT_UNREACHABLE` to `connection_refused`. Windows IOCP similarly reports zero `bytes_transferred` on failed completions. The operating system enforces the same binary outcome per I/O completion. + +The one edge case is POSIX signal interruption (`EINTR`). If a signal arrives after `recv()` has already copied some bytes, the kernel returns the partial byte count as success rather than `-1`/`EINTR`. Asio handles this transparently by retrying on `EINTR`, so the caller never observes it. Even the kernel does not combine data with an error -- it chooses to report the partial data as success. + +=== Convergent Design with POSIX + +POSIX `recv()` independently enforces the same rule: `N > 0` on success, `-1` on error, `0` on EOF. The kernel never returns "here are your last 5 bytes, and also EOF." It delivers the available bytes on one call and returns 0 on the next. This is not because the {cpp} abstraction copied POSIX semantics. It is because the kernel faces the same fundamental constraint: state is discovered through the act of I/O. The alignment between `read_some` and `recv()` is convergent design, not leaky abstraction. + +== Summary + +`ReadStream` provides `read_some` as the single partial-read primitive. This is deliberately minimal: + +- Algorithms that need to fill a buffer completely use the `read` composed algorithm. +- Algorithms that need delimited reads use `read_until`. +- Algorithms that need to process data as it arrives use `read_some` directly. +- `ReadSource` refines `ReadStream` by adding `read` for complete-read semantics. + +The contract that errors exclude data follows Asio's established `AsyncReadStream` contract, aligns with POSIX and Windows system call semantics, and produces a clean trichotomy that makes every read loop safe by construction. diff --git a/doc/modules/ROOT/pages/8.design/8b.ReadSource.adoc b/doc/modules/ROOT/pages/8.design/8d.ReadSource.adoc similarity index 68% rename from doc/modules/ROOT/pages/8.design/8b.ReadSource.adoc rename to doc/modules/ROOT/pages/8.design/8d.ReadSource.adoc index 81e6af27..4b8c364d 100644 --- a/doc/modules/ROOT/pages/8.design/8b.ReadSource.adoc +++ b/doc/modules/ROOT/pages/8.design/8d.ReadSource.adoc @@ -2,10 +2,7 @@ == Overview -This document describes the design of the `ReadSource` concept: a -refinement of `ReadStream` that adds a complete-read primitive. It -explains how `ReadSource` relates to `ReadStream`, why the refinement -hierarchy mirrors the write side, and the use cases each serves. +This document describes the design of the `ReadSource` concept: a refinement of `ReadStream` that adds a complete-read primitive. It explains how `ReadSource` relates to `ReadStream`, why the refinement hierarchy mirrors the write side, and the use cases each serves. == Definition @@ -23,14 +20,11 @@ concept ReadSource = }; ---- -`ReadSource` refines `ReadStream`. Every `ReadSource` is a -`ReadStream`. A `ReadSource` provides two operations: +`ReadSource` refines `ReadStream`. Every `ReadSource` is a `ReadStream`. A `ReadSource` provides two operations: === `read_some(buffers)` -- Partial Read (inherited from `ReadStream`) -Reads one or more bytes from the source into the buffer sequence. -Returns `(error_code, std::size_t)` where `n` is the number of bytes -read. May return fewer bytes than the buffer can hold. +Reads one or more bytes from the source into the buffer sequence. Returns `(error_code, std::size_t)` where `n` is the number of bytes read. May return fewer bytes than the buffer can hold. ==== Semantics @@ -39,42 +33,28 @@ read. May return fewer bytes than the buffer can hold. - On error: `ec`, `n == 0`. - If `buffer_empty(buffers)`: completes immediately, `!ec`, `n == 0`. -Once `read_some` returns an error (including EOF), the caller must -not call `read_some` again. The stream is done. Not all -implementations can reproduce a prior error on subsequent calls, so -the behavior after an error is undefined. +Once `read_some` returns an error (including EOF), the caller must not call `read_some` again. The stream is done. Not all implementations can reproduce a prior error on subsequent calls, so the behavior after an error is undefined. === `read(buffers)` -- Complete Read -Reads data into the buffer sequence. Either fills the entire buffer -or returns an error. Returns `(error_code, std::size_t)` where `n` -is the number of bytes read. +Reads data into the buffer sequence. Either fills the entire buffer or returns an error. Returns `(error_code, std::size_t)` where `n` is the number of bytes read. ==== Semantics -- On success: `!ec`, `n == buffer_size(buffers)`. The buffer is - completely filled. -- On EOF: `ec == cond::eof`, `n` is the number of bytes read before - EOF was reached (may be less than `buffer_size(buffers)`). +- On success: `!ec`, `n == buffer_size(buffers)`. The buffer is completely filled. +- On EOF: `ec == cond::eof`, `n` is the number of bytes read before EOF was reached (may be less than `buffer_size(buffers)`). - On error: `ec`, `n` is the number of bytes read before the error. - If `buffer_empty(buffers)`: completes immediately, `!ec`, `n == 0`. -Successful partial reads are not permitted. Either the entire buffer -is filled, or the operation returns with an error. This is the -defining property of a complete-read primitive. +Successful partial reads are not permitted. Either the entire buffer is filled, or the operation returns with an error. This is the defining property of a complete-read primitive. -Once `read` returns an error (including EOF), the caller must not -call `read` or `read_some` again. The source is done. Not all -implementations can reproduce a prior error on subsequent calls, so -the behavior after an error is undefined. +Once `read` returns an error (including EOF), the caller must not call `read` or `read_some` again. The source is done. Not all implementations can reproduce a prior error on subsequent calls, so the behavior after an error is undefined. -When the buffer sequence contains multiple buffers, each buffer is -filled completely before proceeding to the next. +When the buffer sequence contains multiple buffers, each buffer is filled completely before proceeding to the next. ==== Buffer Lifetime -The caller must ensure that the memory referenced by `buffers` remains -valid until the `co_await` expression returns. +The caller must ensure that the memory referenced by `buffers` remains valid until the `co_await` expression returns. ==== Conforming Signatures @@ -105,35 +85,23 @@ WriteStream { write_some } WriteSink { write_some, write, write_eof(buffers), write_eof() } ---- -Algorithms constrained on `ReadStream` accept both raw streams and -sources. Algorithms that need the complete-read guarantee constrain on -`ReadSource`. +Algorithms constrained on `ReadStream` accept both raw streams and sources. Algorithms that need the complete-read guarantee constrain on `ReadSource`. == Why ReadSource Refines ReadStream Every concrete `ReadSource` type has a natural `read_some`: -- **HTTP content-length body**: `read_some` returns - `min(available_from_network, remaining_content_length)` bytes. It - is the underlying stream's `read_some` capped by the body's limit. -- **HTTP chunked body**: `read_some` delivers whatever unchunked - data is available from the current chunk. -- **Decompression source** (inflate, zstd): `read_some` does one - decompression pass -- feeds available compressed input to the - decompressor and returns whatever output is produced. This is - how `zlib::inflate()` naturally works. -- **File source**: `read_some` is a single `read()` syscall. It is - the OS primitive. +- **HTTP content-length body**: `read_some` returns `min(available_from_network, remaining_content_length)` bytes. It is the underlying stream's `read_some` capped by the body's limit. +- **HTTP chunked body**: `read_some` delivers whatever unchunked data is available from the current chunk. +- **Decompression source** (inflate, zstd): `read_some` does one decompression pass -- feeds available compressed input to the decompressor and returns whatever output is produced. This is how `zlib::inflate()` naturally works. +- **File source**: `read_some` is a single `read()` syscall. It is the OS primitive. - **Memory source**: `read_some` returns `min(requested, remaining)`. -No concrete source type lacks a meaningful `read_some`. The claim that -"many sources can't meaningfully offer `read_some`" does not hold up -under scrutiny. +No concrete source type lacks a meaningful `read_some`. The claim that "many sources can't meaningfully offer `read_some`" does not hold up under scrutiny. === The Relay Argument -If `ReadSource` were disjoint from `ReadStream`, generic relay code -would need two separate implementations: +If `ReadSource` were disjoint from `ReadStream`, generic relay code would need two separate implementations: [source,cpp] ---- @@ -155,13 +123,11 @@ template task<> relay(Src& src, Dest& dest); ---- -This is the same argument that justified `WriteSink` refining -`WriteStream`. +This is the same argument that justified `WriteSink` refining `WriteStream`. === The Latency Argument -With only `read` (complete read), a relay must wait for the entire -buffer to fill before forwarding any data: +With only `read` (complete read), a relay must wait for the entire buffer to fill before forwarding any data: [source,cpp] ---- @@ -179,9 +145,7 @@ auto [ec, n] = co_await src.read_some(mutable_buffer(buf, 65536)); co_await dest.write_some(const_buffer(buf, n)); ---- -For a decompressor backed by a slow network connection, `read_some` -lets you decompress and forward whatever is available instead of -blocking until the entire buffer is filled. +For a decompressor backed by a slow network connection, `read_some` lets you decompress and forward whatever is available instead of blocking until the entire buffer is filled. == Member Function Comparison @@ -214,21 +178,15 @@ auto read(ReadSource auto& source, -> io_task; ---- -Reads from the source into a dynamic buffer until EOF. The buffer -grows with a 1.5x factor when filled. On success (EOF), `ec` is clear -and `n` is total bytes read. +Reads from the source into a dynamic buffer until EOF. The buffer grows with a 1.5x factor when filled. On success (EOF), `ec` is clear and `n` is total bytes read. -This is the `ReadSource` equivalent of the `ReadStream` overload. Both -use the same `read` free function name, distinguished by concept -constraints. +This is the `ReadSource` equivalent of the `ReadStream` overload. Both use the same `read` free function name, distinguished by concept constraints. == Use Cases === Reading an HTTP Body -An HTTP body with a known content length is a `ReadSource`. The caller -reads into a buffer, and the source ensures exactly the right number -of bytes are delivered. +An HTTP body with a known content length is a `ReadSource`. The caller reads into a buffer, and the source ensures exactly the right number of bytes are delivered. [source,cpp] ---- @@ -249,8 +207,7 @@ task read_body(Source& body, std::size_t content_length) === Reading into a Dynamic Buffer -When the body size is unknown (e.g., chunked encoding), read until -EOF using the dynamic buffer overload. +When the body size is unknown (e.g., chunked encoding), read until EOF using the dynamic buffer overload. [source,cpp] ---- @@ -268,8 +225,7 @@ task read_chunked_body(Source& body) === Reading Fixed-Size Records from a Source -When a source produces structured records of known size, `read` -guarantees each record is completely filled. +When a source produces structured records of known size, `read` guarantees each record is completely filled. [source,cpp] ---- @@ -299,10 +255,7 @@ task<> process_records(Source& source) === Decompression with Low-Latency Relay -A decompression source wraps a `ReadStream` and produces decompressed -data. Using `read_some` (inherited from `ReadStream`), a relay can -forward decompressed data as it becomes available instead of waiting -for a full buffer. +A decompression source wraps a `ReadStream` and produces decompressed data. Using `read_some` (inherited from `ReadStream`), a relay can forward decompressed data as it becomes available instead of waiting for a full buffer. [source,cpp] ---- @@ -333,9 +286,7 @@ task<> relay_decompressed(Source& inflater, Sink& dest) === Relaying from ReadSource to WriteSink -When connecting a source to a sink, `read_some` provides low-latency -forwarding. The final chunk uses `write_eof` for atomic delivery plus -EOF signaling. +When connecting a source to a sink, `read_some` provides low-latency forwarding. The final chunk uses `write_eof` for atomic delivery plus EOF signaling. [source,cpp] ---- @@ -363,15 +314,11 @@ task<> relay(Src& src, Sink& dest) } ---- -Because `ReadSource` refines `ReadStream`, this relay accepts -`ReadSource` types (HTTP bodies, decompressors, files) as well as -raw `ReadStream` types (TCP sockets, TLS streams). +Because `ReadSource` refines `ReadStream`, this relay accepts `ReadSource` types (HTTP bodies, decompressors, files) as well as raw `ReadStream` types (TCP sockets, TLS streams). === Type-Erased Source -The `any_read_source` wrapper type-erases a `ReadSource` behind a -virtual interface. This is useful when the concrete source type is -not known at compile time. +The `any_read_source` wrapper type-erases a `ReadSource` behind a virtual interface. This is useful when the concrete source type is not known at compile time. [source,cpp] ---- @@ -393,51 +340,28 @@ task<> handle_request(any_read_source& body) Examples of types that satisfy `ReadSource`: -- **HTTP content-length body**: `read_some` returns available bytes - capped by remaining length. `read` fills the buffer, enforcing the - content length limit. -- **HTTP chunked body**: `read_some` delivers available unchunked - data. `read` decodes chunk framing and fills the buffer. -- **Decompression source** (inflate, zstd): `read_some` does one - decompression pass. `read` loops decompression until the buffer - is filled. -- **File source**: `read_some` is a single `read()` syscall. `read` - loops until the buffer is filled or EOF. -- **Memory source**: `read_some` returns available bytes. `read` - fills the buffer from the memory region. +- **HTTP content-length body**: `read_some` returns available bytes capped by remaining length. `read` fills the buffer, enforcing the content length limit. +- **HTTP chunked body**: `read_some` delivers available unchunked data. `read` decodes chunk framing and fills the buffer. +- **Decompression source** (inflate, zstd): `read_some` does one decompression pass. `read` loops decompression until the buffer is filled. +- **File source**: `read_some` is a single `read()` syscall. `read` loops until the buffer is filled or EOF. +- **Memory source**: `read_some` returns available bytes. `read` fills the buffer from the memory region. == Why `read_some` Returns No Data on EOF -The `read_some` contract (inherited from `ReadStream`) requires that -when `ec == cond::eof`, `n` is always 0. Data and EOF are delivered -in separate calls. See xref:8a.ReadStream.adoc#_design_foundations_why_errors_exclude_data[ReadStream: Why Errors Exclude Data] -for the full rationale. The key points: - -- The clean trichotomy (success/EOF/error, where data implies success) - eliminates an entire class of bugs where callers accidentally drop - the final bytes of a stream. -- Write-side atomicity (`write_eof(buffers)`) serves correctness for - protocol framing. Read-side piggybacking would be a minor - optimization with significant API cost. -- Every concrete source type naturally separates its last data - delivery from its EOF indication. +The `read_some` contract (inherited from `ReadStream`) requires that when `ec == cond::eof`, `n` is always 0. Data and EOF are delivered in separate calls. See xref:8a.ReadStream.adoc#_design_foundations_why_errors_exclude_data[ReadStream: Why Errors Exclude Data] for the full rationale. The key points: + +- The clean trichotomy (success/EOF/error, where data implies success) eliminates an entire class of bugs where callers accidentally drop the final bytes of a stream. +- Write-side atomicity (`write_eof(buffers)`) serves correctness for protocol framing. Read-side piggybacking would be a minor optimization with significant API cost. +- Every concrete source type naturally separates its last data delivery from its EOF indication. - POSIX `read()` follows the same model. -This contract carries over to `ReadSource` unchanged. The `read` -member function (complete read) _does_ allow `n > 0` on EOF, because -it is a composed loop that accumulates data across multiple internal -`read_some` calls. When the underlying stream signals EOF -mid-accumulation, discarding the bytes already gathered would be -wrong. The caller needs `n` to know how much valid data landed in the -buffer. +This contract carries over to `ReadSource` unchanged. The `read` member function (complete read) _does_ allow `n > 0` on EOF, because it is a composed loop that accumulates data across multiple internal `read_some` calls. When the underlying stream signals EOF mid-accumulation, discarding the bytes already gathered would be wrong. The caller needs `n` to know how much valid data landed in the buffer. == Summary -`ReadSource` refines `ReadStream` by adding `read` for complete-read -semantics. The refinement relationship enables: +`ReadSource` refines `ReadStream` by adding `read` for complete-read semantics. The refinement relationship enables: -- Generic algorithms constrained on `ReadStream` work on both raw - streams and sources. +- Generic algorithms constrained on `ReadStream` work on both raw streams and sources. - `read_some` provides low-latency forwarding in relays. - `read` provides the complete-fill guarantee for structured data. diff --git a/doc/modules/ROOT/pages/8.design/8c.BufferSource.adoc b/doc/modules/ROOT/pages/8.design/8e.BufferSource.adoc similarity index 55% rename from doc/modules/ROOT/pages/8.design/8c.BufferSource.adoc rename to doc/modules/ROOT/pages/8.design/8e.BufferSource.adoc index 10958480..a9793ea9 100644 --- a/doc/modules/ROOT/pages/8.design/8c.BufferSource.adoc +++ b/doc/modules/ROOT/pages/8.design/8e.BufferSource.adoc @@ -2,20 +2,9 @@ == Overview -This document describes the design of the `BufferSource` concept, the -rationale behind each member function, and the relationship between -`BufferSource`, `ReadSource`, and the `push_to` algorithm. -`BufferSource` models the "callee owns buffers" pattern on the read -side: the source exposes its internal storage as read-only buffers and -the caller consumes data directly from them, enabling zero-copy data -transfer. - -Where `ReadSource` requires the caller to supply mutable buffers for -the source to fill, `BufferSource` inverts the ownership: the source -provides read-only views into its own memory and the caller reads from -them in place. The two concepts are independent -- neither refines the -other -- but the type-erased wrapper `any_buffer_source` satisfies -both, bridging the two patterns behind a single runtime interface. +This document describes the design of the `BufferSource` concept, the rationale behind each member function, and the relationship between `BufferSource`, `ReadSource`, and the `push_to` algorithm. `BufferSource` models the "callee owns buffers" pattern on the read side: the source exposes its internal storage as read-only buffers and the caller consumes data directly from them, enabling zero-copy data transfer. + +Where `ReadSource` requires the caller to supply mutable buffers for the source to fill, `BufferSource` inverts the ownership: the source provides read-only views into its own memory and the caller reads from them in place. The two concepts are independent -- neither refines the other -- but the type-erased wrapper `any_buffer_source` satisfies both, bridging the two patterns behind a single runtime interface. == Concept Definition @@ -33,9 +22,7 @@ concept BufferSource = }; ---- -`BufferSource` is a standalone concept. It does not refine `ReadSource` -or `ReadStream`. The two concept families model different ownership -patterns and can coexist on the same concrete type. +`BufferSource` is a standalone concept. It does not refine `ReadSource` or `ReadStream`. The two concept families model different ownership patterns and can coexist on the same concrete type. == Caller vs Callee Buffer Ownership @@ -66,20 +53,13 @@ The library provides two concept families for reading data: files, decompression output buffers, kernel receive buffers). |=== -Both patterns are necessary. A memory-mapped file source naturally owns -the mapped region; the caller reads directly from the mapped pages -without copying. Conversely, an application that needs to fill a -fixed-size header struct naturally provides its own mutable buffer for -the source to fill. +Both patterns are necessary. A memory-mapped file source naturally owns the mapped region; the caller reads directly from the mapped pages without copying. Conversely, an application that needs to fill a fixed-size header struct naturally provides its own mutable buffer for the source to fill. == Member Functions === `pull(dest)` -- Expose Readable Buffers -Fills the provided span with const buffer descriptors pointing to the -source's internal storage. This operation is asynchronous because the -source may need to perform I/O to produce data (e.g., reading from a -socket, decompressing a block). +Fills the provided span with const buffer descriptors pointing to the source's internal storage. This operation is asynchronous because the source may need to perform I/O to produce data (e.g., reading from a socket, decompressing a block). ==== Signature @@ -92,39 +72,24 @@ Returns `(error_code, std::span)`. ==== Semantics -- **Data available**: `!ec` and `bufs.size() > 0`. The returned span - contains buffer descriptors pointing to readable data in the source's - internal storage. -- **Source exhausted**: `ec == cond::eof` and `bufs.empty()`. No more - data is available; the transfer is complete. +- **Data available**: `!ec` and `bufs.size() > 0`. The returned span contains buffer descriptors pointing to readable data in the source's internal storage. +- **Source exhausted**: `ec == cond::eof` and `bufs.empty()`. No more data is available; the transfer is complete. - **Error**: `ec` is `true` and `ec != cond::eof`. An error occurred. -Calling `pull` multiple times without an intervening `consume` returns -the same unconsumed data. This idempotency lets the caller inspect the -data, decide how much to process, and then advance the position with -`consume`. +Calling `pull` multiple times without an intervening `consume` returns the same unconsumed data. This idempotency lets the caller inspect the data, decide how much to process, and then advance the position with `consume`. ==== Why Asynchronous -Unlike `BufferSink::prepare`, which is synchronous, `pull` is -asynchronous. The asymmetry exists because the two operations have -fundamentally different costs: +Unlike `BufferSink::prepare`, which is synchronous, `pull` is asynchronous. The asymmetry exists because the two operations have fundamentally different costs: -- `prepare` returns pointers to _empty_ memory the sink already owns. - No data movement is involved; it is pure bookkeeping. -- `pull` may need to _produce_ data before it can return buffer - descriptors. A file source reads from disk. A decompression source - feeds compressed input to the decompressor. A network source waits - for data to arrive on a socket. These operations require I/O. +- `prepare` returns pointers to _empty_ memory the sink already owns. No data movement is involved; it is pure bookkeeping. +- `pull` may need to _produce_ data before it can return buffer descriptors. A file source reads from disk. A decompression source feeds compressed input to the decompressor. A network source waits for data to arrive on a socket. These operations require I/O. -Making `pull` synchronous would force the source to pre-buffer all data -before the caller can begin consuming it, defeating the streaming model. +Making `pull` synchronous would force the source to pre-buffer all data before the caller can begin consuming it, defeating the streaming model. ==== Why a Span Parameter -The caller provides the output span rather than the source returning a -fixed-size container. This lets the caller control the stack allocation -and avoids heap allocation for the buffer descriptor array: +The caller provides the output span rather than the source returning a fixed-size container. This lets the caller control the stack allocation and avoids heap allocation for the buffer descriptor array: [source,cpp] ---- @@ -132,14 +97,11 @@ const_buffer arr[16]; auto [ec, bufs] = co_await source.pull(arr); ---- -The source fills as many descriptors as it can (up to `dest.size()`) -and returns the populated subspan. +The source fills as many descriptors as it can (up to `dest.size()`) and returns the populated subspan. === `consume(n)` -- Advance the Read Position -Advances the source's internal read position by `n` bytes. The next -call to `pull` returns data starting after the consumed bytes. This -operation is synchronous. +Advances the source's internal read position by `n` bytes. The next call to `pull` returns data starting after the consumed bytes. This operation is synchronous. ==== Signature @@ -151,22 +113,16 @@ void consume(std::size_t n) noexcept; ==== Semantics - Advances the read position by `n` bytes. -- `n` must not exceed the total size of the buffers returned by the - most recent `pull`. -- After `consume`, the buffers returned by the prior `pull` are - invalidated. The caller must call `pull` again to obtain new buffer - descriptors. +- `n` must not exceed the total size of the buffers returned by the most recent `pull`. +- After `consume`, the buffers returned by the prior `pull` are invalidated. The caller must call `pull` again to obtain new buffer descriptors. ==== Why Synchronous -`consume` is synchronous because it is pure bookkeeping: advancing an -offset or releasing a reference. No I/O is involved. The asynchronous -work (producing data, performing I/O) happens in `pull`. +`consume` is synchronous because it is pure bookkeeping: advancing an offset or releasing a reference. No I/O is involved. The asynchronous work (producing data, performing I/O) happens in `pull`. ==== Why Separate from `pull` -Separating `consume` from `pull` gives the caller explicit control over -how much data to process before advancing: +Separating `consume` from `pull` gives the caller explicit control over how much data to process before advancing: [source,cpp] ---- @@ -181,13 +137,9 @@ if(!ec) } ---- -This is essential for partial processing. A parser may examine the -pulled data, find that it contains an incomplete message, and consume -only the complete portion. The next `pull` returns the remainder -prepended to any newly available data. +This is essential for partial processing. A parser may examine the pulled data, find that it contains an incomplete message, and consume only the complete portion. The next `pull` returns the remainder prepended to any newly available data. -If `pull` automatically consumed all returned data, the caller would -need to buffer unconsumed bytes itself, defeating the zero-copy benefit. +If `pull` automatically consumed all returned data, the caller would need to buffer unconsumed bytes itself, defeating the zero-copy benefit. == The Pull/Consume Protocol @@ -196,26 +148,17 @@ The `pull` and `consume` functions form a two-phase read protocol: 1. **Pull**: the source provides data (async, may involve I/O). 2. **Inspect**: the caller examines the returned buffers. 3. **Consume**: the caller indicates how many bytes were used (sync). -4. **Repeat**: the next `pull` returns data starting after the consumed - bytes. +4. **Repeat**: the next `pull` returns data starting after the consumed bytes. -This protocol enables several patterns that a single-call interface -cannot: +This protocol enables several patterns that a single-call interface cannot: -- **Partial consumption**: consume less than what was pulled. The - remainder is returned by the next `pull`. -- **Peek**: call `pull` to inspect data without consuming it. Call - `pull` again (without `consume`) to get the same data. -- **Scatter writes**: pull once, write the returned buffers to multiple - destinations (e.g., `write_some` to a socket), and consume only the - bytes that were successfully written. +- **Partial consumption**: consume less than what was pulled. The remainder is returned by the next `pull`. +- **Peek**: call `pull` to inspect data without consuming it. Call `pull` again (without `consume`) to get the same data. +- **Scatter writes**: pull once, write the returned buffers to multiple destinations (e.g., `write_some` to a socket), and consume only the bytes that were successfully written. == Relationship to `push_to` -`push_to` is a composed algorithm that transfers data from a -`BufferSource` to a `WriteSink` (or `WriteStream`). It is the -callee-owns-buffers counterpart to `pull_from`, which transfers from a -`ReadSource` (or `ReadStream`) to a `BufferSink`. +`push_to` is a composed algorithm that transfers data from a `BufferSource` to a `WriteSink` (or `WriteStream`). It is the callee-owns-buffers counterpart to `pull_from`, which transfers from a `ReadSource` (or `ReadStream`) to a `BufferSink`. [source,cpp] ---- @@ -231,11 +174,9 @@ push_to(Src& source, Stream& stream); The algorithm loops: 1. Call `source.pull(arr)` to get readable buffers. -2. Write the data to the sink via `sink.write(bufs)` or - `stream.write_some(bufs)`. +2. Write the data to the sink via `sink.write(bufs)` or `stream.write_some(bufs)`. 3. Call `source.consume(n)` to advance past the written bytes. -4. When `pull` signals EOF, call `sink.write_eof()` to finalize - the sink (WriteSink overload only). +4. When `pull` signals EOF, call `sink.write_eof()` to finalize the sink (WriteSink overload only). The two `push_to` overloads differ in how they write to the destination: @@ -253,35 +194,17 @@ The two `push_to` overloads differ in how they write to the destination: signal EOF (WriteStream has no EOF mechanism). |=== -`push_to` is the right tool when the data source satisfies -`BufferSource` and the destination satisfies `WriteSink` or -`WriteStream`. The source's internal buffers are passed directly to the -write call, avoiding any intermediate caller-owned buffer. +`push_to` is the right tool when the data source satisfies `BufferSource` and the destination satisfies `WriteSink` or `WriteStream`. The source's internal buffers are passed directly to the write call, avoiding any intermediate caller-owned buffer. == Relationship to `ReadSource` -`BufferSource` and `ReadSource` are independent concepts serving -different ownership models. A concrete type may satisfy one, the other, -or both. - -The type-erased wrapper `any_buffer_source` satisfies both concepts. -When the wrapped type satisfies only `BufferSource`, the `ReadSource` -operations (`read_some`, `read`) are synthesized from `pull` and -`consume` with a `buffer_copy` step: the wrapper pulls data from the -underlying source, copies it into the caller's mutable buffers, and -consumes the copied bytes. - -When the wrapped type satisfies both `BufferSource` and `ReadSource`, -the native `read_some` and `read` implementations are forwarded -directly across the type-erased boundary, avoiding the extra copy. -This dispatch is determined at compile time when the vtable is -constructed; at runtime the wrapper checks a single nullable function -pointer to select the forwarding path. - -This dual-concept bridge lets algorithms constrained on `ReadSource` -work with any `BufferSource` through `any_buffer_source`, and lets -algorithms constrained on `BufferSource` work natively with the -callee-owns-buffers pattern. +`BufferSource` and `ReadSource` are independent concepts serving different ownership models. A concrete type may satisfy one, the other, or both. + +The type-erased wrapper `any_buffer_source` satisfies both concepts. When the wrapped type satisfies only `BufferSource`, the `ReadSource` operations (`read_some`, `read`) are synthesized from `pull` and `consume` with a `buffer_copy` step: the wrapper pulls data from the underlying source, copies it into the caller's mutable buffers, and consumes the copied bytes. + +When the wrapped type satisfies both `BufferSource` and `ReadSource`, the native `read_some` and `read` implementations are forwarded directly across the type-erased boundary, avoiding the extra copy. This dispatch is determined at compile time when the vtable is constructed; at runtime the wrapper checks a single nullable function pointer to select the forwarding path. + +This dual-concept bridge lets algorithms constrained on `ReadSource` work with any `BufferSource` through `any_buffer_source`, and lets algorithms constrained on `BufferSource` work natively with the callee-owns-buffers pattern. === Transfer Algorithm Matrix @@ -310,9 +233,7 @@ callee-owns-buffers pattern. === Zero-Copy Transfer to a Socket -When the source's internal storage already contains the data to send, -`push_to` passes the source's buffers directly to the socket's -`write_some`, avoiding any intermediate copy. +When the source's internal storage already contains the data to send, `push_to` passes the source's buffers directly to the socket's `write_some`, avoiding any intermediate copy. [source,cpp] ---- @@ -328,9 +249,7 @@ task<> send_all(Source& source, Stream& socket) === Memory-Mapped File Source -A memory-mapped file is a natural `BufferSource`. The `pull` operation -returns buffer descriptors pointing directly into the mapped region. No -data is copied until the consumer explicitly copies it. +A memory-mapped file is a natural `BufferSource`. The `pull` operation returns buffer descriptors pointing directly into the mapped region. No data is copied until the consumer explicitly copies it. [source,cpp] ---- @@ -346,9 +265,7 @@ task<> serve_static_file(Source& mmap_source, Sink& response) === Partial Consumption with a Parser -A protocol parser pulls data, parses as much as it can, and consumes -only the parsed portion. The next `pull` returns the unparsed remainder -plus any newly arrived data. +A protocol parser pulls data, parses as much as it can, and consumes only the parsed portion. The next `pull` returns the unparsed remainder plus any newly arrived data. [source,cpp] ---- @@ -373,15 +290,11 @@ task parse_message(Source& source) } ---- -The parser consumes only the bytes it understood. If a message spans -two `pull` calls, the unconsumed tail from the first call is returned -at the start of the second. +The parser consumes only the bytes it understood. If a message spans two `pull` calls, the unconsumed tail from the first call is returned at the start of the second. === HTTP Request Body Source -An HTTP request body can be exposed through a `BufferSource` interface. -The concrete implementation handles transfer encoding (content-length, -chunked, compressed) behind the abstraction. +An HTTP request body can be exposed through a `BufferSource` interface. The concrete implementation handles transfer encoding (content-length, chunked, compressed) behind the abstraction. [source,cpp] ---- @@ -396,14 +309,11 @@ task<> handle_request( } ---- -The caller does not know whether the body uses content-length, chunked -encoding, or compression. The `BufferSource` interface handles the -difference. +The caller does not know whether the body uses content-length, chunked encoding, or compression. The `BufferSource` interface handles the difference. === Bridging to ReadSource via `any_buffer_source` -When a function is constrained on `ReadSource` but the concrete type -satisfies only `BufferSource`, `any_buffer_source` bridges the gap. +When a function is constrained on `ReadSource` but the concrete type satisfies only `BufferSource`, `any_buffer_source` bridges the gap. [source,cpp] ---- @@ -418,76 +328,45 @@ any_buffer_source abs(ring); auto data = co_await read_all(abs); ---- -The `read_some` and `read` methods pull data internally, copy it into -the caller's mutable buffers, and consume the copied bytes. This incurs -one buffer copy compared to using `pull` and `consume` directly. +The `read_some` and `read` methods pull data internally, copy it into the caller's mutable buffers, and consume the copied bytes. This incurs one buffer copy compared to using `pull` and `consume` directly. == Alternatives Considered === Single `pull` That Auto-Consumes -An earlier design had `pull` automatically consume all returned data, -eliminating the separate `consume` call. This was rejected because: +An earlier design had `pull` automatically consume all returned data, eliminating the separate `consume` call. This was rejected because: -- Partial consumption becomes impossible. A parser that finds an - incomplete message at the end of a pull would need to buffer the - remainder itself, negating the zero-copy benefit. -- Peek semantics (inspecting data without consuming it) require the - source to maintain a separate undo mechanism. -- The `WriteStream::write_some` pattern naturally consumes only `n` - bytes, so the remaining pulled data must survive for the next - `write_some` call. Without `consume`, the source would need to track - how much of its own returned data was actually used. +- Partial consumption becomes impossible. A parser that finds an incomplete message at the end of a pull would need to buffer the remainder itself, negating the zero-copy benefit. +- Peek semantics (inspecting data without consuming it) require the source to maintain a separate undo mechanism. +- The `WriteStream::write_some` pattern naturally consumes only `n` bytes, so the remaining pulled data must survive for the next `write_some` call. Without `consume`, the source would need to track how much of its own returned data was actually used. === `pull` Returning an Owned Container -A design where `pull` returned a `std::vector` or similar -owned container was considered. This was rejected because: +A design where `pull` returned a `std::vector` or similar owned container was considered. This was rejected because: -- Heap allocation on every pull is unacceptable for high-throughput - I/O paths. -- The span-based interface lets the caller control storage: a - stack-allocated array for the common case, or a heap-allocated array - for unusual situations. -- Returning a subspan of the caller's span is zero-overhead and - composes naturally with existing buffer algorithm interfaces. +- Heap allocation on every pull is unacceptable for high-throughput I/O paths. +- The span-based interface lets the caller control storage: a stack-allocated array for the common case, or a heap-allocated array for unusual situations. +- Returning a subspan of the caller's span is zero-overhead and composes naturally with existing buffer algorithm interfaces. === Synchronous `pull` -Making `pull` synchronous (like `BufferSink::prepare`) was considered. -This was rejected because: +Making `pull` synchronous (like `BufferSink::prepare`) was considered. This was rejected because: -- A source may need to perform I/O to produce data. A file source reads - from disk. A decompression source feeds compressed input to the - decompressor. A network source waits for data to arrive. -- Forcing synchronous `pull` would require the source to pre-buffer all - data before the caller starts consuming, breaking the streaming model - and inflating memory usage. -- The asymmetry with `prepare` is intentional: `prepare` returns - pointers to empty memory (no I/O needed), while `pull` returns - pointers to data that may need to be produced first. +- A source may need to perform I/O to produce data. A file source reads from disk. A decompression source feeds compressed input to the decompressor. A network source waits for data to arrive. +- Forcing synchronous `pull` would require the source to pre-buffer all data before the caller starts consuming, breaking the streaming model and inflating memory usage. +- The asymmetry with `prepare` is intentional: `prepare` returns pointers to empty memory (no I/O needed), while `pull` returns pointers to data that may need to be produced first. === `BufferSource` Refining `ReadSource` -A design where `BufferSource` refined `ReadSource` (requiring all types -to implement `read_some` and `read`) was considered. This was rejected -because: +A design where `BufferSource` refined `ReadSource` (requiring all types to implement `read_some` and `read`) was considered. This was rejected because: -- Many natural `BufferSource` types (memory-mapped files, ring buffers, - DMA receive descriptors) have no meaningful `read_some` primitive. - Their data path is pull-then-consume, not read-into-caller-buffer. -- Requiring `read_some` and `read` on every `BufferSource` would force - implementations to synthesize these operations even when they are - never called. -- The `any_buffer_source` wrapper provides the bridge when needed, - without burdening every concrete type. +- Many natural `BufferSource` types (memory-mapped files, ring buffers, DMA receive descriptors) have no meaningful `read_some` primitive. Their data path is pull-then-consume, not read-into-caller-buffer. +- Requiring `read_some` and `read` on every `BufferSource` would force implementations to synthesize these operations even when they are never called. +- The `any_buffer_source` wrapper provides the bridge when needed, without burdening every concrete type. === Combined Pull-and-Consume -A design with a single `read(dest) -> (error_code, span)` that both -pulled and advanced the position was considered. This is equivalent to -the auto-consume alternative above and was rejected for the same -reasons: it prevents partial consumption and peek semantics. +A design with a single `read(dest) -> (error_code, span)` that both pulled and advanced the position was considered. This is equivalent to the auto-consume alternative above and was rejected for the same reasons: it prevents partial consumption and peek semantics. == Summary @@ -506,8 +385,4 @@ reasons: it prevents partial consumption and peek semantics. | After processing or forwarding data: indicate how much was used. |=== -`BufferSource` is the callee-owns-buffers counterpart to `ReadSource`. -The `push_to` algorithm transfers data from a `BufferSource` to a -`WriteSink` or `WriteStream`, and `any_buffer_source` bridges the two -patterns by satisfying both `BufferSource` and `ReadSource` behind a -single type-erased interface. +`BufferSource` is the callee-owns-buffers counterpart to `ReadSource`. The `push_to` algorithm transfers data from a `BufferSource` to a `WriteSink` or `WriteStream`, and `any_buffer_source` bridges the two patterns by satisfying both `BufferSource` and `ReadSource` behind a single type-erased interface. diff --git a/doc/modules/ROOT/pages/8.design/8d.WriteStream.adoc b/doc/modules/ROOT/pages/8.design/8f.WriteStream.adoc similarity index 66% rename from doc/modules/ROOT/pages/8.design/8d.WriteStream.adoc rename to doc/modules/ROOT/pages/8.design/8f.WriteStream.adoc index 0af74d75..7e7ccaab 100644 --- a/doc/modules/ROOT/pages/8.design/8d.WriteStream.adoc +++ b/doc/modules/ROOT/pages/8.design/8f.WriteStream.adoc @@ -2,11 +2,7 @@ == Overview -This document describes the design of the `WriteStream` concept: the -fundamental partial-write primitive in the concept hierarchy. It explains -why `write_some` is the correct building block, how algorithms expressed -directly in terms of `write_some` can outperform composed complete-write -algorithms like `write_now`, and when each approach is appropriate. +This document describes the design of the `WriteStream` concept: the fundamental partial-write primitive in the concept hierarchy. It explains why `write_some` is the correct building block, how algorithms expressed directly in terms of `write_some` can outperform composed complete-write algorithms like `write_now`, and when each approach is appropriate. == Definition @@ -27,8 +23,7 @@ A `WriteStream` provides a single operation: === `write_some(buffers)` -- Partial Write -Writes one or more bytes from the buffer sequence. Returns -`(error_code, std::size_t)` where `n` is the number of bytes written. +Writes one or more bytes from the buffer sequence. Returns `(error_code, std::size_t)` where `n` is the number of bytes written. ==== Semantics @@ -36,14 +31,11 @@ Writes one or more bytes from the buffer sequence. Returns - On error: `ec`, `n == 0`. - If `buffer_empty(buffers)`: completes immediately, `!ec`, `n == 0`. -The caller must not assume that all bytes are consumed. `write_some` -may write fewer bytes than offered. This is the defining property of a -partial-write primitive. +The caller must not assume that all bytes are consumed. `write_some` may write fewer bytes than offered. This is the defining property of a partial-write primitive. ==== Buffer Lifetime -The caller must ensure that the memory referenced by `buffers` remains -valid until the `co_await` expression returns. +The caller must ensure that the memory referenced by `buffers` remains valid until the `co_await` expression returns. ==== Conforming Signatures @@ -53,9 +45,7 @@ template IoAwaitable auto write_some(Buffers buffers); ---- -Buffer sequences should be accepted by value when the member function -is a coroutine, to ensure the sequence lives in the coroutine frame -across suspension points. +Buffer sequences should be accepted by value when the member function is a coroutine, to ensure the sequence lives in the coroutine frame across suspension points. == Concept Hierarchy @@ -68,15 +58,11 @@ WriteStream { write_some } WriteSink { write_some, write, write_eof(buffers), write_eof() } ---- -Every `WriteSink` is a `WriteStream`. Algorithms constrained on -`WriteStream` accept both raw streams and sinks. The `WriteSink` -concept adds complete-write and EOF signaling on top of the partial-write -primitive. See the WriteSink design document for details. +Every `WriteSink` is a `WriteStream`. Algorithms constrained on `WriteStream` accept both raw streams and sinks. The `WriteSink` concept adds complete-write and EOF signaling on top of the partial-write primitive. See the WriteSink design document for details. == Composed Algorithms -Two composed algorithms build complete-write behavior on top of -`write_some`: +Two composed algorithms build complete-write behavior on top of `write_some`: === `write` (free function) @@ -87,8 +73,7 @@ auto write(WriteStream auto& stream, -> io_task; ---- -Loops `write_some` until the entire buffer sequence is consumed. Always -suspends (returns `task`). No frame caching. +Loops `write_some` until the entire buffer sequence is consumed. Always suspends (returns `task`). No frame caching. === `write_now` (class template) @@ -104,33 +89,20 @@ public: }; ---- -Loops `write_some` until the entire buffer sequence is consumed, with -two advantages over the free function: +Loops `write_some` until the entire buffer sequence is consumed, with two advantages over the free function: -1. **Eager completion**: if every `write_some` returns synchronously - (its `await_ready` returns `true`), the entire operation completes - in `await_ready` with zero coroutine suspensions. -2. **Frame caching**: the internal coroutine frame is allocated once and - reused across calls. +1. **Eager completion**: if every `write_some` returns synchronously (its `await_ready` returns `true`), the entire operation completes in `await_ready` with zero coroutine suspensions. +2. **Frame caching**: the internal coroutine frame is allocated once and reused across calls. == Buffer Top-Up: Why `write_some` Can Outperform `write_now` -The critical design insight behind `write_some` as a primitive is that -the caller retains control after each partial write. This enables a -pattern called _buffer top-up_: after a partial write consumes some data, -the caller refills the buffer before the next write, keeping the buffer -as full as possible. This maximizes the payload of each system call. +The critical design insight behind `write_some` as a primitive is that the caller retains control after each partial write. This enables a pattern called _buffer top-up_: after a partial write consumes some data, the caller refills the buffer before the next write, keeping the buffer as full as possible. This maximizes the payload of each system call. -A composed algorithm like `write_now` cannot do this. It receives a fixed -buffer sequence and drains it to completion. When the kernel accepts only -part of the data, `write_now` must send the remainder in a second call -- -even though the remainder may be small. The caller has no opportunity to -read more data from the source between iterations. +A composed algorithm like `write_now` cannot do this. It receives a fixed buffer sequence and drains it to completion. When the kernel accepts only part of the data, `write_now` must send the remainder in a second call -- even though the remainder may be small. The caller has no opportunity to read more data from the source between iterations. === Diagram: Relaying 100KB from a ReadSource through a TCP Socket -Consider relaying 100KB from a `ReadSource` to a TCP socket. The kernel's -send buffer accepts at most 40KB per call. Compare two approaches: +Consider relaying 100KB from a `ReadSource` to a TCP socket. The kernel's send buffer accepts at most 40KB per call. Compare two approaches: ==== Approach A: `write_some` with Top-Up (3 syscalls) @@ -153,16 +125,11 @@ Step 4: [== 16KB ==] write_some --> 16KB (write_now, small payloa done. 100KB in 4 syscalls, two calls undersized. ---- -Every time `write_now` partially drains a buffer, the remainder is a -small payload that wastes a syscall. With top-up, the caller refills -the ring buffer between calls, keeping each syscall near capacity. +Every time `write_now` partially drains a buffer, the remainder is a small payload that wastes a syscall. With top-up, the caller refills the ring buffer between calls, keeping each syscall near capacity. === Code: `write_some` with Buffer Top-Up -This example reads from a `ReadSource` and writes to a `WriteStream` -using a `circular_dynamic_buffer`. After each partial write frees space -in the ring buffer, the caller reads more data from the source to refill -it before calling `write_some` again. +This example reads from a `ReadSource` and writes to a `WriteStream` using a `circular_dynamic_buffer`. After each partial write frees space in the ring buffer, the caller reads more data from the source to refill it before calling `write_some` again. [source,cpp] ---- @@ -212,15 +179,11 @@ task<> relay_with_topup(Source& src, Stream& dest) } ---- -After `write_some` accepts 40KB of a 64KB buffer, `consume(40KB)` frees -40KB. The caller immediately reads more data from the source into that -freed space. The next `write_some` again presents a full 64KB payload. +After `write_some` accepts 40KB of a 64KB buffer, `consume(40KB)` frees 40KB. The caller immediately reads more data from the source into that freed space. The next `write_some` again presents a full 64KB payload. === Code: `write_now` Without Top-Up -This example reads from a `ReadSource` and writes to a `WriteStream` -using `write_now`. Each chunk is drained to completion before the caller -can read more from the source. +This example reads from a `ReadSource` and writes to a `WriteStream` using `write_now`. Each chunk is drained to completion before the caller can read more from the source. [source,cpp] ---- @@ -255,10 +218,7 @@ task<> relay_with_write_now(Source& src, Stream& dest) } ---- -After the kernel accepts 40KB of a 64KB chunk, `write_now` must send -the remaining 24KB in a second `write_some`. The caller cannot intervene -to refill the buffer because `write_now` owns the loop. That 24KB write -wastes an opportunity to send a full 64KB payload. +After the kernel accepts 40KB of a 64KB chunk, `write_now` must send the remaining 24KB in a second `write_some`. The caller cannot intervene to refill the buffer because `write_now` owns the loop. That 24KB write wastes an opportunity to send a full 64KB payload. == When to Use Each Approach @@ -287,34 +247,25 @@ wastes an opportunity to send a full 64KB payload. === Rule of Thumb -- If the caller reads from a source and relays to a raw byte stream - (TCP socket), use `write_some` with a `circular_dynamic_buffer` - for buffer top-up. -- If the caller has a discrete, bounded payload and wants zero-fuss - complete-write semantics, use `write_now`. +- If the caller reads from a source and relays to a raw byte stream (TCP socket), use `write_some` with a `circular_dynamic_buffer` for buffer top-up. +- If the caller has a discrete, bounded payload and wants zero-fuss complete-write semantics, use `write_now`. - If the destination is a `WriteSink`, use `write` directly. == Conforming Types Examples of types that satisfy `WriteStream`: -- **TCP sockets**: `write_some` maps to a single `send()` or - `WSASend()` call. Partial writes are normal under load. +- **TCP sockets**: `write_some` maps to a single `send()` or `WSASend()` call. Partial writes are normal under load. - **TLS streams**: `write_some` encrypts and sends one TLS record. -- **Buffered write streams**: `write_some` appends to an internal - buffer and returns immediately when space is available, or drains - to the underlying stream when full. +- **Buffered write streams**: `write_some` appends to an internal buffer and returns immediately when space is available, or drains to the underlying stream when full. - **QUIC streams**: `write_some` sends one or more QUIC frames. -- **Test mock streams**: `write_some` records data and returns - configurable results for testing. +- **Test mock streams**: `write_some` records data and returns configurable results for testing. -All of these types also naturally extend to `WriteSink` by adding -`write`, `write_eof(buffers)`, and `write_eof()`. +All of these types also naturally extend to `WriteSink` by adding `write`, `write_eof(buffers)`, and `write_eof()`. == Relationship to `ReadStream` -The read-side counterpart is `ReadStream`, which requires `read_some`. -The same partial-transfer / composed-algorithm decomposition applies: +The read-side counterpart is `ReadStream`, which requires `read_some`. The same partial-transfer / composed-algorithm decomposition applies: [cols="1,1"] |=== @@ -330,26 +281,14 @@ The same partial-transfer / composed-algorithm decomposition applies: | `ReadSource::read` |=== -The asymmetry is that the read side does not have a `read_now` with -eager completion, because reads depend on data arriving from the -network -- the synchronous fast path is less reliably useful than -for writes into a buffered stream. +The asymmetry is that the read side does not have a `read_now` with eager completion, because reads depend on data arriving from the network -- the synchronous fast path is less reliably useful than for writes into a buffered stream. == Summary -`WriteStream` provides `write_some` as the single partial-write -primitive. This is deliberately minimal: - -- Algorithms that need complete-write semantics use `write_now` (for - `WriteStream`) or `write` (for `WriteSink`). -- Algorithms that need maximum throughput use `write_some` directly - with buffer top-up, achieving fewer syscalls than composed algorithms - by keeping the buffer full between iterations. -- The concept is the base of the hierarchy. `WriteSink` refines it by - adding `write`, `write_eof(buffers)`, and `write_eof()`. - -The choice between `write_some`, `write_now`, and `WriteSink::write` -is a throughput-versus-convenience trade-off. `write_some` gives the -caller maximum control. `write_now` gives the caller maximum simplicity. -`WriteSink::write` gives the concrete type maximum implementation -freedom. +`WriteStream` provides `write_some` as the single partial-write primitive. This is deliberately minimal: + +- Algorithms that need complete-write semantics use `write_now` (for `WriteStream`) or `write` (for `WriteSink`). +- Algorithms that need maximum throughput use `write_some` directly with buffer top-up, achieving fewer syscalls than composed algorithms by keeping the buffer full between iterations. +- The concept is the base of the hierarchy. `WriteSink` refines it by adding `write`, `write_eof(buffers)`, and `write_eof()`. + +The choice between `write_some`, `write_now`, and `WriteSink::write` is a throughput-versus-convenience trade-off. `write_some` gives the caller maximum control. `write_now` gives the caller maximum simplicity. `WriteSink::write` gives the concrete type maximum implementation freedom. diff --git a/doc/modules/ROOT/pages/8.design/8e.WriteSink.adoc b/doc/modules/ROOT/pages/8.design/8g.WriteSink.adoc similarity index 61% rename from doc/modules/ROOT/pages/8.design/8e.WriteSink.adoc rename to doc/modules/ROOT/pages/8.design/8g.WriteSink.adoc index fa85c74c..cc2b55f3 100644 --- a/doc/modules/ROOT/pages/8.design/8e.WriteSink.adoc +++ b/doc/modules/ROOT/pages/8.design/8g.WriteSink.adoc @@ -2,11 +2,7 @@ == Overview -This document describes the design of the `WriteSink` concept, the rationale -behind each member function, and the relationship between `WriteSink`, -`WriteStream`, and the `write_now` algorithm. The design was arrived at -through deliberation over several alternative approaches, each of which -is discussed here with its trade-offs. +This document describes the design of the `WriteSink` concept, the rationale behind each member function, and the relationship between `WriteSink`, `WriteStream`, and the `write_now` algorithm. The design was arrived at through deliberation over several alternative approaches, each of which is discussed here with its trade-offs. == Concept Hierarchy @@ -46,20 +42,15 @@ concept WriteSink = }; ---- -`WriteSink` refines `WriteStream`. Every `WriteSink` is a `WriteStream`. -Algorithms constrained on `WriteStream` accept both raw streams and sinks. +`WriteSink` refines `WriteStream`. Every `WriteSink` is a `WriteStream`. Algorithms constrained on `WriteStream` accept both raw streams and sinks. == Member Functions === `write_some(buffers)` -- Partial Write -Writes one or more bytes from the buffer sequence. May consume less than -the full sequence. Returns `(error_code, std::size_t)` where `n` is the -number of bytes written. +Writes one or more bytes from the buffer sequence. May consume less than the full sequence. Returns `(error_code, std::size_t)` where `n` is the number of bytes written. -This is the low-level primitive inherited from `WriteStream`. It is -appropriate when the caller manages its own consumption loop or when -forwarding data incrementally without needing a complete-write guarantee. +This is the low-level primitive inherited from `WriteStream`. It is appropriate when the caller manages its own consumption loop or when forwarding data incrementally without needing a complete-write guarantee. ==== Semantics @@ -69,81 +60,56 @@ forwarding data incrementally without needing a complete-write guarantee. ==== When to Use -- Relay interiors: forwarding chunks of data as they arrive without waiting - for the entire payload to be consumed. -- Backpressure-aware pipelines: writing what the destination can accept - and returning control to the caller. +- Relay interiors: forwarding chunks of data as they arrive without waiting for the entire payload to be consumed. +- Backpressure-aware pipelines: writing what the destination can accept and returning control to the caller. - Implementing `write` or `write_now` on top of the primitive. === `write(buffers)` -- Complete Write -Writes the entire buffer sequence. All bytes are consumed before the -operation completes. Returns `(error_code, std::size_t)` where `n` is -the number of bytes written. +Writes the entire buffer sequence. All bytes are consumed before the operation completes. Returns `(error_code, std::size_t)` where `n` is the number of bytes written. ==== Semantics - On success: `!ec`, `n == buffer_size(buffers)`. -- On error: `ec`, `n` is the number of bytes written before - the error occurred. +- On error: `ec`, `n` is the number of bytes written before the error occurred. - If `buffer_empty(buffers)`: completes immediately, `!ec`, `n == 0`. ==== When to Use - Writing complete protocol messages or frames. -- Serializing structured data where each fragment must be fully delivered - before producing the next. +- Serializing structured data where each fragment must be fully delivered before producing the next. - Any context where partial delivery is not meaningful. ==== Why `write` Belongs in the Concept -For many concrete types, `write` is the natural primitive, not a loop -over `write_some`: +For many concrete types, `write` is the natural primitive, not a loop over `write_some`: -- **File sinks**: the OS `write` call is the primitive. `write_some` - would simply delegate to `write`. -- **Buffered writers**: `write` is a `memcpy` into the circular buffer - (or drain-then-copy). It is not a loop over `write_some`. -- **Compression sinks** (deflate, zstd): `write` feeds data to the - compressor and flushes the output. The internal operation is a single - compression call, not iterated partial writes. +- **File sinks**: the OS `write` call is the primitive. `write_some` would simply delegate to `write`. +- **Buffered writers**: `write` is a `memcpy` into the circular buffer (or drain-then-copy). It is not a loop over `write_some`. +- **Compression sinks** (deflate, zstd): `write` feeds data to the compressor and flushes the output. The internal operation is a single compression call, not iterated partial writes. -Requiring `write` in the concept lets each type implement the operation -in the way that is natural and efficient for that type. +Requiring `write` in the concept lets each type implement the operation in the way that is natural and efficient for that type. === `write_eof(buffers)` -- Atomic Final Write -Writes the entire buffer sequence and then signals end-of-stream, as a -single atomic operation. Returns `(error_code, std::size_t)` where `n` -is the number of bytes written. +Writes the entire buffer sequence and then signals end-of-stream, as a single atomic operation. Returns `(error_code, std::size_t)` where `n` is the number of bytes written. After a successful call, no further writes or EOF signals are permitted. ==== Semantics -- On success: `!ec`, `n == buffer_size(buffers)`. The sink - is finalized. -- On error: `ec`, `n` is bytes written before the error. The - sink state is unspecified. +- On success: `!ec`, `n == buffer_size(buffers)`. The sink is finalized. +- On error: `ec`, `n` is bytes written before the error. The sink state is unspecified. ==== Why Atomicity Matters -Combining the final write with the EOF signal in a single operation -enables optimizations that two separate calls cannot: +Combining the final write with the EOF signal in a single operation enables optimizations that two separate calls cannot: -- **HTTP chunked encoding**: `write_eof(data)` can emit the data chunk - followed by the terminal `0\r\n\r\n` in a single system call. Calling - `write(data)` then `write_eof()` separately forces two calls and may - result in two TCP segments. -- **Compression (deflate)**: `write_eof(data)` can pass `Z_FINISH` to - the final `deflate()` call, producing the compressed data and the - stream trailer together. Separate `write` + `write_eof` would require - an extra flush. -- **TLS close-notify**: `write_eof(data)` can coalesce the final - application data with the TLS close-notify alert. +- **HTTP chunked encoding**: `write_eof(data)` can emit the data chunk followed by the terminal `0\r\n\r\n` in a single system call. Calling `write(data)` then `write_eof()` separately forces two calls and may result in two TCP segments. +- **Compression (deflate)**: `write_eof(data)` can pass `Z_FINISH` to the final `deflate()` call, producing the compressed data and the stream trailer together. Separate `write` + `write_eof` would require an extra flush. +- **TLS close-notify**: `write_eof(data)` can coalesce the final application data with the TLS close-notify alert. -This optimization cannot be achieved by splitting the operation into -`write(data)` followed by `write_eof()`. +This optimization cannot be achieved by splitting the operation into `write(data)` followed by `write_eof()`. === `write_eof()` -- Bare EOF Signal @@ -158,36 +124,22 @@ After a successful call, no further writes or EOF signals are permitted. ==== When to Use -When the final data has already been written via `write` or `write_some` -and only the EOF signal remains. This is less common than `write_eof(buffers)` -but necessary when the data and EOF are produced at different times. +When the final data has already been written via `write` or `write_some` and only the EOF signal remains. This is less common than `write_eof(buffers)` but necessary when the data and EOF are produced at different times. == Relationship to `write_now` -`write_now` is a composed algorithm that operates on any `WriteStream`. -It loops `write_some` until the entire buffer sequence is consumed. It -has two properties that a plain `write_some` loop does not: +`write_now` is a composed algorithm that operates on any `WriteStream`. It loops `write_some` until the entire buffer sequence is consumed. It has two properties that a plain `write_some` loop does not: -1. **Eager completion**: if every `write_some` call completes - synchronously (returns `true` from `await_ready`), the entire - `write_now` operation completes in `await_ready` with zero coroutine - suspensions. -2. **Frame caching**: the internal coroutine frame is cached and reused - across calls, eliminating repeated allocation. +1. **Eager completion**: if every `write_some` call completes synchronously (returns `true` from `await_ready`), the entire `write_now` operation completes in `await_ready` with zero coroutine suspensions. +2. **Frame caching**: the internal coroutine frame is cached and reused across calls, eliminating repeated allocation. -`write_now` is the right tool for code constrained on `WriteStream` -alone (for example, writing to a raw TCP socket). Code constrained on -`WriteSink` should use `write` directly, because the concrete type's -`write` may be more efficient than looping `write_some`, and because -`write_now` cannot replicate the atomic `write_eof(buffers)` operation. +`write_now` is the right tool for code constrained on `WriteStream` alone (for example, writing to a raw TCP socket). Code constrained on `WriteSink` should use `write` directly, because the concrete type's `write` may be more efficient than looping `write_some`, and because `write_now` cannot replicate the atomic `write_eof(buffers)` operation. == Use Cases === Serializing Structured Data -When producing output fragment by fragment (e.g., JSON serialization), -each fragment must be fully consumed before the next is produced. The -final fragment signals EOF. +When producing output fragment by fragment (e.g., JSON serialization), each fragment must be fully consumed before the next is produced. The final fragment signals EOF. [source,cpp] ---- @@ -209,14 +161,11 @@ task<> serialize_json(Sink& sink, json::value const& jv) } ---- -Here `write` guarantees each fragment is fully delivered, and -`write_eof` atomically writes the closing brace and finalizes the sink. +Here `write` guarantees each fragment is fully delivered, and `write_eof` atomically writes the closing brace and finalizes the sink. === Relaying a Streaming Body -When forwarding data from a source to a sink, the interior chunks use -`write_some` for incremental progress. The final chunk uses `write_eof` -for atomic delivery plus EOF. +When forwarding data from a source to a sink, the interior chunks use `write_some` for incremental progress. The final chunk uses `write_eof` for atomic delivery plus EOF. [source,cpp] ---- @@ -251,15 +200,11 @@ task<> relay(Source& src, Sink& dest) } ---- -The interior loop uses `write_some` because the relay does not need -complete-write guarantees for intermediate data. When `read_some` -returns EOF, `n` is 0 (per the `ReadStream` contract), so the relay -signals EOF via `write_eof()` with no data. +The interior loop uses `write_some` because the relay does not need complete-write guarantees for intermediate data. When `read_some` returns EOF, `n` is 0 (per the `ReadStream` contract), so the relay signals EOF via `write_eof()` with no data. === Writing Complete Messages -When sending discrete messages where each must be fully delivered, `write` -is the natural choice. +When sending discrete messages where each must be fully delivered, `write` is the natural choice. [source,cpp] ---- @@ -280,9 +225,7 @@ task<> send_messages(Sink& sink, std::span messages) === HTTP Response Body -An HTTP response handler writes the body through a type-erased sink. -The concrete implementation handles transfer encoding (content-length, -chunked, compressed) behind the `WriteSink` interface. +An HTTP response handler writes the body through a type-erased sink. The concrete implementation handles transfer encoding (content-length, chunked, compressed) behind the `WriteSink` interface. [source,cpp] ---- @@ -302,13 +245,11 @@ task<> send_response(any_write_sink& body, response const& resp) } ---- -The caller does not know whether the body is content-length, chunked, -or compressed. The `WriteSink` interface handles the difference. +The caller does not know whether the body is content-length, chunked, or compressed. The `WriteSink` interface handles the difference. === Compression Pipeline -A deflate sink wraps an underlying `WriteStream` and compresses data -on the fly. `write_eof` sets `Z_FINISH` on the final deflate call. +A deflate sink wraps an underlying `WriteStream` and compresses data on the fly. `write_eof` sets `Z_FINISH` on the final deflate call. [source,cpp] ---- @@ -326,10 +267,7 @@ task<> compress_and_send(Sink& sink, std::string_view data) === Buffered Writer -A buffered writer interposes a buffer between the caller and the -underlying stream. `write_some` appends to the buffer without draining. -`write` ensures all data is buffered (draining if necessary). `write_eof` -flushes the buffer and signals EOF to the underlying stream. +A buffered writer interposes a buffer between the caller and the underlying stream. `write_some` appends to the buffer without draining. `write` ensures all data is buffered (draining if necessary). `write_eof` flushes the buffer and signals EOF to the underlying stream. [source,cpp] ---- @@ -355,9 +293,7 @@ task<> buffered_output(Sink& sink) === Raw Stream with `write_now` -When only a `WriteStream` is available (no EOF signaling needed), the -`write_now` algorithm provides complete-write behavior with eager -completion and frame caching. +When only a `WriteStream` is available (no EOF signaling needed), the `write_now` algorithm provides complete-write behavior with eager completion and frame caching. [source,cpp] ---- @@ -377,65 +313,37 @@ task<> send_data(Stream& stream) } ---- -Because `WriteSink` refines `WriteStream`, `write_now` also works on -sinks. This can be useful when a function is generic over `WriteStream` -and does not need EOF signaling. +Because `WriteSink` refines `WriteStream`, `write_now` also works on sinks. This can be useful when a function is generic over `WriteStream` and does not need EOF signaling. == Alternatives Considered === WriteSink with Only `write` and `write_eof` -The initial design had `WriteSink` require only `write(buffers)`, -`write(buffers, bool eof)`, and `write_eof()`, with no `write_some`. -This made `WriteSink` disjoint from `WriteStream`: a function -constrained on `WriteStream` (using `write_some`) could not accept a -`WriteSink`, and vice versa. +The initial design had `WriteSink` require only `write(buffers)`, `write(buffers, bool eof)`, and `write_eof()`, with no `write_some`. This made `WriteSink` disjoint from `WriteStream`: a function constrained on `WriteStream` (using `write_some`) could not accept a `WriteSink`, and vice versa. -This was rejected because it prevents generic algorithms from working -across both streams and sinks. The refinement relationship -(`WriteSink` refines `WriteStream`) is strictly more useful. +This was rejected because it prevents generic algorithms from working across both streams and sinks. The refinement relationship (`WriteSink` refines `WriteStream`) is strictly more useful. === WriteSink with Only `write_some` and `write_eof` -A minimal design was considered where `WriteSink` required only -`write_some` and `write_eof`, with callers using `write_now` for -complete-write behavior. This approach has three problems: +A minimal design was considered where `WriteSink` required only `write_some` and `write_eof`, with callers using `write_now` for complete-write behavior. This approach has three problems: -1. **No atomic final write**: `write_now` over `write_some` followed by - `write_eof()` is two operations. This prevents concrete types from - coalescing the final data with the EOF signal (chunked encoding, - compression trailers, TLS close-notify). +1. **No atomic final write**: `write_now` over `write_some` followed by `write_eof()` is two operations. This prevents concrete types from coalescing the final data with the EOF signal (chunked encoding, compression trailers, TLS close-notify). -2. **`write` is the natural primitive for many types**: files, buffered - writers, and compression sinks implement `write` directly, not as a - loop over `write_some`. Forcing these types to express complete-write - semantics through a function called `write_some` is semantically - misleading. +2. **`write` is the natural primitive for many types**: files, buffered writers, and compression sinks implement `write` directly, not as a loop over `write_some`. Forcing these types to express complete-write semantics through a function called `write_some` is semantically misleading. -3. **Implementation burden on callers**: every caller that needs - complete-write behavior must construct a `write_now` object and - manage it, rather than calling `sink.write(buffers)` directly. +3. **Implementation burden on callers**: every caller that needs complete-write behavior must construct a `write_now` object and manage it, rather than calling `sink.write(buffers)` directly. === `write(buffers, bool eof)` Instead of `write_eof(buffers)` -An earlier version used `write(buffers, bool eof)` to combine data -writing with optional EOF signaling. This was replaced by -`write_eof(buffers)` because: +An earlier version used `write(buffers, bool eof)` to combine data writing with optional EOF signaling. This was replaced by `write_eof(buffers)` because: -- Boolean parameters are opaque at the call site. `write(data, true)` - does not convey intent as clearly as `write_eof(data)`. +- Boolean parameters are opaque at the call site. `write(data, true)` does not convey intent as clearly as `write_eof(data)`. - `write_eof` is self-documenting: the name states that EOF is signaled. - No risk of accidentally passing the wrong boolean value. === Three-Concept Hierarchy (`WriteStream` / `WriteCloser` / `WriteSink`) -A three-level hierarchy was considered, with an intermediate concept -(`WriteCloser` or similar) requiring `write_some` + `write_eof` but -not `write`. This was rejected because the intermediate concept serves -no practical purpose: any concrete type that has `write_some` and -`write_eof` can and should provide `write`. There is no use case where -a type offers partial writes and EOF signaling but cannot offer complete -writes. +A three-level hierarchy was considered, with an intermediate concept (`WriteCloser` or similar) requiring `write_some` + `write_eof` but not `write`. This was rejected because the intermediate concept serves no practical purpose: any concrete type that has `write_some` and `write_eof` can and should provide `write`. There is no use case where a type offers partial writes and EOF signaling but cannot offer complete writes. == Summary @@ -460,7 +368,4 @@ writes. | When the final data was already written separately. |=== -`WriteSink` refines `WriteStream`. The `write_now` algorithm operates on -any `WriteStream` and provides complete-write behavior with eager -completion and frame caching, but it cannot replicate the atomic -`write_eof(buffers)` that `WriteSink` enables. +`WriteSink` refines `WriteStream`. The `write_now` algorithm operates on any `WriteStream` and provides complete-write behavior with eager completion and frame caching, but it cannot replicate the atomic `write_eof(buffers)` that `WriteSink` enables. diff --git a/doc/modules/ROOT/pages/8.design/8f.BufferSink.adoc b/doc/modules/ROOT/pages/8.design/8h.BufferSink.adoc similarity index 63% rename from doc/modules/ROOT/pages/8.design/8f.BufferSink.adoc rename to doc/modules/ROOT/pages/8.design/8h.BufferSink.adoc index 740be7f2..f2389603 100644 --- a/doc/modules/ROOT/pages/8.design/8f.BufferSink.adoc +++ b/doc/modules/ROOT/pages/8.design/8h.BufferSink.adoc @@ -2,18 +2,9 @@ == Overview -This document describes the design of the `BufferSink` concept, the rationale -behind each member function, and the relationship between `BufferSink`, -`WriteSink`, and the `pull_from` algorithm. `BufferSink` models the -"callee owns buffers" pattern: the sink provides writable memory and the -caller writes directly into it, enabling zero-copy data transfer. - -Where `WriteSink` requires the caller to supply buffer sequences containing -the data to be written, `BufferSink` inverts the ownership: the sink -exposes its internal storage and the caller fills it in place. The two -concepts are independent -- neither refines the other -- but the -type-erased wrapper `any_buffer_sink` satisfies both, bridging the two -patterns behind a single runtime interface. +This document describes the design of the `BufferSink` concept, the rationale behind each member function, and the relationship between `BufferSink`, `WriteSink`, and the `pull_from` algorithm. `BufferSink` models the "callee owns buffers" pattern: the sink provides writable memory and the caller writes directly into it, enabling zero-copy data transfer. + +Where `WriteSink` requires the caller to supply buffer sequences containing the data to be written, `BufferSink` inverts the ownership: the sink exposes its internal storage and the caller fills it in place. The two concepts are independent -- neither refines the other -- but the type-erased wrapper `any_buffer_sink` satisfies both, bridging the two patterns behind a single runtime interface. == Concept Definition @@ -40,9 +31,7 @@ concept BufferSink = }; ---- -`BufferSink` is a standalone concept. It does not refine `WriteSink` or -`WriteStream`. The two concept families model different ownership -patterns and can coexist on the same concrete type. +`BufferSink` is a standalone concept. It does not refine `WriteSink` or `WriteStream`. The two concept families model different ownership patterns and can coexist on the same concrete type. == Caller vs Callee Buffer Ownership @@ -72,19 +61,13 @@ The library provides two concept families for writing data: hardware DMA descriptors). |=== -Both patterns are necessary. A compression sink, for example, naturally -owns the output buffer where compressed data lands; the caller feeds -uncompressed data and the compressor writes results directly into the -ring buffer. Conversely, an HTTP serializer naturally produces header -bytes into its own scratch space and then hands the buffer sequence to a -`WriteSink`. +Both patterns are necessary. A compression sink, for example, naturally owns the output buffer where compressed data lands; the caller feeds uncompressed data and the compressor writes results directly into the ring buffer. Conversely, an HTTP serializer naturally produces header bytes into its own scratch space and then hands the buffer sequence to a `WriteSink`. == Member Functions === `prepare(dest)` -- Expose Writable Buffers -Fills the provided span with mutable buffer descriptors pointing to the -sink's internal storage. This operation is synchronous. +Fills the provided span with mutable buffer descriptors pointing to the sink's internal storage. This operation is synchronous. ==== Signature @@ -95,33 +78,19 @@ std::span prepare(std::span dest); ==== Semantics -- Returns a (possibly empty) subspan of `dest` populated with buffer - descriptors. Each descriptor points to a writable region of the sink's - internal storage. -- If the returned span is empty, the sink has no available space. The - caller should call `commit` (possibly with `n == 0`) to flush - buffered data and then retry `prepare`. -- The returned buffers remain valid until the next call to `prepare`, - `commit`, `commit_eof`, or until the sink is destroyed. +- Returns a (possibly empty) subspan of `dest` populated with buffer descriptors. Each descriptor points to a writable region of the sink's internal storage. +- If the returned span is empty, the sink has no available space. The caller should call `commit` (possibly with `n == 0`) to flush buffered data and then retry `prepare`. +- The returned buffers remain valid until the next call to `prepare`, `commit`, `commit_eof`, or until the sink is destroyed. ==== Why Synchronous -`prepare` is synchronous because it is a bookkeeping operation: the sink -returns pointers into memory it already owns. No I/O or blocking is -involved. Making `prepare` asynchronous would force a coroutine -suspension on every iteration of the write loop, adding overhead with no -benefit. +`prepare` is synchronous because it is a bookkeeping operation: the sink returns pointers into memory it already owns. No I/O or blocking is involved. Making `prepare` asynchronous would force a coroutine suspension on every iteration of the write loop, adding overhead with no benefit. -When the sink has no available space, the correct response is to -`commit` the pending data (which _is_ asynchronous, as it may trigger -I/O), then call `prepare` again. This keeps the synchronous fast path -free of unnecessary suspensions. +When the sink has no available space, the correct response is to `commit` the pending data (which _is_ asynchronous, as it may trigger I/O), then call `prepare` again. This keeps the synchronous fast path free of unnecessary suspensions. ==== Why a Span Parameter -The caller provides the output span rather than the sink returning a -fixed-size container. This lets the caller control the stack allocation -and avoids heap allocation for the buffer descriptor array: +The caller provides the output span rather than the sink returning a fixed-size container. This lets the caller control the stack allocation and avoids heap allocation for the buffer descriptor array: [source,cpp] ---- @@ -129,37 +98,29 @@ mutable_buffer arr[16]; auto bufs = sink.prepare(arr); ---- -The sink fills as many descriptors as it can (up to `dest.size()`) and -returns the populated subspan. +The sink fills as many descriptors as it can (up to `dest.size()`) and returns the populated subspan. === `commit(n)` -- Finalize Written Data -Commits `n` bytes that the caller wrote into the buffers returned by -the most recent `prepare`. Returns `(error_code)`. +Commits `n` bytes that the caller wrote into the buffers returned by the most recent `prepare`. Returns `(error_code)`. ==== Semantics - On success: `!ec`. - On error: `ec`. - May trigger underlying I/O (flush to socket, compression pass, etc.). -- After `commit`, the buffers returned by the prior `prepare` are - invalidated. The caller must call `prepare` again before writing - more data. +- After `commit`, the buffers returned by the prior `prepare` are invalidated. The caller must call `prepare` again before writing more data. ==== When to Use -- After writing data into prepared buffers and needing to continue - the transfer. -- To flush when `prepare` returns an empty span (call `commit(0)` - to drain the sink's internal buffer and free space). +- After writing data into prepared buffers and needing to continue the transfer. +- To flush when `prepare` returns an empty span (call `commit(0)` to drain the sink's internal buffer and free space). === `commit_eof(n)` -- Commit Final Data and Signal EOF -Commits `n` bytes written to the most recent `prepare` buffers and -signals end-of-stream. Returns `(error_code)`. +Commits `n` bytes written to the most recent `prepare` buffers and signals end-of-stream. Returns `(error_code)`. -After a successful call, no further `prepare`, `commit`, or -`commit_eof` operations are permitted. +After a successful call, no further `prepare`, `commit`, or `commit_eof` operations are permitted. ==== Semantics @@ -168,28 +129,17 @@ After a successful call, no further `prepare`, `commit`, or ==== Why `commit_eof` Takes a Byte Count -Combining the final commit with the EOF signal in a single operation -enables the same optimizations that motivate `write_eof(buffers)` on -the `WriteSink` side: +Combining the final commit with the EOF signal in a single operation enables the same optimizations that motivate `write_eof(buffers)` on the `WriteSink` side: -- **HTTP chunked encoding**: `commit_eof(n)` can emit the data chunk - followed by the terminal `0\r\n\r\n` in a single system call. -- **Compression (deflate)**: `commit_eof(n)` can pass `Z_FINISH` to the - final `deflate()` call, producing the compressed data and the stream - trailer together. -- **TLS close-notify**: `commit_eof(n)` can coalesce the final - application data with the TLS close-notify alert. +- **HTTP chunked encoding**: `commit_eof(n)` can emit the data chunk followed by the terminal `0\r\n\r\n` in a single system call. +- **Compression (deflate)**: `commit_eof(n)` can pass `Z_FINISH` to the final `deflate()` call, producing the compressed data and the stream trailer together. +- **TLS close-notify**: `commit_eof(n)` can coalesce the final application data with the TLS close-notify alert. -A separate `commit(n)` followed by `commit_eof(0)` would prevent these -optimizations because the sink cannot know during `commit` that no more -data will follow. +A separate `commit(n)` followed by `commit_eof(0)` would prevent these optimizations because the sink cannot know during `commit` that no more data will follow. == Relationship to `pull_from` -`pull_from` is a composed algorithm that transfers data from a -`ReadSource` (or `ReadStream`) into a `BufferSink`. It is the -callee-owns-buffers counterpart to `push_to`, which transfers from a -`BufferSource` to a `WriteSink`. +`pull_from` is a composed algorithm that transfers data from a `ReadSource` (or `ReadStream`) into a `BufferSink`. It is the callee-owns-buffers counterpart to `push_to`, which transfers from a `BufferSource` to a `WriteSink`. [source,cpp] ---- @@ -207,13 +157,9 @@ The algorithm loops: 1. Call `sink.prepare(arr)` to get writable buffers. 2. Call `source.read(bufs)` (or `source.read_some(bufs)`) to fill them. 3. Call `sink.commit(n)` to finalize the data. -4. When the source signals EOF, call `sink.commit_eof(0)` to finalize - the sink. +4. When the source signals EOF, call `sink.commit_eof(0)` to finalize the sink. -`pull_from` is the right tool when the data source satisfies -`ReadSource` or `ReadStream` and the destination satisfies `BufferSink`. -It avoids the intermediate caller-owned buffer that a `WriteSink`-based -transfer would require. +`pull_from` is the right tool when the data source satisfies `ReadSource` or `ReadStream` and the destination satisfies `BufferSink`. It avoids the intermediate caller-owned buffer that a `WriteSink`-based transfer would require. The two `pull_from` overloads differ in how they read from the source: @@ -232,20 +178,11 @@ The two `pull_from` overloads differ in how they read from the source: == Relationship to `WriteSink` -`BufferSink` and `WriteSink` are independent concepts serving different -ownership models. A concrete type may satisfy one, the other, or both. +`BufferSink` and `WriteSink` are independent concepts serving different ownership models. A concrete type may satisfy one, the other, or both. -The type-erased wrapper `any_buffer_sink` satisfies both concepts. When -the wrapped type satisfies only `BufferSink`, the `WriteSink` operations -(`write_some`, `write`, `write_eof`) are synthesized from `prepare` and -`commit` with a `buffer_copy` step. When the wrapped type satisfies both -`BufferSink` and `WriteSink`, the native write operations are forwarded -directly through the virtual boundary with no extra copy. +The type-erased wrapper `any_buffer_sink` satisfies both concepts. When the wrapped type satisfies only `BufferSink`, the `WriteSink` operations (`write_some`, `write`, `write_eof`) are synthesized from `prepare` and `commit` with a `buffer_copy` step. When the wrapped type satisfies both `BufferSink` and `WriteSink`, the native write operations are forwarded directly through the virtual boundary with no extra copy. -This dual-concept bridge lets algorithms constrained on `WriteSink` work -with any `BufferSink` through `any_buffer_sink`, and lets algorithms -constrained on `BufferSink` work natively with the callee-owns-buffers -pattern. +This dual-concept bridge lets algorithms constrained on `WriteSink` work with any `BufferSink` through `any_buffer_sink`, and lets algorithms constrained on `BufferSink` work natively with the callee-owns-buffers pattern. === Transfer Algorithm Matrix @@ -274,9 +211,7 @@ pattern. === Zero-Copy Transfer -When the sink's internal storage is the final destination (a ring -buffer, a kernel page, a DMA region), the caller writes directly -into it with no intermediate copy. +When the sink's internal storage is the final destination (a ring buffer, a kernel page, a DMA region), the caller writes directly into it with no intermediate copy. [source,cpp] ---- @@ -321,8 +256,7 @@ task<> fill_sink(Sink& sink, std::string_view data) === Transferring from a ReadSource -The `pull_from` algorithm reads data directly into the sink's buffers, -avoiding a caller-owned intermediate buffer entirely. +The `pull_from` algorithm reads data directly into the sink's buffers, avoiding a caller-owned intermediate buffer entirely. [source,cpp] ---- @@ -336,8 +270,7 @@ task<> transfer(Source& source, Sink& sink) } ---- -Compare with the `WriteSink` approach, which requires an intermediate -buffer: +Compare with the `WriteSink` approach, which requires an intermediate buffer: [source,cpp] ---- @@ -368,9 +301,7 @@ The `BufferSink` path eliminates the `buf[8192]` intermediate buffer. === HTTP Response Body Sink -An HTTP response body can be consumed through a `BufferSink` interface. -The concrete implementation handles transfer encoding behind the -abstraction. +An HTTP response body can be consumed through a `BufferSink` interface. The concrete implementation handles transfer encoding behind the abstraction. [source,cpp] ---- @@ -385,15 +316,11 @@ task<> receive_body( } ---- -The caller does not know whether the body uses content-length, chunked -encoding, or compression. The `BufferSink` interface handles the -difference. +The caller does not know whether the body uses content-length, chunked encoding, or compression. The `BufferSink` interface handles the difference. === Compression Pipeline -A compression sink owns an output ring buffer where compressed data -lands. The caller writes uncompressed data into prepared buffers, and -`commit` triggers a compression pass. +A compression sink owns an output ring buffer where compressed data lands. The caller writes uncompressed data into prepared buffers, and `commit` triggers a compression pass. [source,cpp] ---- @@ -429,13 +356,11 @@ task<> compress_input(Sink& sink, std::span input) } ---- -The `commit_eof(0)` call lets the compression sink pass `Z_FINISH` to -the final deflate call, flushing the compressed stream trailer. +The `commit_eof(0)` call lets the compression sink pass `Z_FINISH` to the final deflate call, flushing the compressed stream trailer. === Bridging to WriteSink via `any_buffer_sink` -When a function is constrained on `WriteSink` but the concrete type -satisfies only `BufferSink`, `any_buffer_sink` bridges the gap. +When a function is constrained on `WriteSink` but the concrete type satisfies only `BufferSink`, `any_buffer_sink` bridges the gap. [source,cpp] ---- @@ -450,68 +375,41 @@ any_buffer_sink abs(ring); co_await send_message(abs, "hello"); ---- -When the wrapped type also satisfies `WriteSink`, `any_buffer_sink` -forwards the native write operations directly, avoiding the synthesized -`prepare` + `buffer_copy` + `commit` path. +When the wrapped type also satisfies `WriteSink`, `any_buffer_sink` forwards the native write operations directly, avoiding the synthesized `prepare` + `buffer_copy` + `commit` path. == Alternatives Considered === Combined Prepare-and-Commit -An alternative design combined the prepare and commit steps into a -single asynchronous operation: `write(dest) -> (error_code, span)`, -where the sink returns writable buffers and the commit happens on the -next call. This was rejected because: +An alternative design combined the prepare and commit steps into a single asynchronous operation: `write(dest) -> (error_code, span)`, where the sink returns writable buffers and the commit happens on the next call. This was rejected because: -- The synchronous `prepare` is a pure bookkeeping operation. Making it - asynchronous forces a coroutine suspension on every iteration, even - when the sink has space available. -- Separating `prepare` from `commit` lets the caller fill multiple - prepared buffers before incurring the cost of an asynchronous commit. -- The two-step protocol makes the buffer lifetime explicit: buffers - from `prepare` are valid until `commit` or `commit_eof`. +- The synchronous `prepare` is a pure bookkeeping operation. Making it asynchronous forces a coroutine suspension on every iteration, even when the sink has space available. +- Separating `prepare` from `commit` lets the caller fill multiple prepared buffers before incurring the cost of an asynchronous commit. +- The two-step protocol makes the buffer lifetime explicit: buffers from `prepare` are valid until `commit` or `commit_eof`. === `prepare` Returning a Count Instead of a Span -An earlier design had `prepare` fill a raw pointer array and return a -count (`std::size_t prepare(mutable_buffer* arr, std::size_t max)`). -This was replaced by the span-based interface because: +An earlier design had `prepare` fill a raw pointer array and return a count (`std::size_t prepare(mutable_buffer* arr, std::size_t max)`). This was replaced by the span-based interface because: -- `std::span` is self-describing: it carries both the - pointer and the size, eliminating a class of off-by-one errors. -- Returning a subspan of the input span is idiomatic {cpp} and composes - well with range-based code. -- The raw-pointer interface required two parameters (pointer + count) - where the span interface requires one. +- `std::span` is self-describing: it carries both the pointer and the size, eliminating a class of off-by-one errors. +- Returning a subspan of the input span is idiomatic {cpp} and composes well with range-based code. +- The raw-pointer interface required two parameters (pointer + count) where the span interface requires one. === Separate `flush` Operation -A design with an explicit `flush` method (distinct from `commit`) was -considered, where `commit` would only buffer data and `flush` would -trigger I/O. This was rejected because: +A design with an explicit `flush` method (distinct from `commit`) was considered, where `commit` would only buffer data and `flush` would trigger I/O. This was rejected because: -- It adds a fourth operation to the concept without clear benefit. The - `commit` operation already serves both roles: it finalizes the - caller's data and may trigger I/O at the sink's discretion. -- A sink that wants to defer I/O can do so internally by accumulating - committed data and flushing when its buffer is full. The caller does - not need to know when physical I/O occurs. -- Adding `flush` would complicate the `pull_from` algorithm, which - would need to decide when to call `flush` versus `commit`. +- It adds a fourth operation to the concept without clear benefit. The `commit` operation already serves both roles: it finalizes the caller's data and may trigger I/O at the sink's discretion. +- A sink that wants to defer I/O can do so internally by accumulating committed data and flushing when its buffer is full. The caller does not need to know when physical I/O occurs. +- Adding `flush` would complicate the `pull_from` algorithm, which would need to decide when to call `flush` versus `commit`. === `BufferSink` Refining `WriteSink` -A design where `BufferSink` refined `WriteSink` (requiring all types to -implement both interfaces) was considered. This was rejected because: +A design where `BufferSink` refined `WriteSink` (requiring all types to implement both interfaces) was considered. This was rejected because: -- Many natural `BufferSink` types (ring buffers, DMA descriptors) have - no meaningful `write_some` primitive. Their data path is - prepare-then-commit, not write-from-caller-buffer. -- Requiring `write_some`, `write`, and `write_eof` on every - `BufferSink` would force implementations to synthesize these - operations even when they are never called. -- The `any_buffer_sink` wrapper provides the bridge when needed, - without burdening every concrete type. +- Many natural `BufferSink` types (ring buffers, DMA descriptors) have no meaningful `write_some` primitive. Their data path is prepare-then-commit, not write-from-caller-buffer. +- Requiring `write_some`, `write`, and `write_eof` on every `BufferSink` would force implementations to synthesize these operations even when they are never called. +- The `any_buffer_sink` wrapper provides the bridge when needed, without burdening every concrete type. == Summary @@ -533,8 +431,4 @@ implement both interfaces) was considered. This was rejected because: | Final iteration: deliver last data and close the stream. |=== -`BufferSink` is the callee-owns-buffers counterpart to `WriteSink`. The -`pull_from` algorithm transfers data from a `ReadSource` or -`ReadStream` into a `BufferSink`, and `any_buffer_sink` bridges the two -patterns by satisfying both `BufferSink` and `WriteSink` behind a -single type-erased interface. +`BufferSink` is the callee-owns-buffers counterpart to `WriteSink`. The `pull_from` algorithm transfers data from a `ReadSource` or `ReadStream` into a `BufferSink`, and `any_buffer_sink` bridges the two patterns by satisfying both `BufferSink` and `WriteSink` behind a single type-erased interface. diff --git a/doc/modules/ROOT/pages/8.design/8h.TypeEraseAwaitable.adoc b/doc/modules/ROOT/pages/8.design/8i.TypeEraseAwaitable.adoc similarity index 71% rename from doc/modules/ROOT/pages/8.design/8h.TypeEraseAwaitable.adoc rename to doc/modules/ROOT/pages/8.design/8i.TypeEraseAwaitable.adoc index d8cacaff..46fbec8c 100644 --- a/doc/modules/ROOT/pages/8.design/8h.TypeEraseAwaitable.adoc +++ b/doc/modules/ROOT/pages/8.design/8i.TypeEraseAwaitable.adoc @@ -2,20 +2,13 @@ == Overview -The `any_*` wrappers type-erase stream and source concepts so that -algorithms can operate on heterogeneous concrete types through a -uniform interface. Each wrapper preallocates storage for the -type-erased awaitable at construction time, achieving zero -steady-state allocation. +The `any_*` wrappers type-erase stream and source concepts so that algorithms can operate on heterogeneous concrete types through a uniform interface. Each wrapper preallocates storage for the type-erased awaitable at construction time, achieving zero steady-state allocation. -Two vtable layouts are used depending on how many operations the -wrapper exposes. +Two vtable layouts are used depending on how many operations the wrapper exposes. == Single-Operation: Flat Vtable -When a wrapper exposes exactly one async operation (e.g. -`any_read_stream` with `read_some`, or `any_write_stream` with -`write_some`), all function pointers live in a single flat vtable: +When a wrapper exposes exactly one async operation (e.g. `any_read_stream` with `read_some`, or `any_write_stream` with `write_some`), all function pointers live in a single flat vtable: [source,cpp] ---- @@ -33,14 +26,11 @@ struct vtable }; ---- -The inner awaitable can be constructed in either `await_ready` or -`await_suspend`, depending on whether the outer awaitable has a -short-circuit path. +The inner awaitable can be constructed in either `await_ready` or `await_suspend`, depending on whether the outer awaitable has a short-circuit path. === Construct in await_ready (any_read_stream) -When there is no outer short-circuit, constructing in `await_ready` -lets immediate completions skip `await_suspend` entirely: +When there is no outer short-circuit, constructing in `await_ready` lets immediate completions skip `await_suspend` entirely: [source,cpp] ---- @@ -64,9 +54,7 @@ io_result await_resume() { === Construct in await_suspend (any_write_stream) -When the outer awaitable has a short-circuit (empty buffers), -construction is deferred to `await_suspend` so the inner awaitable -is never created on the fast path: +When the outer awaitable has a short-circuit (empty buffers), construction is deferred to `await_suspend` so the inner awaitable is never created on the fast path: [source,cpp] ---- @@ -96,13 +84,7 @@ Both variants touch the same two cache lines on the hot path. == Multi-Operation: Split Vtable with awaitable_ops -When a wrapper exposes multiple operations that produce different -awaitable types (e.g. `any_read_source` with `read_some` and -`read`, or `any_write_sink` with `write_some`, `write`, -`write_eof(buffers)`, and `write_eof()`), a split layout is -required. Each `construct` call returns a pointer to a -`static constexpr awaitable_ops` matching the awaitable it -created. +When a wrapper exposes multiple operations that produce different awaitable types (e.g. `any_read_source` with `read_some` and `read`, or `any_write_sink` with `write_some`, `write`, `write_eof(buffers)`, and `write_eof()`), a split layout is required. Each `construct` call returns a pointer to a `static constexpr awaitable_ops` matching the awaitable it created. [source,cpp] ---- @@ -125,9 +107,7 @@ struct vtable }; ---- -The inner awaitable is constructed in `await_suspend`. Outer -`await_ready` handles short-circuits (e.g. empty buffers) before -the inner type is ever created: +The inner awaitable is constructed in `await_suspend`. Outer `await_ready` handles short-circuits (e.g. empty buffers) before the inner type is ever created: [source,cpp] ---- @@ -169,10 +149,7 @@ Split (any_read_source, any_write_sink): 3 cache lines (separate .rodata address, defeats spatial prefetch) ---- -The flat layout keeps all per-awaitable function pointers adjacent -to `construct_awaitable` in a single 64-byte structure. The split -layout places `vtable` and `awaitable_ops` at unrelated addresses -in `.rodata`, adding one cache miss on the hot path. +The flat layout keeps all per-awaitable function pointers adjacent to `construct_awaitable` in a single 64-byte structure. The split layout places `vtable` and `awaitable_ops` at unrelated addresses in `.rodata`, adding one cache miss on the hot path. == When to Use Which @@ -192,11 +169,4 @@ in `.rodata`, adding one cache miss on the hot path. == Why the Flat Layout Cannot Scale -With multiple operations, each `construct` call produces a -different concrete awaitable type. The per-awaitable function -pointers (`await_ready`, `await_suspend`, `await_resume`, -`destroy`) must match the type that was constructed. The split -layout solves this by returning the correct `awaitable_ops const*` -from each `construct` call. The flat layout would require -duplicating all four function pointers in the vtable for every -operation -- workable for two operations, unwieldy for four. +With multiple operations, each `construct` call produces a different concrete awaitable type. The per-awaitable function pointers (`await_ready`, `await_suspend`, `await_resume`, `destroy`) must match the type that was constructed. The split layout solves this by returning the correct `awaitable_ops const*` from each `construct` call. The flat layout would require duplicating all four function pointers in the vtable for every operation -- workable for two operations, unwieldy for four. diff --git a/doc/modules/ROOT/pages/8.design/8i.any_buffer_sink.adoc b/doc/modules/ROOT/pages/8.design/8i.any_buffer_sink.adoc deleted file mode 100644 index a5630682..00000000 --- a/doc/modules/ROOT/pages/8.design/8i.any_buffer_sink.adoc +++ /dev/null @@ -1,409 +0,0 @@ -= any_buffer_sink Design - -== Overview - -This document describes the design of `any_buffer_sink`, a type-erased -wrapper that satisfies both `BufferSink` and `WriteSink`. The central -design goal is to serve two fundamentally different data-production -patterns through a single runtime interface, with no performance -compromise for either. - -Data producers fall into two categories: - -- **Generators** produce data on demand. They do not hold the data - in advance; they compute or serialize it into memory that someone - else provides. An HTTP header serializer, a JSON encoder, and a - compression engine are generators. - -- **Buffered sources** already have data sitting in buffers. A - memory-mapped file, a ring buffer that received data from a socket, - and a pre-serialized response body are buffered sources. - -These two patterns require different buffer ownership models. -Generators need writable memory from the sink (the `BufferSink` -pattern). Buffered sources need to hand their existing buffers to -the sink (the `WriteSink` pattern). Forcing either pattern through -the other's interface introduces an unnecessary copy. - -`any_buffer_sink` exposes both interfaces. The caller chooses the -one that matches how its data is produced. The wrapper dispatches -to the underlying concrete sink through the optimal path, achieving -zero-copy when the concrete type supports it and falling back to a -synthesized path when it does not. - -== The Two Interfaces - -=== BufferSink: Callee-Owned Buffers - -The `BufferSink` interface (`prepare`, `commit`, `commit_eof`) is -designed for generators. The sink owns the memory. The generator -asks for writable space, fills it, and commits: - -[source,cpp] ----- -any_buffer_sink abs(concrete_sink{}); - -mutable_buffer arr[16]; -auto bufs = abs.prepare(arr); -// serialize directly into bufs -auto [ec] = co_await abs.commit(bytes_written); ----- - -The data lands in the sink's internal storage with no intermediate -copy. If the concrete sink is backed by a kernel page, a DMA -descriptor, or a ring buffer, the bytes go directly to their final -destination. - -=== WriteSink: Caller-Owned Buffers - -The `WriteSink` interface (`write_some`, `write`, `write_eof`) is -designed for buffered sources. The caller already has the data in -buffers and passes them to the sink: - -[source,cpp] ----- -any_buffer_sink abs(concrete_sink{}); - -// Data already in buffers -- pass them directly -auto [ec, n] = co_await abs.write(existing_buffers); - -// Or atomically write and signal EOF -auto [ec2, n2] = co_await abs.write_eof(final_buffers); ----- - -When the concrete sink natively supports `WriteSink`, the caller's -buffers propagate directly through the type-erased boundary. The -sink receives the original buffer descriptors pointing to the -caller's memory. No data is copied into an intermediate staging -area. - -== Dispatch Strategy - -The vtable records whether the wrapped concrete type satisfies -`WriteSink` in addition to `BufferSink`. This determination is made -at compile time when the vtable is constructed. At runtime, each -`WriteSink` operation checks a single nullable function pointer to -select its path. - -=== Native Forwarding (BufferSink + WriteSink) - -When the concrete type satisfies both concepts, the `WriteSink` -vtable slots are populated with functions that construct the -concrete type's own `write_some`, `write`, `write_eof(buffers)`, -and `write_eof()` awaitables in the cached storage. The caller's -buffer descriptors pass straight through: - ----- -caller buffers → vtable → concrete write(buffers) → I/O ----- - -No `prepare`, no `buffer_copy`, no `commit`. The concrete type -receives the caller's buffers and can submit them directly to the -operating system, the compression library, or the next pipeline -stage. - -This is the zero-copy path for buffered sources writing to a sink -that natively accepts caller-owned buffers. - -=== Synthesized Path (BufferSink Only) - -When the concrete type satisfies only `BufferSink`, the `WriteSink` -vtable slots are null. The wrapper synthesizes the `WriteSink` -operations from the `BufferSink` primitives: - ----- -caller buffers → prepare → buffer_copy → commit → I/O ----- - -For `write_some`: - -1. Call `prepare` to get writable space from the sink. -2. Copy data from the caller's buffers into the prepared space - with `buffer_copy`. -3. Call `commit` to finalize. - -For `write` and `write_eof`: the same loop, repeated until all -data is consumed. `write_eof` finishes with `commit_eof` to signal -end-of-stream. - -This path incurs one buffer copy, which is unavoidable: the -concrete sink only knows how to accept data through its own -`prepare`/`commit` protocol, so the caller's buffers must be copied -into the sink's internal storage. - -== Why This Matters - -=== No Compromise - -A naive design would pick one interface and synthesize the other -unconditionally. If the wrapper only exposed `BufferSink`, every -buffered source would pay a copy to move its data into the sink's -prepared buffers. If the wrapper only exposed `WriteSink`, every -generator would need to allocate its own intermediate buffer, fill -it, then hand it to the sink -- paying a copy that the `BufferSink` -path avoids. - -`any_buffer_sink` avoids both penalties. Each data-production -pattern uses the interface designed for it. The only copy that -occurs is the one that is structurally unavoidable: when a -`WriteSink` operation targets a concrete type that only speaks -`BufferSink`. - -=== True Zero-Copy for Buffered Sources - -Consider an HTTP server where the response body is a memory-mapped -file. The file's pages are already in memory. Through the -`WriteSink` interface, those pages can propagate directly to the -underlying transport: - -[source,cpp] ----- -// body_source is a BufferSource backed by mmap pages -// response_sink wraps a concrete type satisfying both concepts - -any_buffer_sink response_sink(&concrete); - -const_buffer arr[16]; -for(;;) -{ - auto [ec, bufs] = co_await body_source.pull(arr); - if(ec == cond::eof) - { - auto [ec2] = co_await response_sink.write_eof(); - break; - } - if(ec) - break; - // bufs point directly into mmap pages - // write() propagates them through the vtable to the concrete sink - auto [ec2, n] = co_await response_sink.write(bufs); - if(ec2) - break; - body_source.consume(n); -} ----- - -The mapped pages flow from `body_source.pull` through -`response_sink.write` to the concrete transport with no -intermediate copy. If the concrete sink can scatter-gather those -buffers into a `writev` system call, the data moves from the -page cache to the network card without touching user-space memory -a second time. - -=== Generators Write In-Place - -An HTTP header serializer generates bytes on the fly. It does not -hold the output in advance. Through the `BufferSink` interface, it -writes directly into whatever memory the concrete sink provides: - -[source,cpp] ----- -task<> serialize_headers( - any_buffer_sink& sink, - response const& resp) -{ - mutable_buffer arr[16]; - - for(auto const& field : resp.fields()) - { - auto bufs = sink.prepare(arr); - // serialize field directly into bufs - std::size_t n = format_field(bufs, field); - auto [ec] = co_await sink.commit(n); - if(ec) - co_return; - } - // headers done; body follows through the same sink -} ----- - -The serializer never allocates a scratch buffer for the formatted -output. The bytes land directly in the sink's internal storage, -which might be a chunked-encoding buffer, a TLS record buffer, or -a circular buffer feeding a socket. - -== Awaitable Caching - -`any_buffer_sink` uses the split vtable pattern described in -xref:8h.TypeEraseAwaitable.adoc[Type-Erasing Awaitables]. Multiple -async operations (`commit`, `commit_eof`, plus the four `WriteSink` -operations when the concrete type supports them) share a single -cached awaitable storage region. - -The constructor computes the maximum size and alignment across all -awaitable types that the concrete type can produce and allocates -that storage once. This reserves all virtual address space at -construction time, so memory usage is measurable at server startup -rather than growing piecemeal as requests arrive. - -Two separate `awaitable_ops` structs are used: - -- `awaitable_ops` for operations yielding `io_result<>` - (`commit`, `commit_eof`, `write_eof()`) -- `write_awaitable_ops` for operations yielding - `io_result` (`write_some`, `write`, - `write_eof(buffers)`) - -Each `construct_*` function in the vtable creates the concrete -awaitable in the cached storage and returns a pointer to the -matching `static constexpr` ops table. The wrapper stores this -pointer as `active_ops_` or `active_write_ops_` and uses it for -`await_ready`, `await_suspend`, `await_resume`, and destruction. - -== Ownership Modes - -=== Owning - -[source,cpp] ----- -any_buffer_sink abs(my_concrete_sink{args...}); ----- - -The wrapper allocates storage for the concrete sink and moves it -in. The wrapper owns the sink and destroys it in its destructor. -The awaitable cache is allocated separately. - -If either allocation fails, the constructor cleans up via an -internal guard and propagates the exception. - -=== Non-Owning (Reference) - -[source,cpp] ----- -my_concrete_sink sink; -any_buffer_sink abs(&sink); ----- - -The wrapper stores a pointer without allocating storage for the -sink. The concrete sink must outlive the wrapper. Only the -awaitable cache is allocated. - -This mode is useful when the concrete sink is managed by a -higher-level object (e.g., an HTTP connection that owns the -transport) and the wrapper is a short-lived handle passed to a -body-production function. - -== Relationship to any_buffer_source - -`any_buffer_source` is the read-side counterpart, satisfying both -`BufferSource` and `ReadSource`. The same dual-interface principle -applies in mirror image: - -[cols="1,1,1"] -|=== -| Direction | Primary concept | Secondary concept - -| Writing (any_buffer_sink) -| `BufferSink` (callee-owned) -| `WriteSink` (caller-owned) - -| Reading (any_buffer_source) -| `BufferSource` (callee-owned) -| `ReadSource` (caller-owned) -|=== - -Both wrappers enable the same design philosophy: the caller -chooses the interface that matches its data-production or -data-consumption pattern, and the wrapper dispatches optimally. - -== Alternatives Considered - -=== WriteSink-Only Wrapper - -A design where the type-erased wrapper satisfied only `WriteSink` -was considered. Generators would allocate their own scratch buffer, -serialize into it, and call `write`. This was rejected because: - -- Every generator pays a buffer copy that the `BufferSink` path - avoids. For high-throughput paths (HTTP header serialization, - compression output), this copy is measurable. -- Generators must manage scratch buffer lifetime and sizing. - The `prepare`/`commit` protocol pushes this responsibility to - the sink, which knows its own buffer topology. -- The `commit_eof(n)` optimization (coalescing final data with - stream termination) is lost. A generator calling `write` cannot - signal that its last write is the final one without a separate - `write_eof()` call, preventing the sink from combining them. - -=== BufferSink-Only Wrapper - -A design where the wrapper satisfied only `BufferSink` was -considered. Buffered sources would copy their data into the -sink's prepared buffers via `prepare` + `buffer_copy` + `commit`. -This was rejected because: - -- Every buffered source pays a copy that native `WriteSink` - forwarding avoids. When the source is a memory-mapped file and - the sink is a socket, this eliminates the zero-copy path - entirely. -- The `buffer_copy` step becomes the bottleneck for large - transfers, dominating what would otherwise be a pure I/O - operation. -- Buffered sources that produce scatter-gather buffer sequences - (multiple non-contiguous regions) must copy each region - individually into prepared buffers, losing the ability to pass - the entire scatter-gather list to a `writev` system call. - -=== Separate Wrapper Types - -A design with two distinct wrappers (`any_buffer_sink` satisfying -only `BufferSink` and `any_write_sink` satisfying only `WriteSink`) -was considered. The caller would choose which wrapper to construct -based on its data-production pattern. This was rejected because: - -- The caller and the sink are often decoupled. An HTTP server - framework provides the sink; the user provides the body - producer. The framework cannot know at compile time whether the - user will call `prepare`/`commit` or `write`/`write_eof`. -- Requiring two wrapper types forces the framework to either - pick one (losing the other pattern) or expose both (complicating - the API). -- A single wrapper that satisfies both concepts lets the - framework hand one object to the body producer, which uses - whichever interface is natural. No choice is imposed on the - framework or the user. - -=== Always Synthesizing WriteSink - -A design where the `WriteSink` operations were always synthesized -from `prepare` + `buffer_copy` + `commit`, even when the concrete -type natively supports `WriteSink`, was considered. This would -simplify the vtable by removing the nullable write-forwarding -slots. This was rejected because: - -- The buffer copy is measurable. For a concrete type that can - accept caller-owned buffers directly (e.g., a socket wrapper - with `writev` support), the synthesized path adds a copy that - native forwarding avoids. -- The `write_eof(buffers)` atomicity guarantee is lost. The - synthesized path must decompose it into `prepare` + - `buffer_copy` + `commit_eof`, which the concrete type cannot - distinguish from a non-final commit followed by an empty - `commit_eof`. This prevents optimizations like coalescing the - last data chunk with a chunked-encoding terminator. - -== Summary - -`any_buffer_sink` satisfies both `BufferSink` and `WriteSink` -behind a single type-erased interface. The dual API lets each -data-production pattern use the interface designed for it: - -[cols="1,2,2"] -|=== -| Producer type | Interface | Data path - -| Generator (produces on demand) -| `prepare` / `commit` / `commit_eof` -| Writes directly into sink's internal storage. Zero copy. - -| Buffered source (data already in memory) -| `write_some` / `write` / `write_eof` -| Buffers propagate through the vtable. Zero copy when the concrete - type natively supports `WriteSink`. One copy (synthesized) when - it does not. -|=== - -The dispatch is determined at construction time through nullable -vtable slots. At runtime, a single pointer check selects the native -or synthesized path. Neither pattern pays for the other's -existence. diff --git a/doc/modules/ROOT/pages/8.design/8j.Executor.adoc b/doc/modules/ROOT/pages/8.design/8j.Executor.adoc deleted file mode 100644 index 62f60cb7..00000000 --- a/doc/modules/ROOT/pages/8.design/8j.Executor.adoc +++ /dev/null @@ -1,622 +0,0 @@ -= Executor Concept Design - -== Overview - -This document describes the design of the `Executor` concept: the -interface through which coroutines are scheduled for execution. It -explains the relationship to Asio's executor model, why `dispatch` -returns `void`, why `defer` was dropped, how `executor_ref` -achieves zero-allocation type erasure, and the I/O completion -pattern that motivates the entire design. - -The `Executor` concept exists to answer one question: when a -coroutine is ready to run, _where_ does it run? The concept -captures the rules for scheduling coroutine resumption, tracking -outstanding work for graceful shutdown, and accessing the -execution context that owns the executor. Every I/O awaitable in -Corosio -- sockets, acceptors, timers, resolvers -- depends on -this concept to dispatch completions back to the correct executor. - -== Definition - -[source,cpp] ----- -template -concept Executor = - std::is_nothrow_copy_constructible_v && - std::is_nothrow_move_constructible_v && - requires(E& e, E const& ce, E const& ce2, - std::coroutine_handle<> h) - { - { ce == ce2 } noexcept -> std::convertible_to; - { ce.context() } noexcept; - requires std::is_lvalue_reference_v< - decltype(ce.context())> && - std::derived_from< - std::remove_reference_t< - decltype(ce.context())>, - execution_context>; - { ce.on_work_started() } noexcept; - { ce.on_work_finished() } noexcept; - - { ce.dispatch(h) }; - { ce.post(h) }; - }; ----- - -An `Executor` provides exactly two operations on a coroutine -handle: - -=== `dispatch(h)` -- Execute If Safe - -If the executor determines it is safe (e.g., the current thread -is already associated with the executor's context), resumes the -coroutine inline via `h.resume()`. Otherwise, posts the coroutine -for later execution. Returns `void`. - -=== `post(h)` -- Always Queue - -Queues the coroutine for later execution without ever executing -it inline. Never blocks. Use when guaranteed asynchrony is -required. - -The remaining operations support context access, lifecycle -management, and identity: - -=== `context()` -- Access the Execution Context - -Returns an lvalue reference to the `execution_context` that -created this executor. The context provides service -infrastructure, frame allocators, and shutdown coordination. - -=== `on_work_started()` / `on_work_finished()` -- Track Work - -Paired calls that track outstanding work. When the count reaches -zero, the context's event loop (`run()`) returns. These calls -must be balanced: each `on_work_started` must have a matching -`on_work_finished`. - -=== `operator==` -- Equality Comparison - -Two executors are equal if they submit work to the same -destination. This enables the same-executor optimization: when a -completion's executor matches the caller's, the dispatch can -skip the indirection. - -== Relationship to Asio - -Kohlhoff's Asio library established the executor-as-policy model -that Capy inherits. As described in -https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/p0113r0.html[P0113R0]: - -[quote] -____ -An executor is to function execution as an allocator is to -allocation. -____ - -Capy retains the core elements of this model: - -- **Work tracking.** `on_work_started` / `on_work_finished` for - graceful shutdown. -- **`dispatch` / `post` duality.** Execute-if-safe versus - always-queue. -- **`execution_context` base class.** Service infrastructure - and context lifetime management. -- **Equality comparison.** Same-executor optimization. - -Capy removes or changes: - -- **`defer`.** Dropped entirely. See <>. -- **Function object submission.** Asio executors accept - arbitrary callables. Capy executors accept only - `std::coroutine_handle<>`. This removes the need for - allocator-aware function erasure and enables a simpler, - cheaper type-erased wrapper (`executor_ref`). -- **`dispatch` return type.** Asio's `dispatch` returns void - for the same reason Capy's does, but Capy also considered - and rejected a `coroutine_handle<>` return for symmetric - transfer. See <>. - -The result is a concept that preserves Asio's proven execution -model while removing the machinery that a coroutine-native -library does not need. - -[[why-dispatch-returns-void]] -== Why `dispatch` Returns `void` - -An earlier design had `dispatch` return -`std::coroutine_handle<>` so that callers could use it for -symmetric transfer from `await_suspend`. This was rejected -because it violates a fundamental constraint of the I/O layer. - -=== The Problem: Synchronous Completion During `await_suspend` - -When an I/O awaitable initiates an operation inside -`await_suspend`, the I/O might complete immediately. If it does, -the completion path would call `dispatch(h)` while the caller's -`await_suspend` is still on the call stack. If `dispatch` -resumed the coroutine inline via `h.resume()`, the coroutine -would execute while `await_suspend` has not yet returned -- -resuming a coroutine from inside `await_suspend` before the -suspension machinery completes risks undefined behavior. - -The {cpp} standard describes the sequencing in -https://eel.is/c++draft/expr.await[[expr.await]/5.1]: - -[quote] -____ -If the result of await-ready is false, the coroutine is -considered suspended. Then, await-suspend is evaluated. -____ - -Although the standard considers the coroutine suspended before -`await_suspend` is called, resuming it from _within_ -`await_suspend` creates a nested resumption on the same call -stack. The resumed coroutine runs, potentially suspends again or -completes, and then control returns into the middle of -`await_suspend`. If the coroutine was destroyed during -resumption, `await_suspend` returns into a destroyed frame. - -=== Why I/O Awaitables Return `void` or `std::noop_coroutine()` - -To avoid this, all I/O awaitables return `void` or -`std::noop_coroutine()` from `await_suspend`. Both forms -guarantee that the caller is fully suspended and the call stack -has unwound before any completion handler can resume the -coroutine. The I/O operation is initiated during `await_suspend`, -but the completion is dispatched later -- from the event loop, -after `await_suspend` has returned. - -https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0913r1.html[P0913R1] -introduced the `coroutine_handle` return type for symmetric -transfer, which is the correct mechanism for coroutine-to-coroutine -control transfer (as used by `task` internally). But I/O -awaitables cannot use it because the I/O completion is -asynchronous relative to `await_suspend` -- it comes from the -reactor or proactor, not from the awaitable itself. - -=== Consequence for `dispatch` - -Since the primary consumer of `dispatch` is I/O completion -- -called _after_ the coroutine is suspended, from the event loop --- `dispatch` does not need to participate in symmetric transfer. -It calls `h.resume()` inline when safe and returns `void`. A -conforming implementation looks like: - -[source,cpp] ----- -void dispatch(std::coroutine_handle<> h) const -{ - if(ctx_.running_in_this_thread()) - h.resume(); - else - post(h); -} ----- - -After `dispatch` returns, the state of `h` is unspecified. The -coroutine may have completed, been destroyed, or suspended at a -different point. Callers must not use `h` after calling -`dispatch`. - -[[why-not-defer]] -== Why Two Operations, Not Three - -Asio provides three submission methods: `dispatch`, `post`, and -`defer`. Capy provides only `dispatch` and `post`. - -=== What `defer` Does - -https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/p0113r0.html[P0113R0] -describes `defer`: - -[quote] -____ -A defer operation is similar to a post operation... However, a -defer operation also implies a relationship between the caller -and the function object being submitted. It is intended for use -when submitting a function object that represents a continuation -of the caller. -____ - -The optimization this enables is thread-local queuing. When the -caller is already executing within the executor's context, -`defer` saves the continuation to a thread-local queue instead -of the shared work queue. From P0113R0: - -[quote] -____ -If the caller is executing within the thread pool, saves the -function object to a thread-local queue. Once control returns to -the thread pool, the function object is scheduled for execution -as soon as possible. -____ - -=== Why Coroutines Make `defer` Redundant - -In a callback-based library, when an asynchronous operation -completes, the completion handler must be submitted to the -executor as a function object. If the handler is the caller's -continuation, `defer` tells the executor "this is my next step; -optimize accordingly." - -In a coroutine-native library, this optimization is provided by -the language itself. -https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0913r1.html[P0913R1] -introduced symmetric transfer specifically to eliminate the need -for queues and schedulers in coroutine-to-coroutine control -transfer: - -[quote] -____ -Currently Coroutines TS only supports asymmetric control transfer -where suspend always returns control back to the current -coroutine caller or resumer. In order to emulate symmetric -coroutine to coroutine control transfer, one needs to build a -queue and a scheduler. -____ - -When `task::await_suspend` returns the parent's coroutine -handle, the compiler performs a tail-call-like transfer directly -to the parent. No queue, no executor submission, no `defer`. -The optimization that `defer` provides through a runtime hint, -symmetric transfer provides through a compile-time guarantee. - -Corosio confirms this in practice: its entire I/O layer -- -sockets, acceptors, timers, resolvers, signals -- across all -three backends (epoll, IOCP, select) uses only `dispatch` and -`post`. No code path requires `defer`. - -== Why `std::coroutine_handle<>`, Not Typed Handles - -The executor accepts `std::coroutine_handle<>` -- the type-erased -handle -- rather than `std::coroutine_handle

` for a specific -promise type `P`. - -This decision has three consequences: - -- **Type erasure is possible.** `executor_ref` wraps any - executor behind a uniform interface. If `dispatch` and `post` - were templated on the promise type, the vtable would need to - be generic over all promise types, making type erasure - impractical. - -- **Executor implementations are independent of coroutine - internals.** An executor schedules resumption. It does not - need to know what the coroutine's promise type is, what value - it produces, or how it handles exceptions. The type-erased - handle provides exactly the right interface: `resume()` and - nothing else. - -- **I/O operation structures stay simple.** Every pending I/O - operation in Corosio stores two fields: `std::coroutine_handle<> h` (a - typedef for `std::coroutine_handle<>`) and - `capy::executor_ref ex`. Both are type-erased. The operation - structure does not need to be templated on the coroutine's - promise type, which keeps the I/O backend code - non-generic and out of headers. - -== Why Nothrow Copy and Move - -The concept requires `std::is_nothrow_copy_constructible_v` -and `std::is_nothrow_move_constructible_v`. - -Executors propagate through coroutine machinery at points where -exceptions cannot be handled: inside `await_suspend`, during -promise construction, and through type-erased wrappers like -`executor_ref`. An exception thrown from an executor copy at any -of these points would leave the coroutine in an unrecoverable -state -- suspended but with no executor to resume it through. - -The nothrow requirement eliminates this failure mode entirely. -In practice, executors are lightweight handles -- a pointer to -the execution context and perhaps a strand pointer or a priority -value. Nothrow copy and move are natural for such types. The -requirement does not impose a burden; it documents what is -already true of every reasonable executor implementation. - -== Work Tracking, Shutdown, and Executor Validity - -The `on_work_started` and `on_work_finished` operations serve -three roles. - -=== Event Loop Lifetime - -Work tracking is the mechanism by which the event loop knows -when to stop. When outstanding work reaches zero, `run()` -returns. This is not bookkeeping -- it is the event loop's -termination signal. - -In Corosio, `on_work_finished` triggers `stop()` when the -atomic work count reaches zero: - -[source,cpp] ----- -void on_work_finished() noexcept -{ - if(outstanding_work_.fetch_sub( - 1, std::memory_order_acq_rel) == 1) - stop(); -} ----- - -Every `run_async` call increments the count. When the launched -task completes, the count decrements. When no tasks remain, -`run()` returns. Without work tracking in the executor, the -event loop would need a separate signaling mechanism or would -spin indefinitely. - -=== Public Visibility - -These operations are public, not private with friendship. The -reason is extensibility: `work_guard` is the library's RAII -wrapper for work tracking, but users may define their own guards -with additional behaviors (logging, metrics, timeout detection). -Making work tracking private would require the library to grant -friendship to types it cannot anticipate. - -=== Executor Validity - -An executor becomes invalid when its context's `shutdown()` -returns. After shutdown: - -- `dispatch`, `post`, `on_work_started`, `on_work_finished`: - undefined behavior. -- Copy, comparison, `context()`: valid until the context is - destroyed. - -This two-phase model exists because shutdown drains outstanding -work. During the drain, executors must still be copyable (they -are stored in pending operations) and comparable (for -same-executor checks). Only the work-submission operations become -invalid, because the context has stopped accepting new work. - -== Why `context()` Returns `execution_context&` - -The `context()` operation returns a reference to the -`execution_context` base class, not the concrete derived type. - -This serves two purposes: - -- **Type erasure.** `executor_ref` can wrap any executor without - knowing its context type. If `context()` returned a concrete - type, the vtable would need a different return type per - executor type. - -- **Service lookup.** The `execution_context` base class provides - `use_service()` and `make_service()`, which is - sufficient for all runtime service discovery. I/O objects do - not need the concrete context type to find their services. - -Corosio demonstrates this pattern throughout its public API. -I/O objects accept any executor and extract the context via the -base class reference: - -[source,cpp] ----- -template - requires capy::Executor -explicit tcp_socket(Ex const& ex) - : tcp_socket(ex.context()) -{ -} ----- - -The socket constructor receives `execution_context&` and looks -up the socket service. The concrete context type -- `epoll_context`, -`iocp_context`, `select_context` -- is irrelevant to the socket. - -== The `executor_ref` Design - -`executor_ref` is a non-owning, type-erased wrapper for any -executor satisfying the `Executor` concept. It is the mechanism -by which I/O operations store and use executors without templates. - -=== Two Pointers - -The entire object is two pointers: - -[source,cpp] ----- -class executor_ref -{ - void const* ex_; // pointer to the executor - detail::executor_vtable const* vt_; // pointer to the vtable -}; ----- - -Two pointers fit in two registers. `executor_ref` can be passed -by value as cheaply as passing a pointer. No heap allocation, no -small-buffer optimization, no reference counting. - -=== Why Not `std::function` or `std::any` - -`std::function` and small-buffer-optimized type erasure wrappers -have overhead that executor usage cannot tolerate: - -- **Heap allocation.** `std::function` may allocate when the - callable exceeds the SBO threshold. Executor dispatch happens - on every I/O completion -- allocation on the hot path is - unacceptable. - -- **Reference counting.** `std::shared_ptr`-based wrappers add - atomic reference count operations on every copy. Executors are - copied frequently as they propagate through coroutine chains. - -- **Indirection.** SBO wrappers store either inline data or a - heap pointer, adding a branch on every operation. - -`executor_ref` avoids all three. The vtable pointer goes directly -to a `static constexpr` structure in `.rodata`. One indirection, -no branches, no allocation. - -=== Why Not {cpp} Virtual Functions - -{cpp} virtual dispatch places the vtable pointer inside each -heap-allocated object. Every virtual call chases a pointer from -the object to its vtable, which may reside at an unpredictable -address in memory. When objects of different types are -interleaved on the heap, their vtable pointers point to -different locations in `.rodata`, defeating spatial prefetch and -polluting the instruction cache. - -`executor_ref` separates the vtable from the object. The vtable -is a `static constexpr` structure -- one per executor type, -shared by all instances of that type. Because most programs use -only one or two executor types (a thread pool executor and -perhaps a strand), the vtable stays hot in L1 cache. The -executor pointer and the vtable pointer sit adjacent in the -`executor_ref` object, so both are loaded in a single cache line. - -=== Reference Semantics - -`executor_ref` stores a pointer to the executor, not a copy. The -executor must outlive the `executor_ref`. This matches how -executors propagate through coroutine chains: the executor is -owned by the execution context (which outlives all coroutines -running on it), and `executor_ref` is a lightweight handle -passed through `await_suspend` and stored in I/O operation -structures. - -== The I/O Completion Pattern - -The executor concept is designed around a single use case: I/O -completion dispatch. This pattern is the reason the concept -exists. - -=== Capture at Initiation - -When a coroutine `co_await`s an I/O awaitable, the awaitable's -`await_suspend` receives the caller's executor and stores it -as `executor_ref`: - -[source,cpp] ----- -template -auto await_suspend( - std::coroutine_handle<> h, - Ex const& ex) -> std::coroutine_handle<> -{ - // ex is captured as executor_ref in the operation - return socket_.connect(h, ex, endpoint_, token_, &ec_); -} ----- - -The operation structure stores both the coroutine handle and the -executor reference: - -[source,cpp] ----- -struct io_op : scheduler_op -{ - std::coroutine_handle<> h; - capy::executor_ref ex; - // ... error codes, buffers, etc. -}; ----- - -=== Dispatch at Completion - -When the I/O completes (from the reactor thread for epoll, the -completion port for IOCP, or the select loop), the operation -uses the stored executor to resume the coroutine: - -[source,cpp] ----- -void operator()() override -{ - // ... set error codes ... - capy::executor_ref saved_ex(std::move(ex)); - std::coroutine_handle<> saved_h(std::move(h)); - impl_ptr.reset(); - saved_ex.dispatch(saved_h); -} ----- - -`dispatch` checks whether the current thread is already running -on the executor's context. If so, the coroutine resumes inline. -If not, the coroutine is posted for later execution on the -correct context. - -=== Platform Independence - -This pattern is identical across all three Corosio backends: -epoll (Linux), IOCP (Windows), and select (POSIX fallback). The -executor concept and `executor_ref` provide the abstraction that -makes this possible. The backend-specific code deals with I/O -readiness or completion notification. The executor-specific code -deals with coroutine scheduling. The two concerns are cleanly -separated. - -== Why Not `std::execution` (P2300) - -https://wg21.link/P2300[P2300] defines a sender/receiver model -where execution context flows _backward_ from receiver to -sender via queries after `connect()`: - ----- -task async_work(); // Frame allocated NOW -auto sndr = async_work(); -auto op = connect(sndr, receiver); // Allocator available NOW -start(op); // -- too late ----- - -For coroutines, this ordering is fatal. Coroutine frame -allocation happens _before_ the coroutine body executes. The -compiler calls `operator new` first, then constructs the -promise, then begins execution. Any mechanism that provides the -allocator _after_ the coroutine call -- receiver queries, -`await_transform`, explicit method calls -- arrives after the -frame is already allocated with the wrong (or default) -allocator. - -Capy's model flows context _forward_ from launcher to task. -The `run_async(ex, alloc)(my_task())` two-phase invocation sets -the thread-local allocator _before_ the task expression is -evaluated, so `operator new` reads it in time. This is -described in detail in xref:8g.RunApi.adoc[Run API]. - -The same forward-flowing model applies to executors. The -launcher binds the executor before the task runs. The task's -promise stores the executor and propagates it to nested -awaitables via `await_transform`. Context flows from caller to -callee at every level, never backward. - -== Conforming Signatures - -A minimal executor implementation: - -[source,cpp] ----- -class my_executor -{ -public: - execution_context& context() const noexcept; - - void on_work_started() const noexcept; - void on_work_finished() const noexcept; - - void dispatch(std::coroutine_handle<> h) const; - void post(std::coroutine_handle<> h) const; - - bool operator==(my_executor const&) const noexcept; -}; ----- - -== Summary - -The `Executor` concept provides `dispatch` and `post` for -coroutine scheduling, work tracking for event loop lifetime, and -`context()` for service access. The design descends from Asio's -executor model but is adapted for coroutines: `defer` is -replaced by symmetric transfer, function objects are replaced by -`std::coroutine_handle<>`, and `dispatch` returns `void` -because I/O completions are dispatched after suspension, not -during it. - -`executor_ref` type-erases any executor into two pointers, -enabling platform-independent I/O completion dispatch with zero -allocation and predictable cache behavior. The -capture-at-initiation / dispatch-at-completion pattern is the -fundamental use case the concept serves. diff --git a/doc/modules/ROOT/pages/8.design/8j.any_buffer_sink.adoc b/doc/modules/ROOT/pages/8.design/8j.any_buffer_sink.adoc new file mode 100644 index 00000000..3e94d72f --- /dev/null +++ b/doc/modules/ROOT/pages/8.design/8j.any_buffer_sink.adoc @@ -0,0 +1,263 @@ += any_buffer_sink Design + +== Overview + +This document describes the design of `any_buffer_sink`, a type-erased wrapper that satisfies both `BufferSink` and `WriteSink`. The central design goal is to serve two fundamentally different data-production patterns through a single runtime interface, with no performance compromise for either. + +Data producers fall into two categories: + +- **Generators** produce data on demand. They do not hold the data in advance; they compute or serialize it into memory that someone else provides. An HTTP header serializer, a JSON encoder, and a compression engine are generators. + +- **Buffered sources** already have data sitting in buffers. A memory-mapped file, a ring buffer that received data from a socket, and a pre-serialized response body are buffered sources. + +These two patterns require different buffer ownership models. Generators need writable memory from the sink (the `BufferSink` pattern). Buffered sources need to hand their existing buffers to the sink (the `WriteSink` pattern). Forcing either pattern through the other's interface introduces an unnecessary copy. + +`any_buffer_sink` exposes both interfaces. The caller chooses the one that matches how its data is produced. The wrapper dispatches to the underlying concrete sink through the optimal path, achieving zero-copy when the concrete type supports it and falling back to a synthesized path when it does not. + +== The Two Interfaces + +=== BufferSink: Callee-Owned Buffers + +The `BufferSink` interface (`prepare`, `commit`, `commit_eof`) is designed for generators. The sink owns the memory. The generator asks for writable space, fills it, and commits: + +[source,cpp] +---- +any_buffer_sink abs(concrete_sink{}); + +mutable_buffer arr[16]; +auto bufs = abs.prepare(arr); +// serialize directly into bufs +auto [ec] = co_await abs.commit(bytes_written); +---- + +The data lands in the sink's internal storage with no intermediate copy. If the concrete sink is backed by a kernel page, a DMA descriptor, or a ring buffer, the bytes go directly to their final destination. + +=== WriteSink: Caller-Owned Buffers + +The `WriteSink` interface (`write_some`, `write`, `write_eof`) is designed for buffered sources. The caller already has the data in buffers and passes them to the sink: + +[source,cpp] +---- +any_buffer_sink abs(concrete_sink{}); + +// Data already in buffers -- pass them directly +auto [ec, n] = co_await abs.write(existing_buffers); + +// Or atomically write and signal EOF +auto [ec2, n2] = co_await abs.write_eof(final_buffers); +---- + +When the concrete sink natively supports `WriteSink`, the caller's buffers propagate directly through the type-erased boundary. The sink receives the original buffer descriptors pointing to the caller's memory. No data is copied into an intermediate staging area. + +== Dispatch Strategy + +The vtable records whether the wrapped concrete type satisfies `WriteSink` in addition to `BufferSink`. This determination is made at compile time when the vtable is constructed. At runtime, each `WriteSink` operation checks a single nullable function pointer to select its path. + +=== Native Forwarding (BufferSink + WriteSink) + +When the concrete type satisfies both concepts, the `WriteSink` vtable slots are populated with functions that construct the concrete type's own `write_some`, `write`, `write_eof(buffers)`, and `write_eof()` awaitables in the cached storage. The caller's buffer descriptors pass straight through: + +---- +caller buffers → vtable → concrete write(buffers) → I/O +---- + +No `prepare`, no `buffer_copy`, no `commit`. The concrete type receives the caller's buffers and can submit them directly to the operating system, the compression library, or the next pipeline stage. + +This is the zero-copy path for buffered sources writing to a sink that natively accepts caller-owned buffers. + +=== Synthesized Path (BufferSink Only) + +When the concrete type satisfies only `BufferSink`, the `WriteSink` vtable slots are null. The wrapper synthesizes the `WriteSink` operations from the `BufferSink` primitives: + +---- +caller buffers → prepare → buffer_copy → commit → I/O +---- + +For `write_some`: + +1. Call `prepare` to get writable space from the sink. +2. Copy data from the caller's buffers into the prepared space with `buffer_copy`. +3. Call `commit` to finalize. + +For `write` and `write_eof`: the same loop, repeated until all data is consumed. `write_eof` finishes with `commit_eof` to signal end-of-stream. + +This path incurs one buffer copy, which is unavoidable: the concrete sink only knows how to accept data through its own `prepare`/`commit` protocol, so the caller's buffers must be copied into the sink's internal storage. + +== Why This Matters + +=== No Compromise + +A naive design would pick one interface and synthesize the other unconditionally. If the wrapper only exposed `BufferSink`, every buffered source would pay a copy to move its data into the sink's prepared buffers. If the wrapper only exposed `WriteSink`, every generator would need to allocate its own intermediate buffer, fill it, then hand it to the sink -- paying a copy that the `BufferSink` path avoids. + +`any_buffer_sink` avoids both penalties. Each data-production pattern uses the interface designed for it. The only copy that occurs is the one that is structurally unavoidable: when a `WriteSink` operation targets a concrete type that only speaks `BufferSink`. + +=== True Zero-Copy for Buffered Sources + +Consider an HTTP server where the response body is a memory-mapped file. The file's pages are already in memory. Through the `WriteSink` interface, those pages can propagate directly to the underlying transport: + +[source,cpp] +---- +// body_source is a BufferSource backed by mmap pages +// response_sink wraps a concrete type satisfying both concepts + +any_buffer_sink response_sink(&concrete); + +const_buffer arr[16]; +for(;;) +{ + auto [ec, bufs] = co_await body_source.pull(arr); + if(ec == cond::eof) + { + auto [ec2] = co_await response_sink.write_eof(); + break; + } + if(ec) + break; + // bufs point directly into mmap pages + // write() propagates them through the vtable to the concrete sink + auto [ec2, n] = co_await response_sink.write(bufs); + if(ec2) + break; + body_source.consume(n); +} +---- + +The mapped pages flow from `body_source.pull` through `response_sink.write` to the concrete transport with no intermediate copy. If the concrete sink can scatter-gather those buffers into a `writev` system call, the data moves from the page cache to the network card without touching user-space memory a second time. + +=== Generators Write In-Place + +An HTTP header serializer generates bytes on the fly. It does not hold the output in advance. Through the `BufferSink` interface, it writes directly into whatever memory the concrete sink provides: + +[source,cpp] +---- +task<> serialize_headers( + any_buffer_sink& sink, + response const& resp) +{ + mutable_buffer arr[16]; + + for(auto const& field : resp.fields()) + { + auto bufs = sink.prepare(arr); + // serialize field directly into bufs + std::size_t n = format_field(bufs, field); + auto [ec] = co_await sink.commit(n); + if(ec) + co_return; + } + // headers done; body follows through the same sink +} +---- + +The serializer never allocates a scratch buffer for the formatted output. The bytes land directly in the sink's internal storage, which might be a chunked-encoding buffer, a TLS record buffer, or a circular buffer feeding a socket. + +== Awaitable Caching + +`any_buffer_sink` uses the split vtable pattern described in xref:8h.TypeEraseAwaitable.adoc[Type-Erasing Awaitables]. Multiple async operations (`commit`, `commit_eof`, plus the four `WriteSink` operations when the concrete type supports them) share a single cached awaitable storage region. + +The constructor computes the maximum size and alignment across all awaitable types that the concrete type can produce and allocates that storage once. This reserves all virtual address space at construction time, so memory usage is measurable at server startup rather than growing piecemeal as requests arrive. + +Two separate `awaitable_ops` structs are used: + +- `awaitable_ops` for operations yielding `io_result<>` (`commit`, `commit_eof`, `write_eof()`) +- `write_awaitable_ops` for operations yielding `io_result` (`write_some`, `write`, `write_eof(buffers)`) + +Each `construct_*` function in the vtable creates the concrete awaitable in the cached storage and returns a pointer to the matching `static constexpr` ops table. The wrapper stores this pointer as `active_ops_` or `active_write_ops_` and uses it for `await_ready`, `await_suspend`, `await_resume`, and destruction. + +== Ownership Modes + +=== Owning + +[source,cpp] +---- +any_buffer_sink abs(my_concrete_sink{args...}); +---- + +The wrapper allocates storage for the concrete sink and moves it in. The wrapper owns the sink and destroys it in its destructor. The awaitable cache is allocated separately. + +If either allocation fails, the constructor cleans up via an internal guard and propagates the exception. + +=== Non-Owning (Reference) + +[source,cpp] +---- +my_concrete_sink sink; +any_buffer_sink abs(&sink); +---- + +The wrapper stores a pointer without allocating storage for the sink. The concrete sink must outlive the wrapper. Only the awaitable cache is allocated. + +This mode is useful when the concrete sink is managed by a higher-level object (e.g., an HTTP connection that owns the transport) and the wrapper is a short-lived handle passed to a body-production function. + +== Relationship to any_buffer_source + +`any_buffer_source` is the read-side counterpart, satisfying both `BufferSource` and `ReadSource`. The same dual-interface principle applies in mirror image: + +[cols="1,1,1"] +|=== +| Direction | Primary concept | Secondary concept + +| Writing (any_buffer_sink) +| `BufferSink` (callee-owned) +| `WriteSink` (caller-owned) + +| Reading (any_buffer_source) +| `BufferSource` (callee-owned) +| `ReadSource` (caller-owned) +|=== + +Both wrappers enable the same design philosophy: the caller chooses the interface that matches its data-production or data-consumption pattern, and the wrapper dispatches optimally. + +== Alternatives Considered + +=== WriteSink-Only Wrapper + +A design where the type-erased wrapper satisfied only `WriteSink` was considered. Generators would allocate their own scratch buffer, serialize into it, and call `write`. This was rejected because: + +- Every generator pays a buffer copy that the `BufferSink` path avoids. For high-throughput paths (HTTP header serialization, compression output), this copy is measurable. +- Generators must manage scratch buffer lifetime and sizing. The `prepare`/`commit` protocol pushes this responsibility to the sink, which knows its own buffer topology. +- The `commit_eof(n)` optimization (coalescing final data with stream termination) is lost. A generator calling `write` cannot signal that its last write is the final one without a separate `write_eof()` call, preventing the sink from combining them. + +=== BufferSink-Only Wrapper + +A design where the wrapper satisfied only `BufferSink` was considered. Buffered sources would copy their data into the sink's prepared buffers via `prepare` + `buffer_copy` + `commit`. This was rejected because: + +- Every buffered source pays a copy that native `WriteSink` forwarding avoids. When the source is a memory-mapped file and the sink is a socket, this eliminates the zero-copy path entirely. +- The `buffer_copy` step becomes the bottleneck for large transfers, dominating what would otherwise be a pure I/O operation. +- Buffered sources that produce scatter-gather buffer sequences (multiple non-contiguous regions) must copy each region individually into prepared buffers, losing the ability to pass the entire scatter-gather list to a `writev` system call. + +=== Separate Wrapper Types + +A design with two distinct wrappers (`any_buffer_sink` satisfying only `BufferSink` and `any_write_sink` satisfying only `WriteSink`) was considered. The caller would choose which wrapper to construct based on its data-production pattern. This was rejected because: + +- The caller and the sink are often decoupled. An HTTP server framework provides the sink; the user provides the body producer. The framework cannot know at compile time whether the user will call `prepare`/`commit` or `write`/`write_eof`. +- Requiring two wrapper types forces the framework to either pick one (losing the other pattern) or expose both (complicating the API). +- A single wrapper that satisfies both concepts lets the framework hand one object to the body producer, which uses whichever interface is natural. No choice is imposed on the framework or the user. + +=== Always Synthesizing WriteSink + +A design where the `WriteSink` operations were always synthesized from `prepare` + `buffer_copy` + `commit`, even when the concrete type natively supports `WriteSink`, was considered. This would simplify the vtable by removing the nullable write-forwarding slots. This was rejected because: + +- The buffer copy is measurable. For a concrete type that can accept caller-owned buffers directly (e.g., a socket wrapper with `writev` support), the synthesized path adds a copy that native forwarding avoids. +- The `write_eof(buffers)` atomicity guarantee is lost. The synthesized path must decompose it into `prepare` + `buffer_copy` + `commit_eof`, which the concrete type cannot distinguish from a non-final commit followed by an empty `commit_eof`. This prevents optimizations like coalescing the last data chunk with a chunked-encoding terminator. + +== Summary + +`any_buffer_sink` satisfies both `BufferSink` and `WriteSink` behind a single type-erased interface. The dual API lets each data-production pattern use the interface designed for it: + +[cols="1,2,2"] +|=== +| Producer type | Interface | Data path + +| Generator (produces on demand) +| `prepare` / `commit` / `commit_eof` +| Writes directly into sink's internal storage. Zero copy. + +| Buffered source (data already in memory) +| `write_some` / `write` / `write_eof` +| Buffers propagate through the vtable. Zero copy when the concrete + type natively supports `WriteSink`. One copy (synthesized) when + it does not. +|=== + +The dispatch is determined at construction time through nullable vtable slots. At runtime, a single pointer check selects the native or synthesized path. Neither pattern pays for the other's existence. diff --git a/doc/modules/ROOT/pages/8.design/8k.Executor.adoc b/doc/modules/ROOT/pages/8.design/8k.Executor.adoc new file mode 100644 index 00000000..82468506 --- /dev/null +++ b/doc/modules/ROOT/pages/8.design/8k.Executor.adoc @@ -0,0 +1,390 @@ += Executor Concept Design + +== Overview + +This document describes the design of the `Executor` concept: the interface through which coroutines are scheduled for execution. It explains the relationship to Asio's executor model, why `dispatch` returns `void`, why `defer` was dropped, how `executor_ref` achieves zero-allocation type erasure, and the I/O completion pattern that motivates the entire design. + +The `Executor` concept exists to answer one question: when a coroutine is ready to run, _where_ does it run? The concept captures the rules for scheduling coroutine resumption, tracking outstanding work for graceful shutdown, and accessing the execution context that owns the executor. Every I/O awaitable in Corosio -- sockets, acceptors, timers, resolvers -- depends on this concept to dispatch completions back to the correct executor. + +== Definition + +[source,cpp] +---- +template +concept Executor = + std::is_nothrow_copy_constructible_v && + std::is_nothrow_move_constructible_v && + requires(E& e, E const& ce, E const& ce2, + std::coroutine_handle<> h) + { + { ce == ce2 } noexcept -> std::convertible_to; + { ce.context() } noexcept; + requires std::is_lvalue_reference_v< + decltype(ce.context())> && + std::derived_from< + std::remove_reference_t< + decltype(ce.context())>, + execution_context>; + { ce.on_work_started() } noexcept; + { ce.on_work_finished() } noexcept; + + { ce.dispatch(h) }; + { ce.post(h) }; + }; +---- + +An `Executor` provides exactly two operations on a coroutine handle: + +=== `dispatch(h)` -- Execute If Safe + +If the executor determines it is safe (e.g., the current thread is already associated with the executor's context), resumes the coroutine inline via `h.resume()`. Otherwise, posts the coroutine for later execution. Returns `void`. + +=== `post(h)` -- Always Queue + +Queues the coroutine for later execution without ever executing it inline. Never blocks. Use when guaranteed asynchrony is required. + +The remaining operations support context access, lifecycle management, and identity: + +=== `context()` -- Access the Execution Context + +Returns an lvalue reference to the `execution_context` that created this executor. The context provides service infrastructure, frame allocators, and shutdown coordination. + +=== `on_work_started()` / `on_work_finished()` -- Track Work + +Paired calls that track outstanding work. When the count reaches zero, the context's event loop (`run()`) returns. These calls must be balanced: each `on_work_started` must have a matching `on_work_finished`. + +=== `operator==` -- Equality Comparison + +Two executors are equal if they submit work to the same destination. This enables the same-executor optimization: when a completion's executor matches the caller's, the dispatch can skip the indirection. + +== Relationship to Asio + +Kohlhoff's Asio library established the executor-as-policy model that Capy inherits. As described in https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/p0113r0.html[P0113R0]: + +[quote] +____ +An executor is to function execution as an allocator is to +allocation. +____ + +Capy retains the core elements of this model: + +- **Work tracking.** `on_work_started` / `on_work_finished` for graceful shutdown. +- **`dispatch` / `post` duality.** Execute-if-safe versus always-queue. +- **`execution_context` base class.** Service infrastructure and context lifetime management. +- **Equality comparison.** Same-executor optimization. + +Capy removes or changes: + +- **`defer`.** Dropped entirely. See <>. +- **Function object submission.** Asio executors accept arbitrary callables. Capy executors accept only `std::coroutine_handle<>`. This removes the need for allocator-aware function erasure and enables a simpler, cheaper type-erased wrapper (`executor_ref`). +- **`dispatch` return type.** Asio's `dispatch` returns void for the same reason Capy's does, but Capy also considered and rejected a `coroutine_handle<>` return for symmetric transfer. See <>. + +The result is a concept that preserves Asio's proven execution model while removing the machinery that a coroutine-native library does not need. + +[[why-dispatch-returns-void]] +== Why `dispatch` Returns `void` + +An earlier design had `dispatch` return `std::coroutine_handle<>` so that callers could use it for symmetric transfer from `await_suspend`. This was rejected because it violates a fundamental constraint of the I/O layer. + +=== The Problem: Synchronous Completion During `await_suspend` + +When an I/O awaitable initiates an operation inside `await_suspend`, the I/O might complete immediately. If it does, the completion path would call `dispatch(h)` while the caller's `await_suspend` is still on the call stack. If `dispatch` resumed the coroutine inline via `h.resume()`, the coroutine would execute while `await_suspend` has not yet returned -- resuming a coroutine from inside `await_suspend` before the suspension machinery completes risks undefined behavior. + +The {cpp} standard describes the sequencing in https://eel.is/c++draft/expr.await[[expr.await]/5.1]: + +[quote] +____ +If the result of await-ready is false, the coroutine is +considered suspended. Then, await-suspend is evaluated. +____ + +Although the standard considers the coroutine suspended before `await_suspend` is called, resuming it from _within_ `await_suspend` creates a nested resumption on the same call stack. The resumed coroutine runs, potentially suspends again or completes, and then control returns into the middle of `await_suspend`. If the coroutine was destroyed during resumption, `await_suspend` returns into a destroyed frame. + +=== Why I/O Awaitables Return `void` or `std::noop_coroutine()` + +To avoid this, all I/O awaitables return `void` or `std::noop_coroutine()` from `await_suspend`. Both forms guarantee that the caller is fully suspended and the call stack has unwound before any completion handler can resume the coroutine. The I/O operation is initiated during `await_suspend`, but the completion is dispatched later -- from the event loop, after `await_suspend` has returned. + +https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0913r1.html[P0913R1] introduced the `coroutine_handle` return type for symmetric transfer, which is the correct mechanism for coroutine-to-coroutine control transfer (as used by `task` internally). But I/O awaitables cannot use it because the I/O completion is asynchronous relative to `await_suspend` -- it comes from the reactor or proactor, not from the awaitable itself. + +=== Consequence for `dispatch` + +Since the primary consumer of `dispatch` is I/O completion -- called _after_ the coroutine is suspended, from the event loop -- `dispatch` does not need to participate in symmetric transfer. It calls `h.resume()` inline when safe and returns `void`. A conforming implementation looks like: + +[source,cpp] +---- +void dispatch(std::coroutine_handle<> h) const +{ + if(ctx_.running_in_this_thread()) + h.resume(); + else + post(h); +} +---- + +After `dispatch` returns, the state of `h` is unspecified. The coroutine may have completed, been destroyed, or suspended at a different point. Callers must not use `h` after calling `dispatch`. + +[[why-not-defer]] +== Why Two Operations, Not Three + +Asio provides three submission methods: `dispatch`, `post`, and `defer`. Capy provides only `dispatch` and `post`. + +=== What `defer` Does + +https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/p0113r0.html[P0113R0] describes `defer`: + +[quote] +____ +A defer operation is similar to a post operation... However, a +defer operation also implies a relationship between the caller +and the function object being submitted. It is intended for use +when submitting a function object that represents a continuation +of the caller. +____ + +The optimization this enables is thread-local queuing. When the caller is already executing within the executor's context, `defer` saves the continuation to a thread-local queue instead of the shared work queue. From P0113R0: + +[quote] +____ +If the caller is executing within the thread pool, saves the +function object to a thread-local queue. Once control returns to +the thread pool, the function object is scheduled for execution +as soon as possible. +____ + +=== Why Coroutines Make `defer` Redundant + +In a callback-based library, when an asynchronous operation completes, the completion handler must be submitted to the executor as a function object. If the handler is the caller's continuation, `defer` tells the executor "this is my next step; optimize accordingly." + +In a coroutine-native library, this optimization is provided by the language itself. https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0913r1.html[P0913R1] introduced symmetric transfer specifically to eliminate the need for queues and schedulers in coroutine-to-coroutine control transfer: + +[quote] +____ +Currently Coroutines TS only supports asymmetric control transfer +where suspend always returns control back to the current +coroutine caller or resumer. In order to emulate symmetric +coroutine to coroutine control transfer, one needs to build a +queue and a scheduler. +____ + +When `task::await_suspend` returns the parent's coroutine handle, the compiler performs a tail-call-like transfer directly to the parent. No queue, no executor submission, no `defer`. The optimization that `defer` provides through a runtime hint, symmetric transfer provides through a compile-time guarantee. + +Corosio confirms this in practice: its entire I/O layer -- sockets, acceptors, timers, resolvers, signals -- across all three backends (epoll, IOCP, select) uses only `dispatch` and `post`. No code path requires `defer`. + +== Why `std::coroutine_handle<>`, Not Typed Handles + +The executor accepts `std::coroutine_handle<>` -- the type-erased handle -- rather than `std::coroutine_handle

` for a specific promise type `P`. + +This decision has three consequences: + +- **Type erasure is possible.** `executor_ref` wraps any executor behind a uniform interface. If `dispatch` and `post` were templated on the promise type, the vtable would need to be generic over all promise types, making type erasure impractical. + +- **Executor implementations are independent of coroutine internals.** An executor schedules resumption. It does not need to know what the coroutine's promise type is, what value it produces, or how it handles exceptions. The type-erased handle provides exactly the right interface: `resume()` and nothing else. + +- **I/O operation structures stay simple.** Every pending I/O operation in Corosio stores two fields: `std::coroutine_handle<> h` (a typedef for `std::coroutine_handle<>`) and `capy::executor_ref ex`. Both are type-erased. The operation structure does not need to be templated on the coroutine's promise type, which keeps the I/O backend code non-generic and out of headers. + +== Why Nothrow Copy and Move + +The concept requires `std::is_nothrow_copy_constructible_v` and `std::is_nothrow_move_constructible_v`. + +Executors propagate through coroutine machinery at points where exceptions cannot be handled: inside `await_suspend`, during promise construction, and through type-erased wrappers like `executor_ref`. An exception thrown from an executor copy at any of these points would leave the coroutine in an unrecoverable state -- suspended but with no executor to resume it through. + +The nothrow requirement eliminates this failure mode entirely. In practice, executors are lightweight handles -- a pointer to the execution context and perhaps a strand pointer or a priority value. Nothrow copy and move are natural for such types. The requirement does not impose a burden; it documents what is already true of every reasonable executor implementation. + +== Work Tracking, Shutdown, and Executor Validity + +The `on_work_started` and `on_work_finished` operations serve three roles. + +=== Event Loop Lifetime + +Work tracking is the mechanism by which the event loop knows when to stop. When outstanding work reaches zero, `run()` returns. This is not bookkeeping -- it is the event loop's termination signal. + +In Corosio, `on_work_finished` triggers `stop()` when the atomic work count reaches zero: + +[source,cpp] +---- +void on_work_finished() noexcept +{ + if(outstanding_work_.fetch_sub( + 1, std::memory_order_acq_rel) == 1) + stop(); +} +---- + +Every `run_async` call increments the count. When the launched task completes, the count decrements. When no tasks remain, `run()` returns. Without work tracking in the executor, the event loop would need a separate signaling mechanism or would spin indefinitely. + +=== Public Visibility + +These operations are public, not private with friendship. The reason is extensibility: `work_guard` is the library's RAII wrapper for work tracking, but users may define their own guards with additional behaviors (logging, metrics, timeout detection). Making work tracking private would require the library to grant friendship to types it cannot anticipate. + +=== Executor Validity + +An executor becomes invalid when its context's `shutdown()` returns. After shutdown: + +- `dispatch`, `post`, `on_work_started`, `on_work_finished`: undefined behavior. +- Copy, comparison, `context()`: valid until the context is destroyed. + +This two-phase model exists because shutdown drains outstanding work. During the drain, executors must still be copyable (they are stored in pending operations) and comparable (for same-executor checks). Only the work-submission operations become invalid, because the context has stopped accepting new work. + +== Why `context()` Returns `execution_context&` + +The `context()` operation returns a reference to the `execution_context` base class, not the concrete derived type. + +This serves two purposes: + +- **Type erasure.** `executor_ref` can wrap any executor without knowing its context type. If `context()` returned a concrete type, the vtable would need a different return type per executor type. + +- **Service lookup.** The `execution_context` base class provides `use_service()` and `make_service()`, which is sufficient for all runtime service discovery. I/O objects do not need the concrete context type to find their services. + +Corosio demonstrates this pattern throughout its public API. I/O objects accept any executor and extract the context via the base class reference: + +[source,cpp] +---- +template + requires capy::Executor +explicit tcp_socket(Ex const& ex) + : tcp_socket(ex.context()) +{ +} +---- + +The socket constructor receives `execution_context&` and looks up the socket service. The concrete context type -- `epoll_context`, `iocp_context`, `select_context` -- is irrelevant to the socket. + +== The `executor_ref` Design + +`executor_ref` is a non-owning, type-erased wrapper for any executor satisfying the `Executor` concept. It is the mechanism by which I/O operations store and use executors without templates. + +=== Two Pointers + +The entire object is two pointers: + +[source,cpp] +---- +class executor_ref +{ + void const* ex_; // pointer to the executor + detail::executor_vtable const* vt_; // pointer to the vtable +}; +---- + +Two pointers fit in two registers. `executor_ref` can be passed by value as cheaply as passing a pointer. No heap allocation, no small-buffer optimization, no reference counting. + +=== Why Not `std::function` or `std::any` + +`std::function` and small-buffer-optimized type erasure wrappers have overhead that executor usage cannot tolerate: + +- **Heap allocation.** `std::function` may allocate when the callable exceeds the SBO threshold. Executor dispatch happens on every I/O completion -- allocation on the hot path is unacceptable. + +- **Reference counting.** `std::shared_ptr`-based wrappers add atomic reference count operations on every copy. Executors are copied frequently as they propagate through coroutine chains. + +- **Indirection.** SBO wrappers store either inline data or a heap pointer, adding a branch on every operation. + +`executor_ref` avoids all three. The vtable pointer goes directly to a `static constexpr` structure in `.rodata`. One indirection, no branches, no allocation. + +=== Why Not {cpp} Virtual Functions + +{cpp} virtual dispatch places the vtable pointer inside each heap-allocated object. Every virtual call chases a pointer from the object to its vtable, which may reside at an unpredictable address in memory. When objects of different types are interleaved on the heap, their vtable pointers point to different locations in `.rodata`, defeating spatial prefetch and polluting the instruction cache. + +`executor_ref` separates the vtable from the object. The vtable is a `static constexpr` structure -- one per executor type, shared by all instances of that type. Because most programs use only one or two executor types (a thread pool executor and perhaps a strand), the vtable stays hot in L1 cache. The executor pointer and the vtable pointer sit adjacent in the `executor_ref` object, so both are loaded in a single cache line. + +=== Reference Semantics + +`executor_ref` stores a pointer to the executor, not a copy. The executor must outlive the `executor_ref`. This matches how executors propagate through coroutine chains: the executor is owned by the execution context (which outlives all coroutines running on it), and `executor_ref` is a lightweight handle passed through `await_suspend` and stored in I/O operation structures. + +== The I/O Completion Pattern + +The executor concept is designed around a single use case: I/O completion dispatch. This pattern is the reason the concept exists. + +=== Capture at Initiation + +When a coroutine `co_await`s an I/O awaitable, the awaitable's `await_suspend` receives the caller's executor and stores it as `executor_ref`: + +[source,cpp] +---- +template +auto await_suspend( + std::coroutine_handle<> h, + Ex const& ex) -> std::coroutine_handle<> +{ + // ex is captured as executor_ref in the operation + return socket_.connect(h, ex, endpoint_, token_, &ec_); +} +---- + +The operation structure stores both the coroutine handle and the executor reference: + +[source,cpp] +---- +struct io_op : scheduler_op +{ + std::coroutine_handle<> h; + capy::executor_ref ex; + // ... error codes, buffers, etc. +}; +---- + +=== Dispatch at Completion + +When the I/O completes (from the reactor thread for epoll, the completion port for IOCP, or the select loop), the operation uses the stored executor to resume the coroutine: + +[source,cpp] +---- +void operator()() override +{ + // ... set error codes ... + capy::executor_ref saved_ex(std::move(ex)); + std::coroutine_handle<> saved_h(std::move(h)); + impl_ptr.reset(); + saved_ex.dispatch(saved_h); +} +---- + +`dispatch` checks whether the current thread is already running on the executor's context. If so, the coroutine resumes inline. If not, the coroutine is posted for later execution on the correct context. + +=== Platform Independence + +This pattern is identical across all three Corosio backends: epoll (Linux), IOCP (Windows), and select (POSIX fallback). The executor concept and `executor_ref` provide the abstraction that makes this possible. The backend-specific code deals with I/O readiness or completion notification. The executor-specific code deals with coroutine scheduling. The two concerns are cleanly separated. + +== Why Not `std::execution` (P2300) + +https://wg21.link/P2300[P2300] defines a sender/receiver model where execution context flows _backward_ from receiver to sender via queries after `connect()`: + +---- +task async_work(); // Frame allocated NOW +auto sndr = async_work(); +auto op = connect(sndr, receiver); // Allocator available NOW +start(op); // -- too late +---- + +For coroutines, this ordering is fatal. Coroutine frame allocation happens _before_ the coroutine body executes. The compiler calls `operator new` first, then constructs the promise, then begins execution. Any mechanism that provides the allocator _after_ the coroutine call -- receiver queries, `await_transform`, explicit method calls -- arrives after the frame is already allocated with the wrong (or default) allocator. + +Capy's model flows context _forward_ from launcher to task. The `run_async(ex, alloc)(my_task())` two-phase invocation sets the thread-local allocator _before_ the task expression is evaluated, so `operator new` reads it in time. This is described in detail in xref:8l.RunApi.adoc[Run API]. + +The same forward-flowing model applies to executors. The launcher binds the executor before the task runs. The task's promise stores the executor and propagates it to nested awaitables via `await_transform`. Context flows from caller to callee at every level, never backward. + +== Conforming Signatures + +A minimal executor implementation: + +[source,cpp] +---- +class my_executor +{ +public: + execution_context& context() const noexcept; + + void on_work_started() const noexcept; + void on_work_finished() const noexcept; + + void dispatch(std::coroutine_handle<> h) const; + void post(std::coroutine_handle<> h) const; + + bool operator==(my_executor const&) const noexcept; +}; +---- + +== Summary + +The `Executor` concept provides `dispatch` and `post` for coroutine scheduling, work tracking for event loop lifetime, and `context()` for service access. The design descends from Asio's executor model but is adapted for coroutines: `defer` is replaced by symmetric transfer, function objects are replaced by `std::coroutine_handle<>`, and `dispatch` returns `void` because I/O completions are dispatched after suspension, not during it. + +`executor_ref` type-erases any executor into two pointers, enabling platform-independent I/O completion dispatch with zero allocation and predictable cache behavior. The capture-at-initiation / dispatch-at-completion pattern is the fundamental use case the concept serves. diff --git a/doc/modules/ROOT/pages/8.design/8g.RunApi.adoc b/doc/modules/ROOT/pages/8.design/8l.RunApi.adoc similarity index 52% rename from doc/modules/ROOT/pages/8.design/8g.RunApi.adoc rename to doc/modules/ROOT/pages/8.design/8l.RunApi.adoc index 83d5bbfe..2c9fb6db 100644 --- a/doc/modules/ROOT/pages/8.design/8g.RunApi.adoc +++ b/doc/modules/ROOT/pages/8.design/8l.RunApi.adoc @@ -2,23 +2,13 @@ == Overview -This document explains the naming conventions and call syntax of the -two launcher functions: `run_async` (fire-and-forget from non-coroutine -code) and `run` (awaitable within a coroutine). Both accept any type -satisfying _IoRunnable_ -- not just `task` -- and use a -deliberate **two-phase invocation** pattern -- `f(context)(task)` -- that -exists for a mechanical reason rooted in coroutine frame allocation -timing. +This document explains the naming conventions and call syntax of the two launcher functions: `run_async` (fire-and-forget from non-coroutine code) and `run` (awaitable within a coroutine). Both accept any type satisfying _IoRunnable_ -- not just `task` -- and use a deliberate **two-phase invocation** pattern -- `f(context)(task)` -- that exists for a mechanical reason rooted in coroutine frame allocation timing. == Usage === `run_async` -- Fire-and-Forget Launch -`run_async` launches any _IoRunnable_ from non-coroutine code: -`main()`, callback handlers, event loops. `task` is the most common -conforming type, but any user-defined type satisfying the concept works. -The function does not return a value to the caller. Handlers receive the -task's result or exception after completion. +`run_async` launches any _IoRunnable_ from non-coroutine code: `main()`, callback handlers, event loops. `task` is the most common conforming type, but any user-defined type satisfying the concept works. The function does not return a value to the caller. Handlers receive the task's result or exception after completion. [source,cpp] ---- @@ -49,11 +39,7 @@ run_async(ex, st, alloc, h1, h2)(my_task()); === `run` -- Awaitable Launch Within a Coroutine -`run` is the coroutine-side counterpart. It binds any -_IoRunnable_ to a (possibly different) executor and returns -the result to the caller via `co_await`. It also supports overloads -that customize stop token or allocator while inheriting the caller's -executor. +`run` is the coroutine-side counterpart. It binds any _IoRunnable_ to a (possibly different) executor and returns the result to the caller via `co_await`. It also supports overloads that customize stop token or allocator while inheriting the caller's executor. [source,cpp] ---- @@ -86,8 +72,7 @@ task full_control() === `run_async` on a Strand -A common pattern for launching per-connection coroutines on a -strand, ensuring serialized access to connection state: +A common pattern for launching per-connection coroutines on a strand, ensuring serialized access to connection state: [source,cpp] ---- @@ -100,9 +85,7 @@ void on_accept(tcp::socket sock) == Alternatives Considered -Several alternative naming and syntax proposals were evaluated and -discarded. The following table shows each rejected form alongside the -chosen form. +Several alternative naming and syntax proposals were evaluated and discarded. The following table shows each rejected form alongside the chosen form. === Builder Pattern: `on` / `with` / `spawn` / `call` @@ -141,8 +124,7 @@ co_await run(worker_ex)(compute()); co_await run(my_alloc)(subtask()); ---- -The builder pattern reads well as English, but it creates problems -in {cpp} practice. See <> below for the full analysis. +The builder pattern reads well as English, but it creates problems in {cpp} practice. See <> below for the full analysis. === Single-Call with Named Method @@ -152,11 +134,7 @@ in {cpp} practice. See <> below for the full analysis. run_async(ex, my_task()); ---- -This fails the allocator timing constraint entirely. The task -argument `my_task()` is evaluated _before_ `run_async` can set -the thread-local allocator. The coroutine frame is allocated with -the wrong (or no) allocator. This is not a style preference -- it -is a correctness bug. +This fails the allocator timing constraint entirely. The task argument `my_task()` is evaluated _before_ `run_async` can set the thread-local allocator. The coroutine frame is allocated with the wrong (or no) allocator. This is not a style preference -- it is a correctness bug. === Named Method on Wrapper @@ -167,15 +145,7 @@ run_async(ex).spawn(my_task()); co_await run(ex).call(compute()); ---- -This preserves the two-phase timing guarantee and avoids the -namespace collision problems of `on`/`with`. The objection is minor: -`.spawn()` and `.call()` add vocabulary without adding clarity. The -wrapper already has exactly one purpose -- accepting a task. A named -method implies the wrapper has a richer interface than it does. -`operator()` is the conventional {cpp} way to express "this object -does exactly one thing." That said, this alternative has legs and -could be revisited if the `()()` syntax proves too confusing in -practice. +This preserves the two-phase timing guarantee and avoids the namespace collision problems of `on`/`with`. The objection is minor: `.spawn()` and `.call()` add vocabulary without adding clarity. The wrapper already has exactly one purpose -- accepting a task. A named method implies the wrapper has a richer interface than it does. `operator()` is the conventional {cpp} way to express "this object does exactly one thing." That said, this alternative has legs and could be revisited if the `()()` syntax proves too confusing in practice. == The Names @@ -183,38 +153,20 @@ practice. The `run` prefix was chosen for several reasons: -- **Greppability.** Searching for `run_async(` or `run(` in a - codebase produces unambiguous results. Short, common English words - like `on` or `with` collide with local variable names, parameter - names, and other libraries. A `using namespace capy;` combined with - a local variable named `on` produces silent shadowing bugs. - -- **Verb clarity.** `run` tells you what happens: something executes. - `run_async` tells you it executes without waiting. `run` inside a - coroutine tells you control transfers and returns. Prepositions like - `on` and `with` say nothing about the action -- they are sentence - fragments waiting for a verb. - -- **Discoverability.** The `run_*` family groups together in - documentation, autocompletion, and alphabetical listings. Users - searching for "how do I launch a task" find `run_async` and `run` - as a coherent pair. - -- **Consistency.** The naming follows the established pattern from - `io_context::run()`, `std::jthread`, and other {cpp} APIs where - `run` means "begin executing work." - -- **No false promises.** A builder-pattern syntax like - `on(ex).spawn(t)` implies composability -- `on(ex).with(alloc).call(t)` -- - that the API does not deliver. The `f(x)(t)` pattern is honest about - being exactly two steps, no more. It does not invite users to chain - methods that do not exist. +- **Greppability.** Searching for `run_async(` or `run(` in a codebase produces unambiguous results. Short, common English words like `on` or `with` collide with local variable names, parameter names, and other libraries. A `using namespace capy;` combined with a local variable named `on` produces silent shadowing bugs. + +- **Verb clarity.** `run` tells you what happens: something executes. `run_async` tells you it executes without waiting. `run` inside a coroutine tells you control transfers and returns. Prepositions like `on` and `with` say nothing about the action -- they are sentence fragments waiting for a verb. + +- **Discoverability.** The `run_*` family groups together in documentation, autocompletion, and alphabetical listings. Users searching for "how do I launch a task" find `run_async` and `run` as a coherent pair. + +- **Consistency.** The naming follows the established pattern from `io_context::run()`, `std::jthread`, and other {cpp} APIs where `run` means "begin executing work." + +- **No false promises.** A builder-pattern syntax like `on(ex).spawn(t)` implies composability -- `on(ex).with(alloc).call(t)` -- that the API does not deliver. The `f(x)(t)` pattern is honest about being exactly two steps, no more. It does not invite users to chain methods that do not exist. [[why-not-builder]] === Why Not a Builder Pattern -An alternative proposal suggested replacing the two-call syntax with a -builder-style API: +An alternative proposal suggested replacing the two-call syntax with a builder-style API: [source,cpp] ---- @@ -226,13 +178,9 @@ co_await capy::with(alloc).call(subtask()); capy::on(ex).block(my_task()); ---- -While the English readability of `on(ex).spawn(t)` is genuinely -appealing, the approach has practical problems in a Boost library: +While the English readability of `on(ex).spawn(t)` is genuinely appealing, the approach has practical problems in a Boost library: -- **Namespace pollution.** `on` and `with` are among the most - common English words in programming. In a Boost library used - alongside dozens of other namespaces, these names invite collisions. - Consider what happens with `using namespace capy;`: +- **Namespace pollution.** `on` and `with` are among the most common English words in programming. In a Boost library used alongside dozens of other namespaces, these names invite collisions. Consider what happens with `using namespace capy;`: + [source,cpp] ---- @@ -243,14 +191,9 @@ void handle(auto with) { // parameter name with(alloc).call(sub()); // won't compile } ---- -+ -The names `run` and `run_async` do not have this problem. No one -names their variables `run_async`. ++ The names `run` and `run_async` do not have this problem. No one names their variables `run_async`. -- **Semantic ambiguity.** `with(st)` versus `with(alloc)` -- with - _what_, exactly? The current API uses `run(st)` and `run(alloc)` - where overload resolution disambiguates naturally because the verb - `run` provides context. A bare preposition provides none. +- **Semantic ambiguity.** `with(st)` versus `with(alloc)` -- with _what_, exactly? The current API uses `run(st)` and `run(alloc)` where overload resolution disambiguates naturally because the verb `run` provides context. A bare preposition provides none. + [source,cpp] ---- @@ -261,8 +204,7 @@ co_await capy::with(x).call(subtask()); co_await run(x)(subtask()); ---- -- **Builder illusion.** Dot-chaining suggests composability that does - not exist. Users will naturally try: +- **Builder illusion.** Dot-chaining suggests composability that does not exist. Users will naturally try: + [source,cpp] ---- @@ -270,46 +212,27 @@ co_await run(x)(subtask()); capy::on(ex).with(alloc).call(my_task()); capy::on(ex).with(st).with(alloc).spawn(my_task(), h1, h2); ---- -+ -The current syntax makes the interface boundary explicit: the first -call captures _all_ context, the second call accepts the task. There -is no dot-chain to extend. - -- **Erases the test boundary.** `run_blocking` lives in - `capy::test` deliberately -- it is a test utility, not a - production API. The proposed `on(ex).block(t)` places it alongside - `.spawn()` and `.call()` as if it were a first-class production - method. That is a promotion this API has not earned. - -- **Hidden critical ordering.** The two-phase invocation exists for - a mechanical reason (allocator timing, described below). With - `on(ex).spawn(t)`, the critical sequencing guarantee is buried - behind what looks like a casual method call. The `()()` syntax is - pedagogically valuable -- it signals that something important - happens in two distinct steps. - -- **Overload count does not shrink.** `run_async` has 18 overloads - for good reason (executor x stop_token x allocator x handlers). - The builder pattern still needs all those combinations -- they - just move from free function overloads to constructor or method - overloads. The complexity does not vanish; it relocates. ++ The current syntax makes the interface boundary explicit: the first call captures _all_ context, the second call accepts the task. There is no dot-chain to extend. + +- **Erases the test boundary.** `run_blocking` lives in `capy::test` deliberately -- it is a test utility, not a production API. The proposed `on(ex).block(t)` places it alongside `.spawn()` and `.call()` as if it were a first-class production method. That is a promotion this API has not earned. + +- **Hidden critical ordering.** The two-phase invocation exists for a mechanical reason (allocator timing, described below). With `on(ex).spawn(t)`, the critical sequencing guarantee is buried behind what looks like a casual method call. The `()()` syntax is pedagogically valuable -- it signals that something important happens in two distinct steps. + +- **Overload count does not shrink.** `run_async` has 18 overloads for good reason (executor x stop_token x allocator x handlers). The builder pattern still needs all those combinations -- they just move from free function overloads to constructor or method overloads. The complexity does not vanish; it relocates. == The Two-Phase Invocation === The Problem: Allocator Timing -Coroutine frame allocation happens _before_ the coroutine body -executes. When the compiler encounters a coroutine call, it: +Coroutine frame allocation happens _before_ the coroutine body executes. When the compiler encounters a coroutine call, it: 1. Calls `operator new` to allocate the frame 2. Constructs the promise object 3. Begins execution of the coroutine body -Any mechanism that injects the allocator _after_ the call -- receiver -queries, `await_transform`, explicit method calls -- arrives too late. -The frame is already allocated. +Any mechanism that injects the allocator _after_ the call -- receiver queries, `await_transform`, explicit method calls -- arrives too late. The frame is already allocated. -This is the fundamental tension identified in D4003 §3.3: +This is the fundamental tension identified in D4003 �3.3: [quote] ____ @@ -323,8 +246,7 @@ ____ === The Solution: {cpp}17 Postfix Evaluation Order -{cpp}17 guarantees that in a postfix-expression call, the -postfix-expression is sequenced before the argument expressions: +{cpp}17 guarantees that in a postfix-expression call, the postfix-expression is sequenced before the argument expressions: [quote] ____ @@ -334,13 +256,9 @@ ____ In the expression `run_async(ex)(my_task())`: -1. `run_async(ex)` evaluates first. This returns a wrapper object - (`run_async_wrapper`) whose constructor calls `set_current_frame_allocator()` - -- storing a thread-local pointer to the memory resource. -2. `my_task()` evaluates second. The coroutine's `operator new` reads - the thread-local pointer and allocates the frame from it. -3. `operator()` on the wrapper takes ownership of the task and - dispatches it to the executor. +1. `run_async(ex)` evaluates first. This returns a wrapper object (`run_async_wrapper`) whose constructor calls `set_current_frame_allocator()` -- storing a thread-local pointer to the memory resource. +2. `my_task()` evaluates second. The coroutine's `operator new` reads the thread-local pointer and allocates the frame from it. +3. `operator()` on the wrapper takes ownership of the task and dispatches it to the executor. [source,cpp] ---- @@ -351,9 +269,7 @@ In the expression `run_async(ex)(my_task())`: // Step 2: task frame allocated using TLS allocator ---- -This sequencing is not an implementation detail -- it is the -_only correct way_ to inject an allocator into a coroutine's frame -allocation when the allocator is not known at compile time. +This sequencing is not an implementation detail -- it is the _only correct way_ to inject an allocator into a coroutine's frame allocation when the allocator is not known at compile time. === How It Works in the Code @@ -384,8 +300,7 @@ static void* operator new(std::size_t size) } ---- -The wrapper is `[[nodiscard]]` and its `operator()` is -rvalue-ref-qualified, preventing misuse: +The wrapper is `[[nodiscard]]` and its `operator()` is rvalue-ref-qualified, preventing misuse: [source,cpp] ---- @@ -399,22 +314,13 @@ w(my_task()); // Error: requires rvalue === The `run` Variant -The `run` function uses the same two-phase pattern inside coroutines. -An additional subtlety arises: the wrapper is a temporary that dies -before `co_await` suspends the caller. The wrapper's -`frame_memory_resource` would be destroyed before the child task -executes. +The `run` function uses the same two-phase pattern inside coroutines. An additional subtlety arises: the wrapper is a temporary that dies before `co_await` suspends the caller. The wrapper's `frame_memory_resource` would be destroyed before the child task executes. -The solution is to store a _copy_ of the allocator in the awaitable -returned by `operator()`. Since standard allocator copies are -equivalent -- memory allocated with one copy can be deallocated with -another -- this preserves correctness while keeping the allocator -alive for the task's duration. +The solution is to store a _copy_ of the allocator in the awaitable returned by `operator()`. Since standard allocator copies are equivalent -- memory allocated with one copy can be deallocated with another -- this preserves correctness while keeping the allocator alive for the task's duration. === Comparison with `std::execution` -In `std::execution` (P2300), context flows _backward_ from receiver -to sender via queries _after_ `connect()`: +In `std::execution` (P2300), context flows _backward_ from receiver to sender via queries _after_ `connect()`: ---- task async_work(); // Frame allocated NOW @@ -423,8 +329,7 @@ auto op = connect(sndr, receiver); // Allocator available NOW -- too late start(op); ---- -In the _IoAwaitable_ model, context flows _forward_ from launcher to -task: +In the _IoAwaitable_ model, context flows _forward_ from launcher to task: ---- 1. Set TLS allocator --> 2. Call task() @@ -432,8 +337,7 @@ task: 4. await_suspend ---- -The allocator is ready before the frame is created. No query -machinery can retroactively fix an allocation that already happened. +The allocator is ready before the frame is created. No query machinery can retroactively fix an allocation that already happened. == Summary @@ -443,10 +347,4 @@ machinery can retroactively fix an allocation that already happened. | `co_await run(ctx)(task)` | Awaitable launch within a coroutine |=== -The `run` name is greppable, unambiguous, and won't collide with -local variables in a namespace-heavy Boost codebase. The `f(ctx)(task)` -syntax exists because coroutine frame allocation requires the -allocator to be set _before_ the task expression is evaluated, and -{cpp}17 postfix sequencing guarantees exactly that ordering. The syntax -is intentionally explicit about its two steps -- it tells the reader -that something important happens between them. +The `run` name is greppable, unambiguous, and won't collide with local variables in a namespace-heavy Boost codebase. The `f(ctx)(task)` syntax exists because coroutine frame allocation requires the allocator to be set _before_ the task expression is evaluated, and {cpp}17 postfix sequencing guarantees exactly that ordering. The syntax is intentionally explicit about its two steps -- it tells the reader that something important happens between them. diff --git a/doc/modules/ROOT/pages/why-not-cobalt.adoc b/doc/modules/ROOT/pages/8.design/8m.WhyNotCobalt.adoc similarity index 98% rename from doc/modules/ROOT/pages/why-not-cobalt.adoc rename to doc/modules/ROOT/pages/8.design/8m.WhyNotCobalt.adoc index 20d001b4..a0b1635d 100644 --- a/doc/modules/ROOT/pages/why-not-cobalt.adoc +++ b/doc/modules/ROOT/pages/8.design/8m.WhyNotCobalt.adoc @@ -134,7 +134,7 @@ Templates can achieve this by type-erasing every customization point. The cost m == Stream Concepts -Capy defines seven coroutine-only stream concepts. Cobalt inherits Asio's `AsyncReadStream` and `AsyncWriteStream`, which are hybrid concepts supporting callbacks, futures, and coroutines. Cobalt's `cobalt::io` wrappers simplify the API and Cobalt defines stream abstractions (`write_stream`, `read_stream`, `stream`) as abstract base classes, a distinct approach from Capy's concept-based hierarchy. Cobalt's wrappers still include full Asio headers. See xref:why-not-cobalt-2.adoc[Write Stream Design] for a detailed comparison of the two approaches. +Capy defines seven coroutine-only stream concepts. Cobalt inherits Asio's `AsyncReadStream` and `AsyncWriteStream`, which are hybrid concepts supporting callbacks, futures, and coroutines. Cobalt's `cobalt::io` wrappers simplify the API and Cobalt defines stream abstractions (`write_stream`, `read_stream`, `stream`) as abstract base classes, a distinct approach from Capy's concept-based hierarchy. Cobalt's wrappers still include full Asio headers. See xref:8.design/8n.WhyNotCobaltConcepts.adoc[Write Stream Design] for a detailed comparison of the two approaches. Capy's concepts form a refinement hierarchy that emerged naturally from use-case-first design: @@ -192,7 +192,7 @@ Traditional approaches to type erasure in Asio focus on the lowest-level element Capy type-erases the stream itself. This is possible because coroutines provide structural type erasure — the continuation is always a handle, not a template parameter. When the library is coroutines-only, one virtual call per I/O operation is the total cost. The completion handler, executor, and allocator do not need individual erasure because they are not part of the stream's operation signature. -Cobalt defines stream abstractions (`write_stream`, `read_stream`, `stream`) as abstract base classes in `cobalt/io/stream.hpp`, taking a different approach from Capy's concept + type-erased wrapper model. See xref:why-not-cobalt-2.adoc[Write Stream Design] for a side-by-side analysis. +Cobalt defines stream abstractions (`write_stream`, `read_stream`, `stream`) as abstract base classes in `cobalt/io/stream.hpp`, taking a different approach from Capy's concept + type-erased wrapper model. See xref:8.design/8n.WhyNotCobaltConcepts.adoc[Write Stream Design] for a side-by-side analysis. The wrappers compose. `any_buffer_source` also satisfies `ReadSource` — natively if the wrapped type supports both, synthesized otherwise. `any_buffer_sink` also satisfies `WriteSink`. You pick the abstraction level you need. @@ -245,7 +245,7 @@ This is how the Dimovian Ideal is mechanically achieved. == Mock Streams and Testability -When algorithms operate on type-erased interfaces, testing becomes deterministic. Capy provides mock implementations for every stream concept. Cobalt defines stream abstractions as abstract base classes but does not provide mock implementations for testing. See xref:why-not-cobalt-2.adoc[Write Stream Design] for a comparison of the two stream designs. +When algorithms operate on type-erased interfaces, testing becomes deterministic. Capy provides mock implementations for every stream concept. Cobalt defines stream abstractions as abstract base classes but does not provide mock implementations for testing. See xref:8.design/8n.WhyNotCobaltConcepts.adoc[Write Stream Design] for a comparison of the two stream designs. Capy's mock types: diff --git a/doc/modules/ROOT/pages/why-not-cobalt-2.adoc b/doc/modules/ROOT/pages/8.design/8n.WhyNotCobaltConcepts.adoc similarity index 100% rename from doc/modules/ROOT/pages/why-not-cobalt-2.adoc rename to doc/modules/ROOT/pages/8.design/8n.WhyNotCobaltConcepts.adoc diff --git a/doc/modules/ROOT/pages/why-not-tmc.adoc b/doc/modules/ROOT/pages/8.design/8o.WhyNotTMC.adoc similarity index 100% rename from doc/modules/ROOT/pages/why-not-tmc.adoc rename to doc/modules/ROOT/pages/8.design/8o.WhyNotTMC.adoc diff --git a/doc/unlisted/library-executors.adoc b/doc/unlisted/library-executors.adoc index 2069f37f..6bad016e 100644 --- a/doc/unlisted/library-executors.adoc +++ b/doc/unlisted/library-executors.adoc @@ -141,8 +141,11 @@ async_mutex cm; task modify() { - auto lock = co_await cm.lock(); + auto [ec] = co_await cm.lock(); + if(ec) + co_return; shared_data.modify(); + cm.unlock(); } ---- diff --git a/doc/unlisted/synchronization-coro-lock.adoc b/doc/unlisted/synchronization-coro-lock.adoc index 2a7f4bd5..eb11d11b 100644 --- a/doc/unlisted/synchronization-coro-lock.adoc +++ b/doc/unlisted/synchronization-coro-lock.adoc @@ -36,7 +36,9 @@ coro_lock cm; task good_example() { - co_await cm.lock(); // Coroutine suspends, thread is free + auto [ec] = co_await cm.lock(); + if(ec) + co_return; // ... critical section ... cm.unlock(); co_return; @@ -57,7 +59,9 @@ coro_lock cm; task protected_operation() { - co_await cm.lock(); + auto [ec] = co_await cm.lock(); + if(ec) + co_return; // Only one coroutine executes this section at a time do_work(); cm.unlock(); @@ -85,7 +89,9 @@ coro_lock cm; task example() { - co_await cm.lock(); + auto [ec] = co_await cm.lock(); + if(ec) + co_return; // Critical section cm.unlock(); } @@ -103,7 +109,9 @@ coro_lock cm; task example() { - auto guard = co_await cm.scoped_lock(); + auto [ec, guard] = co_await cm.scoped_lock(); + if(ec) + co_return; // Critical section // Guard unlocks automatically on scope exit } @@ -117,7 +125,7 @@ The `lock_guard` class provides RAII semantics: [source,cpp] ---- -coro_lock::lock_guard guard = co_await cm.scoped_lock(); +auto [ec, guard] = co_await cm.scoped_lock(); // Move to extend lifetime coro_lock::lock_guard g2 = std::move(guard); @@ -157,7 +165,9 @@ coro_lock cm; task multi_threaded_safe() { co_await run(s)([&]() -> task { - auto guard = co_await cm.scoped_lock(); + auto [ec, guard] = co_await cm.scoped_lock(); + if(ec) + co_return; // Now safe: strand serializes, mutex excludes co_return; }()); @@ -176,13 +186,17 @@ class shared_counter public: task increment() { - auto guard = co_await cm_.scoped_lock(); + auto [ec, guard] = co_await cm_.scoped_lock(); + if(ec) + co_return; ++value_; } task get() { - auto guard = co_await cm_.scoped_lock(); + auto [ec, guard] = co_await cm_.scoped_lock(); + if(ec) + co_return 0; co_return value_; } }; @@ -202,7 +216,9 @@ public: task write(std::string_view data) { - auto guard = co_await cm_.scoped_lock(); + auto [ec, guard] = co_await cm_.scoped_lock(); + if(ec) + co_return; // Only one write at a time co_await file_.async_write(data); } diff --git a/example/allocation/CMakeLists.txt b/example/allocation/CMakeLists.txt index 15a2aa94..cb3eeec9 100644 --- a/example/allocation/CMakeLists.txt +++ b/example/allocation/CMakeLists.txt @@ -7,6 +7,14 @@ # Official repository: https://github.com/cppalliance/capy # +include(FetchContent) +FetchContent_Declare(mimalloc + GIT_REPOSITORY https://github.com/microsoft/mimalloc + GIT_TAG v2.2.7 + GIT_SHALLOW TRUE) +set(MI_BUILD_TESTS OFF CACHE BOOL "Disable mimalloc tests" FORCE) +FetchContent_MakeAvailable(mimalloc) + file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS *.cpp *.hpp CMakeLists.txt Jamfile) @@ -19,4 +27,5 @@ set_property(TARGET capy_example_allocation PROPERTY FOLDER "examples") target_link_libraries(capy_example_allocation - Boost::capy) + Boost::capy + mimalloc-static) diff --git a/example/allocation/Jamfile b/example/allocation/Jamfile index b62a4e87..937f690b 100644 --- a/example/allocation/Jamfile +++ b/example/allocation/Jamfile @@ -7,6 +7,9 @@ # Official repository: https://github.com/cppalliance/capy # +# Requires mimalloc (https://github.com/microsoft/mimalloc) +# installed where the compiler can find it. + project : requirements /boost/capy//boost_capy @@ -15,4 +18,6 @@ project exe allocation : [ glob *.cpp ] + : + -lmimalloc ; diff --git a/example/allocation/allocation.cpp b/example/allocation/allocation.cpp index 1cd49b8a..c34831d4 100644 --- a/example/allocation/allocation.cpp +++ b/example/allocation/allocation.cpp @@ -10,20 +10,21 @@ // // Allocation Example // -// Compares the performance of the default recycling frame allocator -// against std::allocator (no recycling). A 4-deep coroutine chain -// is invoked 20 million times using test::run_blocking, once with -// each allocator. +// Compares the performance of three frame allocators: the default +// recycling allocator, mimalloc, and std::allocator (no recycling). +// A 4-deep coroutine chain is invoked 2 million times with each. // #include #include +#include #include #include #include #include #include #include +#include // Prevent HALO from eliding coroutine frame allocations #if defined(_MSC_VER) @@ -38,6 +39,39 @@ using namespace boost::capy; std::atomic counter{0}; +// Adapts mimalloc to std::pmr::memory_resource +class mi_memory_resource + : public std::pmr::memory_resource +{ +protected: + void* + do_allocate( + std::size_t bytes, + std::size_t alignment) override + { + void* p = mi_malloc_aligned(bytes, alignment); + if(! p) + throw std::bad_alloc(); + return p; + } + + void + do_deallocate( + void* p, + std::size_t, + std::size_t alignment) override + { + mi_free_aligned(p, alignment); + } + + bool + do_is_equal( + memory_resource const& other) const noexcept override + { + return this == &other; + } +}; + // These coroutines simulate a "composed operation" // consisting of layered APIs. For example a user's // business logic awaiting an HTTP client, awaiting @@ -90,35 +124,59 @@ int main() } auto t1 = std::chrono::steady_clock::now(); - // With std::allocator (no recycling) + // With mimalloc counter.store(0); + mi_memory_resource mi_mr; auto t2 = std::chrono::steady_clock::now(); { test::blocking_context ctx; - run_async(ctx.get_executor(), std::allocator{}, + ctx.set_frame_allocator(&mi_mr); + run_async(ctx.get_executor(), [&] { ctx.signal_done(); })( bench_loop(iterations)); ctx.run(); } auto t3 = std::chrono::steady_clock::now(); + // With std::allocator (no recycling) + counter.store(0); + auto t4 = std::chrono::steady_clock::now(); + { + test::blocking_context ctx; + run_async(ctx.get_executor(), std::allocator{}, + [&] { ctx.signal_done(); })( + bench_loop(iterations)); + ctx.run(); + } + auto t5 = std::chrono::steady_clock::now(); + auto ms_recycling = std::chrono::duration(t1 - t0).count(); - auto ms_standard = + auto ms_mimalloc = std::chrono::duration(t3 - t2).count(); + auto ms_standard = + std::chrono::duration(t5 - t4).count(); - auto pct = std::round((ms_standard / ms_recycling - 1.0) * 1000.0) / 10.0; + auto pct_rc_std = std::round( + (ms_standard / ms_recycling - 1.0) * 1000.0) / 10.0; + auto pct_mi_std = std::round( + (ms_standard / ms_mimalloc - 1.0) * 1000.0) / 10.0; + auto pct_rc_mi = std::round( + (ms_mimalloc / ms_recycling - 1.0) * 1000.0) / 10.0; std::cout << iterations << " iterations, " << "4-deep coroutine chain\n\n" + << std::fixed << std::setprecision(1) << " Recycling allocator: " - << ms_recycling << " ms\n" + << ms_recycling << " ms (+" + << pct_rc_std << "% vs std, +" + << pct_rc_mi << "% vs mimalloc)\n" + << " mimalloc: " + << ms_mimalloc << " ms (+" + << pct_mi_std << "% vs std)\n" << " std::allocator: " - << ms_standard << " ms\n" - << " Speedup: " - << std::fixed << std::setprecision(1) - << pct << "%\n"; + << ms_standard << " ms\n"; return 0; } diff --git a/test/unit/Jamfile b/test/unit/Jamfile index df4c0a51..76418db8 100644 --- a/test/unit/Jamfile +++ b/test/unit/Jamfile @@ -26,12 +26,33 @@ project for local f in [ glob-tree-ex . : *.cpp : file*.cpp ] { - run $(f) ; + local parts = [ SPLIT_BY_CHARACTERS $(f:S=) : / ] ; + local name = $(parts:J=_) ; + if $(name) = test_stream + { + # GCC false positive: structured bindings used inside + # coroutine frames trigger -Wmaybe-uninitialized because + # the coroutine lowering pass splits the code into a state + # machine, and GCC's dataflow analysis cannot prove that + # the anonymous temporary from + # `auto [a, b] = make_stream_pair(f)` is fully initialized. + run $(f) + : target-name $(name) + : requirements + gcc:-Wno-maybe-uninitialized + ; + } + else + { + run $(f) : target-name $(name) ; + } } for local f in [ glob-tree-ex . : file*.cpp ] { + local parts = [ SPLIT_BY_CHARACTERS $(f:S=) : / ] ; run $(f) + : target-name $(parts:J=_) : requirements off norecover:static ; diff --git a/test/unit/test/stream.cpp b/test/unit/test/stream.cpp index 77b0d4a9..29459421 100644 --- a/test/unit/test/stream.cpp +++ b/test/unit/test/stream.cpp @@ -737,7 +737,7 @@ class stream_pair_test auto [a, b] = make_stream_pair(f); co_await when_all( - [&a]() -> task<> { + [](stream a) -> task<> { char buf[32] = {}; auto [ec, n] = co_await a.read_some( make_buffer(buf)); @@ -747,14 +747,14 @@ class stream_pair_test BOOST_TEST_EQ( std::string_view(buf, n), "hello"); - }(), - [&b]() -> task<> { + }(std::move(a)), + [](stream b) -> task<> { auto [ec, n] = co_await b.write_some( make_buffer("hello", 5)); if(ec) co_return; BOOST_TEST_EQ(n, 5u); - }() + }(std::move(b)) ); }); BOOST_TEST(r.success); @@ -954,7 +954,7 @@ class stream_pair_test auto [a, b] = make_stream_pair(f); co_await when_all( - [&a]() -> task<> { + [](stream a) -> task<> { char buf[3] = {}; auto [ec, n] = co_await a.read_some( make_buffer(buf)); @@ -964,14 +964,14 @@ class stream_pair_test BOOST_TEST_EQ( std::string_view(buf, n), "hel"); - }(), - [&b]() -> task<> { + }(std::move(a)), + [](stream b) -> task<> { auto [ec, n] = co_await b.write_some( make_buffer("hello", 5)); if(ec) co_return; BOOST_TEST_EQ(n, 5u); - }() + }(std::move(b)) ); }); BOOST_TEST(r.success); @@ -1043,17 +1043,17 @@ class stream_pair_test auto [a, b] = make_stream_pair(f); co_await when_all( - [&a]() -> task<> { + [](stream a) -> task<> { char buf[32] = {}; auto [ec, n] = co_await a.read_some( make_buffer(buf)); BOOST_TEST(ec == cond::eof); BOOST_TEST_EQ(n, 0u); - }(), - [&b]() -> task<> { + }(std::move(a)), + [](stream b) -> task<> { b.close(); co_return; - }() + }(std::move(b)) ); }()); } @@ -1100,7 +1100,7 @@ class stream_pair_test auto [a, b] = make_stream_pair(f); co_await when_all( - [&a]() -> task<> { + [](stream a) -> task<> { // Reader suspends waiting for data. // Gets data, eof from peer's guard, // or its own fuse error on resume. @@ -1110,8 +1110,8 @@ class stream_pair_test if(ec) co_return; BOOST_TEST_EQ(n, 5u); - }(), - [&b]() -> task<> { + }(std::move(a)), + [](stream b) -> task<> { // Writer may get fuse error, which // closes the peer via the guard auto [ec, n] = co_await b.write_some( @@ -1119,7 +1119,7 @@ class stream_pair_test if(ec) co_return; BOOST_TEST_EQ(n, 5u); - }() + }(std::move(b)) ); }); BOOST_TEST(r.success);