diff options
110 files changed, 8957 insertions, 2021 deletions
diff --git a/.ci/woodpecker/10-build.yaml b/.ci/woodpecker/10-build.yaml index 0a82c469..31b9b9b4 100644 --- a/.ci/woodpecker/10-build.yaml +++ b/.ci/woodpecker/10-build.yaml @@ -88,13 +88,6 @@ steps: done done - for rxm_heap in TRUE FALSE; do - for rxm_block in TRUE FALSE; do - echo "--- HEAP=$rxm_heap BLOCKING=$rxm_block ---" - run_build \ - -DRXM_BUFFER_ON_HEAP=$rxm_heap \ - -DRXM_BLOCKING=$rxm_block - done - done + run_build @@ -1,3 +1,4 @@ *~ *# build/ +/tags diff --git a/CMakeLists.txt b/CMakeLists.txt index c886146d..bfabd711 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,5 +69,6 @@ add_subdirectory(src/ipcpd) add_subdirectory(src/tools) setup_coverage_target() include(doc) +include(tags) include(install) diff --git a/cmake/config/ipcp/broadcast.cmake b/cmake/config/ipcp/broadcast.cmake index 79f41d10..f521ed8e 100644 --- a/cmake/config/ipcp/broadcast.cmake +++ b/cmake/config/ipcp/broadcast.cmake @@ -4,3 +4,6 @@ set(IPCP_BROADCAST_TARGET ipcpd-broadcast) set(IPCP_BROADCAST_MPL 100 CACHE STRING "Default maximum packet lifetime for the Broadcast IPCP, in ms") + +set(IPCP_BROADCAST_MTU 1400 CACHE STRING + "Layer MTU advertised by the Broadcast IPCP, in bytes") diff --git a/cmake/config/ipcp/common.cmake b/cmake/config/ipcp/common.cmake index ffd5dc32..0c873b76 100644 --- a/cmake/config/ipcp/common.cmake +++ b/cmake/config/ipcp/common.cmake @@ -41,3 +41,13 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Linux") set(IPCP_LINUX_TIMERSLACK_NS 100 CACHE STRING "Slack value for high resolution timers on Linux systems.") endif() + +# ipcpd-eth flow statistics (requires FUSE - eth.c relies on +# IPCP_ETH_FLOW_STATS implying HAVE_FUSE for rib_reg/rib_unreg) +if(HAVE_FUSE) + set(IPCP_ETH_FLOW_STATS FALSE CACHE BOOL + "Enable ipcpd-eth flow statistics via RIB") + if(IPCP_ETH_FLOW_STATS) + message(STATUS "ipcpd-eth flow statistics enabled") + endif() +endif() diff --git a/cmake/config/ipcp/eth.cmake b/cmake/config/ipcp/eth.cmake index 4b9007d2..d336d647 100644 --- a/cmake/config/ipcp/eth.cmake +++ b/cmake/config/ipcp/eth.cmake @@ -10,6 +10,10 @@ set(IPCP_ETH_WR_THR 1 CACHE STRING "Number of writer threads in Ethernet IPCP") set(IPCP_ETH_QDISC_BYPASS false CACHE BOOL "Bypass the Qdisc in the kernel when using raw sockets") +set(IPCP_ETH_SNDBUF 0 CACHE STRING + "Raw socket SO_SNDBUF in bytes; 0 = leave kernel default (wmem_default)") +set(IPCP_ETH_RCVBUF 0 CACHE STRING + "Raw socket SO_RCVBUF in bytes; 0 = leave kernel default (rmem_default)") set(IPCP_ETH_LO_MTU 9000 CACHE STRING "Restrict Ethernet MTU over loopback interfaces") set(IPCP_ETH_MGMT_FRAME_SIZE 9000 CACHE STRING diff --git a/cmake/config/ipcp/local.cmake b/cmake/config/ipcp/local.cmake index 88ee8998..70423cd1 100644 --- a/cmake/config/ipcp/local.cmake +++ b/cmake/config/ipcp/local.cmake @@ -2,8 +2,38 @@ set(IPCP_LOCAL_TARGET ipcpd-local) -set(IPCP_LOCAL_MPL 100 CACHE STRING +set(IPCP_LOCAL_MPL 50 CACHE STRING "Default maximum packet lifetime for the Local IPCP, in ms") +set(IPCP_LOCAL_MTU 65000 CACHE STRING + "Layer MTU advertised by the Local IPCP, in bytes") + set(IPCP_LOCAL_POLLING FALSE CACHE BOOL "Enable active polling in the Local IPCP for low-latency mode") + +# IPCP_LOCAL_MTU must fit in the largest enabled GSPP and PUP class +# (sender-side allocation: daemons use GSPP, apps use PUP). Reserve a +# margin for sizeof(struct ssm_pk_buff) + HEADSPACE + TAILSPACE. +math(EXPR _ssm_pk_overhead + "${SSM_PK_BUFF_HEADSPACE} + ${SSM_PK_BUFF_TAILSPACE} + 64") + +foreach(_pool GSPP PUP) + set(_largest 0) + foreach(_pair "256;256" "512;512" "1K;1024" "2K;2048" "4K;4096" + "16K;16384" "64K;65536" "256K;262144" "1M;1048576") + list(GET _pair 0 _name) + list(GET _pair 1 _bytes) + if(SSM_${_pool}_${_name}_BLOCKS GREATER 0 + AND _bytes GREATER _largest) + set(_largest ${_bytes}) + endif() + endforeach() + math(EXPR _avail "${_largest} - ${_ssm_pk_overhead}") + if(IPCP_LOCAL_MTU GREATER _avail) + message(FATAL_ERROR + "IPCP_LOCAL_MTU (${IPCP_LOCAL_MTU}) exceeds largest enabled " + "SSM_${_pool} class minus per-block overhead " + "(${_largest} - ${_ssm_pk_overhead} = ${_avail} bytes). " + "Lower IPCP_LOCAL_MTU or enable a larger SSM_${_pool}_*_BLOCKS.") + endif() +endforeach() diff --git a/cmake/config/ipcp/udp.cmake b/cmake/config/ipcp/udp.cmake index 0124c261..af84a844 100644 --- a/cmake/config/ipcp/udp.cmake +++ b/cmake/config/ipcp/udp.cmake @@ -10,3 +10,7 @@ set(IPCP_UDP_WR_THR 3 CACHE STRING "Number of writer threads in UDP IPCPs") set(IPCP_UDP_MPL 5000 CACHE STRING "Default maximum packet lifetime for the UDP IPCPs, in ms") +set(IPCP_UDP4_MTU 1472 CACHE STRING + "Fallback UDP4 layer MTU when getsockopt(IP_MTU) is unavailable, in bytes") +set(IPCP_UDP6_MTU 1452 CACHE STRING + "Fallback UDP6 layer MTU when getsockopt(IPV6_MTU) is unavailable, in bytes") diff --git a/cmake/config/ipcp/unicast.cmake b/cmake/config/ipcp/unicast.cmake index 3b5b0ce7..b8d4d516 100644 --- a/cmake/config/ipcp/unicast.cmake +++ b/cmake/config/ipcp/unicast.cmake @@ -4,6 +4,8 @@ set(IPCP_UNICAST_TARGET ipcpd-unicast) set(IPCP_UNICAST_MPL 100 CACHE STRING "Default maximum packet lifetime for the Unicast IPCP, in ms") +set(IPCP_UNICAST_MTU 1400 CACHE STRING + "Layer MTU advertised by the Unicast IPCP, in bytes (TODO: derive per-flow from n-1 path MTU minus DT PCI)") set(PFT_SIZE 256 CACHE STRING "Prefix forwarding table size for the Unicast IPCP") diff --git a/cmake/config/irmd.cmake b/cmake/config/irmd.cmake index b86a40c5..45d9e73d 100644 --- a/cmake/config/irmd.cmake +++ b/cmake/config/irmd.cmake @@ -10,8 +10,8 @@ set(ENROLL_TIMEOUT 20000 CACHE STRING "Timeout for an IPCP to enroll (ms)") set(REG_TIMEOUT 20000 CACHE STRING "Timeout for registering a name (ms)") -set(QUERY_TIMEOUT 200 CACHE STRING - "Timeout to query a name with an IPCP (ms)") +set(QUERY_TIMEOUT 2000 CACHE STRING + "Timeout to query a name with an IPCP (ms); must exceed shim retry budget") set(CONNECT_TIMEOUT 20000 CACHE STRING "Timeout to connect an IPCP to another IPCP (ms)") set(FLOW_ALLOC_TIMEOUT 20000 CACHE STRING diff --git a/cmake/config/lib.cmake b/cmake/config/lib.cmake index 287f30dc..25130519 100644 --- a/cmake/config/lib.cmake +++ b/cmake/config/lib.cmake @@ -4,11 +4,11 @@ # Flow limits set(SYS_MAX_FLOWS 10240 CACHE STRING "Maximum number of total flows for this system") -set(PROG_MAX_FLOWS 4096 CACHE STRING +set(PROC_MAX_FLOWS 4096 CACHE STRING "Maximum number of flows in an application") -set(PROG_RES_FDS 64 CACHE STRING +set(PROC_RES_FDS 64 CACHE STRING "Number of reserved flow descriptors per application") -set(PROG_MAX_FQUEUES 32 CACHE STRING +set(PROC_MAX_FQUEUES 32 CACHE STRING "Maximum number of flow sets per application") # Threading @@ -28,18 +28,28 @@ set(SOCKET_TIMEOUT 500 CACHE STRING set(QOS_DISABLE_CRC TRUE CACHE BOOL "Ignores ber setting on all QoS cubes") -# Delta-t protocol timers -set(DELTA_T_MPL 60 CACHE STRING - "Maximum packet lifetime (s)") -set(DELTA_T_ACK 10 CACHE STRING - "Maximum time to acknowledge a packet (s)") -set(DELTA_T_RTX 120 CACHE STRING - "Maximum time to retransmit a packet (s)") +include(utils/CPUUtils) +detect_pclmul() +detect_pmull() +if(HAVE_PCLMUL) + message(STATUS "CRC-64/NVMe backend: PCLMUL (x86 SSE4.1+PCLMUL)") +elseif(HAVE_PMULL) + message(STATUS "CRC-64/NVMe backend: PMULL (aarch64 crypto)") +else() + message(STATUS "CRC-64/NVMe backend: byte table (no acceleration)") +endif() + +# Delta-t protocol timers (Watson bound: 3*MPL + A + R). +# MPL is reported per IPCP (IPCP_*_MPL); A and R are FRCT-wide. +set(DELTA_T_ACK 1000 CACHE STRING + "Maximum time to acknowledge a packet (ms)") +set(DELTA_T_RTX 30000 CACHE STRING + "Maximum time to retransmit a packet (ms)") # FRCT configuration -set(FRCT_REORDER_QUEUE_SIZE 256 CACHE STRING +set(FRCT_REORDER_QUEUE_SIZE 128 CACHE STRING "Size of the reordering queue, must be a power of 2") -set(FRCT_START_WINDOW 64 CACHE STRING +set(FRCT_START_WINDOW 128 CACHE STRING "Start window, must be a power of 2") set(FRCT_LINUX_RTT_ESTIMATOR TRUE CACHE BOOL "Use Linux RTT estimator formula instead of the TCP RFC formula") @@ -48,15 +58,13 @@ set(FRCT_RTO_MDEV_MULTIPLIER 2 CACHE STRING set(FRCT_RTO_INC_FACTOR 0 CACHE STRING "Divisor for RTO increase after timeout: RTO += RTX >> X, 0: Karn/Partridge") set(FRCT_RTO_MIN 250 CACHE STRING - "Minimum Retransmission Timeout (RTO) for FRCT (us)") + "Hard floor for Retransmission Timeout (RTO) for FRCT (us)") set(FRCT_TICK_TIME 5000 CACHE STRING "Tick time for FRCT activity (retransmission, acknowledgments) (us)") +set(FRCT_DEBUG_STDOUT FALSE CACHE BOOL + "Print FRCT final counters to stdout at flow teardown") # Retransmission (RXM) configuration -set(RXM_BUFFER_ON_HEAP FALSE CACHE BOOL - "Store packets for retransmission on the heap instead of in packet buffer") -set(RXM_BLOCKING TRUE CACHE BOOL - "Use blocking writes for retransmission") set(RXM_MIN_RESOLUTION 20 CACHE STRING "Minimum retransmission delay (ns), as a power to 2") set(RXM_WHEEL_MULTIPLIER 4 CACHE STRING @@ -92,3 +100,4 @@ if(HAVE_FUSE) message(STATUS "Application flow statistics disabled") endif() endif() + diff --git a/cmake/config/ssm.cmake b/cmake/config/ssm.cmake index c1f34655..913396ec 100644 --- a/cmake/config/ssm.cmake +++ b/cmake/config/ssm.cmake @@ -15,14 +15,22 @@ set(SSM_PUP_NAME_FMT "/${SSM_PREFIX}.pup.%d" CACHE INTERNAL # Packet buffer configuration set(SSM_POOL_NAME "/${SHM_PREFIX}.pool" CACHE INTERNAL "Name for the main POSIX shared memory pool") -set(SSM_POOL_BLOCKS 16384 CACHE STRING - "Number of blocks in SSM packet pool, must be a power of 2") set(SSM_PK_BUFF_HEADSPACE 256 CACHE STRING "Bytes of headspace to reserve for future headers") set(SSM_PK_BUFF_TAILSPACE 32 CACHE STRING "Bytes of tailspace to reserve for future tails") -set(SSM_RBUFF_SIZE 1024 CACHE STRING +# Sized to absorb burst arrivals from fragmented SDUs without +# overflowing at the eth->FRCT boundary. Must hold at least one +# full FRCT reorder window plus margin for transient app-thread +# unavailability; 4x FRCT_REORDER_QUEUE_SIZE leaves comfortable +# burst headroom. Floor at 1024 for small RQ configs. +math(EXPR _SSM_RBUFF_DEFAULT "${FRCT_REORDER_QUEUE_SIZE} * 4") +if(_SSM_RBUFF_DEFAULT LESS 1024) + set(_SSM_RBUFF_DEFAULT 1024) +endif() +set(SSM_RBUFF_SIZE ${_SSM_RBUFF_DEFAULT} CACHE STRING "Number of blocks in rbuff buffer, must be a power of 2") +unset(_SSM_RBUFF_DEFAULT) set(SSM_RBUFF_PREFIX "/${SHM_PREFIX}.rbuff." CACHE INTERNAL "Prefix for rbuff POSIX shared memory filenames") set(SSM_FLOW_SET_PREFIX "/${SHM_PREFIX}.set." CACHE INTERNAL @@ -36,7 +44,7 @@ set(SSM_POOL_SHARDS 4 CACHE STRING # Shared by all processes in 'ouroboros' group (~60 MB total) set(SSM_GSPP_256_BLOCKS 1024 CACHE STRING "GSPP: Number of 256B blocks") -set(SSM_GSPP_512_BLOCKS 768 CACHE STRING +set(SSM_GSPP_512_BLOCKS 2048 CACHE STRING "GSPP: Number of 512B blocks") set(SSM_GSPP_1K_BLOCKS 512 CACHE STRING "GSPP: Number of 1KB blocks") @@ -55,13 +63,13 @@ set(SSM_GSPP_1M_BLOCKS 16 CACHE STRING # Per-User Pool (PUP) - for unprivileged applications # Each unprivileged app gets its own smaller pool (~7.5 MB total) -set(SSM_PUP_256_BLOCKS 128 CACHE STRING +set(SSM_PUP_256_BLOCKS 512 CACHE STRING "PUP: Number of 256B blocks") -set(SSM_PUP_512_BLOCKS 96 CACHE STRING +set(SSM_PUP_512_BLOCKS 512 CACHE STRING "PUP: Number of 512B blocks") -set(SSM_PUP_1K_BLOCKS 64 CACHE STRING +set(SSM_PUP_1K_BLOCKS 512 CACHE STRING "PUP: Number of 1KB blocks") -set(SSM_PUP_2K_BLOCKS 48 CACHE STRING +set(SSM_PUP_2K_BLOCKS 512 CACHE STRING "PUP: Number of 2KB blocks") set(SSM_PUP_4K_BLOCKS 32 CACHE STRING "PUP: Number of 4KB blocks") @@ -74,6 +82,23 @@ set(SSM_PUP_256K_BLOCKS 2 CACHE STRING set(SSM_PUP_1M_BLOCKS 0 CACHE STRING "PUP: Number of 1MB blocks") +# Zero classes too small for spb header + HEADSPACE + TAILSPACE + 1 B. +math(EXPR _SSM_MIN_USEFUL_CLASS + "32 + ${SSM_PK_BUFF_HEADSPACE} + ${SSM_PK_BUFF_TAILSPACE}") +foreach(_pair "256:256" "512:512" "1K:1024" "2K:2048") + string(REPLACE ":" ";" _p "${_pair}") + list(GET _p 0 _suffix) + list(GET _p 1 _size) + if(_size LESS _SSM_MIN_USEFUL_CLASS) + set(SSM_GSPP_${_suffix}_BLOCKS 0) + set(SSM_PUP_${_suffix}_BLOCKS 0) + endif() +endforeach() +unset(_SSM_MIN_USEFUL_CLASS) +unset(_p) +unset(_suffix) +unset(_size) + # SSM pool size calculations include(utils/HumanReadable) @@ -129,3 +154,23 @@ message(STATUS " Blocks: ${SSM_PUP_256_BLOCKS}, ${SSM_PUP_512_BLOCKS}, " "${SSM_PUP_1K_BLOCKS}, ${SSM_PUP_2K_BLOCKS}, ${SSM_PUP_4K_BLOCKS}, " "${SSM_PUP_16K_BLOCKS}, ${SSM_PUP_64K_BLOCKS}, ${SSM_PUP_256K_BLOCKS}, " "${SSM_PUP_1M_BLOCKS}") + +# FRCT reorder queue must fit in every enabled size class. If RQ_SIZE +# >= any backing pool, the receiver advertises a window the pool +# cannot back; np1_flow_write fails under load and a single dropped +# fragment wedges the flow. Auto-zeroed classes are skipped. +foreach(_class 256 512 1K 2K) + if(SSM_PUP_${_class}_BLOCKS GREATER 0 + AND NOT FRCT_REORDER_QUEUE_SIZE LESS SSM_PUP_${_class}_BLOCKS) + message(FATAL_ERROR + "FRCT_REORDER_QUEUE_SIZE (${FRCT_REORDER_QUEUE_SIZE}) must be " + "< SSM_PUP_${_class}_BLOCKS (${SSM_PUP_${_class}_BLOCKS}): " + "the FC window cannot exceed the pool that backs OOO stashing.") + endif() + if(SSM_GSPP_${_class}_BLOCKS GREATER 0 + AND NOT FRCT_REORDER_QUEUE_SIZE LESS SSM_GSPP_${_class}_BLOCKS) + message(FATAL_ERROR + "FRCT_REORDER_QUEUE_SIZE (${FRCT_REORDER_QUEUE_SIZE}) must be " + "< SSM_GSPP_${_class}_BLOCKS (${SSM_GSPP_${_class}_BLOCKS}).") + endif() +endforeach() diff --git a/cmake/tags.cmake b/cmake/tags.cmake new file mode 100644 index 00000000..00e6f0d6 --- /dev/null +++ b/cmake/tags.cmake @@ -0,0 +1,21 @@ +find_program(CTAGS_EXECUTABLE + NAMES ctags-universal universal-ctags ctags + DOC "Generate a ctags index for source navigation: make tags") +mark_as_advanced(CTAGS_EXECUTABLE) + +if(CTAGS_EXECUTABLE) + add_custom_target(tags + COMMAND ${CTAGS_EXECUTABLE} + -R + --languages=C + --c-kinds=+p + --fields=+S + --exclude=build + --exclude=build-claude + --exclude=build_tmp + --exclude=.git + -f ${CMAKE_SOURCE_DIR}/tags + ${CMAKE_SOURCE_DIR} + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + COMMENT "Generating ctags index at ${CMAKE_SOURCE_DIR}/tags") +endif() diff --git a/cmake/utils/CPUUtils.cmake b/cmake/utils/CPUUtils.cmake new file mode 100644 index 00000000..8ca7683a --- /dev/null +++ b/cmake/utils/CPUUtils.cmake @@ -0,0 +1,82 @@ +include(CheckCSourceRuns) + +# Compile + run a probe so we only enable a feature the host CPU +# actually implements (toolchains accept flags the silicon may lack). +# Cross-compile without an emulator: feature off. +function(detect_cpu_feature _result_var _flags _source) + set(_save_flags "${CMAKE_REQUIRED_FLAGS}") + set(_save_quiet "${CMAKE_REQUIRED_QUIET}") + set(CMAKE_REQUIRED_FLAGS "${_save_flags} ${_flags}") + set(CMAKE_REQUIRED_QUIET TRUE) + if(CMAKE_CROSSCOMPILING AND NOT CMAKE_CROSSCOMPILING_EMULATOR) + set(${_result_var} FALSE CACHE INTERNAL + "${_result_var} (cross-compile without emulator: off)") + else() + check_c_source_runs("${_source}" ${_result_var}) + endif() + set(CMAKE_REQUIRED_FLAGS "${_save_flags}") + set(CMAKE_REQUIRED_QUIET "${_save_quiet}") +endfunction() + +# x86 PCLMULQDQ + SSE4.1. argc-derived input defeats constant folding; +# SIGILL handler exits cleanly so the kernel skips the core dump. +function(detect_pclmul) + detect_cpu_feature(_HAVE_PCLMUL "-mpclmul" +"#include <wmmintrin.h> +#include <signal.h> +#include <unistd.h> +static void on_sigill(int sig) { (void) sig; _exit(1); } +int main(int argc, char ** argv) { + __m128i a; + __m128i b; + (void) argv; + signal(SIGILL, on_sigill); + a = _mm_set1_epi32(argc); + b = _mm_clmulepi64_si128(a, a, 0); + return _mm_cvtsi128_si32(b) & 0; +}") + detect_cpu_feature(_HAVE_SSE41 "-msse4.1" +"#include <smmintrin.h> +#include <signal.h> +#include <unistd.h> +static void on_sigill(int sig) { (void) sig; _exit(1); } +int main(int argc, char ** argv) { + __m128i a; + (void) argv; + signal(SIGILL, on_sigill); + a = _mm_set1_epi32(argc); + return _mm_extract_epi32(a, 0) & 0; +}") + if(_HAVE_PCLMUL AND _HAVE_SSE41) + set(HAVE_PCLMUL TRUE CACHE INTERNAL + "x86 PCLMUL + SSE4.1 intrinsics available") + else() + unset(HAVE_PCLMUL CACHE) + endif() +endfunction() + +# aarch64 FEAT_PMULL (vmull_p64). Pi 4's BCM2711 accepts +crypto at +# compile time but lacks the hardware — the runtime probe catches that. +function(detect_pmull) + detect_cpu_feature(_HAVE_PMULL "-march=armv8-a+crypto" +"#include <arm_neon.h> +#include <signal.h> +#include <stdint.h> +#include <unistd.h> +static void on_sigill(int sig) { (void) sig; _exit(1); } +int main(int argc, char ** argv) { + poly64_t a; + poly128_t c; + (void) argv; + signal(SIGILL, on_sigill); + a = (poly64_t) (uint64_t) argc; + c = vmull_p64(a, a); + return (int) (vgetq_lane_u64((uint64x2_t) c, 0) & 0); +}") + if(_HAVE_PMULL) + set(HAVE_PMULL TRUE CACHE INTERNAL + "aarch64 PMULL intrinsics available") + else() + unset(HAVE_PMULL CACHE) + endif() +endfunction() diff --git a/doc/man/flow_alloc.3 b/doc/man/flow_alloc.3 index dbe5323c..8a9b5f5b 100644 --- a/doc/man/flow_alloc.3 +++ b/doc/man/flow_alloc.3 @@ -62,10 +62,60 @@ The \fBflow_dealloc\fR() function will release any resources associated with the flow. This call may block and keep reliable flows active until all packets are acknowledged. -A \fBqosspec_t\fR specifies the following QoS characteristics of a -flow: - -TODO: specify a qosspec_t +A \fBqosspec_t\fR specifies the QoS characteristics of a flow. +The fields are: + +.TP +\fBdelay\fR (ms) +Maximum one-way delay. +.TP +\fBbandwidth\fR (bits/s) +Minimum bandwidth. +.TP +\fBavailability\fR +Class of 9s (e.g. 5 = 99.999%). +.TP +\fBloss\fR +Tolerated packet loss; 0 selects reliable delivery. +.TP +\fBber\fR +Tolerated bit error rate (errors per billion bits); 0 enables an +end-to-end integrity check (corrupted packets are dropped). +.TP +\fBservice\fR +Framing / reliability class: \fBSVC_RAW\fR (0) disables FRCT; +\fBSVC_MESSAGE\fR (1) preserves SDU boundaries; \fBSVC_STREAM\fR (2) is +a byte stream with no SDU boundaries. \fBSVC_STREAM\fR requires +\fIloss\fR = 0; otherwise +\fBflow_alloc\fR()/\fBflow_accept\fR() returns \fB-EINVAL\fR. +.TP +\fBmax_gap\fR (ms) +Maximum tolerated inter-packet gap. Packets exceeding the gap +budget are dropped under the real-time cubes. +.TP +\fBtimeout\fR (ms) +Peer-liveness timeout; 0 disables. Only applies when FRCT is +enabled (service > 0). + +.PP +The library provides predefined cubes: + +.TP +\fBqos_raw\fR +No guarantees, no integrity check. +.TP +\fBqos_raw_safe\fR +Best-effort with end-to-end integrity (ber = 0). +.TP +\fBqos_rt\fR / \fBqos_rt_safe\fR +Real-time messages, optimised for latency over reliability; +\fBqos_rt_safe\fR adds an end-to-end integrity check. +.TP +\fBqos_msg\fR +Reliable, SDU-preserving delivery. +.TP +\fBqos_stream\fR +Reliable byte stream; no SDU boundaries are preserved. .SH RETURN VALUE @@ -117,13 +167,39 @@ _ \fBflow_dealloc\fR() & Thread safety & MT-Safe .TE +.SH NOTES +The returned file descriptor is subject to a single-reader and +single-writer discipline \(em at most one thread may call +.BR flow_read () +(or monitor the fd via +.BR fevent ()) +and at most one thread may call +.BR flow_write () +concurrently. See +.BR flow_read (3), +.BR flow_write (3), +and +.BR fevent (3) +for details. +.PP +.BR flow_dealloc () +must not be called concurrently with any thread that is inside +.BR flow_read (), +.BR flow_write (), +.BR fevent (), +or any other Ouroboros library call on the same fd; the result is +undefined behaviour. Applications must serialise teardown with +in-flight use, e.g. by signalling worker threads to drop the fd +before calling +.BR flow_dealloc (). + .SH TERMINOLOGY Please see \fBouroboros-glossary\fR(7). .SH SEE ALSO -.BR fccntl "(3), " flow_read "(3), " fqueue "(3), " fset "(3), " \ -ouroboros (8) +.BR fccntl "(3), " fevent "(3), " flow_read "(3), " flow_write "(3), " \ +fqueue "(3), " fset "(3), " ouroboros (8) .SH COLOPHON This page is part of the Ouroboros project, found at diff --git a/doc/man/flow_read.3 b/doc/man/flow_read.3 index acc1f61e..d4a5e883 100644 --- a/doc/man/flow_read.3 +++ b/doc/man/flow_read.3 @@ -39,8 +39,7 @@ end of the datagram. On success, \fBflow_write\fR() returns the number of bytes written. On failure, a negative value indicating the error will be returned. -Partial writes needs to be explicitly enabled. Passing a -NULL pointer for \fIbuf\fR returns 0 with no other effects. +Passing a NULL pointer for \fIbuf\fR returns 0 with no other effects. .SH ERRORS .B -EINVAL @@ -62,7 +61,8 @@ The flow has been reported down. The flow's peer is unresponsive (flow timed out). .B -EMSGSIZE -The buffer was too large to be written. +The received packet does not fit in the caller's buffer and partial +reads are disabled (see \fBfccntl\fR(3), \fBFLOWFRNOPART\fR). .SH ATTRIBUTES @@ -74,11 +74,47 @@ LB|LB|LB L|L|L. Interface & Attribute & Value _ -\fBflow_read\fR() & Thread safety & MT-Safe +\fBflow_read\fR() & Thread safety & MT-Safe race:fd _ -\fBflow_write\fR() & Thread safety & MT-Safe +\fBflow_write\fR() & Thread safety & MT-Safe race:fd .TE +.SH THREAD SAFETY +Only one thread may call +.BR flow_read () +on a given file descriptor at any time. Partial-read state kept +across calls assumes a single logical reader; two threads racing +.BR flow_read () +on the same fd is undefined behaviour. Likewise, only one thread +may call +.BR flow_write () +on a given fd at a time; two writer threads on the same fd is +undefined behaviour. +.PP +Combining a writer thread with a reader thread (one thread calling +.BR flow_write (), +another calling +.BR flow_read () +or +.BR fevent ()) +is permitted and safe. The writer does not need a dedicated reader +thread \(em when the FRCT send window fills, +.BR flow_write () +drives its own inbound rx draining internally to process incoming +ACKs and reopen the window, clamped by the caller's +.BR fccntl (3) +send-timeout if any. +.PP +Monitoring the same fd via +.BR fevent () +from a different thread is well-defined but races: events reported +by +.BR fevent () +may already have been consumed by the racing +.BR flow_read (), +so the second reader may then block. See +.BR fevent (3). + .SH TERMINOLOGY Please see \fBouroboros-glossary\fR(7). diff --git a/doc/man/fqueue.3 b/doc/man/fqueue.3 index 72a0bc25..f2fb8c9f 100644 --- a/doc/man/fqueue.3 +++ b/doc/man/fqueue.3 @@ -116,6 +116,27 @@ _ \fBfevent\fR() & Thread safety & MT-Safe .TE +.SH THREAD SAFETY +.BR fevent () +and +.BR flow_read () +on the same fd from distinct threads is well-defined but races: +events reported by +.BR fevent () +may already have been consumed by the racing +.BR flow_read (), +so the reader may then block. Same shape as +.BR select (2) ++ +.BR read (2) +from distinct threads. The intended pattern is that the thread +invoking +.BR fevent () +is the same thread that calls +.BR flow_read () +on the fds returned by +.BR fqueue_next (). + .SH TERMINOLOGY Please see \fBouroboros-glossary\fR(7). diff --git a/include/ouroboros/atomics.h b/include/ouroboros/atomics.h new file mode 100644 index 00000000..8e667522 --- /dev/null +++ b/include/ouroboros/atomics.h @@ -0,0 +1,39 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * Atomic helpers + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +#ifndef OUROBOROS_LIB_ATOMICS_H +#define OUROBOROS_LIB_ATOMICS_H + +#define LOAD_RELAXED(p) (__atomic_load_n(p, __ATOMIC_RELAXED)) +#define LOAD_ACQUIRE(p) (__atomic_load_n(p, __ATOMIC_ACQUIRE)) +#define LOAD(p) (__atomic_load_n(p, __ATOMIC_SEQ_CST)) + +#define STORE_RELAXED(p, v) (__atomic_store_n(p, v, __ATOMIC_RELAXED)) +#define STORE_RELEASE(p, v) (__atomic_store_n(p, v, __ATOMIC_RELEASE)) +#define STORE(p, v) (__atomic_store_n(p, v, __ATOMIC_SEQ_CST)) + +#define FETCH_ADD_RELAXED(p, v) (__atomic_fetch_add(p, v, __ATOMIC_RELAXED)) +#define FETCH_SUB_RELAXED(p, v) (__atomic_fetch_sub(p, v, __ATOMIC_RELAXED)) +#define FETCH_ADD(p, v) (__atomic_fetch_add(p, v, __ATOMIC_SEQ_CST)) +#define FETCH_SUB(p, v) (__atomic_fetch_sub(p, v, __ATOMIC_SEQ_CST)) + +#endif /* OUROBOROS_LIB_ATOMICS_H */ diff --git a/include/ouroboros/crc16.h b/include/ouroboros/crc16.h new file mode 100644 index 00000000..df4d4f57 --- /dev/null +++ b/include/ouroboros/crc16.h @@ -0,0 +1,43 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * 16-bit Cyclic Redundancy Check (CCITT-FALSE variant) + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +/* + * Polynomial: ITU-T V.41 / CCITT-FALSE, CRC-16/IBM-3740. + * reveng catalog: https://reveng.sourceforge.io/crc-catalogue + * + * Intended for medium-size header check sequences (typ. <= 4 KiB). + * Hamming distance HD=4 up to 32751 message bits. + */ + +#ifndef OUROBOROS_LIB_CRC16_H +#define OUROBOROS_LIB_CRC16_H + +#include <stddef.h> +#include <stdint.h> + +#define CRC16_HASH_LEN 2 + +void crc16_ccitt_false(uint16_t * crc, + const void * buf, + size_t len); + +#endif /* OUROBOROS_LIB_CRC16_H */ diff --git a/include/ouroboros/crc64.h b/include/ouroboros/crc64.h new file mode 100644 index 00000000..f6e407a0 --- /dev/null +++ b/include/ouroboros/crc64.h @@ -0,0 +1,44 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * 64-bit Cyclic Redundancy Check (NVMe variant) + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +/* + * Polynomial: NVM Express Base Spec, CRC-64/NVMe. + * reveng catalog: https://reveng.sourceforge.io/crc-catalogue + * + * Fold-by-N (PCLMUL/PMULL) algorithm: + * V. Gopal et al., "Fast CRC Computation for Generic Polynomials + * Using PCLMULQDQ", Intel white paper, 2009. + */ + +#ifndef OUROBOROS_LIB_CRC64_H +#define OUROBOROS_LIB_CRC64_H + +#include <stddef.h> +#include <stdint.h> + +#define CRC64_HASH_LEN 8 + +void crc64_nvme(uint64_t * crc, + const void * buf, + size_t len); + +#endif /* OUROBOROS_LIB_CRC64_H */ diff --git a/include/ouroboros/crc8.h b/include/ouroboros/crc8.h new file mode 100644 index 00000000..97502a25 --- /dev/null +++ b/include/ouroboros/crc8.h @@ -0,0 +1,43 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * 8-bit Cyclic Redundancy Check (AUTOSAR variant) + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +/* + * Polynomial: AUTOSAR SWS_CRC, CRC-8/AUTOSAR. + * reveng catalog: https://reveng.sourceforge.io/crc-catalogue + * + * Intended for short header check sequences (typ. <= 32 bytes). + * Hamming distance HD=4 up to 119 message bits, HD=3 up to 247. + */ + +#ifndef OUROBOROS_LIB_CRC8_H +#define OUROBOROS_LIB_CRC8_H + +#include <stddef.h> +#include <stdint.h> + +#define CRC8_HASH_LEN 1 + +void crc8_autosar(uint8_t * crc, + const void * buf, + size_t len); + +#endif /* OUROBOROS_LIB_CRC8_H */ diff --git a/include/ouroboros/crypt.h b/include/ouroboros/crypt.h index 806d39ab..5e082bb9 100644 --- a/include/ouroboros/crypt.h +++ b/include/ouroboros/crypt.h @@ -33,7 +33,6 @@ #define MAX_HASH_SIZE 64 /* SHA-512/BLAKE2b max */ #define KEX_ALGO_BUFSZ 32 #define KEX_CIPHER_BUFSZ 32 -#define MSGBUFSZ 2048 /* * On OSX the OpenSSL NIDs are automatically loaded with evp.h. @@ -95,6 +94,8 @@ #define X448MLKEM1024_PKSZ 1624 /* 56 + 1568 */ #define X448MLKEM1024_SKSZ 3224 /* 56 + 3168 */ +#define CRYPT_KEY_BUFSZ 4096 /* Safe buffer for key material */ + #define KEM_MODE_SERVER_ENCAP 0 /* Server encapsulates (default) */ #define KEM_MODE_CLIENT_ENCAP 1 /* Client encapsulates */ #define IS_KEX_ALGO_SET(cfg) ((cfg)->x.nid != NID_undef) @@ -358,6 +359,8 @@ int crypt_check_crt_name(void * crt, int crypt_get_crt_name(void * crt, char * name); +void crypt_cleanup(void); + /* Secure memory allocation for sensitive data (keys, secrets) */ int crypt_secure_malloc_init(size_t max); diff --git a/include/ouroboros/errno.h b/include/ouroboros/errno.h index 9d84df88..eedd978f 100644 --- a/include/ouroboros/errno.h +++ b/include/ouroboros/errno.h @@ -37,5 +37,6 @@ #ifndef EAUTH /* Exists on BSD */ #define EAUTH 1009 /* Authentication error */ #endif +#define EREPLAY 1010 /* OAP replay detected */ #endif /* OUROBOROS_ERRNO_H */ diff --git a/include/ouroboros/fccntl.h b/include/ouroboros/fccntl.h index d3baea8f..e91e91dd 100644 --- a/include/ouroboros/fccntl.h +++ b/include/ouroboros/fccntl.h @@ -50,6 +50,12 @@ #define FRCTFRESCNTL 00000002 /* Feedback from receiver */ #define FRCTFLINGER 00000004 /* Send unsent data */ +/* All user-visible bits (readable via FRCTGFLAGS). */ +#define FRCTFMASK (FRCTFRTX | FRCTFRESCNTL | FRCTFLINGER) + +/* Subset writable via FRCTSFLAGS; FRCTFRTX is fixed at flow_alloc. */ +#define FRCTFSETMASK (FRCTFRESCNTL | FRCTFLINGER) + /* Flow operations */ #define FLOWSRCVTIMEO 00000001 /* Set read timeout */ #define FLOWGRCVTIMEO 00000002 /* Get read timeout */ @@ -60,10 +66,17 @@ #define FLOWGFLAGS 00000007 /* Get flags for flow */ #define FLOWGRXQLEN 00000010 /* Get queue length on rx */ #define FLOWGTXQLEN 00000011 /* Get queue length on tx */ +#define FLOWGMTU 00000012 /* Get per-packet MTU */ /* FRCT operations */ #define FRCTSFLAGS 00001000 /* Set flags for FRCT */ #define FRCTGFLAGS 00002000 /* Get flags for FRCT */ +#define FRCTSMAXSDU 00003000 /* Set max recv SDU size */ +#define FRCTGMAXSDU 00004000 /* Get max recv SDU size */ +#define FRCTSRRINGSZ 00005000 /* Set stream rcv ring sz */ +#define FRCTGRRINGSZ 00006000 /* Get stream rcv ring sz */ +#define FRCTSRTOMIN 00007000 /* Set RTO floor (ns) */ +#define FRCTGRTOMIN 00010000 /* Get RTO floor (ns) */ __BEGIN_DECLS diff --git a/include/ouroboros/flow.h b/include/ouroboros/flow.h index fe4582e7..8b096410 100644 --- a/include/ouroboros/flow.h +++ b/include/ouroboros/flow.h @@ -25,6 +25,7 @@ #include <ouroboros/qos.h> +#include <stdint.h> #include <sys/types.h> #define SYMMKEYSZ 32 @@ -50,6 +51,8 @@ struct flow_info { time_t mpl; + uint32_t mtu; /* n-1 layer MTU in bytes, 0 = unknown */ + struct qos_spec qs; enum flow_state state; diff --git a/include/ouroboros/hash.h b/include/ouroboros/hash.h index 0838df97..17ab98ac 100644 --- a/include/ouroboros/hash.h +++ b/include/ouroboros/hash.h @@ -38,6 +38,9 @@ enum hash_algo { HASH_SHA3_512 = DIR_HASH_SHA3_512, HASH_CRC32, HASH_MD5, + HASH_CRC64, + HASH_CRC8, + HASH_CRC16, }; #define HASH_FMT32 "%02x%02x%02x%02x" diff --git a/include/ouroboros/ipcp-dev.h b/include/ouroboros/ipcp-dev.h index 93236271..d23f757e 100644 --- a/include/ouroboros/ipcp-dev.h +++ b/include/ouroboros/ipcp-dev.h @@ -28,16 +28,20 @@ #include <ouroboros/ssm_pool.h> #include <ouroboros/utils.h> +#include <stdint.h> + int ipcp_create_r(const struct ipcp_info * info); int ipcp_flow_req_arr(const buffer_t * dst, qosspec_t qs, time_t mpl, + uint32_t mtu, const buffer_t * data); int ipcp_flow_alloc_reply(int fd, int response, time_t mpl, + uint32_t mtu, const buffer_t * data); int ipcp_flow_read(int fd, diff --git a/include/ouroboros/irm.h b/include/ouroboros/irm.h index d5e4f1ab..7cb71c21 100644 --- a/include/ouroboros/irm.h +++ b/include/ouroboros/irm.h @@ -53,13 +53,13 @@ int irm_bootstrap_ipcp(pid_t pid, const struct ipcp_config * conf); int irm_connect_ipcp(pid_t pid, - const char * component, const char * dst, + const char * component, qosspec_t qs); int irm_disconnect_ipcp(pid_t pid, - const char * component, - const char * dst); + const char * dst, + const char * component); int irm_bind_program(const char * prog, const char * name, diff --git a/include/ouroboros/np1_flow.h b/include/ouroboros/np1_flow.h index 6f341cfc..309d01c2 100644 --- a/include/ouroboros/np1_flow.h +++ b/include/ouroboros/np1_flow.h @@ -37,12 +37,12 @@ int np1_flow_dealloc(int flow_id, time_t timeo); static const qosspec_t qos_np1 = { + .service = SVC_RAW, .delay = UINT32_MAX, .bandwidth = 0, .availability = 0, .loss = UINT32_MAX, .ber = UINT32_MAX, - .in_order = 0, .max_gap = UINT32_MAX, .timeout = 0 }; diff --git a/include/ouroboros/qos.h b/include/ouroboros/qos.h index 6b0bbc17..7980ad00 100644 --- a/include/ouroboros/qos.h +++ b/include/ouroboros/qos.h @@ -28,79 +28,88 @@ #define DEFAULT_PEER_TIMEOUT 120000 +/* qos_spec.service: framing / reliability class. */ +enum qos_service { + SVC_RAW = 0, /* No FRCT; best-effort raw messages */ + SVC_MESSAGE = 1, /* FRCT, reliable ordered messages */ + SVC_STREAM = 2, /* FRCT, reliable ordered byte stream */ +}; + typedef struct qos_spec { + uint8_t service; /* enum qos_service; gates FRCT (>0). */ uint32_t delay; /* In ms. */ uint64_t bandwidth; /* In bits/s. */ uint8_t availability; /* Class of 9s. */ uint32_t loss; /* Packet loss. */ uint32_t ber; /* Bit error rate, errors per billion bits. */ - uint8_t in_order; /* In-order delivery, enables FRCT. */ uint32_t max_gap; /* In ms. */ uint32_t timeout; /* Peer timeout time, in ms, 0 = no timeout. */ } qosspec_t; +/* "_safe" = integrity check (ber=0). "rt" = latency over reliability. */ + static const qosspec_t qos_raw = { + .service = SVC_RAW, .delay = UINT32_MAX, .bandwidth = 0, .availability = 0, .loss = 1, .ber = 1, - .in_order = 0, .max_gap = UINT32_MAX, - .timeout = DEFAULT_PEER_TIMEOUT + .timeout = 0 }; -static const qosspec_t qos_raw_no_errors = { +static const qosspec_t qos_raw_safe = { + .service = SVC_RAW, .delay = UINT32_MAX, .bandwidth = 0, .availability = 0, .loss = 1, .ber = 0, - .in_order = 0, .max_gap = UINT32_MAX, - .timeout = DEFAULT_PEER_TIMEOUT + .timeout = 0 }; -static const qosspec_t qos_best_effort = { - .delay = UINT32_MAX, - .bandwidth = 0, - .availability = 0, +static const qosspec_t qos_rt = { + .service = SVC_MESSAGE, + .delay = 100, + .bandwidth = UINT64_MAX, + .availability = 3, .loss = 1, - .ber = 0, - .in_order = 1, - .max_gap = UINT32_MAX, + .ber = 1, + .max_gap = 100, .timeout = DEFAULT_PEER_TIMEOUT }; -static const qosspec_t qos_video = { +static const qosspec_t qos_rt_safe = { + .service = SVC_MESSAGE, .delay = 100, .bandwidth = UINT64_MAX, .availability = 3, .loss = 1, .ber = 0, - .in_order = 1, .max_gap = 100, .timeout = DEFAULT_PEER_TIMEOUT }; -static const qosspec_t qos_voice = { - .delay = 50, - .bandwidth = 100000, - .availability = 5, - .loss = 1, +static const qosspec_t qos_msg = { + .service = SVC_MESSAGE, + .delay = 1000, + .bandwidth = 0, + .availability = 0, + .loss = 0, .ber = 0, - .in_order = 1, - .max_gap = 50, + .max_gap = 2000, .timeout = DEFAULT_PEER_TIMEOUT }; -static const qosspec_t qos_data = { +static const qosspec_t qos_stream = { + .service = SVC_STREAM, .delay = 1000, .bandwidth = 0, .availability = 0, .loss = 0, .ber = 0, - .in_order = 1, .max_gap = 2000, .timeout = DEFAULT_PEER_TIMEOUT }; diff --git a/include/ouroboros/ssm_pk_buff.h b/include/ouroboros/ssm_pk_buff.h index 1b779ad1..1d5597c7 100644 --- a/include/ouroboros/ssm_pk_buff.h +++ b/include/ouroboros/ssm_pk_buff.h @@ -28,25 +28,25 @@ struct ssm_pk_buff; -size_t ssm_pk_buff_get_idx(struct ssm_pk_buff * spb); +size_t ssm_pk_buff_get_off(const struct ssm_pk_buff * spb); -uint8_t * ssm_pk_buff_head(struct ssm_pk_buff * spb); +uint8_t * ssm_pk_buff_head(const struct ssm_pk_buff * spb); -uint8_t * ssm_pk_buff_tail(struct ssm_pk_buff * spb); +uint8_t * ssm_pk_buff_tail(const struct ssm_pk_buff * spb); -size_t ssm_pk_buff_len(struct ssm_pk_buff * spb); +size_t ssm_pk_buff_len(const struct ssm_pk_buff * spb); -uint8_t * ssm_pk_buff_head_alloc(struct ssm_pk_buff * spb, - size_t size); +uint8_t * ssm_pk_buff_push(struct ssm_pk_buff * spb, + size_t size); -uint8_t * ssm_pk_buff_tail_alloc(struct ssm_pk_buff * spb, - size_t size); +uint8_t * ssm_pk_buff_push_tail(struct ssm_pk_buff * spb, + size_t size); -uint8_t * ssm_pk_buff_head_release(struct ssm_pk_buff * spb, - size_t size); +uint8_t * ssm_pk_buff_pop(struct ssm_pk_buff * spb, + size_t size); -uint8_t * ssm_pk_buff_tail_release(struct ssm_pk_buff * spb, - size_t size); +uint8_t * ssm_pk_buff_pop_tail(struct ssm_pk_buff * spb, + size_t size); void ssm_pk_buff_truncate(struct ssm_pk_buff * spb, size_t len); diff --git a/include/ouroboros/ssm_pool.h b/include/ouroboros/ssm_pool.h index 89eff8eb..bba76798 100644 --- a/include/ouroboros/ssm_pool.h +++ b/include/ouroboros/ssm_pool.h @@ -32,7 +32,7 @@ struct ssm_pool; -/* Pool API: uid = 0 for GSPP (privileged), uid > 0 for PUP (per-user) */ +/* Pool API: uid = 0 for GSPP (privileged), uid > 0 for PUP (per-user). */ struct ssm_pool * ssm_pool_create(uid_t uid, gid_t gid); @@ -46,13 +46,13 @@ int ssm_pool_mlock(struct ssm_pool * pool); void ssm_pool_gspp_purge(void); -/* Alloc count bytes, returns block index, a ptr and pk_buff. */ +/* Alloc count bytes, returns block offset, a ptr and pk_buff. */ ssize_t ssm_pool_alloc(struct ssm_pool * pool, size_t count, uint8_t ** ptr, struct ssm_pk_buff ** spb); -ssize_t ssm_pool_alloc_b(struct ssm_pool * pool, +ssize_t ssm_pool_alloc_b(struct ssm_pool * pool, size_t count, uint8_t ** ptr, struct ssm_pk_buff ** spb, @@ -60,13 +60,13 @@ ssize_t ssm_pool_alloc_b(struct ssm_pool * pool, ssize_t ssm_pool_read(uint8_t ** dst, struct ssm_pool * pool, - size_t idx); + size_t off); struct ssm_pk_buff * ssm_pool_get(struct ssm_pool * pool, - size_t idx); + size_t off); int ssm_pool_remove(struct ssm_pool * pool, - size_t idx); + size_t off); void ssm_pool_reclaim_orphans(struct ssm_pool * pool, pid_t pid); diff --git a/include/ouroboros/ssm_rbuff.h b/include/ouroboros/ssm_rbuff.h index ffa10b8e..2443b63d 100644 --- a/include/ouroboros/ssm_rbuff.h +++ b/include/ouroboros/ssm_rbuff.h @@ -55,10 +55,10 @@ void ssm_rbuff_fini(struct ssm_rbuff * rb); int ssm_rbuff_mlock(struct ssm_rbuff * rb); int ssm_rbuff_write(struct ssm_rbuff * rb, - size_t idx); + size_t off); int ssm_rbuff_write_b(struct ssm_rbuff * rb, - size_t idx, + size_t off, const struct timespec * abstime); ssize_t ssm_rbuff_read(struct ssm_rbuff * rb); diff --git a/include/ouroboros/time.h b/include/ouroboros/time.h index 3d037a3c..a4136e8e 100644 --- a/include/ouroboros/time.h +++ b/include/ouroboros/time.h @@ -46,6 +46,12 @@ #define TS_TO_UINT64(ts) \ ((uint64_t)(ts).tv_sec * BILLION + (uint64_t)(ts).tv_nsec) +#define UINT64_TO_TS(ns, ts) \ + do { \ + (ts)->tv_sec = (time_t)((ns) / BILLION); \ + (ts)->tv_nsec = (long)((ns) % BILLION); \ + } while (0) + #define TIMEVAL_INIT_S(s) {(s), 0} #define TIMEVAL_INIT_MS(ms) {(ms) / 1000, ((ms) % 1000) * 1000} #define TIMEVAL_INIT_US(us) {(us) / MILLION, ((us) % MILLION)} diff --git a/include/ouroboros/tpm.h b/include/ouroboros/tpm.h index c01a235c..56c04701 100644 --- a/include/ouroboros/tpm.h +++ b/include/ouroboros/tpm.h @@ -24,6 +24,7 @@ #define OUROBOROS_LIB_TPM_H #include <stdbool.h> +#include <sys/types.h> struct tpm; diff --git a/include/ouroboros/tw.h b/include/ouroboros/tw.h new file mode 100644 index 00000000..156f99db --- /dev/null +++ b/include/ouroboros/tw.h @@ -0,0 +1,77 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * Generic deadline-ordered callback queue (timing wheel) + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +#ifndef OUROBOROS_TW_H +#define OUROBOROS_TW_H + +#include <ouroboros/cdefs.h> +#include <ouroboros/list.h> + +#include <stddef.h> +#include <stdint.h> +#include <time.h> + +typedef void (*tw_fire_fn_t)(void * arg); + +struct tw_entry { + struct list_head next; + uint64_t deadline_ns; + tw_fire_fn_t fire; + void * arg; + size_t lvl; +}; + +__BEGIN_DECLS + +int tw_init(void); + +void tw_fini(void); + +void tw_init_entry(struct tw_entry * e); + +/* + * Schedule e to fire at deadline_ns. If e is already posted, + * the previous schedule is cancelled and replaced. + */ +void tw_post(struct tw_entry * e, + uint64_t deadline_ns, + tw_fire_fn_t fire, + void * arg); + +void tw_cancel(struct tw_entry * e); + +/* + * Advance the wheel and fire due callbacks. Callbacks run with the wheel + * unlocked and may call tw_post / tw_cancel on any entry, including the one + * currently firing. Concurrent tw_move from a second thread is a no-op. + */ +void tw_move(void); + +/* + * Write the absolute deadline of the earliest pending entry to *out. + * Empty wheel is signalled by out->tv_nsec == -1. + */ +void tw_next_expiry(struct timespec * out); + +__END_DECLS + +#endif /* OUROBOROS_TW_H */ diff --git a/include/test/test.h b/include/test/test.h index 99681384..a76fe62a 100644 --- a/include/test/test.h +++ b/include/test/test.h @@ -30,6 +30,9 @@ #include <sys/wait.h> #include <sys/types.h> #include <sys/resource.h> +#ifdef __linux__ +#include <sys/prctl.h> +#endif #define TEST_RC_SUCCESS 0 #define TEST_RC_SKIP 1 @@ -86,6 +89,9 @@ static int __attribute__((unused)) test_assert_fail(int(* testfunc)(void)) #ifdef DISABLE_TESTS_CORE_DUMPS struct rlimit rl = { .rlim_cur = 0, .rlim_max = 0 }; setrlimit(RLIMIT_CORE, &rl); +#ifdef __linux__ + prctl(PR_SET_DUMPABLE, 0); +#endif #endif return testfunc(); /* should abort */ } diff --git a/src/ipcpd/broadcast/dt.c b/src/ipcpd/broadcast/dt.c index 30e89a4f..95483e33 100644 --- a/src/ipcpd/broadcast/dt.c +++ b/src/ipcpd/broadcast/dt.c @@ -28,7 +28,7 @@ #include "config.h" -#define BROADCAST_MTU 1400 /* FIXME: avoid packet copy. */ +#define BROADCAST_MTU IPCP_BROADCAST_MTU /* FIXME: avoid packet copy. */ #define DT "dt" #define OUROBOROS_PREFIX DT diff --git a/src/ipcpd/broadcast/main.c b/src/ipcpd/broadcast/main.c index b3cbdc79..77e22531 100644 --- a/src/ipcpd/broadcast/main.c +++ b/src/ipcpd/broadcast/main.c @@ -242,7 +242,7 @@ static int broadcast_ipcp_join(int fd, notifier_event(NOTIFY_DT_CONN_ADD, &conn); - ipcp_flow_alloc_reply(fd, 0, mpl, &data); + ipcp_flow_alloc_reply(fd, 0, mpl, IPCP_BROADCAST_MTU, &data); return 0; } diff --git a/src/ipcpd/config.h.in b/src/ipcpd/config.h.in index 0b4252e5..7edec526 100644 --- a/src/ipcpd/config.h.in +++ b/src/ipcpd/config.h.in @@ -23,8 +23,8 @@ #define PTHREAD_COND_CLOCK @PTHREAD_COND_CLOCK@ #define SYS_MAX_FLOWS @SYS_MAX_FLOWS@ -#define PROG_RES_FDS @PROG_RES_FDS@ -#define PROG_MAX_FLOWS @PROG_MAX_FLOWS@ +#define PROC_RES_FDS @PROC_RES_FDS@ +#define PROC_MAX_FLOWS @PROC_MAX_FLOWS@ #define SOCKET_TIMEOUT @SOCKET_TIMEOUT@ #define CONNECT_TIMEOUT @CONNECT_TIMEOUT@ @@ -46,11 +46,13 @@ #define IPCP_SCHED_THR_MUL @IPCP_SCHED_THR_MUL@ #define PFT_SIZE @PFT_SIZE@ #define IPCP_UNICAST_MPL @IPCP_UNICAST_MPL@ +#define IPCP_UNICAST_MTU @IPCP_UNICAST_MTU@ #define CONNMGR_RCV_TIMEOUT @CONNMGR_RCV_TIMEOUT@ #cmakedefine DISABLE_CORE_LOCK #cmakedefine BUILD_CONTAINER #cmakedefine IPCP_FLOW_STATS +#cmakedefine IPCP_ETH_FLOW_STATS #cmakedefine IPCP_DEBUG_LOCAL #ifdef CONFIG_OUROBOROS_DEBUG #cmakedefine DEBUG_PROTO_DHT @@ -65,6 +67,8 @@ #define IPCP_UDP_RD_THR @IPCP_UDP_RD_THR@ #define IPCP_UDP_WR_THR @IPCP_UDP_WR_THR@ #define IPCP_UDP_MPL @IPCP_UDP_MPL@ +#define IPCP_UDP4_MTU @IPCP_UDP4_MTU@ +#define IPCP_UDP6_MTU @IPCP_UDP6_MTU@ /* eth */ #cmakedefine HAVE_NETMAP @@ -76,10 +80,13 @@ #define IPCP_ETH_LO_MTU @IPCP_ETH_LO_MTU@ #define IPCP_ETH_MGMT_FRAME_SIZE @IPCP_ETH_MGMT_FRAME_SIZE@ #define IPCP_ETH_MPL @IPCP_ETH_MPL@ +#define IPCP_ETH_SNDBUF @IPCP_ETH_SNDBUF@ +#define IPCP_ETH_RCVBUF @IPCP_ETH_RCVBUF@ /* local */ #define IPCP_LOCAL_MPL @IPCP_LOCAL_MPL@ +#define IPCP_LOCAL_MTU @IPCP_LOCAL_MTU@ /* broadcast */ -/* local */ #define IPCP_BROADCAST_MPL @IPCP_BROADCAST_MPL@ +#define IPCP_BROADCAST_MTU @IPCP_BROADCAST_MTU@ diff --git a/src/ipcpd/eth/eth.c b/src/ipcpd/eth/eth.c index 4be7775e..103ba881 100644 --- a/src/ipcpd/eth/eth.c +++ b/src/ipcpd/eth/eth.c @@ -37,12 +37,14 @@ #include "config.h" +#include <ouroboros/atomics.h> #include <ouroboros/endian.h> #include <ouroboros/hash.h> #include <ouroboros/errno.h> #include <ouroboros/list.h> #include <ouroboros/utils.h> #include <ouroboros/bitmap.h> +#include <ouroboros/crc8.h> #include <ouroboros/dev.h> #include <ouroboros/ipcp-dev.h> #include <ouroboros/fqueue.h> @@ -50,6 +52,14 @@ #include <ouroboros/time.h> #include <ouroboros/fccntl.h> #include <ouroboros/pthread.h> +#include <ouroboros/rib.h> + +#ifndef IPCP_ETH_FLOW_STATS +#undef FETCH_ADD_RELAXED +#define FETCH_ADD_RELAXED(p, v) ((void) 0) +#undef FETCH_SUB_RELAXED +#define FETCH_SUB_RELAXED(p, v) ((void) 0) +#endif #include "ipcp.h" #include "np1.h" @@ -122,7 +132,8 @@ #define MGMT_EID 0 #define DIX_EID_SIZE sizeof(uint16_t) #define DIX_LENGTH_SIZE sizeof(uint16_t) -#define DIX_HEADER_SIZE (DIX_EID_SIZE + DIX_LENGTH_SIZE) +#define DIX_HCS_SIZE CRC8_HASH_LEN +#define DIX_HEADER_SIZE (DIX_EID_SIZE + DIX_LENGTH_SIZE + DIX_HCS_SIZE) #define ETH_HEADER_TOT_SIZE (ETH_HEADER_SIZE + DIX_HEADER_SIZE) #define MAX_EIDS (1 << (8 * DIX_EID_SIZE)) #define ETH_MAX_PACKET_SIZE (ETH_MTU - DIX_HEADER_SIZE) @@ -130,16 +141,20 @@ #elif defined(BUILD_ETH_LLC) #define THIS_TYPE IPCP_ETH_LLC #define MGMT_SAP 0x01 -#define LLC_HEADER_SIZE 3 +#define LLC_FIELDS_SIZE 3 +#define LLC_HCS_SIZE CRC8_HASH_LEN +#define LLC_HEADER_SIZE (LLC_FIELDS_SIZE + LLC_HCS_SIZE) #define ETH_HEADER_TOT_SIZE (ETH_HEADER_SIZE + LLC_HEADER_SIZE) #define MAX_SAPS 64 #define ETH_MAX_PACKET_SIZE (ETH_MTU - LLC_HEADER_SIZE) #define ETH_FRAME_SIZE (ETH_HEADER_SIZE + ETH_MTU_MAX) #endif -#define NAME_QUERY_TIMEO 2000 /* ms */ -#define MGMT_TIMEO 100 /* ms */ +#define NAME_QUERY_TIMEO 1900 /* ms total budget */ +#define NAME_QUERY_RETRIES 3 /* retransmits, 4 attempts total */ +#define MGMT_TIMEO 100 /* ms */ #define MGMT_FRAME_SIZE IPCP_ETH_MGMT_FRAME_SIZE +#define ETH_RIB_PATH "eth" #define FLOW_REQ 0 #define FLOW_REPLY 1 @@ -165,7 +180,7 @@ struct mgmt_msg { uint32_t delay; uint32_t timeout; int32_t response; - uint8_t in_order; + uint8_t service; #if defined (BUILD_ETH_DIX) uint8_t code; uint8_t availability; @@ -185,6 +200,7 @@ struct eth_frame { uint8_t ssap; uint8_t cf; #endif + uint8_t hcs; uint8_t payload; } __attribute__((packed)); @@ -196,6 +212,17 @@ struct ef { int8_t r_sap; #endif uint8_t r_addr[MAC_SIZE]; +#ifdef IPCP_ETH_FLOW_STATS + struct { + time_t stamp; + size_t p_rcv; + size_t b_rcv; + size_t p_dlv_f; + size_t p_snd; + size_t b_snd; + size_t p_snd_f; + } stat; +#endif }; struct mgmt_frame { @@ -233,6 +260,22 @@ struct { struct ef * fd_to_ef; fset_t * np1_flows; pthread_rwlock_t flows_lock; +#ifdef IPCP_ETH_FLOW_STATS + struct { + size_t n_flows; + size_t n_rcv; + size_t n_snd; + size_t n_mgmt_rcv; + size_t n_mgmt_snd; + size_t n_bad_id; + size_t n_dlv_f; + size_t n_buf_f; + size_t n_rcv_f; + size_t n_snd_f; + size_t kern_rcv; + size_t kern_drp; + } stat; +#endif pthread_t packet_writer[IPCP_ETH_WR_THR]; pthread_t packet_reader[IPCP_ETH_RD_THR]; @@ -284,7 +327,14 @@ static int eth_data_init(void) eth_data.fd_to_ef[i].r_sap = -1; #endif memset(ð_data.fd_to_ef[i].r_addr, 0, MAC_SIZE); +#ifdef IPCP_ETH_FLOW_STATS + memset(ð_data.fd_to_ef[i].stat, 0, + sizeof(eth_data.fd_to_ef[i].stat)); +#endif } +#ifdef IPCP_ETH_FLOW_STATS + memset(ð_data.stat, 0, sizeof(eth_data.stat)); +#endif eth_data.shim_data = shim_data_create(); if (eth_data.shim_data == NULL) @@ -357,6 +407,227 @@ static void eth_data_fini(void) free(eth_data.fd_to_ef); } +#ifdef IPCP_ETH_FLOW_STATS +static int eth_rib_read(const char * path, + char * buf, + size_t len) +{ + struct ef * flow; + int fd; + char tmstr[RIB_TM_STRLEN]; + struct tm * tm; + time_t stamp; + char * entry; + + entry = strstr(path, RIB_SEPARATOR) + 1; + assert(entry); + + if (len < 2048) + return 0; + + buf[0] = '\0'; + + if (strcmp(entry, "summary") == 0) { + int n; +#if defined(HAVE_RAW_SOCKETS) + int rcvbuf = 0; + int sndbuf = 0; + int queued = 0; + socklen_t optlen = sizeof(rcvbuf); +# if defined(__linux__) + struct tpacket_stats tp_stats; + socklen_t tp_len = sizeof(tp_stats); +# endif + + getsockopt(eth_data.s_fd, SOL_SOCKET, + SO_RCVBUF, &rcvbuf, &optlen); + optlen = sizeof(sndbuf); + getsockopt(eth_data.s_fd, SOL_SOCKET, + SO_SNDBUF, &sndbuf, &optlen); + ioctl(eth_data.s_fd, FIONREAD, &queued); +# if defined(__linux__) + if (getsockopt(eth_data.s_fd, SOL_PACKET, + PACKET_STATISTICS, + &tp_stats, &tp_len) == 0) { + FETCH_ADD_RELAXED(ð_data.stat.kern_rcv, + tp_stats.tp_packets); + FETCH_ADD_RELAXED(ð_data.stat.kern_drp, + tp_stats.tp_drops); + } +# endif +#endif + n = sprintf(buf, + "Active flows: %20zu\n" + "Total frames received: %20zu\n" + "Total frames sent: %20zu\n" + "Management frames received: %20zu\n" + "Management frames sent: %20zu\n" + "Bad EID/SAP frames: %20zu\n" + "Delivery (N+1) failures: %20zu\n" + "Buffer alloc failures: %20zu\n" + "Frame read failures: %20zu\n" + "Frame send failures: %20zu\n", + LOAD_RELAXED(ð_data.stat.n_flows), + LOAD_RELAXED(ð_data.stat.n_rcv), + LOAD_RELAXED(ð_data.stat.n_snd), + LOAD_RELAXED(ð_data.stat.n_mgmt_rcv), + LOAD_RELAXED(ð_data.stat.n_mgmt_snd), + LOAD_RELAXED(ð_data.stat.n_bad_id), + LOAD_RELAXED(ð_data.stat.n_dlv_f), + LOAD_RELAXED(ð_data.stat.n_buf_f), + LOAD_RELAXED(ð_data.stat.n_rcv_f), + LOAD_RELAXED(ð_data.stat.n_snd_f)); +#if defined(HAVE_RAW_SOCKETS) + n += sprintf(buf + n, + "Socket rcvbuf (bytes): %20d\n" + "Socket sndbuf (bytes): %20d\n" + "Socket queued (bytes): %20d\n", + rcvbuf, sndbuf, queued); +# if defined(__linux__) + n += sprintf(buf + n, + "Kernel frames received: %20zu\n" + "Kernel frames dropped: %20zu\n", + LOAD_RELAXED(ð_data.stat.kern_rcv), + LOAD_RELAXED(ð_data.stat.kern_drp)); +# endif +#endif + return n; + } + + fd = atoi(entry); + + if (fd < 0 || fd >= SYS_MAX_FLOWS) + return -1; + + flow = ð_data.fd_to_ef[fd]; + + pthread_rwlock_rdlock(ð_data.flows_lock); + + stamp = flow->stat.stamp; + if (stamp == 0) { + pthread_rwlock_unlock(ð_data.flows_lock); + return 0; + } + + pthread_rwlock_unlock(ð_data.flows_lock); + + tm = gmtime(&stamp); + strftime(tmstr, sizeof(tmstr), RIB_TM_FORMAT, tm); + + sprintf(buf, + "Flow established at: %20s\n" + "Sent (packets): %20zu\n" + "Sent (bytes): %20zu\n" + "Send failed (packets): %20zu\n" + "Received (packets): %20zu\n" + "Received (bytes): %20zu\n" + "Delivery (N+1) failures: %20zu\n", + tmstr, + LOAD_RELAXED(&flow->stat.p_snd), + LOAD_RELAXED(&flow->stat.b_snd), + LOAD_RELAXED(&flow->stat.p_snd_f), + LOAD_RELAXED(&flow->stat.p_rcv), + LOAD_RELAXED(&flow->stat.b_rcv), + LOAD_RELAXED(&flow->stat.p_dlv_f)); + + return strlen(buf); +} + +static int eth_rib_readdir(char *** buf) +{ + char entry[RIB_PATH_LEN + 1]; + size_t i; + int idx = 0; + int n_entries; + + pthread_rwlock_rdlock(ð_data.flows_lock); + + n_entries = (int) LOAD_RELAXED(ð_data.stat.n_flows) + 1; + + *buf = malloc(sizeof(**buf) * n_entries); + if (*buf == NULL) + goto fail_entries; + + (*buf)[idx] = malloc(strlen("summary") + 1); + if ((*buf)[idx] == NULL) + goto fail_entry; + + strcpy((*buf)[idx++], "summary"); + + for (i = 0; i < SYS_MAX_FLOWS && idx < n_entries; ++i) { + if (eth_data.fd_to_ef[i].stat.stamp == 0) + continue; + + sprintf(entry, "%zu", i); + + (*buf)[idx] = malloc(strlen(entry) + 1); + if ((*buf)[idx] == NULL) + goto fail_entry; + + strcpy((*buf)[idx++], entry); + } + + pthread_rwlock_unlock(ð_data.flows_lock); + + return idx; + + fail_entry: + while (idx-- > 0) + free((*buf)[idx]); + free(*buf); + fail_entries: + pthread_rwlock_unlock(ð_data.flows_lock); + return -ENOMEM; +} + +static int eth_rib_getattr(const char * path, + struct rib_attr * attr) +{ + int fd; + char * entry; + struct ef * flow; + + entry = strstr(path, RIB_SEPARATOR) + 1; + assert(entry); + + if (strcmp(entry, "summary") == 0) { + attr->size = 2048; + attr->mtime = 0; + return 0; + } + + fd = atoi(entry); + + if (fd < 0 || fd >= SYS_MAX_FLOWS) { + attr->size = 0; + attr->mtime = 0; + return 0; + } + + flow = ð_data.fd_to_ef[fd]; + + pthread_rwlock_rdlock(ð_data.flows_lock); + + if (flow->stat.stamp != 0) { + attr->size = 2048; + attr->mtime = flow->stat.stamp; + } else { + attr->size = 0; + attr->mtime = 0; + } + + pthread_rwlock_unlock(ð_data.flows_lock); + + return 0; +} + +static struct rib_ops eth_r_ops = { + .read = eth_rib_read, + .readdir = eth_rib_readdir, + .getattr = eth_rib_getattr +}; +#endif /* IPCP_ETH_FLOW_STATS */ + #ifdef BUILD_ETH_LLC static uint8_t reverse_bits(uint8_t b) { @@ -409,12 +680,18 @@ static int eth_ipcp_send_frame(const uint8_t * dst_addr, e_frame->ethertype = eth_data.ethertype; e_frame->eid = htons(deid); e_frame->length = htons(len); + mem_hash(HASH_CRC8, &e_frame->hcs, + (uint8_t *) &e_frame->eid, + DIX_EID_SIZE + DIX_LENGTH_SIZE); frame_len = ETH_HEADER_TOT_SIZE + len; #elif defined(BUILD_ETH_LLC) e_frame->length = htons(LLC_HEADER_SIZE + len); e_frame->dsap = dsap; e_frame->ssap = ssap; e_frame->cf = cf; + mem_hash(HASH_CRC8, &e_frame->hcs, + (uint8_t *) &e_frame->dsap, + LLC_FIELDS_SIZE); frame_len = ETH_HEADER_TOT_SIZE + len; #endif @@ -440,10 +717,7 @@ static int eth_ipcp_send_frame(const uint8_t * dst_addr, } assert(FD_ISSET(eth_data.s_fd, &fds)); - if (sendto(eth_data.s_fd, - frame, - frame_len, - 0, + if (sendto(eth_data.s_fd, frame, frame_len, 0, (struct sockaddr *) ð_data.device, sizeof(eth_data.device)) <= 0) { log_dbg("Failed to send message: %s.", strerror(errno)); @@ -451,6 +725,8 @@ static int eth_ipcp_send_frame(const uint8_t * dst_addr, } #endif /* HAVE_NETMAP */ + FETCH_ADD_RELAXED(ð_data.stat.n_snd, 1); + return 0; } @@ -490,7 +766,7 @@ static int eth_ipcp_alloc(const uint8_t * dst_addr, msg->availability = qs.availability; msg->loss = hton32(qs.loss); msg->ber = hton32(qs.ber); - msg->in_order = qs.in_order; + msg->service = qs.service; msg->max_gap = hton32(qs.max_gap); msg->timeout = hton32(qs.timeout); @@ -508,6 +784,9 @@ static int eth_ipcp_alloc(const uint8_t * dst_addr, buf, len + data->len); free(buf); + if (ret == 0) + FETCH_ADD_RELAXED(ð_data.stat.n_mgmt_snd, 1); + return ret; } @@ -558,6 +837,8 @@ static int eth_ipcp_alloc_resp(uint8_t * dst_addr, return -1; } + FETCH_ADD_RELAXED(ð_data.stat.n_mgmt_snd, 1); + free(buf); return 0; @@ -575,7 +856,8 @@ static int eth_ipcp_req(uint8_t * r_addr, { int fd; - fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_ETH_MPL, data); + fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_ETH_MPL, + ETH_MAX_PACKET_SIZE, data); if (fd < 0) { log_err("Could not get new flow from IRMd."); return -1; @@ -622,7 +904,7 @@ static int eth_ipcp_alloc_reply(uint8_t * r_addr, fd = eth_data.ef_to_fd[dsap]; #endif if (fd < 0) { - pthread_rwlock_unlock(& eth_data.flows_lock); + pthread_rwlock_unlock(ð_data.flows_lock); log_err("No flow found with that SAP."); return -1; /* -EFLOWNOTFOUND */ } @@ -647,7 +929,8 @@ static int eth_ipcp_alloc_reply(uint8_t * r_addr, #elif defined(BUILD_ETH_LLC) log_dbg("Flow reply, fd %d, SSAP %d, DSAP %d.", fd, ssap, dsap); #endif - if ((ret = ipcp_flow_alloc_reply(fd, response, mpl, data)) < 0) { + if ((ret = ipcp_flow_alloc_reply(fd, response, mpl, + ETH_MAX_PACKET_SIZE, data)) < 0) { log_err("Failed to reply to flow allocation."); return -1; } @@ -689,6 +972,8 @@ static int eth_ipcp_name_query_req(const uint8_t * hash, return -1; } + FETCH_ADD_RELAXED(ð_data.stat.n_mgmt_snd, 1); + free(buf); } @@ -718,20 +1003,24 @@ static int eth_ipcp_mgmt_frame(const uint8_t * buf, qosspec_t qs; buffer_t data; + if (len < sizeof(*msg)) + return -1; + msg = (struct mgmt_msg *) buf; switch (msg->code) { case FLOW_REQ: msg_len = sizeof(*msg) + ipcp_dir_hash_len(); - assert(len >= msg_len); + if (len < msg_len) + return -1; qs.delay = ntoh32(msg->delay); qs.bandwidth = ntoh64(msg->bandwidth); qs.availability = msg->availability; qs.loss = ntoh32(msg->loss); qs.ber = ntoh32(msg->ber); - qs.in_order = msg->in_order; + qs.service = msg->service; qs.max_gap = ntoh32(msg->max_gap); qs.timeout = ntoh32(msg->timeout); @@ -752,8 +1041,6 @@ static int eth_ipcp_mgmt_frame(const uint8_t * buf, } break; case FLOW_REPLY: - assert(len >= sizeof(*msg)); - data.data = (uint8_t *) buf + sizeof(*msg); data.len = len - sizeof(*msg); @@ -769,9 +1056,13 @@ static int eth_ipcp_mgmt_frame(const uint8_t * buf, &data); break; case NAME_QUERY_REQ: + if (len < sizeof(*msg) + ipcp_dir_hash_len()) + return -1; eth_ipcp_name_query_req(buf + sizeof(*msg), r_addr); break; case NAME_QUERY_REPLY: + if (len < sizeof(*msg) + ipcp_dir_hash_len()) + return -1; eth_ipcp_name_query_reply(buf + sizeof(*msg), r_addr); break; default: @@ -844,6 +1135,12 @@ static void * eth_ipcp_packet_reader(void * o) fd_set fds; int frame_len; #endif +#if defined(HAVE_RAW_SOCKETS) + struct sockaddr_ll src; + socklen_t slen; +#endif + size_t eth_len; + uint8_t hcs; struct eth_frame * e_frame; struct mgmt_frame * frame; @@ -881,24 +1178,58 @@ static void * eth_ipcp_packet_reader(void * o) if (select(eth_data.s_fd + 1, &fds, NULL, NULL, NULL) < 0) continue; assert(FD_ISSET(eth_data.s_fd, &fds)); - if (ipcp_spb_reserve(&spb, ETH_MTU)) + if (ipcp_spb_reserve(&spb, ETH_MTU)) { + FETCH_ADD_RELAXED(ð_data.stat.n_buf_f, 1); continue; - buf = ssm_pk_buff_head_alloc(spb, ETH_HEADER_TOT_SIZE); + } + buf = ssm_pk_buff_push(spb, ETH_HEADER_TOT_SIZE); if (buf == NULL) { log_dbg("Failed to allocate header."); ipcp_spb_release(spb); + FETCH_ADD_RELAXED(ð_data.stat.n_buf_f, 1); continue; } - frame_len = recv(eth_data.s_fd, buf, - ETH_MTU + ETH_HEADER_TOT_SIZE, 0); + slen = sizeof(src); + /* MSG_DONTWAIT: RD_THR>1 race-loser bails with EAGAIN. */ + frame_len = recvfrom(eth_data.s_fd, buf, + ETH_MTU + ETH_HEADER_TOT_SIZE, + MSG_DONTWAIT, + (struct sockaddr *) &src, &slen); #endif - if (frame_len <= 0) { - log_dbg("Failed to receive frame."); + if (frame_len == 0) { + ipcp_spb_release(spb); + continue; /* Spurious */ + } + + if (frame_len < 0) { ipcp_spb_release(spb); + + if (errno == EAGAIN || errno == EWOULDBLOCK) + continue; + + log_dbg("Failed to rcv frame: %s.", strerror(errno)); + FETCH_ADD_RELAXED(ð_data.stat.n_rcv_f, 1); continue; } #endif +#if defined(HAVE_NETMAP) + eth_len = hdr.len; +#elif defined(HAVE_BPF) + eth_len = ((struct bpf_hdr *) buf)->bh_caplen; +#else + eth_len = (size_t) frame_len; +#endif + /* Defense in depth: reject before parsing dereferences. */ + if (eth_len < ETH_HEADER_TOT_SIZE) + goto fail_frame; + +#if defined(HAVE_RAW_SOCKETS) + /* Drop our own egress. */ + if (src.sll_pkttype == PACKET_OUTGOING) + goto fail_frame; +#endif + #if defined(HAVE_BPF) && !defined(HAVE_NETMAP) e_frame = (struct eth_frame *) (buf + ((struct bpf_hdr *) buf)->bh_hdrlen); @@ -916,6 +1247,8 @@ static void * eth_ipcp_packet_reader(void * o) e_frame->dst_hwaddr, MAC_SIZE) && memcmp(br_addr, e_frame->dst_hwaddr, MAC_SIZE)) { + FETCH_ADD_RELAXED(ð_data.stat.n_bad_id, 1); + goto fail_frame; } #endif length = ntohs(e_frame->length); @@ -923,17 +1256,41 @@ static void * eth_ipcp_packet_reader(void * o) if (e_frame->ethertype != eth_data.ethertype) goto fail_frame; + if (length > ETH_MTU) + goto fail_frame; + deid = ntohs(e_frame->eid); - if (deid == MGMT_EID) { #elif defined (BUILD_ETH_LLC) if (length > 0x05FF) /* DIX */ goto fail_frame; + if (length < LLC_HEADER_SIZE || length > ETH_MTU) + goto fail_frame; + length -= LLC_HEADER_SIZE; dsap = reverse_bits(e_frame->dsap); ssap = reverse_bits(e_frame->ssap); +#endif + + if (eth_len < ETH_HEADER_TOT_SIZE + (size_t) length) + goto fail_frame; +#if defined(BUILD_ETH_DIX) + mem_hash(HASH_CRC8, &hcs, + (uint8_t *) &e_frame->eid, + DIX_EID_SIZE + DIX_LENGTH_SIZE); +#elif defined(BUILD_ETH_LLC) + mem_hash(HASH_CRC8, &hcs, + (uint8_t *) &e_frame->dsap, + LLC_FIELDS_SIZE); +#endif + if (hcs != e_frame->hcs) + goto fail_frame; + +#if defined(BUILD_ETH_DIX) + if (deid == MGMT_EID) { +#elif defined (BUILD_ETH_LLC) if (ssap == MGMT_SAP && dsap == MGMT_SAP) { #endif ipcp_spb_release(spb); /* No need for the N+1 buffer. */ @@ -958,6 +1315,8 @@ static void * eth_ipcp_packet_reader(void * o) list_add(&frame->next, ð_data.mgmt_frames); pthread_cond_signal(ð_data.mgmt_cond); pthread_mutex_unlock(ð_data.mgmt_lock); + FETCH_ADD_RELAXED(ð_data.stat.n_rcv, 1); + FETCH_ADD_RELAXED(ð_data.stat.n_mgmt_rcv, 1); } else { pthread_rwlock_rdlock(ð_data.flows_lock); @@ -968,6 +1327,7 @@ static void * eth_ipcp_packet_reader(void * o) #endif if (fd < 0) { pthread_rwlock_unlock(ð_data.flows_lock); + FETCH_ADD_RELAXED(ð_data.stat.n_bad_id, 1); goto fail_frame; } @@ -976,13 +1336,18 @@ static void * eth_ipcp_packet_reader(void * o) || memcmp(eth_data.fd_to_ef[fd].r_addr, e_frame->src_hwaddr, MAC_SIZE)) { pthread_rwlock_unlock(ð_data.flows_lock); + FETCH_ADD_RELAXED(ð_data.stat.n_bad_id, 1); goto fail_frame; } #endif + FETCH_ADD_RELAXED(ð_data.fd_to_ef[fd].stat.p_rcv, 1); + FETCH_ADD_RELAXED(ð_data.fd_to_ef[fd].stat.b_rcv, + length); + FETCH_ADD_RELAXED(ð_data.stat.n_rcv, 1); pthread_rwlock_unlock(ð_data.flows_lock); #ifndef HAVE_NETMAP - ssm_pk_buff_head_release(spb, ETH_HEADER_TOT_SIZE); + ssm_pk_buff_pop(spb, ETH_HEADER_TOT_SIZE); ssm_pk_buff_truncate(spb, length); #else if (ipcp_spb_reserve(&spb, length)) @@ -991,8 +1356,13 @@ static void * eth_ipcp_packet_reader(void * o) buf = ssm_pk_buff_head(spb); memcpy(buf, &e_frame->payload, length); #endif - if (np1_flow_write(fd, spb, NP1_GET_POOL(fd)) < 0) + if (np1_flow_write(fd, spb, NP1_GET_POOL(fd)) < 0) { ipcp_spb_release(spb); + FETCH_ADD_RELAXED( + ð_data.fd_to_ef[fd].stat.p_dlv_f, + 1); + FETCH_ADD_RELAXED(ð_data.stat.n_dlv_f, 1); + } continue; fail_frame: @@ -1048,10 +1418,11 @@ static void * eth_ipcp_packet_writer(void * o) len = ssm_pk_buff_len(spb); - if (ssm_pk_buff_head_alloc(spb, ETH_HEADER_TOT_SIZE) + if (ssm_pk_buff_push(spb, ETH_HEADER_TOT_SIZE) == NULL) { log_dbg("Failed to allocate header."); ipcp_spb_release(spb); + FETCH_ADD_RELAXED(ð_data.stat.n_buf_f, 1); continue; } @@ -1075,8 +1446,20 @@ static void * eth_ipcp_packet_writer(void * o) dsap, ssap, #endif ssm_pk_buff_head(spb), - len)) + len)) { log_dbg("Failed to send frame."); + FETCH_ADD_RELAXED( + ð_data.fd_to_ef[fd].stat.p_snd_f, + 1); + FETCH_ADD_RELAXED(ð_data.stat.n_snd_f, 1); + } else { + FETCH_ADD_RELAXED( + ð_data.fd_to_ef[fd].stat.p_snd, + 1); + FETCH_ADD_RELAXED( + ð_data.fd_to_ef[fd].stat.b_snd, + len); + } ipcp_spb_release(spb); } } @@ -1424,12 +1807,14 @@ static int eth_init_bpf(struct ifreq * ifr) return -1; } #elif defined(HAVE_RAW_SOCKETS) +#define SOCKOPT() static int eth_init_raw_socket(struct ifreq * ifr) { int idx; - int flags; + int sndbuf; + int rcvbuf; #if defined(IPCP_ETH_QDISC_BYPASS) - int qdisc_bypass = 1; + int qdisc_bypass = 1; #endif /* ENABLE_QDISC_BYPASS */ idx = if_nametoindex(ifr->ifr_name); @@ -1437,6 +1822,7 @@ static int eth_init_raw_socket(struct ifreq * ifr) log_err("Failed to retrieve interface index."); return -1; } + memset(&(eth_data.device), 0, sizeof(eth_data.device)); eth_data.device.sll_ifindex = idx; eth_data.device.sll_family = AF_PACKET; @@ -1453,17 +1839,6 @@ static int eth_init_raw_socket(struct ifreq * ifr) goto fail_socket; } - flags = fcntl(eth_data.s_fd, F_GETFL, 0); - if (flags < 0) { - log_err("Failed to get flags."); - goto fail_device; - } - - if (fcntl(eth_data.s_fd, F_SETFL, flags | O_NONBLOCK)) { - log_err("Failed to set socket non-blocking."); - goto fail_device; - } - #if defined(IPCP_ETH_QDISC_BYPASS) if (setsockopt(eth_data.s_fd, SOL_PACKET, PACKET_QDISC_BYPASS, &qdisc_bypass, sizeof(qdisc_bypass))) { @@ -1471,6 +1846,18 @@ static int eth_init_raw_socket(struct ifreq * ifr) } #endif + sndbuf = IPCP_ETH_SNDBUF; + if (sndbuf > 0 && setsockopt(eth_data.s_fd, SOL_SOCKET, SO_SNDBUF, + &sndbuf, sizeof(sndbuf))) { + log_info("Failed to set SO_SNDBUF to %d.", sndbuf); + } + + rcvbuf = IPCP_ETH_RCVBUF; + if (rcvbuf > 0 && setsockopt(eth_data.s_fd, SOL_SOCKET, SO_RCVBUF, + &rcvbuf, sizeof(rcvbuf))) { + log_info("Failed to set SO_RCVBUF to %d.", rcvbuf); + } + if (bind(eth_data.s_fd, (struct sockaddr *) ð_data.device, sizeof(eth_data.device)) < 0) { log_err("Failed to bind socket to interface."); @@ -1543,6 +1930,12 @@ static int eth_ipcp_bootstrap(struct ipcp_config * conf) return -1; } #endif /* HAVE_NETMAP */ +#ifdef IPCP_ETH_FLOW_STATS + if (rib_reg(ETH_RIB_PATH, ð_r_ops)) { + log_err("Failed to register RIB."); + goto fail_rib_reg; + } +#endif #if defined(__linux__) if (pthread_create(ð_data.if_monitor, NULL, eth_ipcp_if_monitor, NULL)) { @@ -1606,6 +1999,10 @@ static int eth_ipcp_bootstrap(struct ipcp_config * conf) #if defined(__linux__) fail_monitor: #endif +#ifdef IPCP_ETH_FLOW_STATS + rib_unreg(ETH_RIB_PATH); + fail_rib_reg: +#endif #if defined(HAVE_NETMAP) nm_close(eth_data.nmd); #elif defined(HAVE_BPF) @@ -1637,12 +2034,14 @@ static int eth_ipcp_unreg(const uint8_t * hash) static int eth_ipcp_query(const uint8_t * hash) { uint8_t r_addr[MAC_SIZE]; - struct timespec timeout = TIMESPEC_INIT_MS(NAME_QUERY_TIMEO); + struct timespec timeout; struct dir_query * query; int ret; + int attempt; uint8_t * buf; struct mgmt_msg * msg; size_t len; + long per_ms; if (shim_data_dir_has(eth_data.shim_data, hash)) return 0; @@ -1662,32 +2061,46 @@ static int eth_ipcp_query(const uint8_t * hash) memset(r_addr, 0xff, MAC_SIZE); - query = shim_data_dir_query_create(eth_data.shim_data, hash); - if (query == NULL) { - free(buf); - return -1; - } + per_ms = NAME_QUERY_TIMEO / (NAME_QUERY_RETRIES + 1); + + ret = -1; + for (attempt = 0; attempt <= NAME_QUERY_RETRIES; ++attempt) { + query = shim_data_dir_query_create(eth_data.shim_data, hash); + if (query == NULL) { + ret = -1; + break; + } - if (eth_ipcp_send_frame(r_addr, + if (eth_ipcp_send_frame(r_addr, #if defined(BUILD_ETH_DIX) - MGMT_EID, + MGMT_EID, #elif defined(BUILD_ETH_LLC) - reverse_bits(MGMT_SAP), - reverse_bits(MGMT_SAP), + reverse_bits(MGMT_SAP), + reverse_bits(MGMT_SAP), #endif - buf, len)) { - log_err("Failed to send management frame."); + buf, len)) { + log_err("Failed to send management frame."); + shim_data_dir_query_destroy(eth_data.shim_data, + query); + ret = -1; + break; + } + + FETCH_ADD_RELAXED(ð_data.stat.n_mgmt_snd, 1); + + timeout.tv_sec = per_ms / 1000; + timeout.tv_nsec = (per_ms % 1000) * 1000000L; + + ret = shim_data_dir_query_wait(query, &timeout); + shim_data_dir_query_destroy(eth_data.shim_data, query); - free(buf); - return -1; + + if (ret != -ETIMEDOUT) + break; } free(buf); - ret = shim_data_dir_query_wait(query, &timeout); - - shim_data_dir_query_destroy(eth_data.shim_data, query); - return ret; } @@ -1748,6 +2161,14 @@ static int eth_ipcp_flow_alloc(int fd, } fset_add(eth_data.np1_flows, fd); +#ifdef IPCP_ETH_FLOW_STATS + pthread_rwlock_wrlock(ð_data.flows_lock); + memset(ð_data.fd_to_ef[fd].stat, 0, + sizeof(eth_data.fd_to_ef[fd].stat)); + eth_data.fd_to_ef[fd].stat.stamp = time(NULL); + FETCH_ADD_RELAXED(ð_data.stat.n_flows, 1); + pthread_rwlock_unlock(ð_data.flows_lock); +#endif #if defined(BUILD_ETH_LLC) log_dbg("Assigned SAP %d for fd %d.", ssap, fd); #endif @@ -1808,6 +2229,14 @@ static int eth_ipcp_flow_alloc_resp(int fd, } fset_add(eth_data.np1_flows, fd); +#ifdef IPCP_ETH_FLOW_STATS + pthread_rwlock_wrlock(ð_data.flows_lock); + memset(ð_data.fd_to_ef[fd].stat, 0, + sizeof(eth_data.fd_to_ef[fd].stat)); + eth_data.fd_to_ef[fd].stat.stamp = time(NULL); + FETCH_ADD_RELAXED(ð_data.stat.n_flows, 1); + pthread_rwlock_unlock(ð_data.flows_lock); +#endif #if defined(BUILD_ETH_LLC) log_dbg("Assigned SAP %d for fd %d.", ssap, fd); #endif @@ -1836,6 +2265,12 @@ static int eth_ipcp_flow_dealloc(int fd) #endif memset(ð_data.fd_to_ef[fd].r_addr, 0, MAC_SIZE); +#ifdef IPCP_ETH_FLOW_STATS + memset(ð_data.fd_to_ef[fd].stat, 0, + sizeof(eth_data.fd_to_ef[fd].stat)); + FETCH_SUB_RELAXED(ð_data.stat.n_flows, 1); +#endif + pthread_rwlock_unlock(ð_data.flows_lock); ipcp_flow_dealloc(fd); @@ -1902,6 +2337,9 @@ int main(int argc, #ifdef __linux__ pthread_join(eth_data.if_monitor, NULL); #endif +#ifdef IPCP_ETH_FLOW_STATS + rib_unreg(ETH_RIB_PATH); +#endif } ipcp_stop(); diff --git a/src/ipcpd/ipcp.c b/src/ipcpd/ipcp.c index 5ad2401f..1052a686 100644 --- a/src/ipcpd/ipcp.c +++ b/src/ipcpd/ipcp.c @@ -363,6 +363,7 @@ static void * acceptloop(void * o) int ipcp_wait_flow_req_arr(const uint8_t * dst, qosspec_t qs, time_t mpl, + uint32_t mtu, const buffer_t * data) { struct timespec ts = TIMESPEC_INIT_MS(ALLOC_TIMEOUT); @@ -392,7 +393,7 @@ int ipcp_wait_flow_req_arr(const uint8_t * dst, assert(ipcpd.alloc_id == -1); - fd = ipcp_flow_req_arr(&hash, qs, mpl, data); + fd = ipcp_flow_req_arr(&hash, qs, mpl, mtu, data); if (fd < 0) { pthread_mutex_unlock(&ipcpd.alloc_lock); log_err("Failed to get fd for flow."); diff --git a/src/ipcpd/ipcp.h b/src/ipcpd/ipcp.h index 26a780a3..0adcc694 100644 --- a/src/ipcpd/ipcp.h +++ b/src/ipcpd/ipcp.h @@ -98,6 +98,7 @@ enum ipcp_state ipcp_get_state(void); int ipcp_wait_flow_req_arr(const uint8_t * dst, qosspec_t qs, time_t mpl, + uint32_t mtu, const buffer_t * data); int ipcp_wait_flow_resp(const int fd); diff --git a/src/ipcpd/local/main.c b/src/ipcpd/local/main.c index 377a7df3..eb9836f2 100644 --- a/src/ipcpd/local/main.c +++ b/src/ipcpd/local/main.c @@ -203,7 +203,8 @@ static int local_ipcp_flow_alloc(int fd, HASH_VAL32(dst), fd); assert(dst); - out_fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_LOCAL_MPL, data); + out_fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_LOCAL_MPL, + IPCP_LOCAL_MTU, data); if (out_fd < 0) { log_dbg("Flow allocation failed: %d", out_fd); return -1; @@ -236,15 +237,6 @@ static int local_ipcp_flow_alloc_resp(int fd, return -1; } - if (response < 0) { - pthread_rwlock_wrlock(&local_data.lock); - if (local_data.in_out[fd] != -1) - local_data.in_out[local_data.in_out[fd]] = fd; - local_data.in_out[fd] = -1; - pthread_rwlock_unlock(&local_data.lock); - return 0; - } - pthread_rwlock_rdlock(&local_data.lock); out_fd = local_data.in_out[fd]; @@ -263,9 +255,17 @@ static int local_ipcp_flow_alloc_resp(int fd, return -1; } + if (response < 0) { + ipcp_flow_alloc_reply(out_fd, response, mpl, + IPCP_LOCAL_MTU, data); + log_info("Flow allocation rejected, fds (%d, %d).", out_fd, fd); + return 0; + } + fset_add(local_data.flows, fd); - if (ipcp_flow_alloc_reply(out_fd, response, mpl, data) < 0) { + if (ipcp_flow_alloc_reply(out_fd, response, mpl, + IPCP_LOCAL_MTU, data) < 0) { log_err("Failed to reply to allocation"); fset_del(local_data.flows, fd); return -1; diff --git a/src/ipcpd/udp/udp.c b/src/ipcpd/udp/udp.c index 452bbc1a..93e88b9b 100644 --- a/src/ipcpd/udp/udp.c +++ b/src/ipcpd/udp/udp.c @@ -47,6 +47,10 @@ #include <stdlib.h> #include <sys/wait.h> #include <fcntl.h> +#include <unistd.h> +#if defined(__linux__) +#include <netinet/ip.h> +#endif #define FLOW_REQ 1 #define FLOW_REPLY 2 @@ -87,7 +91,7 @@ struct mgmt_msg { uint8_t code; /* QoS parameters from spec */ uint8_t availability; - uint8_t in_order; + uint8_t service; } __attribute__((packed)); struct mgmt_frame { @@ -130,6 +134,53 @@ static const char * __inet_ntop(const struct __ADDR * addr, return inet_ntop(__AF, addr, buf, __ADDRSTRLEN); } +#if defined(BUILD_IPCP_UDP4) +#define UDP_MTU_FALLBACK IPCP_UDP4_MTU +#define UDP_IP_OVERHEAD 28U /* IPv4 + UDP */ +#else +#define UDP_MTU_FALLBACK IPCP_UDP6_MTU +#define UDP_IP_OVERHEAD 48U /* IPv6 + UDP */ +#endif + +static uint32_t udp_query_mtu(const struct __SOCKADDR * saddr) +{ +#if defined(__linux__) && (defined(IP_MTU) || defined(IPV6_MTU)) + int sock; + int mtu = 0; + socklen_t len = sizeof(mtu); + + sock = socket(__AF, SOCK_DGRAM, IPPROTO_UDP); + if (sock < 0) + return UDP_MTU_FALLBACK; + + if (connect(sock, (const struct sockaddr *) saddr, + sizeof(*saddr)) < 0) + goto fallback; + +#if defined(BUILD_IPCP_UDP4) && defined(IP_MTU) + if (getsockopt(sock, IPPROTO_IP, IP_MTU, &mtu, &len) < 0) + goto fallback; +#elif defined(BUILD_IPCP_UDP6) && defined(IPV6_MTU) + if (getsockopt(sock, IPPROTO_IPV6, IPV6_MTU, &mtu, &len) < 0) + goto fallback; +#else + goto fallback; +#endif + close(sock); + + if (mtu <= (int) UDP_IP_OVERHEAD) + return UDP_MTU_FALLBACK; + + return (uint32_t) mtu - UDP_IP_OVERHEAD; + + fallback: + close(sock); +#else + (void) saddr; +#endif + return UDP_MTU_FALLBACK; +} + static int udp_data_init(void) { int i; @@ -220,7 +271,7 @@ static int udp_ipcp_port_alloc(const struct __SOCKADDR * r_saddr, msg->availability = qs.availability; msg->loss = hton32(qs.loss); msg->ber = hton32(qs.ber); - msg->in_order = qs.in_order; + msg->service = qs.service; msg->max_gap = hton32(qs.max_gap); msg->timeout = hton32(qs.timeout); @@ -285,7 +336,8 @@ static int udp_ipcp_port_req(struct __SOCKADDR * c_saddr, { int fd; - fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_UDP_MPL, data); + fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_UDP_MPL, + udp_query_mtu(c_saddr), data); if (fd < 0) { log_err("Could not get new flow from IRMd."); return -1; @@ -332,7 +384,8 @@ static int udp_ipcp_port_alloc_reply(const struct __SOCKADDR * saddr, pthread_rwlock_unlock(&udp_data.flows_lock); - if (ipcp_flow_alloc_reply(s_eid, response, mpl, data) < 0) { + if (ipcp_flow_alloc_reply(s_eid, response, mpl, + udp_query_mtu(saddr), data) < 0) { log_err("Failed to reply to flow allocation."); return -1; } @@ -352,13 +405,18 @@ static int udp_ipcp_mgmt_frame(struct __SOCKADDR c_saddr, qosspec_t qs; buffer_t data; + /* Defence against malformed/corrupted wire input. */ + if (len < sizeof(*msg)) + return -1; + msg = (struct mgmt_msg *) buf; switch (msg->code) { case FLOW_REQ: msg_len = sizeof(*msg) + ipcp_dir_hash_len(); - assert(len >= msg_len); + if (len < msg_len) + return -1; data.len = len - msg_len; data.data = (uint8_t *) buf + msg_len; @@ -369,7 +427,7 @@ static int udp_ipcp_mgmt_frame(struct __SOCKADDR c_saddr, qs.availability = msg->availability; qs.loss = ntoh32(msg->loss); qs.ber = ntoh32(msg->ber); - qs.in_order = msg->in_order; + qs.service = msg->service; qs.max_gap = ntoh32(msg->max_gap); qs.timeout = ntoh32(msg->timeout); @@ -377,8 +435,6 @@ static int udp_ipcp_mgmt_frame(struct __SOCKADDR c_saddr, (uint8_t *) (msg + 1), qs, &data); case FLOW_REPLY: - assert(len >= sizeof(*msg)); - data.len = len - sizeof(*msg); data.data = (uint8_t *) buf + sizeof(*msg); @@ -549,7 +605,7 @@ static void * udp_ipcp_packet_writer(void * o) continue; } - buf = ssm_pk_buff_head_alloc(spb, OUR_HEADER_LEN); + buf = ssm_pk_buff_push(spb, OUR_HEADER_LEN); if (buf == NULL) { log_dbg("Failed to allocate header."); ipcp_spb_release(spb); diff --git a/src/ipcpd/unicast/dt.c b/src/ipcpd/unicast/dt.c index 252477f4..cc54efa1 100644 --- a/src/ipcpd/unicast/dt.c +++ b/src/ipcpd/unicast/dt.c @@ -139,7 +139,7 @@ static void dt_pci_shrink(struct ssm_pk_buff * spb) { assert(spb); - ssm_pk_buff_head_release(spb, dt_pci_info.head_size); + ssm_pk_buff_pop(spb, dt_pci_info.head_size); } struct { @@ -168,12 +168,12 @@ struct { size_t f_nhp_pkt[QOS_CUBE_MAX]; size_t f_nhp_bytes[QOS_CUBE_MAX]; pthread_mutex_t lock; - } stat[PROG_MAX_FLOWS]; + } stat[PROC_MAX_FLOWS]; size_t n_flows; #endif struct bmp * res_fds; - struct comp_info comps[PROG_RES_FDS]; + struct comp_info comps[PROC_RES_FDS]; pthread_rwlock_t lock; pthread_t listener; @@ -220,7 +220,7 @@ static int dt_rib_read(const char * path, tm = gmtime(&dt.stat[fd].stamp); strftime(tmstr, sizeof(tmstr), RIB_TM_FORMAT, tm); - if (fd >= PROG_RES_FDS) { + if (fd >= PROC_RES_FDS) { fccntl(fd, FLOWGRXQLEN, &rxqlen); fccntl(fd, FLOWGTXQLEN, &txqlen); } @@ -296,7 +296,7 @@ static int dt_rib_readdir(char *** buf) if (*buf == NULL) goto fail_entries; - for (i = 0; i < PROG_MAX_FLOWS; ++i) { + for (i = 0; i < PROC_MAX_FLOWS; ++i) { pthread_mutex_lock(&dt.stat[i].lock); if (dt.stat[i].stamp == 0) { @@ -514,7 +514,7 @@ static void packet_handler(int fd, #endif } else { dt_pci_shrink(spb); - if (dt_pci.eid >= PROG_RES_FDS) { + if (dt_pci.eid >= PROC_RES_FDS) { uint8_t ecn = *(head + dt_pci_info.ecn_o); fa_np1_rcv(dt_pci.eid, ecn, spb); return; @@ -636,13 +636,13 @@ int dt_init(struct dt_config cfg) goto fail_rwlock_init; } - dt.res_fds = bmp_create(PROG_RES_FDS, 0); + dt.res_fds = bmp_create(PROC_RES_FDS, 0); if (dt.res_fds == NULL) goto fail_res_fds; #ifdef IPCP_FLOW_STATS memset(dt.stat, 0, sizeof(dt.stat)); - for (i = 0; i < PROG_MAX_FLOWS; ++i) + for (i = 0; i < PROC_MAX_FLOWS; ++i) if (pthread_mutex_init(&dt.stat[i].lock, NULL)) { log_err("Failed to init mutex for flow %d.", i); for (j = 0; j < i; ++j) @@ -662,7 +662,7 @@ int dt_init(struct dt_config cfg) fail_rib_reg: #ifdef IPCP_FLOW_STATS - for (i = 0; i < PROG_MAX_FLOWS; ++i) + for (i = 0; i < PROC_MAX_FLOWS; ++i) pthread_mutex_destroy(&dt.stat[i].lock); fail_stat_lock: #endif @@ -691,7 +691,7 @@ void dt_fini(void) sprintf(dtstr, "%s.%" PRIu64, DT, dt.addr); rib_unreg(dtstr); #ifdef IPCP_FLOW_STATS - for (i = 0; i < PROG_MAX_FLOWS; ++i) + for (i = 0; i < PROC_MAX_FLOWS; ++i) pthread_mutex_destroy(&dt.stat[i].lock); #endif bmp_destroy(dt.res_fds); @@ -791,7 +791,7 @@ int dt_reg_comp(void * comp, void dt_unreg_comp(int eid) { - assert(eid >= 0 && eid < PROG_RES_FDS); + assert(eid >= 0 && eid < PROC_RES_FDS); pthread_rwlock_wrlock(&dt.lock); @@ -823,7 +823,7 @@ int dt_write_packet(uint64_t dst_addr, #ifdef IPCP_FLOW_STATS len = ssm_pk_buff_len(spb); - if (eid < PROG_RES_FDS) { + if (eid < PROC_RES_FDS) { pthread_mutex_lock(&dt.stat[eid].lock); ++dt.stat[eid].lcl_r_pkt[qc]; @@ -837,7 +837,7 @@ int dt_write_packet(uint64_t dst_addr, log_dbg("Could not get nhop for " ADDR_FMT32 ".", ADDR_VAL32(&dst_addr)); #ifdef IPCP_FLOW_STATS - if (eid < PROG_RES_FDS) { + if (eid < PROC_RES_FDS) { pthread_mutex_lock(&dt.stat[eid].lock); ++dt.stat[eid].lcl_r_pkt[qc]; @@ -849,7 +849,7 @@ int dt_write_packet(uint64_t dst_addr, return -EPERM; } - head = ssm_pk_buff_head_alloc(spb, dt_pci_info.head_size); + head = ssm_pk_buff_push(spb, dt_pci_info.head_size); if (head == NULL) { log_dbg("Failed to allocate DT header."); goto fail_write; @@ -876,7 +876,7 @@ int dt_write_packet(uint64_t dst_addr, #ifdef IPCP_FLOW_STATS pthread_mutex_lock(&dt.stat[fd].lock); - if (dt_pci.eid < PROG_RES_FDS) { + if (dt_pci.eid < PROC_RES_FDS) { ++dt.stat[fd].lcl_w_pkt[qc]; dt.stat[fd].lcl_w_bytes[qc] += len; } @@ -891,7 +891,7 @@ int dt_write_packet(uint64_t dst_addr, #ifdef IPCP_FLOW_STATS pthread_mutex_lock(&dt.stat[fd].lock); - if (eid < PROG_RES_FDS) { + if (eid < PROC_RES_FDS) { ++dt.stat[fd].lcl_w_pkt[qc]; dt.stat[fd].lcl_w_bytes[qc] += len; } diff --git a/src/ipcpd/unicast/fa.c b/src/ipcpd/unicast/fa.c index ddf78e22..c0447885 100644 --- a/src/ipcpd/unicast/fa.c +++ b/src/ipcpd/unicast/fa.c @@ -58,12 +58,12 @@ #define CLOCK_REALTIME_COARSE CLOCK_REALTIME #endif -#define TIMEOUT 10 * MILLION /* nanoseconds */ +#define TIMEOUT 10 * MILLION /* nanoseconds */ +#define MSGBUFSZ 32768 #define FLOW_REQ 0 #define FLOW_REPLY 1 #define FLOW_UPDATE 2 -#define MSGBUFSZ 2048 #define STAT_FILE_LEN 0 @@ -81,7 +81,7 @@ struct fa_msg { uint16_t ece; uint8_t code; uint8_t availability; - uint8_t in_order; + uint8_t service; } __attribute__((packed)); struct cmd { @@ -111,7 +111,7 @@ struct fa_flow { struct { pthread_rwlock_t flows_lock; - struct fa_flow flows[PROG_MAX_FLOWS]; + struct fa_flow flows[PROC_MAX_FLOWS]; #ifdef IPCP_FLOW_STATS size_t n_flows; #endif @@ -145,7 +145,7 @@ static int fa_rib_read(const char * path, fd = atoi(entry); - if (fd < 0 || fd >= PROG_MAX_FLOWS) + if (fd < 0 || fd >= PROC_MAX_FLOWS) return -1; if (len < 1536) @@ -225,7 +225,7 @@ static int fa_rib_readdir(char *** buf) if (*buf == NULL) goto fail_entries; - for (i = 0; i < PROG_MAX_FLOWS; ++i) { + for (i = 0; i < PROC_MAX_FLOWS; ++i) { struct fa_flow * flow; flow = &fa.flows[i]; @@ -306,7 +306,7 @@ static int eid_to_fd(uint64_t eid) fd = eid & 0xFFFFFFFF; - if (fd < 0 || fd >= PROG_MAX_FLOWS) + if (fd < 0 || fd >= PROC_MAX_FLOWS) return -1; flow = &fa.flows[fd]; @@ -496,11 +496,12 @@ static int fa_handle_flow_req(struct fa_msg * msg, qs.availability = msg->availability; qs.loss = ntoh32(msg->loss); qs.ber = ntoh32(msg->ber); - qs.in_order = msg->in_order; + qs.service = msg->service; qs.max_gap = ntoh32(msg->max_gap); qs.timeout = ntoh32(msg->timeout); - fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_UNICAST_MPL, &data); + fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_UNICAST_MPL, + IPCP_UNICAST_MTU, &data); if (fd < 0) return fd; @@ -528,7 +529,8 @@ static int fa_handle_flow_reply(struct fa_msg * msg, time_t mpl = IPCP_UNICAST_MPL; int response; - assert(len >= sizeof(*msg)); + if (len < sizeof(*msg)) + return -EINVAL; data.data = (uint8_t *) msg + sizeof(*msg); data.len = len - sizeof(*msg); @@ -558,7 +560,8 @@ static int fa_handle_flow_reply(struct fa_msg * msg, pthread_rwlock_unlock(&fa.flows_lock); - if (ipcp_flow_alloc_reply(fd, response, mpl, &data) < 0) { + if (ipcp_flow_alloc_reply(fd, response, mpl, + IPCP_UNICAST_MTU, &data) < 0) { log_err("Failed to reply for flow allocation on fd %d.", fd); return -EIRMD; } @@ -572,8 +575,8 @@ static int fa_handle_flow_update(struct fa_msg * msg, struct fa_flow * flow; int fd; - (void) len; - assert(len >= sizeof(*msg)); + if (len < sizeof(*msg)) + return -EINVAL; pthread_rwlock_wrlock(&fa.flows_lock); @@ -789,7 +792,7 @@ int fa_alloc(int fd, msg->availability = qs.availability; msg->loss = hton32(qs.loss); msg->ber = hton32(qs.ber); - msg->in_order = qs.in_order; + msg->service = qs.service; msg->max_gap = hton32(qs.max_gap); msg->timeout = hton32(qs.timeout); diff --git a/src/ipcpd/unicast/routing/graph.c b/src/ipcpd/unicast/routing/graph.c index 0226c762..c168eb7d 100644 --- a/src/ipcpd/unicast/routing/graph.c +++ b/src/ipcpd/unicast/routing/graph.c @@ -603,9 +603,9 @@ static int graph_routing_table_lfa(struct graph * graph, struct list_head * table, int ** dist) { - int * n_dist[PROG_MAX_FLOWS]; - uint64_t addrs[PROG_MAX_FLOWS]; - int n_index[PROG_MAX_FLOWS]; + int * n_dist[PROC_MAX_FLOWS]; + uint64_t addrs[PROC_MAX_FLOWS]; + int n_index[PROC_MAX_FLOWS]; struct list_head * p; struct list_head * q; struct vertex * v; @@ -618,7 +618,7 @@ static int graph_routing_table_lfa(struct graph * graph, if (graph_routing_table_simple(graph, s_addr, table, dist)) goto fail_table; - for (j = 0; j < PROG_MAX_FLOWS; j++) { + for (j = 0; j < PROC_MAX_FLOWS; j++) { n_dist[j] = NULL; n_index[j] = -1; addrs[j] = -1; diff --git a/src/ipcpd/unicast/routing/link-state.c b/src/ipcpd/unicast/routing/link-state.c index 051dd98d..c4ea9e1c 100644 --- a/src/ipcpd/unicast/routing/link-state.c +++ b/src/ipcpd/unicast/routing/link-state.c @@ -415,7 +415,7 @@ static void calculate_pff(struct routing_i * instance) struct list_head table; struct list_head * p; struct list_head * q; - int fds[PROG_MAX_FLOWS]; + int fds[PROC_MAX_FLOWS]; assert(instance); diff --git a/src/irmd/main.c b/src/irmd/main.c index e610a015..4808934f 100644 --- a/src/irmd/main.c +++ b/src/irmd/main.c @@ -86,7 +86,9 @@ #define TIMESYNC_SLACK 100 /* ms */ #define OAP_SEEN_TIMER 20 /* s */ #define DEALLOC_TIME 300 /* s */ -#define DIRECT_MPL 1 /* s */ +#define DIRECT_MPL 20 /* ms */ +/* bytes; in-process, bounded only by PUP/GSPP. */ +#define DIRECT_MTU 65000 enum irm_state { IRMD_NULL = 0, @@ -910,6 +912,10 @@ static int flow_accept(struct flow_info * flow, flow->uid = reg_get_proc_uid(flow->n_pid); err = oap_srv_process(&info, req_hdr, &resp_hdr, data, sk); + if (err == -EREPLAY) { + log_warn("Dropping replayed alloc request for %s.", name); + goto fail_replay; + } if (err < 0) { log_err("OAP processing failed for %s.", name); goto fail_oap; @@ -938,6 +944,9 @@ static int flow_accept(struct flow_info * flow, fail_oap: if (!reg_flow_is_direct(flow->id)) ipcp_flow_alloc_resp(flow, err, resp_hdr); + fail_replay: + freebuf(req_hdr); + freebuf(resp_hdr); fail_wait: reg_destroy_flow(flow->id); fail_flow: @@ -1209,6 +1218,7 @@ static int flow_alloc_direct(const char * dst, acc.n_1_pid = flow->n_pid; acc.mpl = DIRECT_MPL; + acc.mtu = DIRECT_MTU; acc.qs = flow->qs; acc.state = FLOW_ALLOCATED; @@ -1244,6 +1254,7 @@ static int flow_alloc_direct(const char * dst, flow->id = acc.id; flow->n_1_pid = acc.n_pid; flow->mpl = DIRECT_MPL; + flow->mtu = DIRECT_MTU; flow->state = FLOW_ALLOCATED; log_info("Flow %d allocated (direct) for %d to %s.", @@ -2416,6 +2427,7 @@ int main(int argc, pthread_sigmask(SIG_UNBLOCK, &sigset, NULL); crypt_secure_malloc_fini(); + crypt_cleanup(); reg_clear(); diff --git a/src/irmd/oap.c b/src/irmd/oap.c deleted file mode 100644 index 1831f533..00000000 --- a/src/irmd/oap.c +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Ouroboros - Copyright (C) 2016 - 2026 - * - * OAP - Shared credential and configuration loading - * - * Dimitri Staessens <dimitri@ouroboros.rocks> - * Sander Vrijders <sander@ouroboros.rocks> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., http://www.fsf.org/about/contact/. - */ - -#if defined(__linux__) || defined(__CYGWIN__) - #define _DEFAULT_SOURCE -#else - #define _POSIX_C_SOURCE 200809L -#endif - -#define OUROBOROS_PREFIX "irmd/oap" - -#include <ouroboros/crypt.h> -#include <ouroboros/errno.h> -#include <ouroboros/logs.h> - -#include "config.h" - -#include <assert.h> -#include <string.h> -#include <sys/stat.h> - -/* - * Shared credential and configuration loading helpers - */ - -#ifndef OAP_TEST_MODE - -static bool file_exists(const char * path) -{ - struct stat s; - - if (stat(path, &s) < 0 && errno == ENOENT) { - log_dbg("File %s does not exist.", path); - return false; - } - - return true; -} - -int load_credentials(const char * name, - const struct name_sec_paths * paths, - void ** pkp, - void ** crt) -{ - assert(paths != NULL); - assert(pkp != NULL); - assert(crt != NULL); - - *pkp = NULL; - *crt = NULL; - - if (!file_exists(paths->crt) || !file_exists(paths->key)) { - log_info("No authentication certificates for %s.", name); - return 0; - } - - if (crypt_load_crt_file(paths->crt, crt) < 0) { - log_err("Failed to load %s for %s.", paths->crt, name); - goto fail_crt; - } - - if (crypt_load_privkey_file(paths->key, pkp) < 0) { - log_err("Failed to load %s for %s.", paths->key, name); - goto fail_key; - } - - log_info("Loaded authentication certificates for %s.", name); - - return 0; - - fail_key: - crypt_free_crt(*crt); - *crt = NULL; - fail_crt: - return -EAUTH; -} - -int load_kex_config(const char * name, - const char * path, - struct sec_config * cfg) -{ - assert(name != NULL); - assert(cfg != NULL); - - memset(cfg, 0, sizeof(*cfg)); - - /* Load encryption config */ - if (!file_exists(path)) - log_dbg("No encryption %s for %s.", path, name); - - if (load_sec_config_file(cfg, path) < 0) { - log_warn("Failed to load %s for %s.", path, name); - return -1; - } - - if (!IS_KEX_ALGO_SET(cfg)) { - log_info("Key exchange not configured for %s.", name); - return 0; - } - - if (cfg->c.nid == NID_undef || crypt_nid_to_str(cfg->c.nid) == NULL) { - log_err("Invalid cipher NID %d for %s.", cfg->c.nid, name); - return -ECRYPT; - } - - log_info("Encryption enabled for %s.", name); - - return 0; -} - -#endif /* OAP_TEST_MODE */ diff --git a/src/irmd/oap/auth.c b/src/irmd/oap/auth.c index a11ab158..d165de73 100644 --- a/src/irmd/oap/auth.c +++ b/src/irmd/oap/auth.c @@ -174,6 +174,7 @@ int oap_check_hdr(const struct oap_hdr * hdr) fail_replay: pthread_mutex_unlock(&oap_auth.replay.mtx); free(new); + return -EREPLAY; fail_stamp: return -EAUTH; } @@ -183,7 +184,7 @@ int oap_auth_peer(char * name, const struct oap_hdr * peer_hdr) { void * crt; - void * pk; + void * pk = NULL; buffer_t sign; /* Signed region */ uint8_t * id = peer_hdr->id.data; @@ -244,8 +245,8 @@ int oap_auth_peer(char * name, return 0; fail_check_sig: - crypt_free_key(pk); fail_crt: + crypt_free_key(pk); crypt_free_crt(crt); fail_check: return -EAUTH; diff --git a/src/irmd/oap/cli.c b/src/irmd/oap/cli.c index 8ecd317d..7a202da7 100644 --- a/src/irmd/oap/cli.c +++ b/src/irmd/oap/cli.c @@ -50,7 +50,7 @@ struct oap_cli_ctx { uint8_t __id[OAP_ID_SIZE]; buffer_t id; - uint8_t kex_buf[MSGBUFSZ]; + uint8_t kex_buf[CRYPT_KEY_BUFSZ]; uint8_t req_hash[MAX_HASH_SIZE]; size_t req_hash_len; int req_md_nid; diff --git a/src/irmd/oap/srv.c b/src/irmd/oap/srv.c index 36391e50..587a8f9f 100644 --- a/src/irmd/oap/srv.c +++ b/src/irmd/oap/srv.c @@ -180,11 +180,7 @@ static int negotiate_cipher(const struct oap_hdr * peer_hdr, cli_rank = crypt_kdf_rank(peer_hdr->kdf_nid); srv_rank = crypt_kdf_rank(kcfg->k.nid); - /* - * For client-encap KEM, the KDF is baked into - * the ciphertext. The server must use the client's - * KDF and can only verify the minimum. - */ + /* Client-encap KEM bakes KDF into ciphertext; verify min. */ if (OAP_KEX_ROLE(peer_hdr) == KEM_MODE_CLIENT_ENCAP) { if (srv_rank > cli_rank) { log_err_id(id, "Client KDF too weak."); @@ -384,15 +380,16 @@ int oap_srv_process(const struct name_info * info, struct oap_hdr peer_hdr; struct oap_hdr local_hdr; struct sec_config kcfg; - uint8_t kex_buf[MSGBUFSZ]; + uint8_t kex_buf[CRYPT_KEY_BUFSZ]; uint8_t hash_buf[MAX_HASH_SIZE]; buffer_t req_hash = BUF_INIT; ssize_t hash_ret; - char cli_name[NAME_SIZE + 1]; /* TODO */ + char cli_name[NAME_SIZE + 1]; uint8_t * id; void * pkp = NULL; void * crt = NULL; int req_md_nid; + int ret; assert(info != NULL); assert(rsp_buf != NULL); @@ -427,8 +424,13 @@ int oap_srv_process(const struct name_info * info, id = peer_hdr.id.data; /* Logging */ - if (oap_check_hdr(&peer_hdr) < 0) { - log_err_id(id, "OAP header failed replay check."); + ret = oap_check_hdr(&peer_hdr); + if (ret == -EREPLAY) { + log_warn_id(id, "OAP header failed replay check."); + goto fail_replay; + } + if (ret < 0) { + log_err_id(id, "OAP header check failed."); goto fail_auth; } @@ -491,6 +493,11 @@ int oap_srv_process(const struct name_info * info, fail_cred: return -EAUTH; + fail_replay: + crypt_free_crt(crt); + crypt_free_key(pkp); + return -EREPLAY; + fail_kex: crypt_free_crt(crt); crypt_free_key(pkp); diff --git a/src/irmd/oap/tests/oap_test.c b/src/irmd/oap/tests/oap_test.c index 2f0f0b4d..a525d988 100644 --- a/src/irmd/oap/tests/oap_test.c +++ b/src/irmd/oap/tests/oap_test.c @@ -32,6 +32,7 @@ #include <ouroboros/crypt.h> #include <ouroboros/endian.h> +#include <ouroboros/errno.h> #include <ouroboros/flow.h> #include <ouroboros/name.h> #include <ouroboros/random.h> @@ -1053,9 +1054,9 @@ static int test_oap_replay_packet(void) freebuf(ctx.req_hdr); ctx.req_hdr = saved_req; - /* Replayed request should fail */ - if (oap_srv_process_ctx(&ctx) == 0) { - printf("Server should reject replayed packet.\n"); + /* Replay must return -EREPLAY so callers can drop silently. */ + if (oap_srv_process_ctx(&ctx) != -EREPLAY) { + printf("Replayed packet rejection != -EREPLAY.\n"); goto fail_cleanup; } @@ -1071,6 +1072,74 @@ static int test_oap_replay_packet(void) return TEST_RC_FAIL; } +/* Server rejects client certificate when root CA is missing from store */ +static int test_oap_missing_root_ca(void) +{ + struct oap_test_ctx ctx; + void * im_ca = NULL; + + test_default_cfg(); + + TEST_START(); + + memset(&ctx, 0, sizeof(ctx)); + + strcpy(ctx.srv.info.name, "test-1.unittest.o7s"); + strcpy(ctx.cli.info.name, "test-1.unittest.o7s"); + + if (oap_auth_init() < 0) { + printf("Failed to init OAP.\n"); + goto fail; + } + + /* Load intermediate CA but intentionally omit the root CA */ + if (crypt_load_crt_str(im_ca_crt_ec, &im_ca) < 0) { + printf("Failed to load intermediate CA cert.\n"); + goto fail_fini; + } + + ctx.im_ca = im_ca; + + if (oap_auth_add_ca_crt(im_ca) < 0) { + printf("Failed to add intermediate CA cert to store.\n"); + goto fail_fini; + } + + if (oap_cli_prepare_ctx(&ctx) < 0) { + printf("Client prepare failed.\n"); + goto fail_fini; + } + + /* Server processes and signs response - succeeds without root CA */ + if (oap_srv_process_ctx(&ctx) < 0) { + printf("Server process failed.\n"); + goto fail_teardown; + } + + /* Client verifies server certificate against trust store: + * should reject because root CA is not in the store */ + if (oap_cli_complete_ctx(&ctx) == 0) { + printf("Client should reject without root CA.\n"); + goto fail_teardown; + } + + oap_test_teardown(&ctx); + + TEST_SUCCESS(); + return TEST_RC_SUCCESS; + + fail_teardown: + oap_test_teardown(&ctx); + TEST_FAIL(); + return TEST_RC_FAIL; + fail_fini: + crypt_free_crt(im_ca); + oap_auth_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + /* Test that client rejects server with wrong certificate name */ static int test_oap_server_name_mismatch(void) { @@ -1149,6 +1218,7 @@ int oap_test(int argc, ret |= test_oap_outdated_packet(); ret |= test_oap_future_packet(); ret |= test_oap_replay_packet(); + ret |= test_oap_missing_root_ca(); ret |= test_oap_server_name_mismatch(); #else (void) test_oap_roundtrip_auth_only; @@ -1173,9 +1243,12 @@ int oap_test(int argc, (void) test_oap_outdated_packet; (void) test_oap_future_packet; (void) test_oap_replay_packet; + (void) test_oap_missing_root_ca; (void) test_oap_server_name_mismatch; ret = TEST_RC_SKIP; #endif + crypt_cleanup(); + return ret; } diff --git a/src/irmd/oap/tests/oap_test_ml_dsa.c b/src/irmd/oap/tests/oap_test_ml_dsa.c index f9e6bdb2..81b307ab 100644 --- a/src/irmd/oap/tests/oap_test_ml_dsa.c +++ b/src/irmd/oap/tests/oap_test_ml_dsa.c @@ -442,6 +442,7 @@ int oap_test_ml_dsa(int argc, ret = TEST_RC_SKIP; #endif + crypt_cleanup(); return ret; } diff --git a/src/irmd/reg/flow.c b/src/irmd/reg/flow.c index 93c3e128..5c709dea 100644 --- a/src/irmd/reg/flow.c +++ b/src/irmd/reg/flow.c @@ -42,6 +42,7 @@ struct reg_flow * reg_flow_create(const struct flow_info * info) assert(info->n_pid != 0); assert(info->n_1_pid == 0); assert(info->mpl == 0); + assert(info->mtu == 0); assert(info->state == FLOW_INIT); flow = malloc(sizeof(*flow)); @@ -160,6 +161,7 @@ int reg_flow_update(struct reg_flow * flow, assert(info->mpl != 0); flow->info.mpl = info->mpl; + flow->info.mtu = info->mtu; if (flow->info.state == FLOW_ALLOC_PENDING) break; diff --git a/src/irmd/reg/reg.c b/src/irmd/reg/reg.c index 9ee47221..365064e5 100644 --- a/src/irmd/reg/reg.c +++ b/src/irmd/reg/reg.c @@ -1820,7 +1820,11 @@ int reg_respond_alloc(struct flow_info * info, goto fail_flow; } - assert(flow->info.state == FLOW_ALLOC_PENDING); + if (flow->info.state != FLOW_ALLOC_PENDING) { + log_warn("Flow %d already responded.", info->id); + goto fail_flow; + } + assert(flow->rsp_data.len == 0); assert(flow->rsp_data.data == NULL); @@ -2008,11 +2012,7 @@ int reg_respond_accept(struct flow_info * info, info->n_pid = flow->info.n_pid; -<<<<<<< HEAD flow->req_data = *pbuf; -======= - reg_flow_set_data(flow, pbuf); ->>>>>>> c82f0de4 (irmd: Fix memleak in reg tests) clrbuf(*pbuf); if (reg_flow_update(flow, info) < 0) { diff --git a/src/irmd/reg/tests/flow_test.c b/src/irmd/reg/tests/flow_test.c index 7e1c1360..18214078 100644 --- a/src/irmd/reg/tests/flow_test.c +++ b/src/irmd/reg/tests/flow_test.c @@ -122,6 +122,21 @@ static int test_reg_flow_create_has_mpl(void) { return TEST_RC_SUCCESS; } +static int test_reg_flow_create_has_mtu(void) { + struct flow_info info = { + .id = 1, + .n_pid = 1, + .n_1_pid = 0, + .mtu = 1400, + .qs = qos_raw, + .state = FLOW_ALLOC_PENDING + }; + + reg_flow_create(&info); /* assert fail */ + + return TEST_RC_SUCCESS; +} + static int test_reg_flow_update(void) { struct reg_flow * f; @@ -136,7 +151,7 @@ static int test_reg_flow_update(void) struct flow_info upd = { .id = 1, .n_pid = 1, - .qs = qos_data, + .qs = qos_msg, .state = FLOW_DEALLOCATED }; @@ -179,7 +194,7 @@ static int test_reg_flow_update_wrong_id(void) struct flow_info upd = { .id = 2, .n_pid = 1, - .qs = qos_data, + .qs = qos_msg, .state = FLOW_DEALLOCATED }; @@ -210,6 +225,7 @@ static int test_reg_flow_assert_fails(void) ret |= test_assert_fail(test_reg_flow_create_has_n_1_pid); ret |= test_assert_fail(test_reg_flow_create_wrong_state); ret |= test_assert_fail(test_reg_flow_create_has_mpl); + ret |= test_assert_fail(test_reg_flow_create_has_mtu); ret |= test_assert_fail(test_reg_flow_update_wrong_id); return ret; diff --git a/src/irmd/reg/tests/reg_test.c b/src/irmd/reg/tests/reg_test.c index b426c0dd..0b1014f9 100644 --- a/src/irmd/reg/tests/reg_test.c +++ b/src/irmd/reg/tests/reg_test.c @@ -31,6 +31,7 @@ #define TEST_N_1_PID 3999 #define TEST_FAKE_ID 9128349 #define TEST_MPL 5 +#define TEST_MTU 1400 #define TEST_PROG "reg_test" /* own binary for binary check */ #define TEST_IPCP "testipcp" #define TEST_NAME "testname" @@ -239,7 +240,7 @@ static int test_reg_accept_flow_success(void) struct flow_info n_1_info = { .n_1_pid = TEST_N_1_PID, - .qs = qos_data, + .qs = qos_msg, .state = FLOW_ALLOCATED /* RESPONSE SUCCESS */ }; @@ -266,6 +267,7 @@ static int test_reg_accept_flow_success(void) n_1_info.id = info.id; n_1_info.mpl = 1; + n_1_info.mtu = TEST_MTU; pthread_create(&thr, NULL, test_flow_respond_accept, &n_1_info); @@ -284,6 +286,11 @@ static int test_reg_accept_flow_success(void) goto fail; } + if (info.mtu != TEST_MTU) { + printf("MTU not propagated.\n"); + goto fail; + } + if (rbuf.data == NULL) { printf("rbuf data not returned.\n"); goto fail; @@ -336,7 +343,7 @@ static int test_reg_accept_flow_success_no_crypt(void) struct flow_info n_1_info = { .n_1_pid = TEST_N_1_PID, - .qs = qos_data, + .qs = qos_msg, .state = FLOW_ALLOCATED /* RESPONSE SUCCESS */ }; @@ -363,6 +370,7 @@ static int test_reg_accept_flow_success_no_crypt(void) n_1_info.id = info.id; n_1_info.mpl = 1; + n_1_info.mtu = TEST_MTU; pthread_create(&thr, NULL, test_flow_respond_accept, &n_1_info); @@ -381,6 +389,11 @@ static int test_reg_accept_flow_success_no_crypt(void) goto fail; } + if (info.mtu != TEST_MTU) { + printf("MTU not propagated.\n"); + goto fail; + } + if (rbuf.data == NULL) { printf("rbuf data was not returned.\n"); goto fail; @@ -431,7 +444,7 @@ static int test_reg_allocate_flow_fail(void) struct flow_info n_1_info = { .n_1_pid = TEST_N_1_PID, - .qs = qos_data, + .qs = qos_msg, .state = FLOW_DEALLOCATED /* RESPONSE FAIL */ }; @@ -489,6 +502,93 @@ static int test_reg_allocate_flow_fail(void) return TEST_RC_FAIL; } +static int test_reg_respond_alloc_duplicate(void) +{ + pthread_t thr; + struct timespec abstime; + struct timespec timeo = TIMESPEC_INIT_S(1); + buffer_t rbuf = BUF_INIT; + buffer_t empty = BUF_INIT; + struct flow_info dup_info; + + struct flow_info info = { + .n_pid = TEST_PID, + .qs = qos_raw + }; + + struct flow_info n_1_info = { + .n_1_pid = TEST_N_1_PID, + .qs = qos_msg, + .state = FLOW_ALLOCATED /* RESPONSE SUCCESS */ + }; + + TEST_START(); + + clock_gettime(PTHREAD_COND_CLOCK, &abstime); + ts_add(&abstime, &timeo, &abstime); + + if (reg_init() < 0) { + printf("Failed to init registry.\n"); + goto fail; + } + + if (reg_create_flow(&info) < 0) { + printf("Failed to add flow.\n"); + goto fail; + } + + info.n_1_pid = TEST_N_1_PID; + + if (reg_prepare_flow_alloc(&info) < 0) { + printf("Failed to prepare flow for alloc.\n"); + goto fail; + } + + n_1_info.id = info.id; + n_1_info.mpl = 1; + n_1_info.mtu = TEST_MTU; + + pthread_create(&thr, NULL, test_flow_respond_alloc, &n_1_info); + + if (reg_wait_flow_allocated(&info, &rbuf, &abstime) < 0) { + printf("Flow allocation failed.\n"); + pthread_join(thr, NULL); + reg_destroy_flow(info.id); + reg_fini(); + goto fail; + } + + pthread_join(thr, NULL); + freebuf(rbuf); + + if (info.mtu != TEST_MTU) { + printf("MTU not propagated.\n"); + goto fail; + } + + /* Duplicate reply on an already-ALLOCATED flow must not assert. */ + dup_info = n_1_info; + dup_info.state = FLOW_DEALLOCATED; + + if (reg_respond_alloc(&dup_info, &empty, -EREPLAY) != -1) { + printf("Duplicate respond_alloc should return -1.\n"); + goto fail; + } + + reg_dealloc_flow(&info); + reg_dealloc_flow_resp(&info); + reg_destroy_flow(n_1_info.id); + + reg_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail: + REG_TEST_FAIL(); + return TEST_RC_FAIL; +} + struct direct_alloc_info { struct flow_info info; buffer_t rsp; @@ -564,7 +664,7 @@ static int test_reg_direct_flow_success(void) dai.info.id = info.id; dai.info.n_1_pid = TEST_N_1_PID; dai.info.mpl = TEST_MPL; - dai.info.qs = qos_data; + dai.info.qs = qos_msg; dai.info.state = FLOW_ALLOCATED; dai.rsp.len = 0; dai.rsp.data = NULL; @@ -679,6 +779,7 @@ static int test_reg_flow(void) { rc |= test_reg_accept_flow_success(); rc |= test_reg_accept_flow_success_no_crypt(); rc |= test_reg_allocate_flow_fail(); + rc |= test_reg_respond_alloc_duplicate(); rc |= test_reg_direct_flow_success(); return rc; @@ -1491,7 +1592,7 @@ static int test_wait_accepting_fail_name(void) static void * test_call_flow_accept(void * o) { struct timespec abstime; - struct timespec timeo = TIMESPEC_INIT_MS(10); + struct timespec timeo = TIMESPEC_INIT_MS(30); buffer_t pbuf = BUF_INIT; struct proc_info pinfo = TEST_PROC_INFO; diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt index 79263924..6cd3a8a4 100644 --- a/src/lib/CMakeLists.txt +++ b/src/lib/CMakeLists.txt @@ -17,7 +17,10 @@ protobuf_generate_c(IPCP_PROTO_SRCS IPCP_PROTO_HDRS set(SOURCE_FILES_COMMON bitmap.c btree.c - crc32.c + crc/crc8.c + crc/crc16.c + crc/crc32.c + crc/crc64.c crypt.c hash.c lockfile.c @@ -36,6 +39,7 @@ set(SOURCE_FILES_COMMON ssm/pool.c sockets.c tpm.c + tw.c utils.c ) @@ -155,5 +159,6 @@ configure_file("${CMAKE_CURRENT_SOURCE_DIR}/ssm/ssm.h.in" if(BUILD_TESTS) add_subdirectory(tests) + add_subdirectory(crc/tests) add_subdirectory(ssm/tests) endif() diff --git a/src/lib/config.h.in b/src/lib/config.h.in index 08e9baf6..7124a974 100644 --- a/src/lib/config.h.in +++ b/src/lib/config.h.in @@ -20,6 +20,14 @@ * Foundation, Inc., http://www.fsf.org/about/contact/. */ +#ifndef MILLION +#define MILLION 1000000LL +#endif + +#ifndef BILLION +#define BILLION 1000000000LL +#endif + #cmakedefine HAVE_SYS_RANDOM #cmakedefine HAVE_EXPLICIT_BZERO #cmakedefine HAVE_LIBGCRYPT @@ -37,6 +45,8 @@ #cmakedefine QOS_DISABLE_CRC #cmakedefine HAVE_OPENSSL_RNG +#cmakedefine HAVE_PCLMUL +#cmakedefine HAVE_PMULL #define SHM_LOCKFILE_NAME "@SHM_LOCKFILE_NAME@" #define FLOW_ALLOC_TIMEOUT @FLOW_ALLOC_TIMEOUT@ @@ -60,16 +70,18 @@ #cmakedefine PROC_FLOW_STATS #endif +#cmakedefine FRCT_DEBUG_STDOUT + #define PTHREAD_COND_CLOCK @PTHREAD_COND_CLOCK@ -#define PROG_MAX_FLOWS @PROG_MAX_FLOWS@ -#define PROG_RES_FDS @PROG_RES_FDS@ -#define PROG_MAX_FQUEUES @PROG_MAX_FQUEUES@ +#define PROC_MAX_FLOWS @PROC_MAX_FLOWS@ +#define PROC_RES_FDS @PROC_RES_FDS@ +#define PROC_MAX_FQUEUES @PROC_MAX_FQUEUES@ /* Default Delta-t parameters */ #cmakedefine FRCT_LINUX_RTT_ESTIMATOR -#define DELT_A (@DELTA_T_ACK@) /* ns */ -#define DELT_R (@DELTA_T_RTX@) /* ns */ +#define DELT_A (@DELTA_T_ACK@) /* ms */ +#define DELT_R (@DELTA_T_RTX@) /* ms */ #define RQ_SIZE (@FRCT_REORDER_QUEUE_SIZE@) #define START_WINDOW (@FRCT_START_WINDOW@) @@ -80,9 +92,6 @@ #define TICTIME (@FRCT_TICK_TIME@ * 1000) /* ns */ /* Retransmission tuning */ -#cmakedefine RXM_BUFFER_ON_HEAP -#cmakedefine RXM_BLOCKING - #define RXMQ_RES (@RXM_MIN_RESOLUTION@) /* 2^N ns */ #define RXMQ_BUMP (@RXM_WHEEL_MULTIPLIER@) #define RXMQ_LVLS (@RXM_WHEEL_LEVELS@) diff --git a/src/lib/crc/crc16.c b/src/lib/crc/crc16.c new file mode 100644 index 00000000..9dc59429 --- /dev/null +++ b/src/lib/crc/crc16.c @@ -0,0 +1,61 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * 16-bit Cyclic Redundancy Check (CCITT-FALSE variant) + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +/* + * CRC-16/CCITT-FALSE (reveng catalog, alias CRC-16/IBM-3740): + * poly = 0x1021 + * init = 0xffff + * refin = false + * refout = false + * xorout = 0x0000 + * check = crc16_ccitt_false("123456789") == 0x29b1 + */ + +#include "config.h" + +#include <ouroboros/crc16.h> + +/* Bit-by-bit MSB-first CRC. */ +void crc16_ccitt_false(uint16_t * crc, + const void * buf, + size_t len) +{ + const uint8_t * p; + uint16_t c; + size_t n; + int i; + + p = (const uint8_t *) buf; + c = *crc ^ 0xffff; + + for (n = 0; n < len; n++) { + c ^= ((uint16_t) p[n]) << 8; + for (i = 0; i < 8; i++) { + if (c & 0x8000) + c = (uint16_t) ((c << 1) ^ 0x1021); + else + c = (uint16_t) (c << 1); + } + } + + *crc = c; +} diff --git a/src/lib/crc32.c b/src/lib/crc/crc32.c index 0fdb62b1..0fdb62b1 100644 --- a/src/lib/crc32.c +++ b/src/lib/crc/crc32.c diff --git a/src/lib/crc/crc64.c b/src/lib/crc/crc64.c new file mode 100644 index 00000000..1b6fb5f6 --- /dev/null +++ b/src/lib/crc/crc64.c @@ -0,0 +1,363 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * 64-bit Cyclic Redundancy Check (NVMe variant) + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +/* + * CRC-64/NVMe (reveng catalog): + * poly = 0xad93d23594c93659 + * init = 0xffffffffffffffff + * refin = true + * refout = true + * xorout = 0xffffffffffffffff + * check = crc64_nvme("123456789") == 0xae8b14860a799888 + */ + +#include "config.h" + +#include <ouroboros/crc64.h> + +/* + * Reflected CRC-64/NVMe table. Polynomial in reflected form: + * 0x9a6c9329ac4bc9b5 (bitrev of 0xad93d23594c93659). + */ +static const uint64_t crc64_nvme_tab[256] = { + 0x0000000000000000ULL, 0x7f6ef0c830358979ULL, + 0xfedde190606b12f2ULL, 0x81b31158505e9b8bULL, + 0xc962e5739841b68fULL, 0xb60c15bba8743ff6ULL, + 0x37bf04e3f82aa47dULL, 0x48d1f42bc81f2d04ULL, + 0xa61cecb46814fe75ULL, 0xd9721c7c5821770cULL, + 0x58c10d24087fec87ULL, 0x27affdec384a65feULL, + 0x6f7e09c7f05548faULL, 0x1010f90fc060c183ULL, + 0x91a3e857903e5a08ULL, 0xeecd189fa00bd371ULL, + 0x78e0ff3b88be6f81ULL, 0x078e0ff3b88be6f8ULL, + 0x863d1eabe8d57d73ULL, 0xf953ee63d8e0f40aULL, + 0xb1821a4810ffd90eULL, 0xceecea8020ca5077ULL, + 0x4f5ffbd87094cbfcULL, 0x30310b1040a14285ULL, + 0xdefc138fe0aa91f4ULL, 0xa192e347d09f188dULL, + 0x2021f21f80c18306ULL, 0x5f4f02d7b0f40a7fULL, + 0x179ef6fc78eb277bULL, 0x68f0063448deae02ULL, + 0xe943176c18803589ULL, 0x962de7a428b5bcf0ULL, + 0xf1c1fe77117cdf02ULL, 0x8eaf0ebf2149567bULL, + 0x0f1c1fe77117cdf0ULL, 0x7072ef2f41224489ULL, + 0x38a31b04893d698dULL, 0x47cdebccb908e0f4ULL, + 0xc67efa94e9567b7fULL, 0xb9100a5cd963f206ULL, + 0x57dd12c379682177ULL, 0x28b3e20b495da80eULL, + 0xa900f35319033385ULL, 0xd66e039b2936bafcULL, + 0x9ebff7b0e12997f8ULL, 0xe1d10778d11c1e81ULL, + 0x606216208142850aULL, 0x1f0ce6e8b1770c73ULL, + 0x8921014c99c2b083ULL, 0xf64ff184a9f739faULL, + 0x77fce0dcf9a9a271ULL, 0x08921014c99c2b08ULL, + 0x4043e43f0183060cULL, 0x3f2d14f731b68f75ULL, + 0xbe9e05af61e814feULL, 0xc1f0f56751dd9d87ULL, + 0x2f3dedf8f1d64ef6ULL, 0x50531d30c1e3c78fULL, + 0xd1e00c6891bd5c04ULL, 0xae8efca0a188d57dULL, + 0xe65f088b6997f879ULL, 0x9931f84359a27100ULL, + 0x1882e91b09fcea8bULL, 0x67ec19d339c963f2ULL, + 0xd75adabd7a6e2d6fULL, 0xa8342a754a5ba416ULL, + 0x29873b2d1a053f9dULL, 0x56e9cbe52a30b6e4ULL, + 0x1e383fcee22f9be0ULL, 0x6156cf06d21a1299ULL, + 0xe0e5de5e82448912ULL, 0x9f8b2e96b271006bULL, + 0x71463609127ad31aULL, 0x0e28c6c1224f5a63ULL, + 0x8f9bd7997211c1e8ULL, 0xf0f5275142244891ULL, + 0xb824d37a8a3b6595ULL, 0xc74a23b2ba0eececULL, + 0x46f932eaea507767ULL, 0x3997c222da65fe1eULL, + 0xafba2586f2d042eeULL, 0xd0d4d54ec2e5cb97ULL, + 0x5167c41692bb501cULL, 0x2e0934dea28ed965ULL, + 0x66d8c0f56a91f461ULL, 0x19b6303d5aa47d18ULL, + 0x980521650afae693ULL, 0xe76bd1ad3acf6feaULL, + 0x09a6c9329ac4bc9bULL, 0x76c839faaaf135e2ULL, + 0xf77b28a2faafae69ULL, 0x8815d86aca9a2710ULL, + 0xc0c42c4102850a14ULL, 0xbfaadc8932b0836dULL, + 0x3e19cdd162ee18e6ULL, 0x41773d1952db919fULL, + 0x269b24ca6b12f26dULL, 0x59f5d4025b277b14ULL, + 0xd846c55a0b79e09fULL, 0xa72835923b4c69e6ULL, + 0xeff9c1b9f35344e2ULL, 0x90973171c366cd9bULL, + 0x1124202993385610ULL, 0x6e4ad0e1a30ddf69ULL, + 0x8087c87e03060c18ULL, 0xffe938b633338561ULL, + 0x7e5a29ee636d1eeaULL, 0x0134d92653589793ULL, + 0x49e52d0d9b47ba97ULL, 0x368bddc5ab7233eeULL, + 0xb738cc9dfb2ca865ULL, 0xc8563c55cb19211cULL, + 0x5e7bdbf1e3ac9decULL, 0x21152b39d3991495ULL, + 0xa0a63a6183c78f1eULL, 0xdfc8caa9b3f20667ULL, + 0x97193e827bed2b63ULL, 0xe877ce4a4bd8a21aULL, + 0x69c4df121b863991ULL, 0x16aa2fda2bb3b0e8ULL, + 0xf86737458bb86399ULL, 0x8709c78dbb8deae0ULL, + 0x06bad6d5ebd3716bULL, 0x79d4261ddbe6f812ULL, + 0x3105d23613f9d516ULL, 0x4e6b22fe23cc5c6fULL, + 0xcfd833a67392c7e4ULL, 0xb0b6c36e43a74e9dULL, + 0x9a6c9329ac4bc9b5ULL, 0xe50263e19c7e40ccULL, + 0x64b172b9cc20db47ULL, 0x1bdf8271fc15523eULL, + 0x530e765a340a7f3aULL, 0x2c608692043ff643ULL, + 0xadd397ca54616dc8ULL, 0xd2bd67026454e4b1ULL, + 0x3c707f9dc45f37c0ULL, 0x431e8f55f46abeb9ULL, + 0xc2ad9e0da4342532ULL, 0xbdc36ec59401ac4bULL, + 0xf5129aee5c1e814fULL, 0x8a7c6a266c2b0836ULL, + 0x0bcf7b7e3c7593bdULL, 0x74a18bb60c401ac4ULL, + 0xe28c6c1224f5a634ULL, 0x9de29cda14c02f4dULL, + 0x1c518d82449eb4c6ULL, 0x633f7d4a74ab3dbfULL, + 0x2bee8961bcb410bbULL, 0x548079a98c8199c2ULL, + 0xd53368f1dcdf0249ULL, 0xaa5d9839ecea8b30ULL, + 0x449080a64ce15841ULL, 0x3bfe706e7cd4d138ULL, + 0xba4d61362c8a4ab3ULL, 0xc52391fe1cbfc3caULL, + 0x8df265d5d4a0eeceULL, 0xf29c951de49567b7ULL, + 0x732f8445b4cbfc3cULL, 0x0c41748d84fe7545ULL, + 0x6bad6d5ebd3716b7ULL, 0x14c39d968d029fceULL, + 0x95708ccedd5c0445ULL, 0xea1e7c06ed698d3cULL, + 0xa2cf882d2576a038ULL, 0xdda178e515432941ULL, + 0x5c1269bd451db2caULL, 0x237c997575283bb3ULL, + 0xcdb181ead523e8c2ULL, 0xb2df7122e51661bbULL, + 0x336c607ab548fa30ULL, 0x4c0290b2857d7349ULL, + 0x04d364994d625e4dULL, 0x7bbd94517d57d734ULL, + 0xfa0e85092d094cbfULL, 0x856075c11d3cc5c6ULL, + 0x134d926535897936ULL, 0x6c2362ad05bcf04fULL, + 0xed9073f555e26bc4ULL, 0x92fe833d65d7e2bdULL, + 0xda2f7716adc8cfb9ULL, 0xa54187de9dfd46c0ULL, + 0x24f29686cda3dd4bULL, 0x5b9c664efd965432ULL, + 0xb5517ed15d9d8743ULL, 0xca3f8e196da80e3aULL, + 0x4b8c9f413df695b1ULL, 0x34e26f890dc31cc8ULL, + 0x7c339ba2c5dc31ccULL, 0x035d6b6af5e9b8b5ULL, + 0x82ee7a32a5b7233eULL, 0xfd808afa9582aa47ULL, + 0x4d364994d625e4daULL, 0x3258b95ce6106da3ULL, + 0xb3eba804b64ef628ULL, 0xcc8558cc867b7f51ULL, + 0x8454ace74e645255ULL, 0xfb3a5c2f7e51db2cULL, + 0x7a894d772e0f40a7ULL, 0x05e7bdbf1e3ac9deULL, + 0xeb2aa520be311aafULL, 0x944455e88e0493d6ULL, + 0x15f744b0de5a085dULL, 0x6a99b478ee6f8124ULL, + 0x224840532670ac20ULL, 0x5d26b09b16452559ULL, + 0xdc95a1c3461bbed2ULL, 0xa3fb510b762e37abULL, + 0x35d6b6af5e9b8b5bULL, 0x4ab846676eae0222ULL, + 0xcb0b573f3ef099a9ULL, 0xb465a7f70ec510d0ULL, + 0xfcb453dcc6da3dd4ULL, 0x83daa314f6efb4adULL, + 0x0269b24ca6b12f26ULL, 0x7d0742849684a65fULL, + 0x93ca5a1b368f752eULL, 0xeca4aad306bafc57ULL, + 0x6d17bb8b56e467dcULL, 0x12794b4366d1eea5ULL, + 0x5aa8bf68aecec3a1ULL, 0x25c64fa09efb4ad8ULL, + 0xa4755ef8cea5d153ULL, 0xdb1bae30fe90582aULL, + 0xbcf7b7e3c7593bd8ULL, 0xc399472bf76cb2a1ULL, + 0x422a5673a732292aULL, 0x3d44a6bb9707a053ULL, + 0x759552905f188d57ULL, 0x0afba2586f2d042eULL, + 0x8b48b3003f739fa5ULL, 0xf42643c80f4616dcULL, + 0x1aeb5b57af4dc5adULL, 0x6585ab9f9f784cd4ULL, + 0xe436bac7cf26d75fULL, 0x9b584a0fff135e26ULL, + 0xd389be24370c7322ULL, 0xace74eec0739fa5bULL, + 0x2d545fb4576761d0ULL, 0x523aaf7c6752e8a9ULL, + 0xc41748d84fe75459ULL, 0xbb79b8107fd2dd20ULL, + 0x3acaa9482f8c46abULL, 0x45a459801fb9cfd2ULL, + 0x0d75adabd7a6e2d6ULL, 0x721b5d63e7936bafULL, + 0xf3a84c3bb7cdf024ULL, 0x8cc6bcf387f8795dULL, + 0x620ba46c27f3aa2cULL, 0x1d6554a417c62355ULL, + 0x9cd645fc4798b8deULL, 0xe3b8b53477ad31a7ULL, + 0xab69411fbfb21ca3ULL, 0xd407b1d78f8795daULL, + 0x55b4a08fdfd90e51ULL, 0x2ada5047efec8728ULL +}; + +static __inline__ uint64_t crc64_nvme_step(uint64_t c, + const uint8_t * p, + size_t len) +{ + size_t n; + + for (n = 0; n < len; n++) + c = crc64_nvme_tab[(c ^ p[n]) & 0xff] ^ (c >> 8); + + return c; +} + +void crc64_nvme_table(uint64_t * crc, + const void * buf, + size_t len) +{ + uint64_t c; + + c = crc64_nvme_step(*crc ^ UINT64_MAX, + (const uint8_t *) buf, len); + + *crc = c ^ UINT64_MAX; +} + +#ifdef HAVE_PCLMUL + +#include <smmintrin.h> +#include <wmmintrin.h> + +/* + * Fold-by-16 constants for reflected CRC-64/NVMe. Properties of the + * polynomial; identical between the PCLMUL and PMULL backends. + * k3 = bitrev64(x^(128+64) mod P) << 1 + * k4 = bitrev64(x^(128+0) mod P) << 1 + */ +static const uint64_t k3_clmul = 0xeadc41fd2ba3d420ULL; +static const uint64_t k4_clmul = 0x21e9761e252621acULL; + +__attribute__((target("pclmul,sse4.1"))) +static __m128i fold16(__m128i x, + __m128i k) +{ + __m128i lo; + __m128i hi; + + lo = _mm_clmulepi64_si128(x, k, 0x00); + hi = _mm_clmulepi64_si128(x, k, 0x11); + return _mm_xor_si128(lo, hi); +} + +/* + * Fold-by-16 over 16-byte chunks; the 128-bit folded state is then + * emitted as 16 little-endian bytes and run through the byte-table + * loop together with any tail (<=15 bytes). The 16-byte minimum on + * the bulk loop is why the short-input path uses the table directly. + */ +__attribute__((target("pclmul,sse4.1"))) +static void crc64_nvme_clmul(uint64_t * crc, + const void * buf, + size_t len) +{ + const uint8_t * p; + uint64_t seed; + uint64_t c; + size_t off; + __m128i x; + __m128i k; + uint8_t post[16]; + + p = (const uint8_t *) buf; + seed = *crc; + + if (len < 16) { + c = crc64_nvme_step(seed ^ UINT64_MAX, p, len); + *crc = c ^ UINT64_MAX; + return; + } + + x = _mm_loadu_si128((const __m128i *) p); + x = _mm_xor_si128(x, _mm_cvtsi64_si128((int64_t) + (seed ^ UINT64_MAX))); + + k = _mm_set_epi64x((int64_t) k4_clmul, (int64_t) k3_clmul); + + off = 16; + while (off + 16 <= len) { + __m128i d; + + d = _mm_loadu_si128((const __m128i *) (p + off)); + x = _mm_xor_si128(fold16(x, k), d); + off += 16; + } + + _mm_storeu_si128((__m128i *) post, x); + + c = crc64_nvme_step(0, post, 16); + c = crc64_nvme_step(c, p + off, len - off); + + *crc = c ^ UINT64_MAX; +} + +#endif /* HAVE_PCLMUL */ + +#ifdef HAVE_PMULL + +#include <arm_neon.h> + +/* Same fold-by-16 constants as the PCLMUL path (poly properties). */ +static const uint64_t k3_pmull = 0xeadc41fd2ba3d420ULL; +static const uint64_t k4_pmull = 0x21e9761e252621acULL; + +__attribute__((target("+crypto"))) +static uint64x2_t fold16_pmull(uint64x2_t x, + uint64x2_t k) +{ + poly64x2_t xp; + poly64x2_t kp; + uint64x2_t lo; + uint64x2_t hi; + + xp = vreinterpretq_p64_u64(x); + kp = vreinterpretq_p64_u64(k); + lo = vreinterpretq_u64_p128( + vmull_p64((poly64_t) vgetq_lane_u64(x, 0), + (poly64_t) vgetq_lane_u64(k, 0))); + hi = vreinterpretq_u64_p128(vmull_high_p64(xp, kp)); + return veorq_u64(lo, hi); +} + +__attribute__((target("+crypto"))) +static void crc64_nvme_pmull(uint64_t * crc, + const void * buf, + size_t len) +{ + const uint8_t * p; + uint64_t seed; + uint64_t c; + size_t off; + uint64x2_t x; + uint64x2_t k; + uint64_t seed_lane[2]; + uint64_t k_lanes[2]; + uint8_t post[16]; + + p = (const uint8_t *) buf; + seed = *crc; + + if (len < 16) { + c = crc64_nvme_step(seed ^ UINT64_MAX, p, len); + *crc = c ^ UINT64_MAX; + return; + } + + x = vld1q_u64((const uint64_t *) p); + seed_lane[0] = seed ^ UINT64_MAX; + seed_lane[1] = 0; + x = veorq_u64(x, vld1q_u64(seed_lane)); + + k_lanes[0] = k3_pmull; + k_lanes[1] = k4_pmull; + k = vld1q_u64(k_lanes); + + off = 16; + while (off + 16 <= len) { + uint64x2_t d; + + d = vld1q_u64((const uint64_t *) (p + off)); + x = veorq_u64(fold16_pmull(x, k), d); + off += 16; + } + + vst1q_u8(post, vreinterpretq_u8_u64(x)); + + c = crc64_nvme_step(0, post, 16); + c = crc64_nvme_step(c, p + off, len - off); + + *crc = c ^ UINT64_MAX; +} +#endif /* HAVE_PMULL */ + +void crc64_nvme(uint64_t * crc, + const void * buf, + size_t len) +{ +#ifdef HAVE_PCLMUL + crc64_nvme_clmul(crc, buf, len); +#elif defined(HAVE_PMULL) + crc64_nvme_pmull(crc, buf, len); +#else + crc64_nvme_table(crc, buf, len); +#endif +} diff --git a/src/lib/crc/crc8.c b/src/lib/crc/crc8.c new file mode 100644 index 00000000..20976b29 --- /dev/null +++ b/src/lib/crc/crc8.c @@ -0,0 +1,62 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * 8-bit Cyclic Redundancy Check (AUTOSAR variant) + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +/* + * CRC-8/AUTOSAR (reveng catalog): + * poly = 0x2f + * init = 0xff + * refin = false + * refout = false + * xorout = 0xff + * check = crc8_autosar("123456789") == 0xdf + */ + +#include "config.h" + +#include <ouroboros/crc8.h> + + + /* Bit-by-bit MSB-first CRC. */ +void crc8_autosar(uint8_t * crc, + const void * buf, + size_t len) +{ + const uint8_t * p; + uint8_t c; + size_t n; + int i; + + p = (const uint8_t *) buf; + c = *crc ^ 0xff; + + for (n = 0; n < len; n++) { + c ^= p[n]; + for (i = 0; i < 8; i++) { + if (c & 0x80) + c = (uint8_t) ((c << 1) ^ 0x2f); + else + c = (uint8_t) (c << 1); + } + } + + *crc = c ^ 0xff; +} diff --git a/src/lib/crc/tests/CMakeLists.txt b/src/lib/crc/tests/CMakeLists.txt new file mode 100644 index 00000000..11daca5a --- /dev/null +++ b/src/lib/crc/tests/CMakeLists.txt @@ -0,0 +1,21 @@ +get_filename_component(PARENT_PATH ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) +get_filename_component(PARENT_DIR ${PARENT_PATH} NAME) + +compute_test_prefix() + +create_test_sourcelist(${PARENT_DIR}_tests test_suite.c + # Add new tests here + crc8_test.c + crc16_test.c + crc32_test.c + crc64_test.c + ) + +add_executable(${PARENT_DIR}_test ${${PARENT_DIR}_tests}) + +disable_test_logging_for_target(${PARENT_DIR}_test) +target_link_libraries(${PARENT_DIR}_test ouroboros-common) + +add_dependencies(build_tests ${PARENT_DIR}_test) + +ouroboros_register_tests(TARGET ${PARENT_DIR}_test TESTS ${${PARENT_DIR}_tests}) diff --git a/src/lib/crc/tests/crc16_test.c b/src/lib/crc/tests/crc16_test.c new file mode 100644 index 00000000..03a5b504 --- /dev/null +++ b/src/lib/crc/tests/crc16_test.c @@ -0,0 +1,67 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * Test of the CRC-16/CCITT-FALSE function + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +#include "config.h" + +#include <ouroboros/crc16.h> + +#include <test/test.h> + +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> + +/* reveng-catalog smoke vectors. */ +static int test_crc16_ccitt_false_basic(void) +{ + uint16_t crc; + + TEST_START(); + + crc = 0; + crc16_ccitt_false(&crc, "", 0); + if (crc != 0xffff) + goto fail; + + crc = 0; + crc16_ccitt_false(&crc, "123456789", 9); + if (crc != 0x29b1) + goto fail; + + TEST_SUCCESS(); + return TEST_RC_SUCCESS; + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +int crc16_test(int argc, + char ** argv) +{ + int ret = 0; + + (void) argc; + (void) argv; + + ret |= test_crc16_ccitt_false_basic(); + return ret; +} diff --git a/src/lib/tests/crc32_test.c b/src/lib/crc/tests/crc32_test.c index 5a1ddd87..5a1ddd87 100644 --- a/src/lib/tests/crc32_test.c +++ b/src/lib/crc/tests/crc32_test.c diff --git a/src/lib/crc/tests/crc64_test.c b/src/lib/crc/tests/crc64_test.c new file mode 100644 index 00000000..cf3f5ca3 --- /dev/null +++ b/src/lib/crc/tests/crc64_test.c @@ -0,0 +1,126 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * Test of the CRC-64/NVMe function + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +#include "config.h" + +#include <ouroboros/crc64.h> +#include <ouroboros/random.h> + +#include <test/test.h> + +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> + +/* Reference impl, internal to libouroboros-common. */ +extern void crc64_nvme_table(uint64_t * crc, + const void * buf, + size_t len); + +/* reveng-catalog smoke vectors plus a 16-byte fold-boundary check. */ +static int test_crc64_nvme_basic(void) +{ + uint64_t crc; + + TEST_START(); + + crc = 0; + crc64_nvme(&crc, "", 0); + if (crc != 0x0000000000000000ULL) + goto fail; + + crc = 0; + crc64_nvme(&crc, "123456789", 9); + if (crc != 0xae8b14860a799888ULL) + goto fail; + + crc = 0; + crc64_nvme(&crc, "0123456789abcdef", 16); + if (crc != 0x091485ca7018730eULL) + goto fail; + + TEST_SUCCESS(); + return TEST_RC_SUCCESS; + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +#if defined(HAVE_PCLMUL) || defined(HAVE_PMULL) +/* Cross-check the accelerated dispatcher path against the byte-table. */ +static int test_crc64_nvme_random(void) +{ + static const size_t lens[] = { + 0, 1, 7, 8, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128, + 129, 255, 256, 257, 1023, 1024, 1025, 4096 + }; + uint8_t buf[4096]; + size_t i; + uint64_t ref; + uint64_t got; + + TEST_START(); + + if (random_buffer(buf, sizeof(buf)) < 0) { + printf("Failed to generate random data.\n"); + goto fail; + } + + for (i = 0; i < sizeof(lens) / sizeof(lens[0]); i++) { + ref = 0; + crc64_nvme_table(&ref, buf, lens[i]); + + got = 0; + crc64_nvme(&got, buf, lens[i]); + + if (ref == got) + continue; + + printf("Mismatch at len=%zu: table=0x%016lx disp=0x%016lx\n", + lens[i], + (unsigned long) ref, + (unsigned long) got); + goto fail; + } + + TEST_SUCCESS(); + return TEST_RC_SUCCESS; + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +#endif +} + +int crc64_test(int argc, + char ** argv) +{ + int ret = 0; + + (void) argc; + (void) argv; + + ret |= test_crc64_nvme_basic(); +#if defined(HAVE_PCLMUL) || defined(HAVE_PMULL) + ret |= test_crc64_nvme_random(); +#endif + return ret; +} diff --git a/src/lib/crc/tests/crc8_test.c b/src/lib/crc/tests/crc8_test.c new file mode 100644 index 00000000..f7bb33b8 --- /dev/null +++ b/src/lib/crc/tests/crc8_test.c @@ -0,0 +1,67 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * Test of the CRC-8/AUTOSAR function + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +#include "config.h" + +#include <ouroboros/crc8.h> + +#include <test/test.h> + +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> + +/* reveng-catalog smoke vectors. */ +static int test_crc8_autosar_basic(void) +{ + uint8_t crc; + + TEST_START(); + + crc = 0; + crc8_autosar(&crc, "", 0); + if (crc != 0x00) + goto fail; + + crc = 0; + crc8_autosar(&crc, "123456789", 9); + if (crc != 0xdf) + goto fail; + + TEST_SUCCESS(); + return TEST_RC_SUCCESS; + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +int crc8_test(int argc, + char ** argv) +{ + int ret = 0; + + (void) argc; + (void) argv; + + ret |= test_crc8_autosar_basic(); + return ret; +} diff --git a/src/lib/crypt.c b/src/lib/crypt.c index cd3421dd..71197f6e 100644 --- a/src/lib/crypt.c +++ b/src/lib/crypt.c @@ -1094,6 +1094,13 @@ void crypt_secure_malloc_fini(void) #endif } +void crypt_cleanup(void) +{ +#ifdef HAVE_OPENSSL + openssl_cleanup(); +#endif +} + void * crypt_secure_malloc(size_t size) { #ifdef HAVE_OPENSSL diff --git a/src/lib/crypt/openssl.c b/src/lib/crypt/openssl.c index 573bc0b3..5916e3cb 100644 --- a/src/lib/crypt/openssl.c +++ b/src/lib/crypt/openssl.c @@ -629,7 +629,7 @@ ssize_t openssl_pkp_create(const char * algo, return (ssize_t) raw.len; } else { /* DER encode standard algorithms */ - pos = pk; /* i2d_PUBKEY increments the pointer, don't use pk! */ + pos = pk; /* i2d_PUBKEY increments the ptr, don't use pk! */ len = i2d_PUBKEY(*pkp, &pos); if (len < 0) goto fail_pubkey; @@ -666,7 +666,7 @@ static ssize_t __openssl_kem_encap(EVP_PKEY * pub, /* Get required lengths */ ret = EVP_PKEY_encapsulate(ctx, NULL, &ct_len, NULL, &secret_len); - if (ret != 1 || ct_len > MSGBUFSZ) + if (ret != 1 || ct_len > CRYPT_KEY_BUFSZ) goto fail_encap; /* Allocate buffer for secret */ @@ -1283,24 +1283,14 @@ int openssl_load_privkey_file(const char * path, { FILE * fp; EVP_PKEY * pkey; - unsigned long err; - char errbuf[256]; fp = fopen(path, "r"); - if (fp == NULL) { - fprintf(stderr, "Failed to open %s\n", path); + if (fp == NULL) goto fail_file; - } pkey = PEM_read_PrivateKey(fp, NULL, NULL, ""); - if (pkey == NULL) { - err = ERR_get_error(); - ERR_error_string_n(err, errbuf, sizeof(errbuf)); - fprintf(stderr, - "OpenSSL error loading privkey from %s: %s\n", - path, errbuf); + if (pkey == NULL) goto fail_key; - } fclose(fp); @@ -1442,7 +1432,7 @@ int openssl_load_pubkey_raw_file(const char * path, buffer_t * buf) { FILE * fp; - uint8_t tmp_buf[MSGBUFSZ]; + uint8_t tmp_buf[CRYPT_KEY_BUFSZ]; size_t bytes_read; const char * algo; @@ -1453,7 +1443,7 @@ int openssl_load_pubkey_raw_file(const char * path, if (fp == NULL) goto fail_file; - bytes_read = fread(tmp_buf, 1, MSGBUFSZ, fp); + bytes_read = fread(tmp_buf, 1, CRYPT_KEY_BUFSZ, fp); if (bytes_read == 0) goto fail_read; @@ -1658,25 +1648,33 @@ int openssl_crt_str(const void * crt, int openssl_crt_der(const void * crt, buffer_t * buf) { - int len; + uint8_t * p; + int len; assert(crt != NULL); assert(buf != NULL); - len = i2d_X509((X509 *) crt, &buf->data); + /* Get the size by encoding to NULL */ + len = i2d_X509((X509 *) crt, NULL); if (len < 0) - goto fail_der; + goto fail_len; + buf->data = malloc((size_t) len); + if (buf->data == NULL) + goto fail_malloc; + + p = buf->data; /* i2d_X509 increments p */ + i2d_X509((X509 *) crt, &p); buf->len = (size_t) len; return 0; - fail_der: + fail_malloc: + fail_len: clrbuf(*buf); return -1; } - void * openssl_auth_create_store(void) { return X509_STORE_new(); @@ -1878,3 +1876,7 @@ void openssl_secure_clear(void * ptr, { OPENSSL_cleanse(ptr, size); } +void openssl_cleanup(void) +{ + OPENSSL_cleanup(); +} diff --git a/src/lib/crypt/openssl.h b/src/lib/crypt/openssl.h index b95d1b0b..af285232 100644 --- a/src/lib/crypt/openssl.h +++ b/src/lib/crypt/openssl.h @@ -169,4 +169,6 @@ void openssl_secure_free(void * ptr, void openssl_secure_clear(void * ptr, size_t size); +void openssl_cleanup(void); + #endif /* OUROBOROS_LIB_CRYPT_OPENSSL_H */ diff --git a/src/lib/dev.c b/src/lib/dev.c index 9cfc24ee..ae0401b7 100644 --- a/src/lib/dev.c +++ b/src/lib/dev.c @@ -29,10 +29,13 @@ #include "config.h" #include "ssm.h" +#include <ouroboros/atomics.h> #include <ouroboros/bitmap.h> #include <ouroboros/cep.h> +#include <ouroboros/crc16.h> #include <ouroboros/crypt.h> #include <ouroboros/dev.h> +#include <ouroboros/endian.h> #include <ouroboros/errno.h> #include <ouroboros/fccntl.h> #include <ouroboros/flow.h> @@ -45,32 +48,33 @@ #include <ouroboros/np1_flow.h> #include <ouroboros/pthread.h> #include <ouroboros/random.h> +#ifdef PROC_FLOW_STATS +#include <ouroboros/rib.h> +#endif #include <ouroboros/serdes-irm.h> +#include <ouroboros/sockets.h> #include <ouroboros/ssm_flow_set.h> #include <ouroboros/ssm_pool.h> #include <ouroboros/ssm_rbuff.h> -#include <ouroboros/sockets.h> +#include <ouroboros/tw.h> #include <ouroboros/utils.h> -#ifdef PROC_FLOW_STATS -#include <ouroboros/rib.h> -#endif +#include <assert.h> #ifdef HAVE_LIBGCRYPT #include <gcrypt.h> #endif -#include <assert.h> -#include <stdlib.h> -#include <string.h> -#include <stdio.h> #include <stdarg.h> #include <stdbool.h> +#include <inttypes.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> #include <sys/types.h> #ifndef CLOCK_REALTIME_COARSE #define CLOCK_REALTIME_COARSE CLOCK_REALTIME #endif -/* Partial read information. */ #define NO_PART -1 #define DONE_PART -2 @@ -78,19 +82,12 @@ #define SECMEMSZ 16384 #define MSGBUFSZ 2048 -/* map flow_ids to flow descriptors; track state of the flow */ struct fmap { int fd; - /* TODO: use actual flow state */ enum flow_state state; }; -#define frcti_to_flow(frcti) \ - ((struct flow *)((uint8_t *) frcti - offsetof(struct flow, frcti))) - struct flow { - struct list_head next; - struct flow_info info; struct ssm_rbuff * rx_rb; @@ -135,16 +132,10 @@ struct { struct flow * flows; struct fmap * id_to_fd; - struct list_head flow_list; pthread_mutex_t mtx; pthread_cond_t cond; - pthread_t tx; - pthread_t rx; - size_t n_frcti; - fset_t * frct_set; - pthread_rwlock_t lock; } proc; @@ -243,7 +234,7 @@ static int proc_announce(const struct proc_info * proc) return irm__irm_result_des(&msg); } -/* IRMd will clean up the mess if this fails */ +/* IRMd cleans up on failure. */ static void proc_exit(void) { uint8_t buf[SOCK_BUF_SIZE]; @@ -264,7 +255,7 @@ static int spb_encrypt(struct flow * flow, uint8_t * tail; if (flow->crypt == NULL) - return 0; /* No encryption */ + return 0; in.data = ssm_pk_buff_head(spb); in.len = ssm_pk_buff_len(spb); @@ -272,11 +263,11 @@ static int spb_encrypt(struct flow * flow, if (crypt_encrypt(flow->crypt, in, &out) < 0) goto fail_encrypt; - head = ssm_pk_buff_head_alloc(spb, flow->headsz); + head = ssm_pk_buff_push(spb, flow->headsz); if (head == NULL) goto fail_alloc; - tail = ssm_pk_buff_tail_alloc(spb, flow->tailsz); + tail = ssm_pk_buff_push_tail(spb, flow->tailsz); if (tail == NULL) goto fail_alloc; @@ -299,7 +290,7 @@ static int spb_decrypt(struct flow * flow, uint8_t * head; if (flow->crypt == NULL) - return 0; /* No decryption */ + return 0; in.data = ssm_pk_buff_head(spb); in.len = ssm_pk_buff_len(spb); @@ -308,8 +299,8 @@ static int spb_decrypt(struct flow * flow, return -ENOMEM; - head = ssm_pk_buff_head_release(spb, flow->headsz) + flow->headsz; - ssm_pk_buff_tail_release(spb, flow->tailsz); + head = ssm_pk_buff_pop(spb, flow->headsz) + flow->headsz; + ssm_pk_buff_pop_tail(spb, flow->tailsz); memcpy(head, out.data, out.len); @@ -318,130 +309,280 @@ static int spb_decrypt(struct flow * flow, return 0; } -#include "frct.c" - -void * flow_tx(void * o) +/* tw_move under proc.lock rdlock; gates teardown vs in-flight fires. */ +static void tw_move_safe(void) { - struct timespec tic = TIMESPEC_INIT_NS(TICTIME); - - (void) o; + pthread_rwlock_rdlock(&proc.lock); - while (true) { - timerwheel_move(); + pthread_cleanup_push(__cleanup_rwlock_unlock, &proc.lock); - nanosleep(&tic, NULL); - } + tw_move(); - return (void *) 0; + pthread_cleanup_pop(1); } -static void flow_send_keepalive(struct flow * flow, - struct timespec now) +static int crc_add(struct ssm_pk_buff * spb, + size_t head_skip) { - struct ssm_pk_buff * spb; - ssize_t idx; - uint8_t * ptr; - - idx = ssm_pool_alloc(proc.pool, 0, &ptr, &spb); - if (idx < 0) - return; + uint8_t * head; + uint8_t * tail; - pthread_rwlock_wrlock(&proc.lock); + tail = ssm_pk_buff_push_tail(spb, CRCLEN); + if (tail == NULL) + return -ENOMEM; - flow->snd_act = now; + head = ssm_pk_buff_head(spb) + head_skip; - if (ssm_rbuff_write(flow->tx_rb, idx)) - ssm_pool_remove(proc.pool, idx); - else - ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT); + mem_hash(HASH_CRC32, tail, head, tail - head); - pthread_rwlock_unlock(&proc.lock); + return 0; } -/* Needs rdlock on proc. */ -static void _flow_keepalive(struct flow * flow) +static int crc_check(struct ssm_pk_buff * spb, + size_t head_skip) { - struct timespec now; - struct timespec s_act; - struct timespec r_act; - int flow_id; - time_t timeo; - uint32_t acl; + uint32_t crc; + uint8_t * head = ssm_pk_buff_head(spb) + head_skip; + uint8_t * tail = ssm_pk_buff_pop_tail(spb, CRCLEN); + + mem_hash(HASH_CRC32, &crc, head, tail - head); - s_act = flow->snd_act; - r_act = flow->rcv_act; + return !(crc == *((uint32_t *) tail)); +} - flow_id = flow->info.id; - timeo = flow->info.qs.timeout; +/* FRCT included here so it can use proc and dev.c statics directly. */ +#include "frct.c" - acl = ssm_rbuff_get_acl(flow->rx_rb); - if (timeo == 0 || acl & (ACL_FLOWPEER | ACL_FLOWDOWN)) - return; +/* + * SACK / DATA carry trailer CRC32; HCS protects the headers on every + * FRCT packet. Decrypt before any check so plaintext is authoritative. + */ +static bool invalid_pkt(struct flow * flow, + struct ssm_pk_buff * spb) +{ + const struct frct_pci * pci; + uint16_t flags; + size_t pci_total; - clock_gettime(PTHREAD_COND_CLOCK, &now); + if (spb == NULL || ssm_pk_buff_len(spb) == 0) + return true; - if (ts_diff_ns(&now, &r_act) > (int64_t) timeo * MILLION) { - ssm_rbuff_set_acl(flow->rx_rb, ACL_FLOWPEER); - ssm_flow_set_notify(proc.fqset, flow_id, FLOW_PEER); - return; + if (spb_decrypt(flow, spb) < 0) + return true; + + if (flow->frcti == NULL) { + if (flow->info.qs.ber == 0 && crc_check(spb, 0) != 0) + return true; + return false; } - if (ts_diff_ns(&now, &s_act) > (int64_t) timeo * (MILLION >> 2)) { - pthread_rwlock_unlock(&proc.lock); + if (ssm_pk_buff_len(spb) < FRCT_PCILEN) + return true; - flow_send_keepalive(flow, now); + pci = (const struct frct_pci *) ssm_pk_buff_head(spb); + flags = ntoh16(pci->flags); - pthread_rwlock_rdlock(&proc.lock); + /* Untrusted flag read; mismatch on HCS will drop on corrupt. */ + if (flags & FRCT_DATA) + pci_total = frcti_data_hdr_len(flow->frcti); + else + pci_total = frcti_ctrl_hdr_len(flow->frcti); + + if (ssm_pk_buff_len(spb) < pci_total) + return true; + + if (frct_hcs_check(pci, flow->frcti) != 0) + return true; + + /* HCS valid: CRC32 on SACK; or on DATA if ber = 0. */ + if (flags & FRCT_SACK) { + if (crc_check(spb, pci_total) != 0) + return true; + + } else if ((flags & FRCT_DATA) && flow->info.qs.ber == 0) { + if (crc_check(spb, pci_total) != 0) + return true; } + + return false; } -static void handle_keepalives(void) +static bool deadline_passed(const struct timespec * abs) { - struct list_head * p; - struct list_head * h; + struct timespec now; - pthread_rwlock_rdlock(&proc.lock); + if (abs == NULL) + return false; - list_for_each_safe(p, h, &proc.flow_list) { - struct flow * flow; - flow = list_entry(p, struct flow, next); - _flow_keepalive(flow); - } + clock_gettime(PTHREAD_COND_CLOCK, &now); - pthread_rwlock_unlock(&proc.lock); + return ts_diff_ns(&now, abs) >= 0; } -static void __cleanup_fqueue_destroy(void * fq) +/* Clamp the wait by min(dl, next tw expiry, now + TICTIME). */ +static void compute_wait_deadline(const struct timespec * dl, + struct timespec * out) { - fqueue_destroy((fqueue_t *) fq); + struct timespec now; + struct timespec cap; + struct timespec expiry; + struct timespec tic = TIMESPEC_INIT_NS(TICTIME); + + clock_gettime(PTHREAD_COND_CLOCK, &now); + ts_add(&now, &tic, &cap); + + tw_next_expiry(&expiry); + + *out = (ts_diff_ns(&cap, &expiry) < 0) ? expiry : cap; + if (dl != NULL && ts_diff_ns(out, dl) > 0) + *out = *dl; } -void * flow_rx(void * o) +/* + * proc.lock rdlock held across each iteration so flow_fini's wrlock + * waits for us to finish; FLOWDOWN already set means we exit promptly. + */ +static void flow_drain_rx_nb(struct flow * flow) { - struct timespec tic = TIMESPEC_INIT_NS(TICTIME); - int ret; - struct fqueue * fq; + ssize_t idx; + struct ssm_pk_buff * spb; + struct ssm_rbuff * rx_rb; + struct frcti * frcti; +#ifdef PROC_FLOW_STATS + struct timespec t_a; + struct timespec t_b; +#endif + + if (flow->frcti != NULL) + STAT_BUMP(flow->frcti, drain_calls); - (void) o; + while (true) { + pthread_rwlock_rdlock(&proc.lock); - fq = fqueue_create(); + rx_rb = flow->rx_rb; + if (rx_rb == NULL) { + pthread_rwlock_unlock(&proc.lock); + return; + } - pthread_cleanup_push(__cleanup_fqueue_destroy, fq); + idx = ssm_rbuff_read(rx_rb); + if (idx < 0) { + pthread_rwlock_unlock(&proc.lock); + return; + } - /* fevent will filter all FRCT packets for us */ - while ((ret = fevent(proc.frct_set, fq, &tic)) != 0) { - if (ret == -ETIMEDOUT) { - handle_keepalives(); + spb = ssm_pool_get(proc.pool, idx); + if (invalid_pkt(flow, spb)) { + ssm_pool_remove(proc.pool, idx); + pthread_rwlock_unlock(&proc.lock); continue; } - while (fqueue_next(fq) >= 0) - ; /* no need to act */ + frcti = flow->frcti; + if (frcti != NULL) { +#ifdef PROC_FLOW_STATS + clock_gettime(CLOCK_MONOTONIC, &t_a); + FRCTI_RCV(frcti, spb); + clock_gettime(CLOCK_MONOTONIC, &t_b); + STAT_ADD(frcti, rcv_proc_ns, + (size_t) ts_diff_ns(&t_b, &t_a)); +#else + FRCTI_RCV(frcti, spb); +#endif + } else { + ssm_pool_remove(proc.pool, idx); + } + + pthread_rwlock_unlock(&proc.lock); + + /* Per-packet so the delayed-ACK fires on time in a burst. */ +#ifdef PROC_FLOW_STATS + clock_gettime(CLOCK_MONOTONIC, &t_a); + tw_move_safe(); + clock_gettime(CLOCK_MONOTONIC, &t_b); + if (frcti != NULL) + STAT_ADD(frcti, tw_move_ns, + (size_t) ts_diff_ns(&t_b, &t_a)); +#else + tw_move_safe(); +#endif } +} - pthread_cleanup_pop(true); +/* + * Wait clamped by caller deadline, next tw expiry, and TICTIME; + * a clamp-timeout means tw work is due, not caller-deadline. + */ +static int flow_rx_one(struct flow * flow, + struct timespec * abs) +{ + struct timespec wait_abs; + struct ssm_pk_buff * spb; + struct ssm_rbuff * rx_rb; + ssize_t idx; + + while (true) { + compute_wait_deadline(abs, &wait_abs); + + /* rdlock gates flow_fini; FLOWDOWN preempts the block. */ + pthread_rwlock_rdlock(&proc.lock); + + rx_rb = flow->rx_rb; + if (rx_rb == NULL) { + pthread_rwlock_unlock(&proc.lock); + return -EFLOWDOWN; + } + + idx = ssm_rbuff_read_b(rx_rb, &wait_abs); + if (idx == -ETIMEDOUT) { + pthread_rwlock_unlock(&proc.lock); + if (deadline_passed(abs)) + return -ETIMEDOUT; + tw_move_safe(); + continue; + } + if (idx < 0) { + pthread_rwlock_unlock(&proc.lock); + return idx; + } + + spb = ssm_pool_get(proc.pool, idx); + if (invalid_pkt(flow, spb)) { + ssm_pool_remove(proc.pool, idx); + pthread_rwlock_unlock(&proc.lock); + continue; + } + + if (flow->frcti != NULL) + FRCTI_RCV(flow->frcti, spb); + else + ssm_pool_remove(proc.pool, idx); + + pthread_rwlock_unlock(&proc.lock); - return (void *) 0; + tw_move_safe(); + return 0; + } +} + +/* 0 = window open; -EAGAIN = !block and would block; else flow_rx_one rc. */ +static __inline__ int flow_wait_window(struct flow * flow, + size_t n, + bool block, + struct timespec * dl) +{ + int rc; + + while (true) { + flow_drain_rx_nb(flow); + if (FRCTI_IS_WINDOW_OPEN_N(flow->frcti, n)) + return 0; + if (!block) + return -EAGAIN; + rc = flow_rx_one(flow, dl); + if (rc < 0) + return rc; + } } static void flow_clear(int fd) @@ -451,36 +592,40 @@ static void flow_clear(int fd) proc.flows[fd].info.id = -1; } -static void __flow_fini(int fd) +/* + * Set ACL_FLOWDOWN on rx/tx so any in-flight blocking reads or writes + * wake up and drop their proc.lock rdlock. Must run BEFORE flow_fini's + * wrlock, else the wrlock blocks on those rdlock holders and the + * in-flight calls never see the FLOWDOWN signal. + */ +static void flow_quiesce(int fd) { - assert(fd >= 0 && fd < SYS_MAX_FLOWS); + struct ssm_rbuff * rx_rb = proc.flows[fd].rx_rb; + struct ssm_rbuff * tx_rb = proc.flows[fd].tx_rb; - if (proc.flows[fd].frcti != NULL) { - proc.n_frcti--; - if (proc.n_frcti == 0) { - pthread_cancel(proc.tx); - pthread_join(proc.tx, NULL); - } + if (rx_rb != NULL) + ssm_rbuff_set_acl(rx_rb, ACL_FLOWDOWN); + if (tx_rb != NULL) + ssm_rbuff_set_acl(tx_rb, ACL_FLOWDOWN); +} - ssm_flow_set_del(proc.fqset, 0, proc.flows[fd].info.id); +static void do_flow_fini(int fd) +{ + assert(fd >= 0 && fd < PROC_MAX_FLOWS); + if (proc.flows[fd].frcti != NULL) frcti_destroy(proc.flows[fd].frcti); - } if (proc.flows[fd].info.id != -1) { flow_destroy(&proc.id_to_fd[proc.flows[fd].info.id]); bmp_release(proc.fds, fd); } - if (proc.flows[fd].rx_rb != NULL) { - ssm_rbuff_set_acl(proc.flows[fd].rx_rb, ACL_FLOWDOWN); + if (proc.flows[fd].rx_rb != NULL) ssm_rbuff_close(proc.flows[fd].rx_rb); - } - if (proc.flows[fd].tx_rb != NULL) { - ssm_rbuff_set_acl(proc.flows[fd].tx_rb, ACL_FLOWDOWN); + if (proc.flows[fd].tx_rb != NULL) ssm_rbuff_close(proc.flows[fd].tx_rb); - } if (proc.flows[fd].set != NULL) { ssm_flow_set_notify(proc.flows[fd].set, @@ -491,24 +636,40 @@ static void __flow_fini(int fd) crypt_destroy_ctx(proc.flows[fd].crypt); - list_del(&proc.flows[fd].next); - flow_clear(fd); } static void flow_fini(int fd) { + flow_quiesce(fd); + pthread_rwlock_wrlock(&proc.lock); - __flow_fini(fd); + do_flow_fini(fd); pthread_rwlock_unlock(&proc.lock); } #define IS_ENCRYPTED(crypt) ((crypt)->nid != NID_undef) -#define IS_ORDERED(flow) (flow.qs.in_order != 0) +#define IS_ORDERED(info) ((info)->qs.service != SVC_RAW) +#define IS_STREAM(info) ((info)->qs.service == SVC_STREAM) + +/* Raw MTU minus the wrapping (IV/Tag + optional CRC) dev.c adds. */ +static __inline__ size_t flow_user_mtu(const struct flow * flow, + size_t raw) +{ + size_t hdr; + + hdr = flow->headsz + flow->tailsz; + if (flow->info.qs.ber == 0 && flow->crypt == NULL) + hdr += CRCLEN; + + return raw > hdr ? raw - hdr : 0; +} + static int flow_init(struct flow_info * info, - struct crypt_sk * sk) + struct crypt_sk * sk, + time_t rtt_hint) { struct timespec now; struct flow * flow; @@ -550,7 +711,6 @@ static int flow_init(struct flow_info * info, flow->tailsz = 0; if (IS_ENCRYPTED(sk)) { - /* Set to lower value in tests, should we make configurable? */ sk->rot_bit = KEY_ROTATION_BIT; flow->crypt = crypt_create_ctx(sk); if (flow->crypt == NULL) @@ -561,22 +721,16 @@ static int flow_init(struct flow_info * info, assert(flow->frcti == NULL); - if (IS_ORDERED(flow->info)) { - flow->frcti = frcti_create(fd, DELT_A, DELT_R, info->mpl); + if (IS_ORDERED(&flow->info)) { + uint32_t frct_mtu = flow_user_mtu(flow, info->mtu); + + flow->frcti = frcti_create(fd, DELT_A, DELT_R, + info->mpl, rtt_hint, + info->qs, frct_mtu); if (flow->frcti == NULL) goto fail_frcti; - - if (ssm_flow_set_add(proc.fqset, 0, info->id)) - goto fail_flow_set_add; - - ++proc.n_frcti; - if (proc.n_frcti == 1 && - pthread_create(&proc.tx, NULL, flow_tx, NULL) < 0) - goto fail_tx_thread; } - list_add_tail(&flow->next, &proc.flow_list); - proc.id_to_fd[info->id].fd = fd; flow_set_state(&proc.id_to_fd[info->id], FLOW_ALLOCATED); @@ -585,10 +739,6 @@ static int flow_init(struct flow_info * info, return fd; - fail_tx_thread: - ssm_flow_set_del(proc.fqset, 0, info->id); - fail_flow_set_add: - frcti_destroy(flow->frcti); fail_frcti: crypt_destroy_ctx(flow->crypt); fail_crypt: @@ -655,13 +805,13 @@ static void init(int argc, gcry_control(GCRYCTL_INITIALIZATION_FINISHED, 0); } #endif - proc.fds = bmp_create(PROG_MAX_FLOWS - PROG_RES_FDS, PROG_RES_FDS); + proc.fds = bmp_create(PROC_MAX_FLOWS - PROC_RES_FDS, PROC_RES_FDS); if (proc.fds == NULL) { fprintf(stderr, "FATAL: Could not create fd bitmap.\n"); goto fail_fds; } - proc.fqueues = bmp_create(PROG_MAX_FQUEUES, 0); + proc.fqueues = bmp_create(PROC_MAX_FQUEUES, 0); if (proc.fqueues == NULL) { fprintf(stderr, "FATAL: Could not create fqueue bitmap.\n"); goto fail_fqueues; @@ -677,13 +827,13 @@ static void init(int argc, goto fail_rdrb; } - proc.flows = malloc(sizeof(*proc.flows) * PROG_MAX_FLOWS); + proc.flows = malloc(sizeof(*proc.flows) * PROC_MAX_FLOWS); if (proc.flows == NULL) { fprintf(stderr, "FATAL: Could not malloc flows.\n"); goto fail_flows; } - for (i = 0; i < PROG_MAX_FLOWS; ++i) + for (i = 0; i < PROC_MAX_FLOWS; ++i) flow_clear(i); proc.id_to_fd = malloc(sizeof(*proc.id_to_fd) * SYS_MAX_FLOWS); @@ -716,20 +866,14 @@ static void init(int argc, goto fail_fqset; } - proc.frct_set = fset_create(); - if (proc.frct_set == NULL || proc.frct_set->idx != 0) { - fprintf(stderr, "FATAL: Could not create FRCT set.\n"); - goto fail_frct_set; - } - - if (timerwheel_init() < 0) { + if (tw_init() < 0) { fprintf(stderr, "FATAL: Could not initialize timerwheel.\n"); goto fail_timerwheel; } if (crypt_secure_malloc_init(PROC_SECMEM_MAX) < 0) { fprintf(stderr, "FATAL: Could not init secure malloc.\n"); - goto fail_timerwheel; + goto fail_secmem; } #if defined PROC_FLOW_STATS @@ -741,24 +885,15 @@ static void init(int argc, } } #endif - if (pthread_create(&proc.rx, NULL, flow_rx, NULL) < 0) { - fprintf(stderr, "FATAL: Could not start monitor thread.\n"); - goto fail_monitor; - } - - list_head_init(&proc.flow_list); - return; - fail_monitor: #if defined PROC_FLOW_STATS - rib_fini(); fail_rib_init: + crypt_secure_malloc_fini(); #endif - timerwheel_fini(); + fail_secmem: + tw_fini(); fail_timerwheel: - fset_destroy(proc.frct_set); - fail_frct_set: ssm_flow_set_close(proc.fqset); fail_fqset: pthread_rwlock_destroy(&proc.lock); @@ -789,19 +924,20 @@ static void fini(void) if (proc.fds == NULL) return; - pthread_cancel(proc.rx); - pthread_join(proc.rx, NULL); + /* Wake all in-flight readers/writers BEFORE wrlock acquire. */ + for (i = 0; i < PROC_MAX_FLOWS; ++i) + if (proc.flows[i].info.id != -1) + flow_quiesce(i); pthread_rwlock_wrlock(&proc.lock); - for (i = 0; i < PROG_MAX_FLOWS; ++i) { + for (i = 0; i < PROC_MAX_FLOWS; ++i) { struct flow * flow = &proc.flows[i]; if (flow->info.id != -1) { ssize_t idx; - ssm_rbuff_set_acl(flow->rx_rb, ACL_FLOWDOWN); while ((idx = ssm_rbuff_read(flow->rx_rb)) >= 0) ssm_pool_remove(proc.pool, idx); - __flow_fini(i); + do_flow_fini(i); } } @@ -813,9 +949,9 @@ static void fini(void) #ifdef PROC_FLOW_STATS rib_fini(); #endif - timerwheel_fini(); + crypt_secure_malloc_fini(); - fset_destroy(proc.frct_set); + tw_fini(); ssm_flow_set_close(proc.fqset); @@ -860,6 +996,10 @@ int flow_accept(qosspec_t * qs, if (qs != NULL) qs->ber = 1; #endif + /* STREAM cannot tolerate loss: drops create silent gaps. */ + if (qs != NULL && qs->service == SVC_STREAM && qs->loss != 0) + return -EINVAL; + memset(&flow, 0, sizeof(flow)); flow.n_pid = getpid(); @@ -878,7 +1018,8 @@ int flow_accept(qosspec_t * qs, if (err < 0) return err; - fd = flow_init(&flow, &crypt); + /* No RTT in accept; rtt_hint=0 bootstraps from first ACK. */ + fd = flow_init(&flow, &crypt, 0); crypt_secure_clear(key, SYMMKEYSZ); @@ -899,11 +1040,16 @@ int flow_alloc(const char * dst, uint8_t key[SYMMKEYSZ]; int fd; int err; + struct timespec t0; + struct timespec t1; #ifdef QOS_DISABLE_CRC if (qs != NULL) qs->ber = 1; #endif + /* STREAM cannot tolerate loss: drops create silent gaps. */ + if (qs != NULL && qs->service == SVC_STREAM && qs->loss != 0) + return -EINVAL; memset(&flow, 0, sizeof(flow)); @@ -913,11 +1059,13 @@ int flow_alloc(const char * dst, if (flow_alloc__irm_req_ser(&msg, &flow, dst, timeo)) return -ENOMEM; + clock_gettime(PTHREAD_COND_CLOCK, &t0); + err = send_recv_msg(&msg); - if (err < 0) { - printf("send_recv_msg error %d\n", err); + if (err < 0) return err; - } + + clock_gettime(PTHREAD_COND_CLOCK, &t1); crypt.key = key; @@ -925,7 +1073,7 @@ int flow_alloc(const char * dst, if (err < 0) return err; - fd = flow_init(&flow, &crypt); + fd = flow_init(&flow, &crypt, ts_diff_ns(&t1, &t0)); crypt_secure_clear(key, SYMMKEYSZ); @@ -964,7 +1112,7 @@ int flow_join(const char * dst, if (err < 0) return err; - fd = flow_init(&flow, &crypt); + fd = flow_init(&flow, &crypt, 0); crypt_secure_clear(key, SYMMKEYSZ); @@ -983,10 +1131,10 @@ int flow_dealloc(int fd) struct flow * flow; int err; - if (fd < 0 || fd >= SYS_MAX_FLOWS ) + if (fd < 0 || fd >= PROC_MAX_FLOWS ) return -EINVAL; - memset(&info, 0, sizeof(flow)); + memset(&info, 0, sizeof(info)); flow = &proc.flows[fd]; @@ -1008,9 +1156,8 @@ int flow_dealloc(int fd) pthread_rwlock_rdlock(&proc.lock); - timeo.tv_sec = frcti_dealloc(flow->frcti); - while (timeo.tv_sec < 0) { /* keep the flow active for rtx */ - ssize_t ret; + while (FRCTI_LINGERING(flow->frcti)) { + ssize_t ret; pthread_rwlock_unlock(&proc.lock); @@ -1018,12 +1165,12 @@ int flow_dealloc(int fd) pthread_rwlock_rdlock(&proc.lock); - timeo.tv_sec = frcti_dealloc(flow->frcti); - - if (ret == -EFLOWDOWN && timeo.tv_sec < 0) - timeo.tv_sec = -timeo.tv_sec; + if (ret == -EFLOWDOWN) + break; } + timeo.tv_sec = FRCTI_DEALLOC(flow->frcti); + pthread_cleanup_push(__cleanup_rwlock_unlock, &proc.lock); ssm_rbuff_fini(flow->tx_rb); @@ -1033,15 +1180,18 @@ int flow_dealloc(int fd) info.id = flow->info.id; info.n_pid = getpid(); - if (flow_dealloc__irm_req_ser(&msg, &info, &timeo) < 0) - return -ENOMEM; + if (flow_dealloc__irm_req_ser(&msg, &info, &timeo) < 0) { + err = -ENOMEM; + goto out; + } err = send_recv_msg(&msg); if (err < 0) - return err; + goto out; err = irm__irm_result_des(&msg); + out: flow_fini(fd); return err; @@ -1055,12 +1205,12 @@ int ipcp_flow_dealloc(int fd) struct flow * flow; int err; - if (fd < 0 || fd >= SYS_MAX_FLOWS ) + if (fd < 0 || fd >= PROC_MAX_FLOWS ) return -EINVAL; flow = &proc.flows[fd]; - memset(&info, 0, sizeof(flow)); + memset(&info, 0, sizeof(info)); pthread_rwlock_rdlock(&proc.lock); @@ -1074,15 +1224,18 @@ int ipcp_flow_dealloc(int fd) pthread_rwlock_unlock(&proc.lock); - if (ipcp_flow_dealloc__irm_req_ser(&msg, &info) < 0) - return -ENOMEM; + if (ipcp_flow_dealloc__irm_req_ser(&msg, &info) < 0) { + err = -ENOMEM; + goto out; + } err = send_recv_msg(&msg); if (err < 0) - return err; + goto out; err = irm__irm_result_des(&msg); + out: flow_fini(fd); return err; @@ -1102,8 +1255,18 @@ int fccntl(int fd, uint32_t tx_acl; size_t * qlen; struct flow * flow; - - if (fd < 0 || fd >= SYS_MAX_FLOWS) + uint16_t old_acc; + uint16_t new_acc; + size_t max; + size_t * maxp; + size_t rsz; + size_t * rszp; + time_t rto; + time_t * rtop; + int rc; + bool emit_eos = false; + + if (fd < 0 || fd >= PROC_MAX_FLOWS) return -EBADF; flow = &proc.flows[fd]; @@ -1167,14 +1330,27 @@ int fccntl(int fd, qlen = va_arg(l, size_t *); *qlen = ssm_rbuff_queued(flow->tx_rb); break; + case FLOWGMTU: + maxp = va_arg(l, size_t *); + if (maxp == NULL) + goto einval; + *maxp = flow_user_mtu(flow, flow->info.mtu); + break; case FLOWSFLAGS: + old_acc = flow->oflags & FLOWFACCMODE; flow->oflags = va_arg(l, uint32_t); + new_acc = flow->oflags & FLOWFACCMODE; + + /* Defer EOS emit until after proc.lock is dropped: */ + /* frcti_fin_snd may block on shm-pool/tx-rb. */ + if (new_acc == FLOWFRDONLY + && old_acc != FLOWFRDONLY + && flow->frcti != NULL) + emit_eos = true; + rx_acl = ssm_rbuff_get_acl(flow->rx_rb); - tx_acl = ssm_rbuff_get_acl(flow->rx_rb); - /* - * Making our own flow write only means making the - * the other side of the flow read only. - */ + tx_acl = ssm_rbuff_get_acl(flow->tx_rb); + /* Our flow write-only -> peer's read-only. */ if (flow->oflags & FLOWFWRONLY) rx_acl |= ACL_RDONLY; if (flow->oflags & FLOWFRDWR) @@ -1218,6 +1394,59 @@ int fccntl(int fd, goto eperm; *cflags = frcti_getflags(flow->frcti); break; + case FRCTSMAXSDU: + max = va_arg(l, size_t); + if (flow->frcti == NULL) + goto eperm; + if (frcti_set_max_rcv_sdu(flow->frcti, max) < 0) + goto einval; + break; + case FRCTGMAXSDU: + maxp = va_arg(l, size_t *); + if (maxp == NULL) + goto einval; + if (flow->frcti == NULL) + goto eperm; + *maxp = frcti_get_max_rcv_sdu(flow->frcti); + break; + case FRCTSRRINGSZ: + rsz = va_arg(l, size_t); + if (flow->frcti == NULL) + goto eperm; + rc = frcti_set_rcv_ring_sz(flow->frcti, rsz); + if (rc < 0) { + pthread_rwlock_unlock(&proc.lock); + va_end(l); + return rc; + } + break; + case FRCTGRRINGSZ: + rszp = va_arg(l, size_t *); + if (rszp == NULL) + goto einval; + if (flow->frcti == NULL) + goto eperm; + *rszp = frcti_get_rcv_ring_sz(flow->frcti); + break; + case FRCTSRTOMIN: + if (flow->frcti == NULL) + goto eperm; + rto = va_arg(l, time_t); + rc = frcti_set_rto_min(flow->frcti, rto); + if (rc < 0) { + pthread_rwlock_unlock(&proc.lock); + va_end(l); + return rc; + } + break; + case FRCTGRTOMIN: + if (flow->frcti == NULL) + goto eperm; + rtop = va_arg(l, time_t *); + if (rtop == NULL) + goto einval; + *rtop = frcti_get_rto_min(flow->frcti); + break; default: pthread_rwlock_unlock(&proc.lock); va_end(l); @@ -1227,6 +1456,9 @@ int fccntl(int fd, pthread_rwlock_unlock(&proc.lock); + if (emit_eos) + frcti_fin_snd(flow->frcti); + va_end(l); return 0; @@ -1241,86 +1473,195 @@ int fccntl(int fd, return -EPERM; } -static int chk_crc(struct ssm_pk_buff * spb) -{ - uint32_t crc; - uint8_t * head = ssm_pk_buff_head(spb); - uint8_t * tail = ssm_pk_buff_tail_release(spb, CRCLEN); - - mem_hash(HASH_CRC32, &crc, head, tail - head); - - return !(crc == *((uint32_t *) tail)); -} - -static int add_crc(struct ssm_pk_buff * spb) -{ - uint8_t * head; - uint8_t * tail; - - tail = ssm_pk_buff_tail_alloc(spb, CRCLEN); - if (tail == NULL) - return -ENOMEM; - - head = ssm_pk_buff_head(spb); - mem_hash(HASH_CRC32, tail, head, tail - head); - - return 0; -} - static int flow_tx_spb(struct flow * flow, struct ssm_pk_buff * spb, + uint16_t flags, bool block, struct timespec * abstime) { struct timespec now; ssize_t idx; + size_t pci_total; int ret; clock_gettime(PTHREAD_COND_CLOCK, &now); - - pthread_rwlock_wrlock(&proc.lock); - flow->snd_act = now; - pthread_rwlock_unlock(&proc.lock); - - idx = ssm_pk_buff_get_idx(spb); - - pthread_rwlock_rdlock(&proc.lock); + idx = ssm_pk_buff_get_off(spb); if (ssm_pk_buff_len(spb) > 0) { - if (frcti_snd(flow->frcti, spb) < 0) + if (FRCTI_SND(flow->frcti, spb, flags) < 0) goto enomem; - if (spb_encrypt(flow, spb) < 0) - goto enomem; + if (flow->info.qs.ber == 0) { + pci_total = flow->frcti != NULL + ? frcti_data_hdr_len(flow->frcti) : 0; + if (crc_add(spb, pci_total) != 0) + goto enomem; + } - if (flow->info.qs.ber == 0 && add_crc(spb) != 0) + if (spb_encrypt(flow, spb) < 0) goto enomem; } - pthread_cleanup_push(__cleanup_rwlock_unlock, &proc.lock); - if (!block) ret = ssm_rbuff_write(flow->tx_rb, idx); else ret = ssm_rbuff_write_b(flow->tx_rb, idx, abstime); - if (ret < 0) + if (ret < 0) { ssm_pool_remove(proc.pool, idx); - else - ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT); - - pthread_cleanup_pop(true); + return ret; + } + ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT); return 0; -enomem: - pthread_rwlock_unlock(&proc.lock); + enomem: ssm_pool_remove(proc.pool, idx); return -ENOMEM; } +/* Per-fragment role for fragment i out of n; n == 1 yields SOLE. */ +static __inline__ uint16_t flow_frag_role(size_t i, size_t n) +{ + if (n == 1) + return FRCT_FR_SOLE; + if (i == 0) + return FRCT_FR_FIRST; + if (i + 1 == n) + return FRCT_FR_LAST; + + return FRCT_FR_MID; +} + +/* + * Stream-mode write: split buf into chunks of + * (frag_mtu - PCI - PCI_STREAM) bytes; each chunk goes through the + * normal tx path. frcti_snd injects the [start,end) extension and + * advances snd_byte_next under its wrlock. No FFGM/LFGM role bits. + */ +static ssize_t flow_write_stream(struct flow * flow, + const void * buf, + size_t count, + int oflags, + struct timespec * dl) +{ + const uint8_t * src = buf; + size_t payload; + size_t off = 0; + bool block = !(oflags & FLOWFWNOBLOCK); + + if (!FRCTI_IS_FRTX(flow->frcti)) + return -EMSGSIZE; + + payload = FRCTI_PAYLOAD_CAP(flow->frcti); + + while (off < count) { + struct ssm_pk_buff * spb; + uint8_t * ptr; + ssize_t idx; + size_t clen; + int ret; + + ret = flow_wait_window(flow, 1, block, dl); + if (ret < 0) + return off > 0 ? (ssize_t) off : (ssize_t) ret; + + clen = MIN(count - off, payload); + + if (block) + idx = ssm_pool_alloc_b(proc.pool, clen, &ptr, + &spb, dl); + else + idx = ssm_pool_alloc(proc.pool, clen, &ptr, &spb); + if (idx < 0) + return off > 0 ? (ssize_t) off : idx; + + memcpy(ptr, src + off, clen); + + ret = flow_tx_spb(flow, spb, 0, block, dl); + if (ret < 0) + return off > 0 ? (ssize_t) off : (ssize_t) ret; + + off += clen; + } + + return (ssize_t) count; +} + +/* Per-fragment flow_tx_spb loop. Raw flows refuse; FRCT splits the SDU. */ +static ssize_t flow_write_frag(struct flow * flow, + const void * buf, + size_t count, + int oflags, + struct timespec * dl) +{ + const uint8_t * src = buf; + size_t frag_payload; + size_t n; + size_t off = 0; + size_t i; + int ret; + bool block = !(oflags & FLOWFWNOBLOCK); + + /* Raw flows carry no PCI; cannot fragment. */ + if (flow->frcti == NULL) + return -EMSGSIZE; + + frag_payload = FRCTI_PAYLOAD_CAP(flow->frcti); + + /* Guard the ceil-divide against size_t overflow. */ + if (count > SIZE_MAX - frag_payload + 1) + return -EMSGSIZE; + n = (count + frag_payload - 1) / frag_payload; + + /* SDU larger than the FC window can ever offer would deadlock. */ + if (n > RQ_SIZE) + return -EMSGSIZE; + + /* SDU-atomic FC: wait for n seqnos to avoid overshoot mid-SDU. */ + ret = flow_wait_window(flow, n, block, dl); + if (ret < 0) + return (ssize_t) ret; + + STAT_BUMP(flow->frcti, sdu_snd_frag); + + for (i = 0; i < n; ++i) { + struct ssm_pk_buff * spb; + uint8_t * ptr; + ssize_t idx; + size_t clen; + + clen = (i + 1 == n) ? (count - off) : frag_payload; + + if (block) + idx = ssm_pool_alloc_b(proc.pool, clen, &ptr, + &spb, dl); + else + idx = ssm_pool_alloc(proc.pool, clen, &ptr, &spb); + if (idx < 0) { + if (off > 0) + STAT_BUMP(flow->frcti, sdu_snd_alloc); + return off > 0 ? (ssize_t) off : idx; + } + + memcpy(ptr, src + off, clen); + + ret = flow_tx_spb(flow, spb, flow_frag_role(i, n), + block, dl); + if (ret < 0) { + if (off > 0) + STAT_BUMP(flow->frcti, sdu_snd_tx); + return off > 0 ? (ssize_t) off : (ssize_t) ret; + } + + off += clen; + } + + return (ssize_t) count; +} + ssize_t flow_write(int fd, const void * buf, size_t count) @@ -1330,76 +1671,75 @@ ssize_t flow_write(int fd, int ret; int flags; struct timespec abs; - struct timespec * abstime = NULL; + struct timespec now; + struct timespec * dl = NULL; struct ssm_pk_buff * spb; uint8_t * ptr; if (buf == NULL && count != 0) return -EINVAL; - if (fd < 0 || fd >= PROG_MAX_FLOWS) + if (fd < 0 || fd >= PROC_MAX_FLOWS) return -EBADF; flow = &proc.flows[fd]; - clock_gettime(PTHREAD_COND_CLOCK, &abs); - - pthread_rwlock_wrlock(&proc.lock); + pthread_rwlock_rdlock(&proc.lock); if (flow->info.id < 0) { pthread_rwlock_unlock(&proc.lock); return -ENOTALLOC; } + flags = flow->oflags; + + clock_gettime(PTHREAD_COND_CLOCK, &now); + if (flow->snd_timesout) { - ts_add(&abs, &flow->snd_timeo, &abs); - abstime = &abs; + ts_add(&now, &flow->snd_timeo, &abs); + dl = &abs; } - flags = flow->oflags; - pthread_rwlock_unlock(&proc.lock); if ((flags & FLOWFACCMODE) == FLOWFRDONLY) return -EPERM; - if (flags & FLOWFWNOBLOCK) { - if (!frcti_is_window_open(flow->frcti)) - return -EAGAIN; - idx = ssm_pool_alloc(proc.pool, count, &ptr, &spb); - } else { - ret = frcti_window_wait(flow->frcti, abstime); + tw_move_safe(); + + if (flow->frcti != NULL) { + /* Pump rx_rb so a pure-writer processes ACKs. */ + ret = flow_wait_window(flow, 1, !(flags & FLOWFWNOBLOCK), dl); if (ret < 0) return ret; - idx = ssm_pool_alloc_b(proc.pool, count, &ptr, &spb, abstime); + + if (count > 0 && FRCTI_IS_STREAM(flow->frcti)) + return flow_write_stream(flow, buf, count, flags, dl); + + if (FRCTI_NEEDS_FRAG(flow->frcti, count)) + return flow_write_frag(flow, buf, count, flags, dl); + } else if (flow->info.mtu > 0 + && count > flow_user_mtu(flow, flow->info.mtu)) { + /* Raw flows carry no PCI; refuse anything > one n-1 frame. */ + return -EMSGSIZE; } + if (flags & FLOWFWNOBLOCK) + idx = ssm_pool_alloc(proc.pool, count, &ptr, &spb); + else + idx = ssm_pool_alloc_b(proc.pool, count, &ptr, &spb, dl); if (idx < 0) return idx; if (count > 0) memcpy(ptr, buf, count); - ret = flow_tx_spb(flow, spb, !(flags & FLOWFWNOBLOCK), abstime); + ret = flow_tx_spb(flow, spb, FRCT_FR_SOLE, + !(flags & FLOWFWNOBLOCK), dl); return ret < 0 ? (ssize_t) ret : (ssize_t) count; } -static bool invalid_pkt(struct flow * flow, - struct ssm_pk_buff * spb) -{ - if (spb == NULL || ssm_pk_buff_len(spb) == 0) - return true; - - if (flow->info.qs.ber == 0 && chk_crc(spb) != 0) - return true; - - if (spb_decrypt(flow, spb) < 0) - return true; - - return false; -} - static ssize_t flow_rx_spb(struct flow * flow, struct ssm_pk_buff ** spb, bool block, @@ -1408,19 +1748,14 @@ static ssize_t flow_rx_spb(struct flow * flow, ssize_t idx; struct timespec now; - idx = block ? ssm_rbuff_read_b(flow->rx_rb, abstime) : - ssm_rbuff_read(flow->rx_rb); + idx = block ? ssm_rbuff_read_b(flow->rx_rb, abstime) + : ssm_rbuff_read(flow->rx_rb); if (idx < 0) return idx; clock_gettime(PTHREAD_COND_CLOCK, &now); - - pthread_rwlock_wrlock(&proc.lock); - flow->rcv_act = now; - pthread_rwlock_unlock(&proc.lock); - *spb = ssm_pool_get(proc.pool, idx); if (invalid_pkt(flow, *spb)) { @@ -1431,28 +1766,124 @@ static ssize_t flow_rx_spb(struct flow * flow, return idx; } +static ssize_t raw_flow_read_pkt(struct flow * flow, + bool block, + struct timespec * dl) +{ + struct ssm_pk_buff * spb; + struct timespec wait_abs; + ssize_t idx; + + while (true) { + if (!block) { + idx = ssm_rbuff_read(flow->rx_rb); + if (idx < 0) + return -EAGAIN; + } else { + compute_wait_deadline(dl, &wait_abs); + idx = ssm_rbuff_read_b(flow->rx_rb, &wait_abs); + if (idx == -ETIMEDOUT) { + if (deadline_passed(dl)) + return -ETIMEDOUT; + continue; + } + if (idx < 0) + return idx; + } + + spb = ssm_pool_get(proc.pool, idx); + if (!invalid_pkt(flow, spb)) + return idx; + + ssm_pool_remove(proc.pool, idx); + if (!block) + return -EAGAIN; + } +} + +static ssize_t deliver_pkt(struct flow * flow, + struct ssm_pk_buff * spb, + ssize_t idx, + void * buf, + size_t count, + bool partrd) +{ + uint8_t * packet = ssm_pk_buff_head(spb); + ssize_t n = ssm_pk_buff_len(spb); + + assert(n >= 0); + + if (n <= (ssize_t) count) { + memcpy(buf, packet, n); + ipcp_spb_release(spb); + if (partrd && n == (ssize_t) count) + flow->part_idx = DONE_PART; + else + flow->part_idx = NO_PART; + + return n; + } + + if (partrd) { + memcpy(buf, packet, count); + ssm_pk_buff_pop(spb, n); + flow->part_idx = idx; + return count; + } + + ipcp_spb_release(spb); + return -EMSGSIZE; +} + +/* Drive frcti_consume until it delivers or errors. */ +static ssize_t flow_read_frcti(struct flow * flow, + void * buf, + size_t count, + bool block, + struct timespec * dl) +{ + struct timespec now; + ssize_t bytes; + int rc; + + while (true) { + flow_drain_rx_nb(flow); + bytes = FRCTI_CONSUME(flow->frcti, buf, count); + if (bytes >= 0) + break; + if (bytes != -EAGAIN) + return bytes; + if (!block) + return -EAGAIN; + rc = flow_rx_one(flow, dl); + if (rc < 0) + return rc; + } + + clock_gettime(PTHREAD_COND_CLOCK, &now); + flow->rcv_act = now; + + return bytes; +} + ssize_t flow_read(int fd, void * buf, size_t count) { - ssize_t idx; - ssize_t n; - uint8_t * packet; + struct flow * flow; struct ssm_pk_buff * spb; struct timespec abs; struct timespec now; - struct timespec * abstime = NULL; - struct flow * flow; + struct timespec * dl = NULL; + ssize_t idx; bool block; bool partrd; - if (fd < 0 || fd >= PROG_MAX_FLOWS) + if (fd < 0 || fd >= PROC_MAX_FLOWS) return -EBADF; flow = &proc.flows[fd]; - clock_gettime(PTHREAD_COND_CLOCK, &now); - pthread_rwlock_rdlock(&proc.lock); if (flow->info.id < 0) { @@ -1461,8 +1892,8 @@ ssize_t flow_read(int fd, } if (flow->part_idx == DONE_PART) { - pthread_rwlock_unlock(&proc.lock); flow->part_idx = NO_PART; + pthread_rwlock_unlock(&proc.lock); return 0; } @@ -1470,75 +1901,33 @@ ssize_t flow_read(int fd, partrd = !(flow->oflags & FLOWFRNOPART); if (flow->rcv_timesout) { + clock_gettime(PTHREAD_COND_CLOCK, &now); ts_add(&now, &flow->rcv_timeo, &abs); - abstime = &abs; + dl = &abs; } - idx = flow->part_idx; - if (idx < 0) { - while ((idx = frcti_queued_pdu(flow->frcti)) < 0) { - pthread_rwlock_unlock(&proc.lock); - - idx = flow_rx_spb(flow, &spb, block, abstime); - if (idx < 0) { - if (block && idx != -EAGAIN) - return idx; - if (!block) - return idx; + pthread_rwlock_unlock(&proc.lock); - pthread_rwlock_rdlock(&proc.lock); - continue; - } + tw_move_safe(); - pthread_rwlock_rdlock(&proc.lock); + idx = flow->part_idx; + if (idx < 0 && flow->frcti != NULL) + return flow_read_frcti(flow, buf, count, block, dl); - frcti_rcv(flow->frcti, spb); - } + if (idx < 0) { + idx = raw_flow_read_pkt(flow, block, dl); + if (idx < 0) + return idx; } spb = ssm_pool_get(proc.pool, idx); - pthread_rwlock_unlock(&proc.lock); - - packet = ssm_pk_buff_head(spb); - - n = ssm_pk_buff_len(spb); - - assert(n >= 0); - - if (n <= (ssize_t) count) { - memcpy(buf, packet, n); - ipcp_spb_release(spb); - - pthread_rwlock_wrlock(&proc.lock); - - flow->part_idx = (partrd && n == (ssize_t) count) ? - DONE_PART : NO_PART; - - flow->rcv_act = now; - - pthread_rwlock_unlock(&proc.lock); - return n; - } else { - if (partrd) { - memcpy(buf, packet, count); - ssm_pk_buff_head_release(spb, n); - pthread_rwlock_wrlock(&proc.lock); - flow->part_idx = idx; - - flow->rcv_act = now; + clock_gettime(PTHREAD_COND_CLOCK, &now); + flow->rcv_act = now; - pthread_rwlock_unlock(&proc.lock); - return count; - } else { - ipcp_spb_release(spb); - return -EMSGSIZE; - } - } + return deliver_pkt(flow, spb, idx, buf, count, partrd); } -/* fqueue functions. */ - struct flow_set * fset_create(void) { struct flow_set * set; @@ -1614,7 +2003,7 @@ int fset_add(struct flow_set * set, struct flow * flow; int ret; - if (set == NULL || fd < 0 || fd >= SYS_MAX_FLOWS) + if (set == NULL || fd < 0 || fd >= PROC_MAX_FLOWS) return -EINVAL; flow = &proc.flows[fd]; @@ -1650,7 +2039,7 @@ void fset_del(struct flow_set * set, { struct flow * flow; - if (set == NULL || fd < 0 || fd >= SYS_MAX_FLOWS) + if (set == NULL || fd < 0 || fd >= PROC_MAX_FLOWS) return; flow = &proc.flows[fd]; @@ -1661,7 +2050,7 @@ void fset_del(struct flow_set * set, ssm_flow_set_del(proc.fqset, set->idx, flow->info.id); if (flow->frcti != NULL) - ssm_flow_set_add(proc.fqset, 0, proc.flows[fd].info.id); + ssm_flow_set_add(proc.fqset, 0, flow->info.id); pthread_rwlock_unlock(&proc.lock); } @@ -1672,7 +2061,7 @@ bool fset_has(const struct flow_set * set, struct flow * flow; bool ret; - if (set == NULL || fd < 0 || fd >= SYS_MAX_FLOWS) + if (set == NULL || fd < 0 || fd >= PROC_MAX_FLOWS) return false; flow = &proc.flows[fd]; @@ -1691,61 +2080,59 @@ bool fset_has(const struct flow_set * set, return ret; } -/* Filter fqueue events for non-data packets */ static int fqueue_filter(struct fqueue * fq) { struct ssm_pk_buff * spb; int fd; ssize_t idx; struct frcti * frcti; + int ret = 0; - while (fq->next < fq->fqsize) { - if (fq->fqueue[fq->next].event != FLOW_PKT) - return 1; + /* proc.lock rdlock gates frcti_destroy via flow_fini wrlock. */ + pthread_rwlock_rdlock(&proc.lock); - pthread_rwlock_rdlock(&proc.lock); + while (fq->next < fq->fqsize) { + if (fq->fqueue[fq->next].event != FLOW_PKT) { + ret = 1; + goto out; + } fd = proc.id_to_fd[fq->fqueue[fq->next].flow_id].fd; if (fd < 0) { ++fq->next; - pthread_rwlock_unlock(&proc.lock); continue; } frcti = proc.flows[fd].frcti; if (frcti == NULL) { - pthread_rwlock_unlock(&proc.lock); - return 1; + ret = 1; + goto out; } - if (__frcti_pdu_ready(frcti) >= 0) { - pthread_rwlock_unlock(&proc.lock); - return 1; + if (FRCTI_PDU_READY(frcti)) { + ret = 1; + goto out; } - pthread_rwlock_unlock(&proc.lock); - idx = flow_rx_spb(&proc.flows[fd], &spb, false, NULL); if (idx < 0) - return 0; - - pthread_rwlock_rdlock(&proc.lock); + goto out; spb = ssm_pool_get(proc.pool, idx); - __frcti_rcv(frcti, spb); + FRCTI_RCV(frcti, spb); - if (__frcti_pdu_ready(frcti) >= 0) { - pthread_rwlock_unlock(&proc.lock); - return 1; + if (FRCTI_PDU_READY(frcti)) { + ret = 1; + goto out; } - pthread_rwlock_unlock(&proc.lock); - ++fq->next; } - return 0; + out: + pthread_rwlock_unlock(&proc.lock); + return ret; } int fqueue_next(struct fqueue * fq) @@ -1792,7 +2179,8 @@ ssize_t fevent(struct flow_set * set, { ssize_t ret = 0; struct timespec abs; - struct timespec * t = NULL; + struct timespec * dl = NULL; + struct timespec wait_abs; if (set == NULL || fq == NULL) return -EINVAL; @@ -1800,17 +2188,26 @@ ssize_t fevent(struct flow_set * set, if (fq->fqsize > 0 && fq->next != fq->fqsize) return 1; - clock_gettime(PTHREAD_COND_CLOCK, &abs); - if (timeo != NULL) { - ts_add(&abs, timeo, &abs); - t = &abs; + struct timespec now; + clock_gettime(PTHREAD_COND_CLOCK, &now); + ts_add(&now, timeo, &abs); + dl = &abs; } while (ret == 0) { - ret = ssm_flow_set_wait(proc.fqset, set->idx, fq->fqueue, t); - if (ret == -ETIMEDOUT) - return -ETIMEDOUT; + tw_move_safe(); + + compute_wait_deadline(dl, &wait_abs); + + ret = ssm_flow_set_wait(proc.fqset, set->idx, + fq->fqueue, &wait_abs); + if (ret == -ETIMEDOUT) { + if (deadline_passed(dl)) + return -ETIMEDOUT; + ret = 0; + continue; + } fq->fqsize = ret; fq->next = 0; @@ -1823,8 +2220,6 @@ ssize_t fevent(struct flow_set * set, return 1; } -/* ipcp-dev functions. */ - int np1_flow_alloc(pid_t n_pid, int flow_id) { @@ -1837,9 +2232,10 @@ int np1_flow_alloc(pid_t n_pid, flow.n_pid = getpid(); flow.qs = qos_np1; flow.mpl = 0; - flow.n_1_pid = n_pid; /* This "flow" is upside-down! */ + /* np1 flow: n_1_pid is the upper. */ + flow.n_1_pid = n_pid; - return flow_init(&flow, &crypt); + return flow_init(&flow, &crypt, 0); } int np1_flow_dealloc(int flow_id, @@ -1847,12 +2243,7 @@ int np1_flow_dealloc(int flow_id, { int fd; - /* - * TODO: Don't pass timeo to the IPCP but wait in IRMd. - * This will need async ops, waiting until we bootstrap - * the IRMd over ouroboros. - */ - + /* TODO: wait in IRMd, not here; needs async ops. */ sleep(timeo); pthread_rwlock_rdlock(&proc.lock); @@ -1900,6 +2291,7 @@ int ipcp_create_r(const struct ipcp_info * info) int ipcp_flow_req_arr(const buffer_t * dst, qosspec_t qs, time_t mpl, + uint32_t mtu, const buffer_t * data) { struct flow_info flow; @@ -1916,6 +2308,7 @@ int ipcp_flow_req_arr(const buffer_t * dst, flow.n_1_pid = getpid(); flow.qs = qs; flow.mpl = mpl; + flow.mtu = mtu; if (ipcp_flow_req_arr__irm_req_ser(&msg, dst, &flow, data) < 0) return -ENOMEM; @@ -1930,22 +2323,25 @@ int ipcp_flow_req_arr(const buffer_t * dst, if (err < 0) return err; - assert(crypt.nid == NID_undef); /* np1 flows are not encrypted */ + /* np1 flows are not encrypted. */ + assert(crypt.nid == NID_undef); - /* inverted for np1_flow */ + /* Inverted for np1_flow. */ flow.n_1_pid = flow.n_pid; flow.n_pid = getpid(); flow.mpl = 0; + flow.mtu = 0; flow.qs = qos_np1; crypt.nid = NID_undef; - return flow_init(&flow, &crypt); + return flow_init(&flow, &crypt, 0); } int ipcp_flow_alloc_reply(int fd, int response, time_t mpl, + uint32_t mtu, const buffer_t * data) { struct flow_info flow; @@ -1953,7 +2349,7 @@ int ipcp_flow_alloc_reply(int fd, buffer_t msg = {SOCK_BUF_SIZE, buf}; int err; - assert(fd >= 0 && fd < SYS_MAX_FLOWS); + assert(fd >= 0 && fd < PROC_MAX_FLOWS); pthread_rwlock_rdlock(&proc.lock); @@ -1962,6 +2358,7 @@ int ipcp_flow_alloc_reply(int fd, pthread_rwlock_unlock(&proc.lock); flow.mpl = mpl; + flow.mtu = mtu; if (ipcp_flow_alloc_reply__irm_msg_ser(&msg, &flow, response, data) < 0) return -ENOMEM; @@ -1979,7 +2376,7 @@ int ipcp_flow_read(int fd, struct flow * flow; ssize_t idx = -1; - assert(fd >= 0 && fd < SYS_MAX_FLOWS); + assert(fd >= 0 && fd < PROC_MAX_FLOWS); assert(spb); flow = &proc.flows[fd]; @@ -1988,7 +2385,14 @@ int ipcp_flow_read(int fd, assert(flow->info.id >= 0); - while (frcti_queued_pdu(flow->frcti) < 0) { + /* Raw flow: deliver the popped pkt directly (no FRCT rq). */ + if (flow->frcti == NULL) { + pthread_rwlock_unlock(&proc.lock); + idx = flow_rx_spb(flow, spb, false, NULL); + return idx < 0 ? (int) idx : 0; + } + + while (!FRCTI_PDU_READY(flow->frcti)) { pthread_rwlock_unlock(&proc.lock); idx = flow_rx_spb(flow, spb, false, NULL); @@ -1997,7 +2401,7 @@ int ipcp_flow_read(int fd, pthread_rwlock_rdlock(&proc.lock); - frcti_rcv(flow->frcti, *spb); + FRCTI_RCV(flow->frcti, *spb); } pthread_rwlock_unlock(&proc.lock); @@ -2011,12 +2415,12 @@ int ipcp_flow_write(int fd, struct flow * flow; int ret; - assert(fd >= 0 && fd < SYS_MAX_FLOWS); + assert(fd >= 0 && fd < PROC_MAX_FLOWS); assert(spb); flow = &proc.flows[fd]; - pthread_rwlock_wrlock(&proc.lock); + pthread_rwlock_rdlock(&proc.lock); if (flow->info.id < 0) { pthread_rwlock_unlock(&proc.lock); @@ -2030,30 +2434,28 @@ int ipcp_flow_write(int fd, pthread_rwlock_unlock(&proc.lock); - ret = flow_tx_spb(flow, spb, true, NULL); + ret = flow_tx_spb(flow, spb, FRCT_FR_SOLE, true, NULL); return ret; } -static int pool_copy_spb(struct ssm_pool * src_pool, - ssize_t src_idx, - struct ssm_pool * dst_pool, - struct ssm_pk_buff ** dst_spb) +/* Copy src into dst_pool without consuming src. Caller owns both halves. */ +static int pool_dup_spb(struct ssm_pool * src_pool, + size_t src_off, + struct ssm_pool * dst_pool, + struct ssm_pk_buff ** dst_spb) { struct ssm_pk_buff * src; uint8_t * ptr; size_t len; - src = ssm_pool_get(src_pool, src_idx); + src = ssm_pool_get(src_pool, src_off); len = ssm_pk_buff_len(src); - if (ssm_pool_alloc(dst_pool, len, &ptr, dst_spb) < 0) { - ssm_pool_remove(src_pool, src_idx); + if (ssm_pool_alloc(dst_pool, len, &ptr, dst_spb) < 0) return -ENOMEM; - } memcpy(ptr, ssm_pk_buff_head(src), len); - ssm_pool_remove(src_pool, src_idx); return 0; } @@ -2063,9 +2465,9 @@ int np1_flow_read(int fd, struct ssm_pool * pool) { struct flow * flow; - ssize_t idx = -1; + ssize_t off; - assert(fd >= 0 && fd < SYS_MAX_FLOWS); + assert(fd >= 0 && fd < PROC_MAX_FLOWS); assert(spb); flow = &proc.flows[fd]; @@ -2074,20 +2476,23 @@ int np1_flow_read(int fd, pthread_rwlock_rdlock(&proc.lock); - idx = ssm_rbuff_read(flow->rx_rb); - if (idx < 0) { + off = ssm_rbuff_read(flow->rx_rb); + if (off < 0) { pthread_rwlock_unlock(&proc.lock); - return idx; + return off; } pthread_rwlock_unlock(&proc.lock); if (pool == NULL) { - *spb = ssm_pool_get(proc.pool, idx); + *spb = ssm_pool_get(proc.pool, off); } else { /* Cross-pool copy: PUP -> GSPP */ - if (pool_copy_spb(pool, idx, proc.pool, spb) < 0) + if (pool_dup_spb(pool, off, proc.pool, spb) < 0) { + ssm_pool_remove(pool, off); return -ENOMEM; + } + ssm_pool_remove(pool, off); } return 0; @@ -2100,9 +2505,10 @@ int np1_flow_write(int fd, struct flow * flow; struct ssm_pk_buff * dst; int ret; - ssize_t idx; + size_t off; + size_t dst_off; - assert(fd >= 0 && fd < SYS_MAX_FLOWS); + assert(fd >= 0 && fd < PROC_MAX_FLOWS); assert(spb); flow = &proc.flows[fd]; @@ -2121,45 +2527,47 @@ int np1_flow_write(int fd, pthread_rwlock_unlock(&proc.lock); - idx = ssm_pk_buff_get_idx(spb); + off = ssm_pk_buff_get_off(spb); if (pool == NULL) { - ret = ssm_rbuff_write_b(flow->tx_rb, idx, NULL); + ret = ssm_rbuff_write_b(flow->tx_rb, off, NULL); if (ret < 0) - ssm_pool_remove(proc.pool, idx); - else - ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT); + return ret; + ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT); } else { - /* Cross-pool copy: GSPP -> PUP */ - if (pool_copy_spb(proc.pool, idx, pool, &dst) < 0) + /* Cross-pool copy: GSPP -> PUP. Src kept on error. */ + if (pool_dup_spb(proc.pool, off, pool, &dst) < 0) return -ENOMEM; - idx = ssm_pk_buff_get_idx(dst); - ret = ssm_rbuff_write_b(flow->tx_rb, idx, NULL); - if (ret < 0) - ssm_pool_remove(pool, idx); - else - ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT); + dst_off = ssm_pk_buff_get_off(dst); + ret = ssm_rbuff_write_b(flow->tx_rb, dst_off, NULL); + if (ret < 0) { + ssm_pool_remove(pool, dst_off); + return ret; + } + ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT); + ssm_pool_remove(proc.pool, off); } - return ret; + return 0; } int ipcp_spb_reserve(struct ssm_pk_buff ** spb, size_t len) { - return ssm_pool_alloc_b(proc.pool, len, NULL, spb, NULL) < 0 ? -1 : 0; + return ssm_pool_alloc_b(proc.pool, len, NULL, spb, NULL) < 0 + ? -1 : 0; } void ipcp_spb_release(struct ssm_pk_buff * spb) { - ssm_pool_remove(proc.pool, ssm_pk_buff_get_idx(spb)); + ssm_pool_remove(proc.pool, ssm_pk_buff_get_off(spb)); } int ipcp_flow_fini(int fd) { struct ssm_rbuff * rx_rb; - assert(fd >= 0 && fd < SYS_MAX_FLOWS); + assert(fd >= 0 && fd < PROC_MAX_FLOWS); pthread_rwlock_rdlock(&proc.lock); @@ -2188,7 +2596,7 @@ int ipcp_flow_fini(int fd) int ipcp_flow_get_qoscube(int fd, qoscube_t * cube) { - assert(fd >= 0 && fd < SYS_MAX_FLOWS); + assert(fd >= 0 && fd < PROC_MAX_FLOWS); assert(cube); pthread_rwlock_rdlock(&proc.lock); @@ -2227,7 +2635,7 @@ int local_flow_transfer(int src_fd, struct ssm_pk_buff * dst_spb; struct ssm_pool * sp; struct ssm_pool * dp; - ssize_t idx; + ssize_t off; int ret; assert(src_fd >= 0); @@ -2241,15 +2649,15 @@ int local_flow_transfer(int src_fd, pthread_rwlock_rdlock(&proc.lock); - idx = ssm_rbuff_read(src_flow->rx_rb); - if (idx < 0) { + off = ssm_rbuff_read(src_flow->rx_rb); + if (off < 0) { pthread_rwlock_unlock(&proc.lock); - return idx; + return off; } if (dst_flow->info.id < 0) { pthread_rwlock_unlock(&proc.lock); - ssm_pool_remove(sp, idx); + ssm_pool_remove(sp, off); return -ENOTALLOC; } @@ -2257,21 +2665,24 @@ int local_flow_transfer(int src_fd, if (sp == dp) { /* Same pool: zero-copy */ - ret = ssm_rbuff_write_b(dst_flow->tx_rb, idx, NULL); + ret = ssm_rbuff_write_b(dst_flow->tx_rb, off, NULL); if (ret < 0) - ssm_pool_remove(sp, idx); + ssm_pool_remove(sp, off); else ssm_flow_set_notify(dst_flow->set, dst_flow->info.id, FLOW_PKT); } else { /* Different pools: single copy */ - if (pool_copy_spb(sp, idx, dp, &dst_spb) < 0) + if (pool_dup_spb(sp, off, dp, &dst_spb) < 0) { + ssm_pool_remove(sp, off); return -ENOMEM; + } - idx = ssm_pk_buff_get_idx(dst_spb); - ret = ssm_rbuff_write_b(dst_flow->tx_rb, idx, NULL); + ssm_pool_remove(sp, off); + off = ssm_pk_buff_get_off(dst_spb); + ret = ssm_rbuff_write_b(dst_flow->tx_rb, off, NULL); if (ret < 0) - ssm_pool_remove(dp, idx); + ssm_pool_remove(dp, off); else ssm_flow_set_notify(dst_flow->set, dst_flow->info.id, FLOW_PKT); diff --git a/src/lib/frct.c b/src/lib/frct.c index fad2cf69..2e8955e3 100644 --- a/src/lib/frct.c +++ b/src/lib/frct.c @@ -1,7 +1,7 @@ /* * Ouroboros - Copyright (C) 2016 - 2026 * - * Flow and Retransmission Control + * Flow and Retransmission Control Task (FRCT) * * Dimitri Staessens <dimitri@ouroboros.rocks> * Sander Vrijders <sander@ouroboros.rocks> @@ -20,97 +20,416 @@ * Foundation, Inc., http://www.fsf.org/about/contact/. */ -#include <ouroboros/endian.h> +/* Included by dev.c; uses dev.c statics (proc, spb_encrypt, ...). */ #define DELT_RDV (100 * MILLION) /* ns */ -#define MAX_RDV (1 * BILLION) /* ns */ +#define MAX_RDV (1 * BILLION) /* ns */ + +#define MAX_RTO_MUL 8 /* caps the RTO backoff shift */ +#define MAX_TLP_PER_EP 2 /* RFC 8985 §7.3: up to 2 TLPs */ +#define INITIAL_RTO (1 * BILLION) /* RFC 6298 §2.1: 1 s default */ +#define RTT_BOOT_NS (10 * MILLION) /* rtt_hint floor + initial mdev */ +#define SRTT_FLOOR_NS 1000L /* 1 us; smoothed RTT floor */ +#define MDEV_FLOOR_NS 100L /* 100 ns; mdev sanity floor */ +#define RTT_CLAMP_MUL 16 /* probe sample cap = N * srtt */ +#define MIN_RTT_WIN_NS (300ULL * BILLION) /* 5 min, Linux tcp default */ +#define NACK_COOLDOWN_NS (100 * MILLION) /* pre-DRF NACK cooldown */ +#define FRCT_TX_TIMEO_NS (250 * 1000) /* tx ring write deadline */ +#define ACK_DELAY_NS (2ULL * TICTIME) /* delayed-ACK fire delay */ #define FRCT "frct" #define FRCT_PCILEN (sizeof(struct frct_pci)) #define FRCT_NAME_STRLEN 32 -struct frct_cr { - uint32_t lwe; /* Left window edge */ - uint32_t rwe; /* Right window edge */ +/* Wire-protocol cap on SACK blocks per packet; binds both peers. */ +#define SACK_MAX_BLOCKS 2048 +#define SACK_BLOCK_SIZE (2 * sizeof(uint32_t)) +/* 2B count + 2B pad to 4-byte align the block list. */ +#define SACK_HDR_SIZE (sizeof(uint32_t)) +#define SACK_MIN_GAP_NS (250u * 1000u) /* 250 us SACK gap */ +#define MIN_REORDER_NS (250u * 1000u) /* 250 us RACK floor */ +#define SACK_RXM_MAX 32 /* Cap on retransmits staged from single SACK.*/ +#define DUP_THRESH 3 /* RFC 8985 §6.2 step 2.2 SACK count gate. */ + +/* RFC 8985 §7.2 RACK reorder-window scaling cap. */ +#define REO_WND_MULT_MAX 20 +/* RFC 8985 §7.2 step 5: round trips of no DSACK before halving. */ +#define REO_DECAY_PKTS 16 +/* DSACK seqno sanity: reject reports older/farther than one rcv window. */ +#define MAX_DSACK_LAG RQ_SIZE + +/* Signed ns elapsed; negative under concurrent update (no underflow). */ +static __inline__ int64_t ts_age_ns(uint64_t now_ns, + uint64_t then_ns) +{ + return (int64_t)(now_ns - then_ns); +} - uint8_t cflags; - uint32_t seqno; /* SEQ to send, or last SEQ Ack'd */ +/* True iff strictly more than thr_ns elapsed since then_ns. */ +static __inline__ bool ts_aged_ns(uint64_t now_ns, + uint64_t then_ns, + uint64_t thr_ns) +{ + return ts_age_ns(now_ns, then_ns) > (int64_t) thr_ns; +} - struct timespec act; /* Last seen activity */ - time_t inact; /* Inactivity (s) */ -}; +/* FRCT r-timer: do not retransmit packet older than t_r (from first send). */ +#define RXM_AGED_OUT(t0, now_ns, t_r) \ + ts_aged_ns((now_ns), (t0), (uint64_t)(t_r)) -struct frcti { - int fd; +/* FRCT a-timer: do not (re)transmit ACK after t_a from last data receive. */ +#define ACK_AGED_OUT(act, now_ns, t_a) \ + ts_aged_ns((now_ns), (act), (uint64_t)(t_a)) - time_t mpl; - time_t a; - time_t r; - time_t rdv; - - time_t srtt; /* Smoothed rtt */ - time_t mdev; /* Deviation */ - time_t rto; /* Retransmission timeout */ - uint32_t rttseq; - struct timespec t_probe; /* Probe time */ - bool probe; /* Probe active */ -#ifdef PROC_FLOW_STATS - size_t n_rtx; /* Number of rxm packets */ - size_t n_prb; /* Number of rtt probes */ - size_t n_rtt; /* Number of estimates */ - size_t n_dup; /* Duplicates received */ - size_t n_dak; /* Delayed ACKs received */ - size_t n_rdv; /* Number of rdv packets */ - size_t n_out; /* Packets out of window */ - size_t n_rqo; /* Packets out of rqueue */ -#endif - struct frct_cr snd_cr; - struct frct_cr rcv_cr; +struct sack_args { + uint16_t n; + bool dsack; /* RFC 2883: block[0] is a DSACK report */ + uint32_t ack; + uint32_t rwe; + uint32_t blocks[][2]; /* flexible — sized at alloc time */ +}; +/* NewReno-careful (RFC 6582) exit pad; gates RTT samples post-signal. */ +#define RTT_QUARANTINE 32 +#define RTTP_NONCE_LEN 16 - ssize_t rq[RQ_SIZE]; - pthread_rwlock_t lock; +/* RTT-probe wire payload (after the FRCT PCI). */ +struct frct_rttp { + uint32_t probe_id; /* sender counter; 0 on reply */ + uint32_t echo_id; /* peer's probe_id; 0 outbound */ + uint8_t nonce[RTTP_NONCE_LEN]; /* random; echoed verbatim */ +} __attribute__((packed)); - bool open; /* Window open/closed */ - struct timespec t_wnd; /* Window closed time */ - struct timespec t_rdvs; /* Last rendez-vous sent */ - pthread_cond_t cond; - pthread_mutex_t mtx; -}; +#define RTTP_PAYLOAD sizeof(struct frct_rttp) +#define RTTP_POS(id) ((id) & (RTTP_RING - 1)) +/* + * Flag values are assigned MSB-first on the wire (RFC convention): + * bit 0 = 0x8000 occupies wire-position 0 of the 16-bit flags + * field, bit 12 = 0x0008 is the last assigned bit, and the three + * LSBs (0x0007) are reserved. + */ enum frct_flags { - FRCT_DATA = 0x01, /* PDU carries data */ - FRCT_DRF = 0x02, /* Data run flag */ - FRCT_ACK = 0x04, /* ACK field valid */ - FRCT_FC = 0x08, /* FC window valid */ - FRCT_RDVS = 0x10, /* Rendez-vous */ - FRCT_FFGM = 0x20, /* First Fragment */ - FRCT_MFGM = 0x40, /* More fragments */ + FRCT_DATA = 0x8000, /* PDU carries data */ + FRCT_DRF = 0x4000, /* Data run flag */ + FRCT_ACK = 0x2000, /* ACK field valid */ + FRCT_NACK = 0x1000, /* Neg-ACK: pci->seqno is arrival_seqno - 1 */ + FRCT_FC = 0x0800, /* FC window valid */ + FRCT_RDVS = 0x0400, /* Rendez-vous */ + FRCT_FFGM = 0x0200, /* First fragment (begin) */ + FRCT_LFGM = 0x0100, /* Last fragment (end) */ + FRCT_RXM = 0x0080, /* Retransmission */ + FRCT_SACK = 0x0040, /* SACK block list follows */ + FRCT_RTTP = 0x0020, /* RTT probe / echo */ + FRCT_KA = 0x0010, /* Keepalive */ + FRCT_FIN = 0x0008, /* End of stream */ }; -struct frct_pci { - uint8_t flags; +/* + * DATA-packet fragment role (FFGM = begin, LFGM = end), SCTP-style: + * 1 1 = sole / un-fragmented SDU (begin AND end) + * 1 0 = first fragment of a multi-fragment SDU + * 0 0 = middle fragment + * 0 1 = last fragment + */ +#define FRCT_FR_MASK (FRCT_FFGM | FRCT_LFGM) +#define FRCT_FR_SOLE (FRCT_FFGM | FRCT_LFGM) +#define FRCT_FR_FIRST (FRCT_FFGM) +#define FRCT_FR_MID (0) +#define FRCT_FR_LAST (FRCT_LFGM) + +/* Default cap on a single reassembled SDU. App can raise via FRCTSMAXSDU */ +#define FRCT_MAX_SDU (1U << 20) + +/* Stream-mode PCI extension: [start, end) byte range on every DATA pkt. */ +struct frct_pci_stream { + uint32_t start; + uint32_t end; +} __attribute__((packed)); + +#define FRCT_PCI_STREAM_LEN (sizeof(struct frct_pci_stream)) - uint8_t pad; /* 24 bit window! */ - uint16_t window; +/* Bytes following PCI: SACK list / RTTP nonce / control payload. */ +#define FRCT_BODY(pci) ((uint8_t *) (pci) + FRCT_PCILEN) +/* Typed access to the stream PCI extension on stream DATA packets. */ +#define FRCT_SPCI(pci) \ + ((struct frct_pci_stream *) ((uint8_t *) (pci) + FRCT_PCILEN)) +/* Push the FRCT header onto spb's head. */ +#define FRCT_HDR_PUSH(spb, frcti) \ + ((struct frct_pci *) ssm_pk_buff_push((spb), \ + frcti_data_hdr_len(frcti))) + +/* Pop a fixed-size header off spb's head; cast to type *. */ +#define FRCT_HDR_POP(spb, type) \ + ((struct type *) ssm_pk_buff_pop((spb), sizeof(struct type))) + +/* Default / max per-flow stream rx ring (pow2); min N * per_pkt. */ +#define FRCT_STREAM_RING_MIN_PKTS 4 +#define FRCT_STREAM_RING_SZ (1U << 20) /* 1 MiB default */ +#define FRCT_STREAM_RING_SZ_MAX (1U << 27) /* 128 MiB */ + +struct frct_pci { + uint16_t flags; + uint16_t hcs; + + uint32_t window; uint32_t seqno; uint32_t ackno; } __attribute__((packed)); +/* Stat counters; fold to no-ops without PROC_FLOW_STATS. */ #ifdef PROC_FLOW_STATS +struct frcti_stat { + size_t rxm_rto; /* RTO-timer driven retransmits */ + size_t rxm_rcv; /* RXM packets received (all) */ + size_t rxm_dup_rcv; /* RXM dups (peer already had it) */ + size_t rxm_sack; /* SACK-mechanism retransmits */ + size_t rxm_rack; /* RACK-driven retransmits */ + size_t rxm_dupthresh; /* DupThresh-driven retransmits */ + size_t rxm_nack; /* NACK-pulled retransmits */ + size_t rxm_due_count; /* rxm_due entries (pre-bail) */ + size_t rxm_due_acked; /* bail: seqno < snd_lwe */ + size_t rxm_due_unowned; /* bail: slot.rxm replaced */ + size_t rxm_due_aged; /* bail: r->t0 + t_r < now */ + size_t rxm_due_defer; /* bail: non-HoL, deferred to HoL */ + size_t rxm_arm_fail; /* rxm_arm: malloc failed */ + size_t rxm_cancel; /* entries cancelled at teardown */ + size_t rxm_tx_dead; /* RXM tx into terminal flow */ + size_t tx_drop; /* frct_tx fail (any cause) */ + size_t tx_drop_ack; /* bare ACK dropped */ + size_t tx_drop_sack; /* SACK dropped */ + size_t tx_drop_ka; /* keepalive dropped */ + size_t tx_drop_rttp; /* RTT probe/echo dropped */ + size_t tx_drop_nack; /* pre-DRF NACK dropped */ + size_t tx_drop_rdv; /* rendez-vous dropped */ + size_t tx_drop_other; /* anything not matched above */ + size_t ack_snd; /* ACK packets sent (bare + SACK) */ + size_t ack_fire; /* delayed-ACK timer fires */ + size_t ack_supp_seqno; /* fire suppressed: seqno */ + size_t ack_supp_inact; /* fire suppressed: inact */ + size_t ack_supp_rate; /* fire suppressed: rate */ + size_t ack_rcv; /* ACK packets received */ + size_t ack_rtt; /* ACKs that fed RTT estimator */ + size_t ack_dup_rcv; /* ACK packet wire dups dropped */ + size_t dup_rcv; /* duplicates received */ + size_t out_rcv; /* pkts out of window */ + size_t rqo_rcv; /* pkts out of rqueue */ + size_t ooo_rcv; /* OOO arrivals */ + size_t sack_snd; /* SACK packets sent */ + size_t sack_rcv; /* SACK packets received */ + size_t dsack_snd; /* SACK pkts carrying a DSACK */ + size_t dsack_rcv; /* DSACK blocks parsed */ + size_t dsack_drop; /* DSACK blocks past MAX_DSACK_LAG */ + size_t nack_snd; /* pre-DRF NACKs sent */ + size_t nack_rcv; /* pre-DRF NACKs received */ + size_t tlp_snd; /* tail loss probes sent */ + size_t inact_drop; /* inactivity drop (NACK on cd) */ + size_t drf_rebase; /* DRF-triggered window rebase */ + size_t rq_released; /* slots cleared by release_rq */ + size_t rttp_snd; /* RTT probes sent */ + size_t rttp_rcv; /* RTT probe replies rcvd */ + size_t rtt_smpl; /* RTT estimator samples */ + size_t rdv_snd; /* rendez-vous packets sent */ + size_t rdv_rcv; /* rendez-vous packets rcvd */ + size_t ka_snd; /* keepalives sent */ + size_t ka_rcv; /* keepalives received */ + size_t sdu_snd_frag; /* writes that fragmented */ + size_t sdu_snd_alloc; /* alloc fail truncated SDU send */ + size_t sdu_snd_tx; /* tx fail truncated SDU send */ + size_t frag_snd; /* fragments sent: FIRST/MID/LAST */ + size_t frag_rcv; /* fragments stashed in rq[] */ + size_t sdu_reasm; /* SDUs delivered reassembled */ + size_t sdu_sole; /* SOLE SDUs delivered (n==1) */ + size_t frag_drop; /* dropped at malformed run */ + size_t strm_snd_byte; /* bytes sent on stream */ + size_t strm_rcv_byte; /* bytes copied to ring */ + size_t strm_dlv_byte; /* bytes delivered to reader */ + size_t strm_drop; /* stream rcvs dropped */ + size_t strm_fin_drop; /* stream FIN packets rejected */ + /* Profiling instrumentation. */ + size_t rcv_proc_ns; /* time inside FRCTI_RCV (ns) */ + size_t tw_move_ns; /* time inside tw_move (ns) */ + size_t drain_calls; /* flow_drain_rx_nb invocations */ +}; + +#define STAT_BUMP(frcti, field) FETCH_ADD_RELAXED(&(frcti)->stat.field, 1) +#define STAT_ADD(frcti, field, v) FETCH_ADD_RELAXED(&(frcti)->stat.field, (v)) +#define STAT_LOAD(frcti, field) LOAD_RELAXED(&(frcti)->stat.field) +#else +#define STAT_BUMP(frcti, field) ((void) (frcti)) +#define STAT_ADD(frcti, field, v) ((void) (frcti)) +#define STAT_LOAD(frcti, field) ((void) (frcti), (size_t) 0) +#endif + +#define frcti_to_flow(f) (&proc.flows[(f)->fd]) +#define RTTP_RING 8 +#define RTTP_COLD_NS (100 * MILLION) /* cold-probe cadence */ +#define RQ_SLOT(seqno) ((seqno) & (RQ_SIZE - 1)) + +struct rxm_entry; + +enum snd_slot_flags { + SND_RTX = 0x01, /* Any retransmit; Karn skips next RTT sample. */ + SND_FAST_RXM = 0x02, /* Fast-retx one-shot gate per loss event. */ + SND_TLP = 0x04, /* Tail loss probe; ACK resets rto_mul. */ +}; + +struct snd_slot { + struct rxm_entry * rxm; /* RXM entry, NULL if none. */ + uint64_t time; /* ts_to_ns of last send (any kind). */ + uint8_t flags; /* SND_* bits above. */ +}; + +/* Per-seqno reorder slot (FRTX) and stream-mode byte/FIN metadata. */ +struct rcv_slot { + ssize_t idx; /* spb idx; -1 = empty */ + uint32_t start; /* stream byte start */ + uint32_t end; /* stream byte end */ + uint8_t fin; /* stream FIN bit */ +}; + +struct frct_cr { + uint32_t lwe; /* Left window edge */ + uint32_t rwe; /* Right window edge */ + + uint8_t cflags; + uint32_t seqno; /* SEQ to send, or last SEQ Ack'd */ + uint32_t ackno; /* snd: ACK-pkt seqno; rcv: dedup */ + + uint64_t act; /* ts_to_ns of last activity */ + uint64_t inact; /* Inactivity threshold (ns) */ +}; + +struct frcti { + /* IMM: set once in frcti_create; read-only thereafter. */ + int fd; + uint64_t t_mpl; /* MPL (ns) */ + uint64_t t_a; /* a-timer (ns) */ + uint64_t t_r; /* r-timer (ns) */ + uint64_t t_rdv; /* RDV cooldown (ns) */ + time_t ber; /* cached qs.ber */ + bool lossy; /* qs.loss != 0 */ + time_t qs_timeout; /* cached qs.timeout (ms) */ + size_t frag_mtu; /* max FRCT pkt: PCI + payload */ + uint16_t sack_n_max; /* SACK blocks that fit MTU */ + bool stream; + + /* All fields below are protected by lock (rwlock/LOAD_ACQUIRE). */ + struct { + struct frct_cr snd_cr; + struct frct_cr rcv_cr; + + /* RTT/RACK estimator */ + time_t srtt; /* smoothed RTT */ + time_t mdev; /* mean deviation */ + time_t min_rtt; /* RACK base, ns */ + uint64_t t_min_rtt; /* min_rtt last set */ + time_t rto; /* retransmit TO */ + time_t rto_min; /* RTO floor (ns) */ + uint8_t rto_mul; /* RTO backoff bits */ + uint32_t rtt_lwe; /* RTT-sample fence */ + uint64_t t_rcv_rtt; /* last RTT feed */ + uint64_t t_snd_probe; /* last probe sent */ + uint64_t t_latest_ack; /* RACK.fack snd-ts */ + uint32_t probe_id_next; + struct { + uint32_t id; + uint64_t ts; /* ts_to_ns send */ + uint8_t nonce[RTTP_NONCE_LEN]; /* echoed back */ + } probes[RTTP_RING]; + + /* rcv reassembly */ + size_t max_rcv_sdu; /* max reasm bytes */ + uint8_t * rcv_ring; /* lazy alloc */ + size_t rcv_ring_sz; /* power of 2 */ + uint32_t ring_seq_cap; /* ring/per_pkt */ + + uint32_t snd_byte_next; + bool snd_fin_sent; + uint32_t snd_fin_seqno; + uint32_t rcv_byte_next; + uint32_t rcv_byte_high; /* contiguous high */ + uint32_t rcv_byte_fin; /* set when FIN */ + bool rcv_fin_seen; + + struct rcv_slot rcv_slots[RQ_SIZE]; + struct snd_slot snd_slots[RQ_SIZE]; /* .rxm is ATOM */ + + /* rcv SACK dedup */ + uint64_t t_snd_sack; + uint32_t sack_lwe; /* rcv lwe at SACK */ + uint16_t sack_n; /* SACK block count */ + + /* RFC 2883 D-SACK: pending report (single-slot, latest). */ + uint32_t dsack_seqno; + bool dsack_valid; + + /* RFC 8985 §7.2 RACK reorder-window scaling. */ + uint8_t reo_wnd_mult; /* REO_WND_MULT_MAX */ + uint32_t dsack_lwe_snap; /* lwe @ last DSACK */ + uint64_t t_last_reo_widen; /* once-per-RTT */ + + uint32_t dup_thresh; /* RFC 8985 */ + uint32_t tlp_high_seq; /* §7.3: 0 = none */ + uint8_t tlp_count; /* §7.3 per-episode */ + uint64_t t_nack; + bool open; /* FC window state */ + bool in_recovery; + uint32_t recovery_high; /* seqno @ entry */ + uint32_t rack_fired_lwe; /* lwe @ last RACK */ + struct timespec t_wnd; /* window-closed ts */ + struct timespec t_last_rdv; /* last RDV sent */ + struct list_head rxm_list; /* live rxm entries */ + + pthread_rwlock_t lock; + }; + + /* Read/written via __atomic without holding lock. */ + uint64_t t_ka_rcv; /* ts_to_ns of last KA rx */ + uint8_t ack_pending; /* delayed-ACK dedup */ + uint8_t tlp_pending; /* TLP arm dedup (lazy) */ + + /* Timer entries; ownership belongs to the tw module. */ + struct tw_entry ack_tw; /* delayed-ACK timer */ + struct tw_entry ka_tw; /* keepalive timer */ + struct tw_entry tlp_tw; /* tail-loss probe timer */ + +#ifdef PROC_FLOW_STATS + /* STAT: lock-free relaxed atomic counters. */ + struct frcti_stat stat; +#endif +}; + +#ifdef PROC_FLOW_STATS + +__attribute__((cold)) static int frct_rib_read(const char * path, char * buf, size_t len) { + struct frcti * frcti; struct timespec now; + uint64_t now_ns; char * entry; - struct flow * flow; - struct frcti * frcti; int fd; - - (void) len; + int written; + /* Snapshot under the locks; format outside (pure userspace). */ + struct { + uint64_t t_mpl; + uint64_t t_a; + uint64_t t_r; + time_t srtt; + time_t mdev; + time_t rto; + time_t min_rtt; + struct frct_cr snd_cr; + struct frct_cr rcv_cr; + size_t rx_q_now; + size_t tx_q_now; + struct frcti_stat stat; + } s; entry = strstr(path, RIB_SEPARATOR); assert(entry); @@ -118,23 +437,50 @@ static int frct_rib_read(const char * path, fd = atoi(path); - flow = &proc.flows[fd]; - clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); + + if (fd < 0 || fd >= PROC_MAX_FLOWS) + return 0; pthread_rwlock_rdlock(&proc.lock); - frcti = flow->frcti; + frcti = proc.flows[fd].frcti; + if (frcti == NULL) { + pthread_rwlock_unlock(&proc.lock); + return 0; + } + + s.t_mpl = frcti->t_mpl; + s.t_a = frcti->t_a; + s.t_r = frcti->t_r; + + s.rx_q_now = proc.flows[fd].rx_rb != NULL + ? ssm_rbuff_queued(proc.flows[fd].rx_rb) : 0; + s.tx_q_now = proc.flows[fd].tx_rb != NULL + ? ssm_rbuff_queued(proc.flows[fd].tx_rb) : 0; pthread_rwlock_rdlock(&frcti->lock); - sprintf(buf, - "Maximum packet lifetime (ns): %20ld\n" - "Max time to Ack (ns): %20ld\n" - "Max time to Retransmit (ns): %20ld\n" + s.srtt = frcti->srtt; + s.mdev = frcti->mdev; + s.rto = frcti->rto; + s.min_rtt = frcti->min_rtt; + s.snd_cr = frcti->snd_cr; + s.rcv_cr = frcti->rcv_cr; + s.stat = frcti->stat; + + pthread_rwlock_unlock(&frcti->lock); + pthread_rwlock_unlock(&proc.lock); + + written = snprintf(buf, len, + "Maximum packet lifetime (ns): %20" PRIu64 "\n" + "Max time to Ack (ns): %20" PRIu64 "\n" + "Max time to Retransmit (ns): %20" PRIu64 "\n" "Smoothed rtt (ns): %20ld\n" "RTT standard deviation (ns): %20ld\n" "Retransmit timeout RTO (ns): %20ld\n" + "Minimum RTT (RACK base, ns): %20ld\n" "Sender left window edge: %20u\n" "Sender right window edge: %20u\n" "Sender inactive (ns): %20lld\n" @@ -143,44 +489,132 @@ static int frct_rib_read(const char * path, "Receiver right window edge: %20u\n" "Receiver inactive (ns): %20lld\n" "Receiver last ack: %20u\n" - "Number of pkt retransmissions: %20zu\n" - "Number of rtt probes: %20zu\n" - "Number of rtt estimates: %20zu\n" - "Number of duplicates received: %20zu\n" - "Number of delayed acks received: %20zu\n" - "Number of rendez-vous sent: %20zu\n" - "Number of packets out of window: %20zu\n" - "Number of packets out of rqueue: %20zu\n", - frcti->mpl, - frcti->a, - frcti->r, - frcti->srtt, - frcti->mdev, - frcti->rto, - frcti->snd_cr.lwe, - frcti->snd_cr.rwe, - ts_diff_ns(&now, &frcti->snd_cr.act), - frcti->snd_cr.seqno, - frcti->rcv_cr.lwe, - frcti->rcv_cr.rwe, - ts_diff_ns(&now, &frcti->rcv_cr.act), - frcti->rcv_cr.seqno, - frcti->n_rtx, - frcti->n_prb, - frcti->n_rtt, - frcti->n_dup, - frcti->n_dak, - frcti->n_rdv, - frcti->n_out, - frcti->n_rqo); - - pthread_rwlock_unlock(&flow->frcti->lock); + "RXM (RTO-driven) sent: %20zu\n" + "RXM packets received: %20zu\n" + " duplicates received: %20zu\n" + "RXM (SACK mechanism) sent: %20zu\n" + "RXM (RACK-driven) sent: %20zu\n" + "RXM (DupThresh-driven) sent: %20zu\n" + "RXM (NACK-driven) sent: %20zu\n" + "ACK packets sent: %20zu\n" + "Delayed-ACK timer fires: %20zu\n" + " suppressed (seqno): %20zu\n" + " suppressed (inact): %20zu\n" + " suppressed (rate): %20zu\n" + "ACK packets received: %20zu\n" + " fed RTT estimator: %20zu\n" + " wire dups dropped: %20zu\n" + "Duplicates received: %20zu\n" + "Out-of-window pkts received: %20zu\n" + "Out-of-rqueue pkts received: %20zu\n" + "OOO arrivals: %20zu\n" + "SACKs sent: %20zu\n" + "SACKs received: %20zu\n" + "D-SACKs sent: %20zu\n" + "D-SACKs received: %20zu\n" + "D-SACK out-of-range dropped: %20zu\n" + "Pre-DRF NACKs sent: %20zu\n" + "Pre-DRF NACKs received: %20zu\n" + "Tail loss probes sent: %20zu\n" + "Inactivity drops (silent): %20zu\n" + "DRF window rebases: %20zu\n" + "rq slots cleared by release_rq: %20zu\n" + "RTT probes sent: %20zu\n" + "RTT probe replies received: %20zu\n" + "RTT estimator samples: %20zu\n" + "Rendez-vous packets sent: %20zu\n" + "Rendez-vous packets received: %20zu\n" + "Keepalives sent: %20zu\n" + "Keepalives received: %20zu\n" + "SDU writes fragmented: %20zu\n" + " alloc fail mid-SDU: %20zu\n" + " tx fail mid-SDU: %20zu\n" + "Fragments sent: %20zu\n" + "Fragments received: %20zu\n" + "SDUs delivered reassembled: %20zu\n" + "SDUs delivered (SOLE): %20zu\n" + "Fragments dropped (malformed): %20zu\n" + "Stream bytes sent: %20zu\n" + "Stream bytes received: %20zu\n" + "Stream bytes delivered: %20zu\n" + "Stream packets dropped: %20zu\n" + "Stream FINs dropped: %20zu\n" + "FRCTI_RCV time (ns): %20zu\n" + "tw_move time (ns): %20zu\n" + "drain_rx_nb calls: %20zu\n" + "RX rbuff queued: %20zu\n" + "TX rbuff queued: %20zu\n" + "RXM-due entries: %20zu\n" + " bail (acked): %20zu\n" + " bail (unowned): %20zu\n" + " bail (aged): %20zu\n" + " bail (defer): %20zu\n" + "RXM-arm malloc failures: %20zu\n" + "RXM cancels (teardown): %20zu\n" + "RXM tx into dead flow: %20zu\n" + "Tx ring drops (any cause): %20zu\n" + " ack: %20zu\n" + " sack: %20zu\n" + " ka: %20zu\n" + " rttp: %20zu\n" + " nack: %20zu\n" + " rdv: %20zu\n" + " other: %20zu\n", + /* Check getattr size below when adding stats. */ + s.t_mpl, s.t_a, s.t_r, + s.srtt, s.mdev, s.rto, s.min_rtt, + s.snd_cr.lwe, s.snd_cr.rwe, + (long long)(now_ns - s.snd_cr.act), + s.snd_cr.seqno, + s.rcv_cr.lwe, s.rcv_cr.rwe, + (long long)(now_ns - s.rcv_cr.act), + s.rcv_cr.seqno, + s.stat.rxm_rto, s.stat.rxm_rcv, s.stat.rxm_dup_rcv, + s.stat.rxm_sack, s.stat.rxm_rack, s.stat.rxm_dupthresh, + s.stat.rxm_nack, + s.stat.ack_snd, s.stat.ack_fire, + s.stat.ack_supp_seqno, s.stat.ack_supp_inact, + s.stat.ack_supp_rate, + s.stat.ack_rcv, s.stat.ack_rtt, s.stat.ack_dup_rcv, + s.stat.dup_rcv, s.stat.out_rcv, s.stat.rqo_rcv, + s.stat.ooo_rcv, + s.stat.sack_snd, s.stat.sack_rcv, + s.stat.dsack_snd, s.stat.dsack_rcv, s.stat.dsack_drop, + s.stat.nack_snd, s.stat.nack_rcv, s.stat.tlp_snd, + s.stat.inact_drop, s.stat.drf_rebase, s.stat.rq_released, + s.stat.rttp_snd, s.stat.rttp_rcv, s.stat.rtt_smpl, + s.stat.rdv_snd, s.stat.rdv_rcv, + s.stat.ka_snd, s.stat.ka_rcv, + s.stat.sdu_snd_frag, s.stat.sdu_snd_alloc, s.stat.sdu_snd_tx, + s.stat.frag_snd, s.stat.frag_rcv, + s.stat.sdu_reasm, s.stat.sdu_sole, s.stat.frag_drop, + s.stat.strm_snd_byte, s.stat.strm_rcv_byte, + s.stat.strm_dlv_byte, + s.stat.strm_drop, s.stat.strm_fin_drop, + s.stat.rcv_proc_ns, s.stat.tw_move_ns, + s.stat.drain_calls, + s.rx_q_now, s.tx_q_now, + s.stat.rxm_due_count, + s.stat.rxm_due_acked, s.stat.rxm_due_unowned, + s.stat.rxm_due_aged, s.stat.rxm_due_defer, + s.stat.rxm_arm_fail, + s.stat.rxm_cancel, + s.stat.rxm_tx_dead, s.stat.tx_drop, + s.stat.tx_drop_ack, s.stat.tx_drop_sack, + s.stat.tx_drop_ka, s.stat.tx_drop_rttp, + s.stat.tx_drop_nack, s.stat.tx_drop_rdv, + s.stat.tx_drop_other); + + if (written < 0) + return 0; - pthread_rwlock_unlock(&proc.lock); + if ((size_t) written >= len) + return (int) (len - 1); - return strlen(buf); + return written; } +__attribute__((cold)) static int frct_rib_readdir(char *** buf) { *buf = malloc(sizeof(**buf)); @@ -199,13 +633,14 @@ static int frct_rib_readdir(char *** buf) return -ENOMEM; } +__attribute__((cold)) static int frct_rib_getattr(const char * path, struct rib_attr * attr) { (void) path; - (void) attr; - attr->size = 1189; + /* Must be >= the sprintf output in frct_rib_read. */ + attr->size = 8192; attr->mtime = 0; return 0; @@ -220,128 +655,1172 @@ static struct rib_ops r_ops = { #endif /* PROC_FLOW_STATS */ -static bool before(uint32_t seq1, - uint32_t seq2) +static __inline__ bool before(uint32_t s1, uint32_t s2) { - return (int32_t)(seq1 - seq2) < 0; + return (int32_t)(s1 - s2) < 0; } -static bool after(uint32_t seq1, - uint32_t seq2) +static __inline__ bool after(uint32_t s1, uint32_t s2) { - return (int32_t)(seq2 - seq1) < 0; + return (int32_t)(s2 - s1) < 0; } -static void __send_frct_pkt(int fd, - uint8_t flags, - uint32_t ackno, - uint32_t rwe) +static __inline__ bool within(uint32_t seq, uint32_t lo, uint32_t hi) { - struct ssm_pk_buff * spb; - struct frct_pci * pci; - ssize_t idx; - struct flow * f; + return after(seq, lo) && !after(seq, hi); +} - /* Raw calls needed to bypass frcti. */ -#ifdef RXM_BLOCKING - idx = ssm_pool_alloc_b(proc.pool, sizeof(*pci), NULL, &spb, NULL); -#else - idx = ssm_pool_alloc(proc.pool, sizeof(*pci), NULL, &spb); -#endif - if (idx < 0) +static __inline__ bool in_window(uint32_t seq, const struct frct_cr * cr) +{ + return !before(seq, cr->lwe) && before(seq, cr->rwe); +} + +/* DRF arrival that stays within the current receive epoch. */ +static __inline__ bool same_epoch_drf(uint32_t seq, + uint16_t flags, + const struct frct_cr * cr) +{ + if (cr->lwe == cr->rwe) + return false; + + return (flags & FRCT_RXM) || in_window(seq, cr); +} + +/* + * RACK reorder window R (RFC 8985 §6.2): + * R = MIN(reo_wnd_mult * RACK.min_RTT / 4, SRTT) + * reo_wnd_mult scales on D-SACK evidence of under-tolerance (§7.2). + * Fall back to srtt when no min_rtt sample exists yet; MIN_REORDER_NS + * floor guards collapse below the timer-tick resolution. + */ +static __inline__ uint64_t rack_reorder_window(struct frcti * frcti) +{ + uint64_t mult = frcti->reo_wnd_mult > 0 ? frcti->reo_wnd_mult : 1; + uint64_t base = frcti->min_rtt > 0 ? (uint64_t) frcti->min_rtt + : (uint64_t) frcti->srtt; + uint64_t R = mult * (base / 4); + + R = MAX(R, (uint64_t) MIN_REORDER_NS); + R = MIN(R, (uint64_t) frcti->srtt); + + return R; +} + +static __inline__ int frct_spb_reserve(size_t len, + struct ssm_pk_buff ** spb) +{ + ssize_t idx = ssm_pool_alloc_b(proc.pool, len, NULL, spb, NULL); + + return idx < 0 ? (int) idx : 0; +} + +static __inline__ void frct_spb_release(struct ssm_pk_buff * spb) +{ + ssm_pool_remove(proc.pool, ssm_pk_buff_get_off(spb)); +} + +static __inline__ void frct_spb_release_idx(size_t idx) +{ + ssm_pool_remove(proc.pool, idx); +} + +/* Fetch the spb stashed at the rq slot for seqno. */ +static __inline__ struct ssm_pk_buff * rq_frag(const struct frcti * frcti, + uint32_t seqno) +{ + return ssm_pool_get(proc.pool, frcti->rcv_slots[RQ_SLOT(seqno)].idx); +} + +static __inline__ size_t frcti_data_hdr_len(const struct frcti * frcti) +{ + return FRCT_PCILEN + (frcti->stream ? FRCT_PCI_STREAM_LEN : 0); +} + +static __inline__ size_t frcti_ctrl_hdr_len(const struct frcti * frcti) +{ + (void) frcti; + + return FRCT_PCILEN; +} + +/* + * HCS at offset 2 inside PCI. Covers flags (bytes 0..1) and + * window/seqno/ackno (bytes 4..15), plus SPCI for stream DATA. + */ +static void frct_hcs_set(struct frct_pci * pci, + bool stream) +{ + uint16_t hcs = 0; + size_t tail; + + tail = sizeof(*pci) - sizeof(pci->flags) - sizeof(pci->hcs); + if (stream) + tail += FRCT_PCI_STREAM_LEN; + + crc16_ccitt_false(&hcs, pci, sizeof(pci->flags)); + crc16_ccitt_false(&hcs, &pci->window, tail); + + pci->hcs = hton16(hcs); +} + +static int frct_hcs_check(const struct frct_pci * pci, + const struct frcti * frcti) +{ + uint16_t hcs = 0; + uint16_t flags; + size_t tail; + + /* Untrusted flag read; mismatch on HCS will drop on corrupt. */ + flags = ntoh16(pci->flags); + + tail = sizeof(*pci) - sizeof(pci->flags) - sizeof(pci->hcs); + if (frcti->stream && (flags & FRCT_DATA)) + tail += FRCT_PCI_STREAM_LEN; + + crc16_ccitt_false(&hcs, pci, sizeof(pci->flags)); + crc16_ccitt_false(&hcs, &pci->window, tail); + + return hcs != ntoh16(pci->hcs); +} + +/* Bump tx_drop plus the per-frame-type counter matching `flags`. */ +static void frct_tx_drop_bump(struct frcti * frcti, + uint16_t flags) +{ + STAT_BUMP(frcti, tx_drop); + + if (flags & FRCT_SACK) { + STAT_BUMP(frcti, tx_drop_sack); return; + } - pci = (struct frct_pci *) ssm_pk_buff_head(spb); - memset(pci, 0, sizeof(*pci)); + if (flags & FRCT_KA) { + STAT_BUMP(frcti, tx_drop_ka); + return; + } - *((uint32_t *) pci) = hton32(rwe); + if (flags & FRCT_RTTP) { + STAT_BUMP(frcti, tx_drop_rttp); + return; + } - pci->flags = flags; - pci->ackno = hton32(ackno); + if (flags & FRCT_NACK) { + STAT_BUMP(frcti, tx_drop_nack); + return; + } - f = &proc.flows[fd]; + if (flags & FRCT_RDVS) { + STAT_BUMP(frcti, tx_drop_rdv); + return; + } + + if (flags & FRCT_ACK) { + STAT_BUMP(frcti, tx_drop_ack); + return; + } + + STAT_BUMP(frcti, tx_drop_other); +} + +static int frct_tx(struct frcti * frcti, struct ssm_pk_buff * spb) +{ + struct flow * f = frcti_to_flow(frcti); + const struct frct_pci * pci; + const struct timespec * dl = NULL; + struct timespec now; + struct timespec intv = TIMESPEC_INIT_NS(FRCT_TX_TIMEO_NS); + struct timespec deadline; + uint16_t flags; + ssize_t idx; + int ret = -ENOMEM; + + pci = (const struct frct_pci *) ssm_pk_buff_head(spb); + flags = ntoh16(pci->flags); + + /* CRC32 covers plaintext body; PCI is in HCS. Pre-encrypt. */ + if (flags & FRCT_SACK) { + if (crc_add(spb, frcti_ctrl_hdr_len(frcti)) != 0) + goto fail; + } else if ((flags & FRCT_DATA) && f->info.qs.ber == 0) { + if (crc_add(spb, frcti_data_hdr_len(frcti)) != 0) + goto fail; + } if (spb_encrypt(f, spb) < 0) goto fail; -#ifdef RXM_BLOCKING - if (ssm_rbuff_write_b(f->tx_rb, idx, NULL)) -#else - if (ssm_rbuff_write(f->tx_rb, idx)) -#endif + idx = ssm_pk_buff_get_off(spb); + + /* DATA blocks; control times out so a full ring can't stall wheel. */ + if (!(flags & FRCT_DATA)) { + clock_gettime(PTHREAD_COND_CLOCK, &now); + ts_add(&now, &intv, &deadline); + dl = &deadline; + } + + ret = ssm_rbuff_write_b(f->tx_rb, idx, dl); + if (ret < 0) goto fail; ssm_flow_set_notify(f->set, f->info.id, FLOW_PKT); - return; + return 0; fail: - ipcp_spb_release(spb); - return; + frct_tx_drop_bump(frcti, flags); + ssm_pool_remove(proc.pool, ssm_pk_buff_get_off(spb)); + return ret; +} + +__attribute__((cold)) +static void frct_mark_flow_down(struct frcti * frcti) +{ + struct flow * f = frcti_to_flow(frcti); + + if (f->rx_rb != NULL) + ssm_rbuff_set_acl(f->rx_rb, ACL_FLOWDOWN); + + if (f->tx_rb != NULL) + ssm_rbuff_set_acl(f->tx_rb, ACL_FLOWDOWN); +} + +__attribute__((cold)) +static void frct_mark_peer_dead(struct frcti * frcti) +{ + struct flow * f = frcti_to_flow(frcti); + + if (f->rx_rb != NULL) + ssm_rbuff_set_acl(f->rx_rb, ACL_FLOWPEER); + + if (proc.fqset != NULL) + ssm_flow_set_notify(proc.fqset, f->info.id, FLOW_PEER); +} + +static __inline__ int frct_ctrl_alloc(struct ssm_pk_buff ** spb, + struct frct_pci ** pci, + size_t payload_len) +{ + if (frct_spb_reserve(FRCT_PCILEN + payload_len, spb) < 0) + return -1; + + *pci = (struct frct_pci *) ssm_pk_buff_head(*spb); + memset(*pci, 0, FRCT_PCILEN); + + return 0; +} + +/* + * Advertised rwe. Stream mode clamps to lwe + ring_seq_cap so the + * byte-equivalent fits the rx ring. Caller holds at least the rdlock. + */ +static __inline__ uint32_t frcti_advert_rwe(struct frcti * frcti) +{ + uint32_t rwe; + uint32_t cap; + + rwe = frcti->rcv_cr.rwe; + + if (!frcti->stream) + return rwe; + + cap = frcti->rcv_cr.lwe + frcti->ring_seq_cap; + + return before(cap, rwe) ? cap : rwe; +} + +static void frcti_pkt_snd(struct frcti * frcti, + uint16_t flags, + uint32_t ackno, + uint32_t rwe) +{ + struct ssm_pk_buff * spb; + struct frct_pci * pci; + + if (frct_ctrl_alloc(&spb, &pci, 0) < 0) + return; + + pci->flags = hton16(flags); + pci->window = hton32(rwe); + pci->ackno = hton32(ackno); + if (flags & FRCT_ACK) { + /* reuse ackno for the sequence number of delayed ACK */ + ackno = FETCH_ADD_RELAXED(&frcti->snd_cr.ackno, 1); + pci->seqno = hton32(ackno + 1); + } + + frct_hcs_set(pci, false); + + frct_tx(frcti, spb); +} + +/* RTO floor scales with srtt; hard floor rto_min guards sub-ms RTT. */ +static void rtt_init(struct frcti * frcti, + time_t rtt_hint) +{ + time_t floor; + + if (rtt_hint > 0) { + rtt_hint = MAX(rtt_hint, (time_t) RTT_BOOT_NS); + frcti->srtt = rtt_hint; + frcti->mdev = rtt_hint >> 3; + floor = MAX(frcti->rto_min, 2 * frcti->srtt); + frcti->rto = MAX(floor, rtt_hint + (frcti->mdev << MDEV_MUL)); + frcti->min_rtt = rtt_hint; + } else { + /* Boot from first ACK. */ + frcti->srtt = 0; + frcti->mdev = RTT_BOOT_NS; + frcti->rto = MAX((time_t) INITIAL_RTO, frcti->rto_min); + frcti->min_rtt = 0; + } + + frcti->rto_mul = 0; +} + +/* RFC 8985 §6.2: replace min_RTT on unset, smaller sample, or expiry. */ +static __inline__ bool min_rtt_stale(struct frcti * frcti, + time_t mrtt, + uint64_t now_ns) +{ + if (frcti->min_rtt == 0) + return true; + + if (mrtt < frcti->min_rtt) + return true; + + return ts_aged_ns(now_ns, frcti->t_min_rtt, MIN_RTT_WIN_NS); +} + +/* Linux-style windowed-min refresh of RACK.min_RTT. */ +static __inline__ void min_rtt_update(struct frcti * frcti, + time_t mrtt, + uint64_t now_ns) +{ + if (!min_rtt_stale(frcti, mrtt, now_ns)) + return; + + frcti->min_rtt = mrtt; + frcti->t_min_rtt = now_ns; +} + +static void rtt_update(struct frcti * frcti, + time_t mrtt, + uint64_t now_ns) +{ + time_t srtt = frcti->srtt; + time_t rttvar = frcti->mdev; + time_t floor; + time_t rto; + + if (srtt == 0) { + srtt = mrtt; + rttvar = mrtt >> 1; + } else { + /* RFC 6298 symmetric EWMA. */ + time_t delta = mrtt - srtt; + srtt += (delta >> 3); + delta = (ABS(delta) - rttvar) >> 2; +#ifdef FRCT_LINUX_RTT_ESTIMATOR + if (delta < 0) + delta >>= 3; +#endif + rttvar += delta; + } + STAT_BUMP(frcti, rtt_smpl); + frcti->srtt = MAX(SRTT_FLOOR_NS, srtt); + frcti->mdev = MAX(MDEV_FLOOR_NS, rttvar); + + min_rtt_update(frcti, mrtt, now_ns); + + floor = MAX(frcti->rto_min, 2 * frcti->srtt); + rto = MAX(floor, frcti->srtt + (frcti->mdev << MDEV_MUL)); + + STORE_RELEASE(&frcti->rto, rto); + STORE_RELEASE(&frcti->rto_mul, 0); +} + +/* Fill probes[pos], return new probe_id; 0 on entropy failure. Wrlock. */ +static uint32_t rttp_alloc_probe(struct frcti * frcti, + uint64_t now_ns, + uint8_t nonce[RTTP_NONCE_LEN]) +{ + uint32_t probe_id; + size_t pos; + + if (random_buffer(nonce, RTTP_NONCE_LEN) < 0) + return 0; + + probe_id = frcti->probe_id_next++; + if (probe_id == 0) + probe_id = frcti->probe_id_next++; + + pos = RTTP_POS(probe_id); + frcti->probes[pos].id = probe_id; + frcti->probes[pos].ts = now_ns; + memcpy(frcti->probes[pos].nonce, nonce, RTTP_NONCE_LEN); + frcti->t_snd_probe = now_ns; + + STAT_BUMP(frcti, rttp_snd); + + return probe_id; +} + +/* Caller wrlock; out args valid on true (caller emits post-unlock). */ +static bool rtt_probe_arm(struct frcti * frcti, + uint64_t now_ns, + uint32_t * probe_id, + uint8_t nonce[RTTP_NONCE_LEN]) +{ + if (frcti->srtt == 0) + return false; + + if (!after(frcti->snd_cr.seqno, frcti->snd_cr.lwe)) + return false; + + if (!ts_aged_ns(now_ns, frcti->t_rcv_rtt, + 2u * (uint64_t) frcti->srtt)) + return false; + + if (!ts_aged_ns(now_ns, frcti->t_snd_probe, + (uint64_t) frcti->srtt)) + return false; + + *probe_id = rttp_alloc_probe(frcti, now_ns, nonce); + + return *probe_id != 0; +} + +static void frcti_rttp_snd(struct frcti * frcti, + uint32_t probe_id, + uint32_t echo_id, + const uint8_t * nonce) +{ + struct ssm_pk_buff * spb; + struct frct_pci * pci; + struct frct_rttp * rttp; + + if (frct_ctrl_alloc(&spb, &pci, RTTP_PAYLOAD) < 0) + return; + + pci->flags = hton16(FRCT_RTTP); + + frct_hcs_set(pci, false); + + rttp = (struct frct_rttp *) FRCT_BODY(pci); + rttp->probe_id = hton32(probe_id); + rttp->echo_id = hton32(echo_id); + memcpy(rttp->nonce, nonce, sizeof(rttp->nonce)); + + frct_tx(frcti, spb); +} + +struct rxm_entry { + struct tw_entry tw; + struct list_head next; /* in frcti->rxm_list */ + struct frcti * frcti; + uint32_t seqno; + uint64_t t0; + size_t len; + uint8_t pkt[]; /* flexible — sized at alloc time */ +}; + +static struct rxm_entry * rxm_entry_create(struct frcti * frcti, + uint32_t seqno, + const struct ssm_pk_buff * spb) +{ + struct rxm_entry * r; + struct timespec now; + size_t len = ssm_pk_buff_len(spb); + + r = malloc(sizeof(*r) + len); + if (r == NULL) { + STAT_BUMP(frcti, rxm_arm_fail); + return NULL; + } + + memcpy(r->pkt, ssm_pk_buff_head(spb), len); + r->len = len; + r->frcti = frcti; + r->seqno = seqno; + + clock_gettime(PTHREAD_COND_CLOCK, &now); + r->t0 = TS_TO_UINT64(now); + + tw_init_entry(&r->tw); + + return r; +} + +static void rxm_entry_destroy(struct rxm_entry * r) +{ + free(r); +} + +static bool rxm_still_owned(struct frcti * frcti, + size_t pos, + struct rxm_entry * r) +{ + return LOAD_ACQUIRE(&frcti->snd_slots[pos].rxm) == r; +} + +/* + * All in-flight slots share the HoL backoff; otherwise non-HoL timers + * cycle at base RTO and storm the wire while HoL is still backing off. + */ +static uint64_t rxm_next_deadline(struct frcti * frcti, + uint64_t now_ns) +{ + time_t rto = LOAD_RELAXED(&frcti->rto); + uint8_t rto_mul = LOAD_RELAXED(&frcti->rto_mul); + + return now_ns + ((uint64_t) rto << rto_mul); +} + +/* Copy pkt, set FRCT_RXM, refresh ackno, re-seal HCS. */ +static struct ssm_pk_buff * rxm_pkt_prepare(const void * pkt, + size_t len, + uint32_t rcv_lwe, + bool stream) +{ + struct ssm_pk_buff * spb; + struct frct_pci * pci; + uint16_t flags; + + if (frct_spb_reserve(len, &spb) < 0) + return NULL; + + pci = (struct frct_pci *) ssm_pk_buff_head(spb); + memcpy(pci, pkt, len); + + flags = ntoh16(pci->flags) | FRCT_RXM; + pci->flags = hton16(flags); + pci->ackno = hton32(rcv_lwe); + + frct_hcs_set(pci, stream); + + return spb; +} + +/* Caller must NOT hold frcti->lock. */ +static void rxm_snd(struct frcti * frcti, + uint32_t seqno, + const void * pkt, + size_t len) +{ + struct ssm_pk_buff * spb; + struct timespec now; + struct snd_slot * slot; + uint32_t snd_lwe; + uint32_t rcv_lwe; + size_t pos; + int ret; + + snd_lwe = LOAD_RELAXED(&frcti->snd_cr.lwe); + rcv_lwe = LOAD_RELAXED(&frcti->rcv_cr.lwe); + + clock_gettime(PTHREAD_COND_CLOCK, &now); + + pthread_rwlock_wrlock(&frcti->lock); + + pos = RQ_SLOT(seqno); + slot = &frcti->snd_slots[pos]; + + slot->time = TS_TO_UINT64(now); + /* RTO supersedes any pending TLP/fast-rxm on this slot. */ + slot->flags = (slot->flags & ~(SND_FAST_RXM | SND_TLP)) | SND_RTX; + /* §7.3: RTO supersedes TLP probes and ends the probe episode. */ + frcti->tlp_high_seq = 0; + frcti->tlp_count = 0; + + frcti->rtt_lwe = seqno + 1; + + /* Only the HoL retransmit bumps the global RTO backoff. */ + if (seqno == snd_lwe && frcti->rto_mul < MAX_RTO_MUL) + STORE_RELEASE(&frcti->rto_mul, frcti->rto_mul + 1); + + /* RFC 8985 §7.2 step 4: RTO on HoL resets RACK reo scaling. */ + if (seqno == snd_lwe) + frcti->reo_wnd_mult = 1; + + pthread_rwlock_unlock(&frcti->lock); + + STAT_BUMP(frcti, rxm_rto); + + spb = rxm_pkt_prepare(pkt, len, rcv_lwe, frcti->stream); + if (spb == NULL) + return; + + /* ETIMEDOUT/ENOMEM: let r-timer drive teardown. */ + ret = frct_tx(frcti, spb); + if (ret == -EFLOWDOWN || ret == -ENOTALLOC) + STAT_BUMP(frcti, rxm_tx_dead); } -static void send_frct_pkt(struct frcti * frcti) +static void rxm_due(void * arg) +{ + struct rxm_entry * r = arg; + struct frcti * frcti = r->frcti; + struct timespec now; + uint64_t now_ns; + uint32_t snd_lwe; + size_t pos = RQ_SLOT(r->seqno); + + STAT_BUMP(frcti, rxm_due_count); + + snd_lwe = LOAD_RELAXED(&frcti->snd_cr.lwe); + + /* Already ACK'd: expected for the steady-state majority. */ + if (before(r->seqno, snd_lwe)) { + STAT_BUMP(frcti, rxm_due_acked); + goto cleanup; + } + + /* SACK/RACK-cleared the slot (caller NULL'd snd_slots[pos].rxm). */ + if (!rxm_still_owned(frcti, pos, r)) { + STAT_BUMP(frcti, rxm_due_unowned); + goto cleanup; + } + + clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); + + /* R-timer expired: peer unreachable. */ + if (RXM_AGED_OUT(r->t0, now_ns, frcti->t_r)) { + STAT_BUMP(frcti, rxm_due_aged); + frct_mark_flow_down(frcti); + goto cleanup; + } + + /* HoL-only retx; defer at base rto so HoL transitions react. */ + if (r->seqno != snd_lwe) { + STAT_BUMP(frcti, rxm_due_defer); + tw_post(&r->tw, now_ns + LOAD_RELAXED(&frcti->rto), + rxm_due, r); + return; + } + + rxm_snd(frcti, r->seqno, r->pkt, r->len); + + /* Re-check ownership: fire path may have replaced our entry. */ + if (rxm_still_owned(frcti, pos, r)) { + uint64_t anchor; + + /* Per-slot anchor breaks co-fire re-bin. */ + anchor = frcti->snd_slots[pos].time; + tw_post(&r->tw, rxm_next_deadline(frcti, anchor), rxm_due, r); + return; + } + + cleanup: + pthread_rwlock_wrlock(&frcti->lock); + + if (rxm_still_owned(frcti, pos, r)) + STORE_RELEASE(&frcti->snd_slots[pos].rxm, NULL); + + list_del(&r->next); + + pthread_rwlock_unlock(&frcti->lock); + + rxm_entry_destroy(r); +} + +static int rxm_arm(struct frcti * frcti, + uint32_t seqno, + const struct ssm_pk_buff * spb) +{ + struct rxm_entry * r; + time_t rto; + uint8_t rto_mul; + uint64_t deadline; + + r = rxm_entry_create(frcti, seqno, spb); + if (r == NULL) + return -ENOMEM; + + rto = LOAD_RELAXED(&frcti->rto); + rto_mul = LOAD_RELAXED(&frcti->rto_mul); + deadline = r->t0 + ((uint64_t) rto << rto_mul); + + pthread_rwlock_wrlock(&frcti->lock); + + list_add_tail(&r->next, &frcti->rxm_list); + STORE_RELEASE(&frcti->snd_slots[RQ_SLOT(seqno)].rxm, r); + + pthread_rwlock_unlock(&frcti->lock); + + tw_post(&r->tw, deadline, rxm_due, r); + + return 0; +} + +static void rxm_cancel_all(struct frcti * frcti) +{ + struct list_head * p; + struct list_head * t; + + list_for_each_safe(p, t, &frcti->rxm_list) { + struct rxm_entry * r = list_entry(p, struct rxm_entry, next); + list_del(&r->next); + tw_cancel(&r->tw); + rxm_entry_destroy(r); + STAT_BUMP(frcti, rxm_cancel); + } +} + +static __inline__ void sack_block_put(uint8_t * payload, + uint16_t i, + uint32_t s, + uint32_t e) +{ + uint32_t * blk = (uint32_t *) + (payload + SACK_HDR_SIZE + i * SACK_BLOCK_SIZE); + + blk[0] = hton32(s); + blk[1] = hton32(e); +} + +static __inline__ void sack_block_get(const uint8_t * payload, + uint16_t i, + uint32_t * s, + uint32_t * e) +{ + const uint32_t * blk = (const uint32_t *) + (payload + SACK_HDR_SIZE + i * SACK_BLOCK_SIZE); + + *s = ntoh32(blk[0]); + *e = ntoh32(blk[1]); +} + +/* + * Build SACK blocks for ranges *above* rcv_cr.lwe. Wire invariant + * (see doc/frct.txt §1.3): every block produced here satisfies + * blocks[i].start > rcv_cr.lwe = ackno, which makes the "first block + * below ackno" convention used to mark a D-SACK (RFC 2883 §4 case 1) + * unambiguous. Caller holds frcti->lock. + */ +static uint16_t sack_blocks_build(struct frcti * frcti, + uint32_t blocks[][2], + uint16_t max_n) +{ + const struct rcv_slot * slots = frcti->rcv_slots; + uint32_t s; + uint32_t end; + uint16_t n = 0; + + s = frcti->rcv_cr.lwe + 1; + end = frcti->rcv_cr.lwe + RQ_SIZE; + if (after(end, frcti->rcv_cr.rwe)) + end = frcti->rcv_cr.rwe; + + while (before(s, end) && n < max_n) { + while (before(s, end) && slots[RQ_SLOT(s)].idx == -1) + ++s; + + if (!before(s, end)) + break; + + blocks[n][0] = s; + while (before(s, end) && slots[RQ_SLOT(s)].idx != -1) + ++s; + blocks[n][1] = s; + ++n; + } + + return n; +} + +/* + * Prepend the pending D-SACK report (if any) as block[0]; clear flag. + * Returns the number of slots consumed at the head (0 or 1). Caller + * holds wrlock. + */ +static __inline__ uint16_t dsack_consume(struct frcti * frcti, + uint32_t blocks[][2]) +{ + if (!frcti->dsack_valid || frcti->sack_n_max == 0) + return 0; + + blocks[0][0] = frcti->dsack_seqno; + blocks[0][1] = frcti->dsack_seqno + 1; + frcti->dsack_valid = false; + return 1; +} + +/* Caller must NOT hold frcti->lock. */ +static void frcti_sack_snd(struct frcti * frcti, + const struct sack_args * sa) +{ + struct ssm_pk_buff * spb; + struct frct_pci * pci; + buffer_t buf; + uint16_t i; + + assert(sa->n <= SACK_MAX_BLOCKS); + + buf.len = SACK_HDR_SIZE + sa->n * SACK_BLOCK_SIZE; + + if (frct_ctrl_alloc(&spb, &pci, buf.len) < 0) + return; + + pci->flags = hton16(FRCT_ACK | FRCT_FC | FRCT_SACK); + pci->window = hton32(sa->rwe); + pci->ackno = hton32(sa->ack); + pci->seqno = hton32(FETCH_ADD_RELAXED(&frcti->snd_cr.ackno, 1) + 1); + + frct_hcs_set(pci, false); + + buf.data = FRCT_BODY(pci); + memset(buf.data, 0, SACK_HDR_SIZE); + *(uint16_t *) buf.data = hton16(sa->n); + for (i = 0; i < sa->n; ++i) + sack_block_put(buf.data, i, sa->blocks[i][0], sa->blocks[i][1]); + + frct_tx(frcti, spb); +} + +static void ack_snd(struct frcti * frcti, + bool with_sack) { struct timespec now; + uint64_t now_ns; time_t diff; uint32_t ackno; uint32_t rwe; - int fd; + struct sack_args * sa = NULL; + size_t sa_sz; + bool sacking = false; assert(frcti); + STAT_BUMP(frcti, ack_fire); + clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); + + if (with_sack && frcti->sack_n_max > 0) { + sa_sz = sizeof(*sa) + frcti->sack_n_max * sizeof(sa->blocks[0]); + sa = malloc(sa_sz); + /* If alloc fails, fall through and send a bare cum-ACK. */ + } pthread_rwlock_wrlock(&frcti->lock); - if (!after(frcti->rcv_cr.lwe, frcti->rcv_cr.seqno)) { + /* D-SACK rides through cum-ACK freshness; signal is the duplicate. */ + if (!after(frcti->rcv_cr.lwe, frcti->rcv_cr.seqno) + && !frcti->dsack_valid) { pthread_rwlock_unlock(&frcti->lock); - return; + STAT_BUMP(frcti, ack_supp_seqno); + goto out; } - fd = frcti->fd; ackno = frcti->rcv_cr.lwe; - rwe = frcti->rcv_cr.rwe; + rwe = frcti_advert_rwe(frcti); - diff = ts_diff_ns(&now, &frcti->rcv_cr.act); - if (diff > frcti->a) { + if (ACK_AGED_OUT(frcti->rcv_cr.act, now_ns, frcti->t_a)) { pthread_rwlock_unlock(&frcti->lock); - return; + STAT_BUMP(frcti, ack_supp_inact); + goto out; } - diff = ts_diff_ns(&now, &frcti->snd_cr.act); - if (diff < TICTIME) { + diff = (time_t) ts_age_ns(now_ns, frcti->snd_cr.act); + if (diff < TICTIME && !frcti->dsack_valid) { pthread_rwlock_unlock(&frcti->lock); - return; + STAT_BUMP(frcti, ack_supp_rate); + goto out; } + /* RFC 2018: piggyback SACK on timer ACK; dedup unchanged board. */ + if (sa == NULL || (frcti->sack_n == 0 && !frcti->dsack_valid)) + goto no_sack; + + sa->dsack = false; + sa->n = dsack_consume(frcti, sa->blocks); + if (sa->n == 1) + sa->dsack = true; + + sa->n += sack_blocks_build(frcti, sa->blocks + sa->n, + frcti->sack_n_max - sa->n); + if (sa->n == 0) + goto no_sack; + + if (!sa->dsack && ackno == frcti->sack_lwe && sa->n == frcti->sack_n) + goto no_sack; + + sa->ack = ackno; + sa->rwe = rwe; + frcti->sack_lwe = ackno; + frcti->sack_n = sa->n; + frcti->t_snd_sack = now_ns; + sacking = true; + + no_sack: frcti->rcv_cr.seqno = frcti->rcv_cr.lwe; pthread_rwlock_unlock(&frcti->lock); - __send_frct_pkt(fd, FRCT_ACK | FRCT_FC, ackno, rwe); + STAT_BUMP(frcti, ack_snd); + + if (sacking) { + STAT_BUMP(frcti, sack_snd); + if (sa->dsack) + STAT_BUMP(frcti, dsack_snd); + frcti_sack_snd(frcti, sa); + } else { + frcti_pkt_snd(frcti, FRCT_ACK | FRCT_FC, ackno, rwe); + } + + out: + free(sa); } -static void __send_rdv(int fd) +/* Delayed-ACK timer: per-flow, dedup'd via atomic test-and-set. */ +static void ack_due(void * arg) { - __send_frct_pkt(fd, FRCT_RDVS, 0, 0); + struct frcti * frcti = arg; + + __atomic_clear(&frcti->ack_pending, __ATOMIC_RELAXED); + + ack_snd(frcti, true); } -static struct frcti * frcti_create(int fd, - time_t a, - time_t r, - time_t mpl) +static int ack_arm(struct frcti * frcti) { - struct frcti * frcti; - ssize_t idx; - struct timespec now; - pthread_condattr_t cattr; + struct timespec now; + uint64_t deadline; + + if (__atomic_test_and_set(&frcti->ack_pending, __ATOMIC_RELAXED)) + return 0; + + clock_gettime(PTHREAD_COND_CLOCK, &now); + deadline = TS_TO_UINT64(now) + ACK_DELAY_NS; + + tw_post(&frcti->ack_tw, deadline, ack_due, frcti); + + return 0; +} + +/* Forward decl breaks the keepalive cycle: ka_arm <-> ka_due. */ +static void ka_due(void * arg); + +static int ka_arm(struct frcti * frcti) +{ + struct timespec now; + uint64_t now_ns; + uint64_t timeo_ns; + uint64_t snd_ns; + uint64_t rcv_ns; + uint64_t deadline; + + timeo_ns = (uint64_t) frcti->qs_timeout * MILLION; /* IMM */ + snd_ns = LOAD_RELAXED(&frcti->snd_cr.act) + timeo_ns / 4; + rcv_ns = LOAD_RELAXED(&frcti->rcv_cr.act) + timeo_ns; + + clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); + deadline = MIN(snd_ns, rcv_ns); + if (deadline <= now_ns) + deadline = now_ns + timeo_ns / 4; + + tw_post(&frcti->ka_tw, deadline, ka_due, frcti); + + return 0; +} + +__attribute__((cold)) +static void ka_snd(struct frcti * frcti) +{ + struct ssm_pk_buff * spb; + struct frct_pci * pci; + struct timespec now; + uint64_t now_ns; + time_t timeo_ns; + uint64_t rcv_act; + uint64_t ka_rcv; + int64_t rcv_idle; + int64_t snd_idle; + uint32_t ackno; + + assert(frcti); + + clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); + + timeo_ns = (time_t)(frcti->qs_timeout) * MILLION; /* IMM */ + rcv_act = LOAD_RELAXED(&frcti->rcv_cr.act); + ka_rcv = LOAD_RELAXED(&frcti->t_ka_rcv); + rcv_idle = ts_age_ns(now_ns, rcv_act > ka_rcv ? rcv_act : ka_rcv); + snd_idle = ts_age_ns(now_ns, LOAD_RELAXED(&frcti->snd_cr.act)); + + if (rcv_idle > timeo_ns) { + frct_mark_peer_dead(frcti); + return; + } + + if (snd_idle <= timeo_ns / 4) { + ka_arm(frcti); + return; + } + + if (frct_ctrl_alloc(&spb, &pci, 0) < 0) { + ka_arm(frcti); + return; + } + + ackno = LOAD_RELAXED(&frcti->rcv_cr.lwe); + + pci->flags = hton16(FRCT_KA | FRCT_ACK); + pci->ackno = hton32(ackno); + + frct_hcs_set(pci, false); + + STAT_BUMP(frcti, ka_snd); + frct_tx(frcti, spb); + + ka_arm(frcti); +} + +/* Keepalive timer: re-posted by the fire callback itself. */ +static void ka_due(void * arg) +{ + ka_snd((struct frcti *) arg); +} + +static void frcti_rdv_snd(struct frcti * frcti) +{ + frcti_pkt_snd(frcti, FRCT_RDVS, 0, 0); +} + +#define HAS_RESCNTL(cr) ((cr)->cflags & FRCTFRESCNTL) +static bool frcti_is_window_open(struct frcti * frcti) +{ + struct frct_cr * snd_cr = &frcti->snd_cr; + struct timespec now; + time_t diff; + bool ret = false; + + if (!HAS_RESCNTL(snd_cr)) + return true; + + if (before(snd_cr->seqno, LOAD_RELAXED(&snd_cr->rwe))) + return true; + + /* Window may be closed; wrlock for RDV state mutations. */ + pthread_rwlock_wrlock(&frcti->lock); + + if (before(snd_cr->seqno, snd_cr->rwe)) { + ret = true; + goto unlock; + } + + clock_gettime(PTHREAD_COND_CLOCK, &now); + + if (frcti->open) { + frcti->open = false; + frcti->t_wnd = now; + frcti->t_last_rdv = now; + goto unlock; + } + + diff = ts_diff_ns(&now, &frcti->t_wnd); + if (diff > MAX_RDV) + goto unlock; + + diff = ts_diff_ns(&now, &frcti->t_last_rdv); + if (diff > (time_t) frcti->t_rdv) { + frcti->t_last_rdv = now; + frcti_rdv_snd(frcti); + STAT_BUMP(frcti, rdv_snd); + } + unlock: + pthread_rwlock_unlock(&frcti->lock); + + return ret; +} + +/* n contiguous seqnos free? No RDV: the n=1 path drives it. */ +static bool frcti_is_window_open_n(struct frcti * frcti, + size_t n) +{ + struct frct_cr * snd_cr = &frcti->snd_cr; + + if (!HAS_RESCNTL(snd_cr)) + return true; + + if (n <= 1) + return frcti_is_window_open(frcti); + + return before(snd_cr->seqno + (uint32_t)(n - 1), + LOAD_RELAXED(&snd_cr->rwe)); +} + +static void release_rq(struct frcti * frcti) +{ + size_t i; + + for (i = 0; i < RQ_SIZE; ++i) { + if (frcti->rcv_slots[i].idx == -1) + continue; + + /* Stream rq entries are sentinels (no spb owned). */ + if (!frcti->stream) + frct_spb_release_idx(frcti->rcv_slots[i].idx); + + frcti->rcv_slots[i].idx = -1; + STAT_BUMP(frcti, rq_released); + } +} + +static __inline__ bool stream_ring_sz_ok(struct frcti * frcti, + size_t n) +{ + size_t per_pkt; + + if (n > FRCT_STREAM_RING_SZ_MAX) + return false; + + if ((n & (n - 1)) != 0) + return false; + + per_pkt = frcti->frag_mtu - frcti_data_hdr_len(frcti); + + return n >= FRCT_STREAM_RING_MIN_PKTS * per_pkt; +} + +/* Default ring sized for full RQ_SIZE seqno window; pow2, capped. */ +static size_t default_stream_ring_sz(size_t per_pkt) +{ + size_t need; + size_t sz; + + need = (size_t) RQ_SIZE * per_pkt; + sz = FRCT_STREAM_RING_SZ; + + while (sz < need && sz < FRCT_STREAM_RING_SZ_MAX) + sz <<= 1; + + return sz; +} + +struct frcti * frcti_create(int fd, + uint64_t a, + uint64_t r, + uint64_t mpl, + time_t rtt_hint, + qosspec_t qs, + uint32_t mtu) +{ + struct frcti * frcti; + ssize_t idx; + struct timespec now; + uint64_t now_ns; + size_t bb; + size_t per_pkt; #ifdef PROC_FLOW_STATS - char frctstr[FRCT_NAME_STRLEN + 1]; + char frctstr[FRCT_NAME_STRLEN + 1]; #endif - mpl *= MILLION; - a *= BILLION; - r *= BILLION; + mpl *= MILLION; /* ms -> ns */ + a *= MILLION; /* ms -> ns */ + r *= MILLION; /* ms -> ns */ frcti = malloc(sizeof(*frcti)); if (frcti == NULL) @@ -349,56 +1828,76 @@ static struct frcti * frcti_create(int fd, memset(frcti, 0, sizeof(*frcti)); + list_head_init(&frcti->rxm_list); + if (pthread_rwlock_init(&frcti->lock, NULL)) goto fail_lock; - if (pthread_mutex_init(&frcti->mtx, NULL)) - goto fail_mutex; - - if (pthread_condattr_init(&cattr)) - goto fail_cattr; -#ifndef __APPLE__ - pthread_condattr_setclock(&cattr, PTHREAD_COND_CLOCK); -#endif - if (pthread_cond_init(&frcti->cond, &cattr)) - goto fail_cond; - #ifdef PROC_FLOW_STATS sprintf(frctstr, "%d", fd); if (rib_reg(frctstr, &r_ops)) goto fail_rib_reg; #endif - pthread_condattr_destroy(&cattr); for (idx = 0; idx < RQ_SIZE; ++idx) - frcti->rq[idx] = -1; + frcti->rcv_slots[idx].idx = -1; clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); + + frcti->t_mpl = mpl; + frcti->t_a = a; + frcti->t_r = r; + frcti->t_rdv = DELT_RDV; + frcti->fd = fd; + frcti->ber = (time_t) qs.ber; + frcti->lossy = (qs.loss != 0); + frcti->qs_timeout = (time_t) qs.timeout; + + frcti->frag_mtu = (size_t) mtu; + + /* Cap blocks per SACK at what fits in the per-flow frag_mtu. */ + bb = (frcti->frag_mtu - FRCT_PCILEN - SACK_HDR_SIZE) + / SACK_BLOCK_SIZE; + if (bb > SACK_MAX_BLOCKS) + bb = SACK_MAX_BLOCKS; + frcti->sack_n_max = (uint16_t) bb; + + frcti->max_rcv_sdu = FRCT_MAX_SDU; + + frcti->stream = (qs.service == SVC_STREAM); + if (frcti->stream) { + per_pkt = frcti->frag_mtu - frcti_data_hdr_len(frcti); + frcti->rcv_ring_sz = default_stream_ring_sz(per_pkt); + frcti->ring_seq_cap = + (uint32_t) (frcti->rcv_ring_sz / per_pkt); + } - frcti->mpl = mpl; - frcti->a = a; - frcti->r = r; - frcti->rdv = DELT_RDV; - frcti->fd = fd; - - - frcti->rttseq = 0; - frcti->probe = false; - - frcti->srtt = 0; /* Updated on first ACK */ - frcti->mdev = 10 * MILLION; /* Updated on first ACK */ - frcti->rto = BILLION; /* Initial rxm will be after 1 s */ -#ifdef PROC_FLOW_STATS - frcti->n_rtx = 0; - frcti->n_prb = 0; - frcti->n_rtt = 0; - frcti->n_dup = 0; - frcti->n_dak = 0; - frcti->n_rdv = 0; - frcti->n_out = 0; - frcti->n_rqo = 0; -#endif - if (proc.flows[fd].info.qs.loss == 0) { + frcti->rto_min = (time_t) MAX(RTO_MIN, 1ULL << RXMQ_RES); + rtt_init(frcti, rtt_hint); + frcti->t_min_rtt = now_ns; + frcti->probe_id_next = 1; + frcti->t_rcv_rtt = now_ns; + frcti->t_snd_probe = now_ns; + frcti->t_snd_sack = 0; + frcti->sack_lwe = 0; + frcti->sack_n = 0; + frcti->dsack_seqno = 0; + frcti->dsack_valid = false; + frcti->reo_wnd_mult = 1; + frcti->dsack_lwe_snap = 0; + frcti->t_last_reo_widen = 0; + /* So the first pre-DRF NACK fires without waiting cooldown. */ + frcti->t_nack = now_ns - BILLION; + frcti->in_recovery = false; + frcti->recovery_high = 0; + frcti->rack_fired_lwe = 0; + + tw_init_entry(&frcti->ack_tw); + tw_init_entry(&frcti->ka_tw); + tw_init_entry(&frcti->tlp_tw); + + if (!frcti->lossy) { frcti->snd_cr.cflags |= FRCTFRTX | FRCTFLINGER; frcti->rcv_cr.cflags |= FRCTFRTX; } @@ -406,24 +1905,31 @@ static struct frcti * frcti_create(int fd, frcti->snd_cr.cflags |= FRCTFRESCNTL; frcti->snd_cr.rwe = START_WINDOW; + if (frcti->lossy) + frcti->snd_cr.rwe = RQ_SIZE; - frcti->snd_cr.inact = (3 * mpl + a + r) / BILLION + 1; /* s */ - frcti->snd_cr.act.tv_sec = now.tv_sec - (frcti->snd_cr.inact + 1); + frcti->snd_cr.inact = 3 * mpl + a + r + BILLION; /* ns */ + frcti->snd_cr.act = now_ns - frcti->snd_cr.inact - BILLION; - frcti->rcv_cr.inact = (2 * mpl + a + r) / BILLION + 1; /* s */ - frcti->rcv_cr.act.tv_sec = now.tv_sec - (frcti->rcv_cr.inact + 1); + frcti->rcv_cr.inact = 2 * mpl + a + r + BILLION; /* ns */ + frcti->rcv_cr.act = now_ns - frcti->rcv_cr.inact - BILLION; + + frcti->t_ka_rcv = now_ns; + + /* qs_timeout == 0: no KA, silent peer crash goes undetected. */ + if (frcti->qs_timeout > 0) { + if (ka_arm(frcti) < 0) + goto fail_ka_arm; + } return frcti; + fail_ka_arm: #ifdef PROC_FLOW_STATS + sprintf(frctstr, "%d", fd); + rib_unreg(frctstr); fail_rib_reg: - pthread_cond_destroy(&frcti->cond); #endif - fail_cond: - pthread_condattr_destroy(&cattr); - fail_cattr: - pthread_mutex_destroy(&frcti->mtx); - fail_mutex: pthread_rwlock_destroy(&frcti->lock); fail_lock: free(frcti); @@ -431,21 +1937,55 @@ static struct frcti * frcti_create(int fd, return NULL; } -static void frcti_destroy(struct frcti * frcti) +void frcti_destroy(struct frcti * frcti) { #ifdef PROC_FLOW_STATS char frctstr[FRCT_NAME_STRLEN + 1]; +#endif + /* Drop every wheel entry referencing frcti before freeing it. */ + rxm_cancel_all(frcti); + tw_cancel(&frcti->ack_tw); + tw_cancel(&frcti->ka_tw); + tw_cancel(&frcti->tlp_tw); + +#if defined(PROC_FLOW_STATS) && defined(FRCT_DEBUG_STDOUT) + printf("[FRCT teardown] pid=%d fd=%d " + "sdu_snd=%zu sdu_reasm=%zu sdu_sole=%zu " + "frag_snd=%zu frag_rcv=%zu frag_drop=%zu " + "rxm_rto=%zu rxm_sack=%zu rxm_dup=%zu " + "rxm_due=%zu acked=%zu unowned=%zu aged=%zu defer=%zu " + "cancel=%zu arm_fail=%zu inflight=%u " + "nack_snd=%zu nack_rcv=%zu inact_drop=%zu " + "drf_rebase=%zu rq_released=%zu\n", + (int) getpid(), frcti->fd, + frcti->stat.sdu_snd_frag, frcti->stat.sdu_reasm, + frcti->stat.sdu_sole, + frcti->stat.frag_snd, frcti->stat.frag_rcv, + frcti->stat.frag_drop, + frcti->stat.rxm_rto, frcti->stat.rxm_sack, + frcti->stat.rxm_dupthresh, + frcti->stat.rxm_due_count, frcti->stat.rxm_due_acked, + frcti->stat.rxm_due_unowned, frcti->stat.rxm_due_aged, + frcti->stat.rxm_due_defer, + frcti->stat.rxm_cancel, frcti->stat.rxm_arm_fail, + frcti->snd_cr.seqno - frcti->snd_cr.lwe, + frcti->stat.nack_snd, frcti->stat.nack_rcv, + frcti->stat.inact_drop, + frcti->stat.drf_rebase, frcti->stat.rq_released); +#endif + + release_rq(frcti); + free(frcti->rcv_ring); +#ifdef PROC_FLOW_STATS sprintf(frctstr, "%d", frcti->fd); rib_unreg(frctstr); #endif - pthread_cond_destroy(&frcti->cond); - pthread_mutex_destroy(&frcti->mtx); pthread_rwlock_destroy(&frcti->lock); free(frcti); } -static uint16_t frcti_getflags(struct frcti * frcti) +uint16_t frcti_getflags(struct frcti * frcti) { uint16_t ret; @@ -453,89 +1993,91 @@ static uint16_t frcti_getflags(struct frcti * frcti) pthread_rwlock_rdlock(&frcti->lock); - ret = frcti->snd_cr.cflags; + ret = frcti->snd_cr.cflags & FRCTFMASK; pthread_rwlock_unlock(&frcti->lock); return ret; } -static void frcti_setflags(struct frcti * frcti, - uint16_t flags) +void frcti_setflags(struct frcti * frcti, + uint16_t flags) { - flags |= FRCTFRTX; /* Should not be set by command */ - assert(frcti); - pthread_rwlock_wrlock(&frcti->lock); + flags &= FRCTFSETMASK; - frcti->snd_cr.cflags &= FRCTFRTX; /* Zero other flags */ + pthread_rwlock_wrlock(&frcti->lock); - frcti->snd_cr.cflags &= flags; + frcti->snd_cr.cflags = (frcti->snd_cr.cflags & ~FRCTFSETMASK) | flags; pthread_rwlock_unlock(&frcti->lock); } -#define frcti_queued_pdu(frcti) \ - (frcti == NULL ? idx : __frcti_queued_pdu(frcti)) +size_t frcti_get_max_rcv_sdu(struct frcti * frcti) +{ + size_t ret; -#define frcti_snd(frcti, spb) \ - (frcti == NULL ? 0 : __frcti_snd(frcti, spb)) + assert(frcti); -#define frcti_rcv(frcti, spb) \ - (frcti == NULL ? 0 : __frcti_rcv(frcti, spb)) + pthread_rwlock_rdlock(&frcti->lock); + ret = frcti->max_rcv_sdu; + pthread_rwlock_unlock(&frcti->lock); -#define frcti_dealloc(frcti) \ - (frcti == NULL ? 0 : __frcti_dealloc(frcti)) + return ret; +} -#define frcti_is_window_open(frcti) \ - (frcti == NULL ? true : __frcti_is_window_open(frcti)) +int frcti_set_max_rcv_sdu(struct frcti * frcti, + size_t max) +{ + assert(frcti); -#define frcti_window_wait(frcti, abstime) \ - (frcti == NULL ? 0 : __frcti_window_wait(frcti, abstime)) + if (max == 0) + return -EINVAL; + pthread_rwlock_wrlock(&frcti->lock); + frcti->max_rcv_sdu = max; + pthread_rwlock_unlock(&frcti->lock); -static bool __frcti_is_window_open(struct frcti * frcti) + return 0; +} + +size_t frcti_get_rcv_ring_sz(struct frcti * frcti) { - struct frct_cr * snd_cr = &frcti->snd_cr; - bool ret = true; + size_t ret; + + assert(frcti); pthread_rwlock_rdlock(&frcti->lock); + ret = frcti->rcv_ring_sz; + pthread_rwlock_unlock(&frcti->lock); + + return ret; +} - if (snd_cr->cflags & FRCTFRESCNTL) - ret = before(snd_cr->seqno, snd_cr->rwe); +/* Set before any stream byte has been delivered; -EBUSY otherwise. */ +int frcti_set_rcv_ring_sz(struct frcti * frcti, + size_t n) +{ + int ret = 0; + size_t per_pkt; - if (!ret) { - struct timespec now; + assert(frcti); - clock_gettime(PTHREAD_COND_CLOCK, &now); + if (!frcti->stream) + return -ENOTSUP; + if (!stream_ring_sz_ok(frcti, n)) + return -EINVAL; - pthread_mutex_lock(&frcti->mtx); - if (frcti->open) { - frcti->open = false; - frcti->t_wnd = now; - frcti->t_rdvs = now; - } else { - time_t diff; - diff = ts_diff_ns(&now, &frcti->t_wnd); - if (diff > MAX_RDV) { - pthread_mutex_unlock(&frcti->mtx); - pthread_rwlock_unlock(&frcti->lock); - return false; - } - - diff = ts_diff_ns(&now, &frcti->t_rdvs); - if (diff > frcti->rdv) { - frcti->t_rdvs = now; - __send_rdv(frcti->fd); -#ifdef PROC_FLOW_STATS - frcti->n_rdv++; -#endif + per_pkt = frcti->frag_mtu - frcti_data_hdr_len(frcti); - } - } + pthread_rwlock_wrlock(&frcti->lock); - pthread_mutex_unlock(&frcti->mtx); + if (frcti->rcv_ring != NULL) { + ret = -EBUSY; + } else { + frcti->rcv_ring_sz = n; + frcti->ring_seq_cap = (uint32_t) (n / per_pkt); } pthread_rwlock_unlock(&frcti->lock); @@ -543,392 +2085,2101 @@ static bool __frcti_is_window_open(struct frcti * frcti) return ret; } -static int __frcti_window_wait(struct frcti * frcti, - struct timespec * abstime) +time_t frcti_get_rto_min(struct frcti * frcti) { - struct frct_cr * snd_cr = &frcti->snd_cr; - int ret = 0; + time_t v; + + assert(frcti); pthread_rwlock_rdlock(&frcti->lock); + v = frcti->rto_min; + pthread_rwlock_unlock(&frcti->lock); - if (!(snd_cr->cflags & FRCTFRESCNTL)) { - pthread_rwlock_unlock(&frcti->lock); + return v; +} + +/* Floor at the timer-wheel resolution; finer granularity is unrepresentable. */ +int frcti_set_rto_min(struct frcti * frcti, + time_t rto_min) +{ + time_t floor = (time_t) (1ULL << RXMQ_RES); + time_t rto_floor; + time_t rto; + + assert(frcti); + + if (rto_min < floor) + return -EINVAL; + + pthread_rwlock_wrlock(&frcti->lock); + + frcti->rto_min = rto_min; + if (frcti->srtt > 0) { + rto_floor = MAX(rto_min, 2 * frcti->srtt); + rto = MAX(rto_floor, + frcti->srtt + (frcti->mdev << MDEV_MUL)); + STORE_RELEASE(&frcti->rto, rto); + } else if (frcti->rto < rto_min) { + STORE_RELEASE(&frcti->rto, rto_min); + } + + pthread_rwlock_unlock(&frcti->lock); + + return 0; +} + +/* Re-arm a fresh rxm so a lost fast-retx still recovers via RTO. */ +static void sack_rxm_snd(struct frcti * frcti, + void * pkt, + size_t len) +{ + struct ssm_pk_buff * spb; + const struct frct_pci * pci; + uint32_t rcv_lwe; + uint32_t seqno; + int ret; + + rcv_lwe = LOAD_RELAXED(&frcti->rcv_cr.lwe); + + spb = rxm_pkt_prepare(pkt, len, rcv_lwe, frcti->stream); + if (spb == NULL) + return; + + pci = (const struct frct_pci *) ssm_pk_buff_head(spb); + seqno = ntoh32(pci->seqno); + + /* Register fresh rxm before send; old entry self-cleans. */ + if (rxm_arm(frcti, seqno, spb) < 0) { + frct_spb_release(spb); + return; + } + + STAT_BUMP(frcti, rxm_sack); + ret = frct_tx(frcti, spb); + if (ret == -EFLOWDOWN || ret == -ENOTALLOC) + STAT_BUMP(frcti, rxm_tx_dead); +} + +/* Additive HoL emit; original snd_slots[hp].rxm stays armed (NewReno). */ +static int fast_rxm_send(struct frcti * frcti, + void * pkt, + size_t len) +{ + struct ssm_pk_buff * spb; + uint32_t rcv_lwe; + + rcv_lwe = LOAD_RELAXED(&frcti->rcv_cr.lwe); + + spb = rxm_pkt_prepare(pkt, len, rcv_lwe, frcti->stream); + if (spb == NULL) return 0; + + return frct_tx(frcti, spb); +} + +/* PCI bytes survive head_release at receive; just rewind the pointer. */ +static __inline__ uint16_t frag_role_peek(struct ssm_pk_buff * spb) +{ + const struct frct_pci * pci; + + assert(ssm_pk_buff_head(spb) != NULL); + + pci = (const struct frct_pci *) (ssm_pk_buff_head(spb) - FRCT_PCILEN); + + return ntoh16(pci->flags) & FRCT_FR_MASK; +} + +enum frag_state { + FRAG_NOT_READY, /* head missing / FIRST..LAST run incomplete */ + FRAG_DELIVER, /* *count fragments form a deliverable SDU */ + FRAG_DROP, /* *count fragments at lwe are malformed */ +}; + +/* + * On a gap in the run: FRTX waits (NOT_READY); best-effort scans forward + * for the next FIRST/SOLE and returns DROP for the broken prefix. *count + * gets the offset from the trailing edge. NOT_READY if no later run is + * in window. Caller rdlock. + */ +static enum frag_state frag_inspect_gap(struct frcti * frcti, + size_t start, + size_t * count) +{ + const struct rcv_slot * slots = frcti->rcv_slots; + struct ssm_pk_buff * spb; + uint32_t k; + uint16_t role; + size_t m; + + if (frcti->rcv_cr.cflags & FRCTFRTX) + return FRAG_NOT_READY; + + k = frcti->rcv_cr.rwe - RQ_SIZE; + + for (m = start; m < RQ_SIZE; ++m) { + if (slots[RQ_SLOT(k + m)].idx == -1) + continue; + + spb = rq_frag(frcti, k + m); + role = frag_role_peek(spb); + + if (role == FRCT_FR_SOLE || role == FRCT_FR_FIRST) { + if (m == 0) + return FRAG_NOT_READY; + + *count = m; + return FRAG_DROP; + } } - while (snd_cr->seqno == snd_cr->rwe && ret != -ETIMEDOUT) { - struct timespec now; - pthread_rwlock_unlock(&frcti->lock); - pthread_mutex_lock(&frcti->mtx); + return FRAG_NOT_READY; +} + +/* + * Inspect rq[lwe..]; set *count and return DELIVER/DROP/NOT_READY. DROP + * covers broken prefixes (mid/last at HoL, FIRST..[non-LAST]..new-FIRST). + * Non-FRTX flows skip past gaps to the next FIRST/SOLE. Caller rdlock. + */ +static enum frag_state frag_run_inspect(struct frcti * frcti, + size_t * count) +{ + const struct rcv_slot * slots = frcti->rcv_slots; + struct ssm_pk_buff * spb; + uint32_t k = frcti->rcv_cr.rwe - RQ_SIZE; + uint16_t role; + size_t n = 0; - if (frcti->open) { - clock_gettime(PTHREAD_COND_CLOCK, &now); + if (slots[RQ_SLOT(k)].idx == -1) + return frag_inspect_gap(frcti, 0, count); - frcti->t_wnd = now; - frcti->t_rdvs = now; - frcti->open = false; + spb = rq_frag(frcti, k); + role = frag_role_peek(spb); + + if (role == FRCT_FR_SOLE) { + *count = 1; + return FRAG_DELIVER; + } + + if (role != FRCT_FR_FIRST) { + *count = 1; + return FRAG_DROP; + } + + while (true) { + if (n == RQ_SIZE || slots[RQ_SLOT(k + n)].idx == -1) + return frag_inspect_gap(frcti, n, count); + + spb = rq_frag(frcti, k + n); + role = frag_role_peek(spb); + ++n; + + if (role == FRCT_FR_LAST) { + *count = n; + return FRAG_DELIVER; } - pthread_cleanup_push(__cleanup_mutex_unlock, &frcti->mtx); + if (n > 1 && role != FRCT_FR_MID) { + /* SOLE or new FIRST mid-run: drop the prefix. */ + *count = n - 1; + return FRAG_DROP; + } + } +} - ret = -__timedwait(&frcti->cond, &frcti->mtx, abstime); +/* Caller wrlock. Delivery edge is implicit: rwe - RQ_SIZE. */ +static void frag_drop(struct frcti * frcti, + size_t count) +{ + uint32_t k = frcti->rcv_cr.rwe - RQ_SIZE; + uint32_t edge; + size_t i; - pthread_cleanup_pop(false); + for (i = 0; i < count; ++i) { + size_t pos = RQ_SLOT(k + i); - if (ret == -ETIMEDOUT) { - time_t diff; + if (frcti->rcv_slots[pos].idx == -1) + continue; - clock_gettime(PTHREAD_COND_CLOCK, &now); + frct_spb_release_idx(frcti->rcv_slots[pos].idx); + frcti->rcv_slots[pos].idx = -1; + } - diff = ts_diff_ns(&now, &frcti->t_wnd); - if (diff > MAX_RDV) { - pthread_mutex_unlock(&frcti->mtx); - return -ECONNRESET; /* write fails! */ - } + frcti->rcv_cr.rwe += count; - diff = ts_diff_ns(&now, &frcti->t_rdvs); - if (diff > frcti->rdv) { - frcti->t_rdvs = now; - __send_rdv(frcti->fd); - } + /* Drop may span a gap; pull lwe up to preserve rwe - RQ_SIZE <= lwe. */ + edge = frcti->rcv_cr.rwe - RQ_SIZE; + if (before(frcti->rcv_cr.lwe, edge)) + STORE_RELEASE(&frcti->rcv_cr.lwe, edge); +} + +/* Copy `count` fragments at rq[lwe..] into buf; release + advance lwe. */ +static size_t frag_gather(struct frcti * frcti, + size_t count, + uint8_t * buf) +{ + struct ssm_pk_buff * frag; + size_t off = 0; + size_t i; + uint32_t k = frcti->rcv_cr.rwe - RQ_SIZE; + + for (i = 0; i < count; ++i) { + size_t pos = RQ_SLOT(k + i); + size_t flen; + + frag = rq_frag(frcti, k + i); + flen = ssm_pk_buff_len(frag); + memcpy(buf + off, ssm_pk_buff_head(frag), flen); + off += flen; + frct_spb_release_idx(frcti->rcv_slots[pos].idx); + frcti->rcv_slots[pos].idx = -1; + } + + frcti->rcv_cr.rwe += count; + + return off; +} + +/* Caller holds lock. */ +static size_t frag_total_len(struct frcti * frcti, + size_t count, + bool * overflow) +{ + struct ssm_pk_buff * frag; + size_t total = 0; + size_t i; + uint32_t k = frcti->rcv_cr.rwe - RQ_SIZE; + + *overflow = false; + + for (i = 0; i < count; ++i) { + size_t flen; + + frag = rq_frag(frcti, k + i); + flen = ssm_pk_buff_len(frag); + if (total + flen < total) { + *overflow = true; + return 0; } + total += flen; + } + + return total; +} + +/* + * Process a delivered slot at lwe: latch FIN if acceptable, + * advance byte_high (clamped to byte_fin once latched). + */ +static __inline__ void stream_deliver_slot(struct frcti * frcti, + size_t lp) +{ + uint32_t end; + + end = frcti->rcv_slots[lp].end; + + if (frcti->rcv_slots[lp].fin) { + if (end == frcti->rcv_byte_high && !frcti->rcv_fin_seen) { + frcti->rcv_fin_seen = true; + frcti->rcv_byte_fin = end; + } else { + STAT_BUMP(frcti, strm_fin_drop); + } + } + + if (frcti->rcv_fin_seen && after(end, frcti->rcv_byte_fin)) + end = frcti->rcv_byte_fin; + + frcti->rcv_byte_high = end; +} + +/* Two-segment memcpy from buf into the rx ring at byte offset start. */ +static void stream_ring_write(struct frcti * frcti, + uint32_t start, + buffer_t buf) +{ + size_t mask = frcti->rcv_ring_sz - 1; + size_t off = start & mask; + + if (off + buf.len <= frcti->rcv_ring_sz) { + memcpy(frcti->rcv_ring + off, buf.data, buf.len); + } else { + size_t first = frcti->rcv_ring_sz - off; + memcpy(frcti->rcv_ring + off, buf.data, first); + memcpy(frcti->rcv_ring, buf.data + first, buf.len - first); + } +} - pthread_mutex_unlock(&frcti->mtx); - pthread_rwlock_rdlock(&frcti->lock); +/* Two-segment memcpy from the rx ring at byte offset start into buf. */ +static void stream_ring_read(struct frcti * frcti, + uint32_t start, + buffer_t buf) +{ + size_t mask = frcti->rcv_ring_sz - 1; + size_t off = start & mask; + + if (off + buf.len <= frcti->rcv_ring_sz) { + memcpy(buf.data, frcti->rcv_ring + off, buf.len); + } else { + size_t first = frcti->rcv_ring_sz - off; + memcpy(buf.data, frcti->rcv_ring + off, first); + memcpy(buf.data + first, frcti->rcv_ring, buf.len - first); } +} + +/* Deliver-or-drop one stashed slot at lwe; advance lwe/rwe. Caller wrlock. */ +static void stream_advance_lwe(struct frcti * frcti) +{ + size_t lp; + + lp = RQ_SLOT(frcti->rcv_cr.lwe); + + if (frcti->rcv_slots[lp].start != frcti->rcv_byte_high) + STAT_BUMP(frcti, strm_drop); + else + stream_deliver_slot(frcti, lp); + + frcti->rcv_slots[lp].fin = 0; + frcti->rcv_slots[lp].idx = -1; + STORE_RELEASE(&frcti->rcv_cr.lwe, frcti->rcv_cr.lwe + 1); + frcti->rcv_cr.rwe++; +} + +/* + * Validate a stream DATA packet before stashing. Returns 0 if the + * packet may be written into rcv_ring + rq[], -1 otherwise. + */ +static __inline__ int stream_stash_check(struct frcti * frcti, + uint32_t start, + uint32_t end, + size_t plen, + uint16_t flags) +{ + if (end - start != (uint32_t) plen) + return -1; + /* FIN MUST be 0-byte. */ + if ((flags & FRCT_FIN) && plen != 0) + return -1; + + /* Post-EOS: no further FIN once latched. */ + if (frcti->rcv_fin_seen && (flags & FRCT_FIN)) + return -1; + + /* Post-EOS: reject data at or past byte_fin. */ + if (frcti->rcv_fin_seen && !before(start, frcti->rcv_byte_fin)) + return -1; + + /* Stale: peer is behind the delivered edge. */ + if (before(end, frcti->rcv_byte_next)) + return -1; + + /* Exact-edge: only an empty-stream FIN is meaningful. */ + if (end == frcti->rcv_byte_next && !(flags & FRCT_FIN)) + return -1; + + if (end - frcti->rcv_byte_next > frcti->rcv_ring_sz) + return -1; + + return 0; +} + +/* + * Stream-mode DATA receive: validate, stash payload in rcv_ring, mark + * rq[pos], advance lwe through any newly-contiguous run. Returns 0 + * (spb released) or -1 (caller releases). Caller wrlock. + */ +static int frcti_stream_data_rcv(struct frcti * frcti, + struct ssm_pk_buff * spb, + size_t pos, + uint16_t flags) +{ + struct frct_pci_stream * spci; + uint32_t start; + uint32_t end; + buffer_t buf; + size_t skip; + + if (ssm_pk_buff_len(spb) < FRCT_PCI_STREAM_LEN) + return -1; + + if (frcti->rcv_ring == NULL) { + frcti->rcv_ring = calloc(1, frcti->rcv_ring_sz); + if (frcti->rcv_ring == NULL) + return -ENOMEM; + } + + spci = FRCT_HDR_POP(spb, frct_pci_stream); + start = ntoh32(spci->start); + end = ntoh32(spci->end); + + buf.data = ssm_pk_buff_head(spb); + buf.len = ssm_pk_buff_len(spb); + + if (stream_stash_check(frcti, start, end, buf.len, flags) < 0) + return -1; + + /* Trim front-overlap with already-delivered region. */ + if (before(start, frcti->rcv_byte_next)) { + skip = frcti->rcv_byte_next - start; + buf.data += skip; + buf.len -= skip; + start = frcti->rcv_byte_next; + } + + stream_ring_write(frcti, start, buf); + STAT_ADD(frcti, strm_rcv_byte, buf.len); + + frcti->rcv_slots[pos].idx = 1; + frcti->rcv_slots[pos].start = start; + frcti->rcv_slots[pos].end = end; + frcti->rcv_slots[pos].fin = (flags & FRCT_FIN) ? 1 : 0; + + while (frcti->rcv_slots[RQ_SLOT(frcti->rcv_cr.lwe)].idx != -1) + stream_advance_lwe(frcti); + + frct_spb_release(spb); + + return 0; +} + +/* + * DATA receive: stash idx at rq[pos], advance lwe through any + * contiguous run. Caller wrlock. + */ +static void frcti_data_stash(struct frcti * frcti, + ssize_t idx, + size_t pos, + uint16_t flags) +{ + frcti->rcv_slots[pos].idx = idx; + + if ((flags & FRCT_FR_MASK) != FRCT_FR_SOLE) + STAT_BUMP(frcti, frag_rcv); + + /* lwe = cum-ACK edge; advance per fragment through contiguous run. */ + while (before(frcti->rcv_cr.lwe, frcti->rcv_cr.rwe) + && frcti->rcv_slots[RQ_SLOT(frcti->rcv_cr.lwe)].idx != -1) + STORE_RELEASE(&frcti->rcv_cr.lwe, frcti->rcv_cr.lwe + 1); +} + +/* Stream consume: copy up to `count` contiguous bytes from ring into buf. */ +static ssize_t frcti_consume_stream(struct frcti * frcti, + uint8_t * buf, + size_t count) +{ + size_t avail; + size_t copy; + ssize_t ret; + buffer_t dst; + + assert(frcti); + + pthread_rwlock_wrlock(&frcti->lock); + + avail = (size_t) (frcti->rcv_byte_high - frcti->rcv_byte_next); + if (avail == 0) { + /* EOS drained: signal EOF to the reader. */ + if (frcti->rcv_fin_seen + && frcti->rcv_byte_next == frcti->rcv_byte_fin) + ret = 0; + else + ret = -EAGAIN; + goto unlock; + } + + copy = MIN(avail, count); + + dst.data = buf; + dst.len = copy; + stream_ring_read(frcti, frcti->rcv_byte_next, dst); + + frcti->rcv_byte_next += (uint32_t) copy; + STAT_ADD(frcti, strm_dlv_byte, copy); + + ret = (ssize_t) copy; + + unlock: pthread_rwlock_unlock(&frcti->lock); return ret; } -static ssize_t __frcti_queued_pdu(struct frcti * frcti) +/* + * FRTX consume: copy next ready PDU (full SDU or nothing). Returns bytes, + * -EAGAIN (no PDU), or -EMSGSIZE (oversize: run dropped to unblock flow). + */ +static ssize_t frcti_consume(struct frcti * frcti, + uint8_t * buf, + size_t count) { - ssize_t idx; - size_t pos; + size_t n; + size_t total; + bool overflow; + enum frag_state st; + ssize_t ret; assert(frcti); - /* See if we already have the next PDU. */ pthread_rwlock_wrlock(&frcti->lock); - pos = frcti->rcv_cr.lwe & (RQ_SIZE - 1); - - idx = frcti->rq[pos]; - if (idx != -1) { - ++frcti->rcv_cr.lwe; - ++frcti->rcv_cr.rwe; - frcti->rq[pos] = -1; + while (true) { + st = frag_run_inspect(frcti, &n); + if (st == FRAG_NOT_READY) { + ret = -EAGAIN; + goto unlock; + } + if (st == FRAG_DROP) { + STAT_ADD(frcti, frag_drop, n); + frag_drop(frcti, n); + continue; + } + /* FRAG_DELIVER */ + total = frag_total_len(frcti, n, &overflow); + if (overflow || total > frcti->max_rcv_sdu || total > count) { + STAT_ADD(frcti, frag_drop, n); + frag_drop(frcti, n); + ret = -EMSGSIZE; + goto unlock; + } + ret = (ssize_t) frag_gather(frcti, n, buf); + if (n > 1) + STAT_BUMP(frcti, sdu_reasm); + else + STAT_BUMP(frcti, sdu_sole); + goto unlock; } + unlock: pthread_rwlock_unlock(&frcti->lock); - return idx; + return ret; } -static ssize_t __frcti_pdu_ready(struct frcti * frcti) +static bool frcti_pdu_ready(struct frcti * frcti) { - ssize_t idx; - size_t pos; + size_t pos; + size_t count; + bool ready; assert(frcti); - /* See if we already have the next PDU. */ pthread_rwlock_rdlock(&frcti->lock); - pos = frcti->rcv_cr.lwe & (RQ_SIZE - 1); - idx = frcti->rq[pos]; + if (frcti->stream) { + ready = frcti->rcv_byte_high != frcti->rcv_byte_next; + pthread_rwlock_unlock(&frcti->lock); + return ready; + } + + if (frag_run_inspect(frcti, &count) != FRAG_DELIVER) { + /* Drop case: frcti_consume will handle it; not ready. */ + pthread_rwlock_unlock(&frcti->lock); + return false; + } + + pos = RQ_SLOT(frcti->rcv_cr.rwe - RQ_SIZE); + ready = frcti->rcv_slots[pos].idx != -1; + + pthread_rwlock_unlock(&frcti->lock); + + return ready; +} + +/* No srtt yet: probe at the cold-probe cadence to seed it. */ +#define PROBE_DUE_COLD(frcti, now_ns) \ + ((now_ns) - (frcti)->t_snd_probe > (uint64_t) RTTP_COLD_NS) + +/* Have srtt: probe when peer quiet for > 2*srtt and last probe > srtt. */ +#define PROBE_DUE_WARM(frcti, now_ns) \ + ((now_ns) - (frcti)->t_rcv_rtt > 2u * (uint64_t)(frcti)->srtt \ + && (now_ns) - (frcti)->t_snd_probe > (uint64_t)(frcti)->srtt) + +/* Seeds srtt for receive-only sides so they don't fall back to 1 s RTO. */ +__attribute__((cold)) +static void frcti_rcv_probe(struct frcti * frcti, + uint64_t now_ns) +{ + uint32_t probe_id; + uint8_t nonce[RTTP_NONCE_LEN] = { 0 }; + + pthread_rwlock_wrlock(&frcti->lock); + + if (frcti->srtt == 0 && !PROBE_DUE_COLD(frcti, now_ns)) { + pthread_rwlock_unlock(&frcti->lock); + return; + } + + if (frcti->srtt != 0 && !PROBE_DUE_WARM(frcti, now_ns)) { + pthread_rwlock_unlock(&frcti->lock); + return; + } + + probe_id = rttp_alloc_probe(frcti, now_ns, nonce); pthread_rwlock_unlock(&frcti->lock); - return idx; + if (probe_id != 0) + frcti_rttp_snd(frcti, probe_id, 0, nonce); } -#include <timerwheel.c> +/* Echo at slot `pos` matches our probe: id, slot, nonce all intact. */ +static __inline__ bool probe_echo_matches(struct frcti * frcti, + size_t pos, + uint32_t echo_id, + const uint8_t nonce[RTTP_NONCE_LEN]) +{ + if (frcti->probes[pos].id != echo_id) + return false; + + if (frcti->probes[pos].ts == 0) + return false; + + return memcmp(frcti->probes[pos].nonce, nonce, RTTP_NONCE_LEN) == 0; +} /* - * Send a final ACK for everything that has not been ACK'd. - * If the flow should be kept active for retransmission, - * the returned time will be negative. + * RTT probe (echo_id == 0): bounce the nonce back to peer. + * RTT echo (echo_id != 0): verify nonce + feed sample. */ -static time_t __frcti_dealloc(struct frcti * frcti) +static void frcti_rttp_rcv(struct frcti * frcti, + buffer_t pkt, + uint64_t now_ns) { - struct timespec now; - time_t wait; - int ackno; - int fd = -1; + const struct frct_rttp * rttp; + uint32_t probe_id; + uint32_t echo_id; + uint8_t nonce[RTTP_NONCE_LEN]; + size_t ring_pos; + int64_t elapsed; + uint64_t sample; + + if (pkt.len < RTTP_PAYLOAD) + return; + + rttp = (const struct frct_rttp *) pkt.data; + probe_id = ntoh32(rttp->probe_id); + echo_id = ntoh32(rttp->echo_id); + + /* Forged/malformed: bouncing this would loop on echo_id == 0. */ + if (probe_id == 0 && echo_id == 0) + return; + + memcpy(nonce, rttp->nonce, sizeof(nonce)); + + if (echo_id == 0) { + /* Probe: echo back with same nonce so peer can verify. */ + STAT_BUMP(frcti, rttp_rcv); + frcti_rttp_snd(frcti, 0, probe_id, nonce); + return; + } + + ring_pos = RTTP_POS(echo_id); + + pthread_rwlock_wrlock(&frcti->lock); + + if (!probe_echo_matches(frcti, ring_pos, echo_id, nonce)) { + pthread_rwlock_unlock(&frcti->lock); + return; + } + + elapsed = ts_age_ns(now_ns, frcti->probes[ring_pos].ts); + frcti->probes[ring_pos].ts = 0; + frcti->t_rcv_rtt = now_ns; + + if (elapsed <= 0) { + pthread_rwlock_unlock(&frcti->lock); + return; + } + sample = (uint64_t) elapsed; + + /* Clamp probe sample to RTT_CLAMP_MUL * srtt to avoid poisoning. */ + if (frcti->srtt > 0) + sample = MIN(sample, (uint64_t) frcti->srtt * RTT_CLAMP_MUL); + + rtt_update(frcti, sample, now_ns); + + pthread_rwlock_unlock(&frcti->lock); +} + +/* Honours piggybacked ACK on the KA. */ +static void frcti_ka_rcv(struct frcti * frcti, + const struct frct_pci * pci, + uint64_t now_ns, + uint16_t flags) +{ + uint32_t ka_ackno; + + STORE_RELEASE(&frcti->t_ka_rcv, now_ns); + STAT_BUMP(frcti, ka_rcv); + + if (!(flags & FRCT_ACK)) + return; + + ka_ackno = ntoh32(pci->ackno); + + pthread_rwlock_wrlock(&frcti->lock); + + if (within(ka_ackno, frcti->snd_cr.lwe, frcti->snd_cr.seqno)) + STORE_RELEASE(&frcti->snd_cr.lwe, ka_ackno); + + pthread_rwlock_unlock(&frcti->lock); +} + +/* + * Additive HoL re-emit (carries DRF); runs before rcv_cr->act + * refresh so it doesn't pre-empt peer's first DRF. + */ +__attribute__((cold)) +static void frcti_nack_rcv(struct frcti * frcti) +{ + struct timespec now; + uint64_t now_ns; + size_t hp; + struct rxm_entry * rxm; + void * pkt_copy = NULL; + size_t pkt_len = 0; clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); + + pthread_rwlock_wrlock(&frcti->lock); + + STAT_BUMP(frcti, nack_rcv); + + if (frcti->snd_cr.seqno == frcti->snd_cr.lwe) { + pthread_rwlock_unlock(&frcti->lock); + return; + } + + hp = RQ_SLOT(frcti->snd_cr.lwe); + rxm = LOAD_ACQUIRE(&frcti->snd_slots[hp].rxm); + if (rxm == NULL || RXM_AGED_OUT(rxm->t0, now_ns, frcti->t_r)) { + pthread_rwlock_unlock(&frcti->lock); + return; + } + + pkt_copy = malloc(rxm->len); + if (pkt_copy != NULL) { + memcpy(pkt_copy, rxm->pkt, rxm->len); + pkt_len = rxm->len; + /* Karn: suppress RTT sample. NACK supersedes pending TLP. */ + frcti->snd_slots[hp].flags = + (frcti->snd_slots[hp].flags & ~SND_TLP) + | SND_RTX | SND_FAST_RXM; + frcti->rtt_lwe = frcti->snd_cr.lwe + 1; + STAT_BUMP(frcti, rxm_nack); + } + + pthread_rwlock_unlock(&frcti->lock); + + if (pkt_copy != NULL) { + int ret = fast_rxm_send(frcti, pkt_copy, pkt_len); + if (ret == -EFLOWDOWN || ret == -ENOTALLOC) + STAT_BUMP(frcti, rxm_tx_dead); + free(pkt_copy); + } +} + +__attribute__((cold)) +static void frcti_rdv_rcv(struct frcti * frcti) +{ + uint32_t rwe; pthread_rwlock_rdlock(&frcti->lock); - ackno = frcti->rcv_cr.lwe; - if (frcti->rcv_cr.lwe != frcti->rcv_cr.seqno) - fd = frcti->fd; + rwe = frcti_advert_rwe(frcti); - wait = MAX(frcti->rcv_cr.inact - now.tv_sec + frcti->rcv_cr.act.tv_sec, - frcti->snd_cr.inact - now.tv_sec + frcti->snd_cr.act.tv_sec); - wait = MAX(wait, 0); + pthread_rwlock_unlock(&frcti->lock); + + STAT_BUMP(frcti, rdv_rcv); - if (frcti->snd_cr.cflags & FRCTFLINGER - && before(frcti->snd_cr.lwe, frcti->snd_cr.seqno)) - wait = -wait; + frcti_pkt_snd(frcti, FRCT_FC, 0, rwe); +} + +/* §7.2: PTO = 2*SRTT + max delayed-ACK delay; fallback when unseeded. */ +static __inline__ uint64_t tlp_pto(const struct frcti * frcti) +{ + if (frcti->srtt > 0) + return 2ULL * (uint64_t) frcti->srtt + ACK_DELAY_NS; + return NACK_COOLDOWN_NS; +} + +/* + * RFC 8985 §7: lazy probe. Re-evaluate on fire — if sender was active + * within PTO, re-post; else probe HoL once and hand off to RTO. + */ +__attribute__((cold)) +static void tlp_due(void * arg) +{ + struct frcti * frcti = arg; + struct timespec now; + uint64_t now_ns; + uint64_t pto; + uint64_t rto_at; + size_t hp; + struct rxm_entry * rxm; + void * pkt_copy = NULL; + size_t pkt_len = 0; + bool re_post = false; + uint64_t deadline = 0; + + clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); + + pthread_rwlock_wrlock(&frcti->lock); + + if (frcti->snd_cr.seqno == frcti->snd_cr.lwe) + goto unlock; + + if (!before(frcti->snd_cr.seqno, frcti->snd_cr.rwe)) + goto unlock; /* FC-blocked: RDV handles it. */ + + /* RFC 8985 §7.3: one outstanding probe, MAX_TLP_PER_EP per ep. */ + if (frcti->tlp_high_seq != 0) + goto unlock; + + if (frcti->tlp_count >= MAX_TLP_PER_EP) + goto unlock; + + pto = tlp_pto(frcti); + + /* §7.2: anchor PTO on most recent send; defer if still active. */ + if (now_ns < frcti->snd_cr.act + pto) { + deadline = frcti->snd_cr.act + pto; + re_post = true; + goto unlock; + } + + hp = RQ_SLOT(frcti->snd_cr.lwe); + rxm = LOAD_ACQUIRE(&frcti->snd_slots[hp].rxm); + if (rxm == NULL || RXM_AGED_OUT(rxm->t0, now_ns, frcti->t_r)) + goto unlock; + + /* Cap: if HoL RTO is due, let rxm_due fire instead. */ + rto_at = rxm->t0 + ((uint64_t) frcti->rto + << LOAD_RELAXED(&frcti->rto_mul)); + if (rto_at <= now_ns) + goto unlock; + + pkt_copy = malloc(rxm->len); + if (pkt_copy != NULL) { + memcpy(pkt_copy, rxm->pkt, rxm->len); + pkt_len = rxm->len; + frcti->snd_slots[hp].time = now_ns; + frcti->snd_slots[hp].flags |= SND_TLP | SND_FAST_RXM; + frcti->rtt_lwe = frcti->snd_cr.lwe + 1; + /* §7.3 outstanding-probe marker; ack_rcv/rxm_snd clear. */ + frcti->tlp_high_seq = frcti->snd_cr.seqno; + frcti->tlp_count++; + STAT_BUMP(frcti, tlp_snd); + } + + unlock: pthread_rwlock_unlock(&frcti->lock); - if (fd != -1) - __send_frct_pkt(fd, FRCT_ACK, ackno, 0); + if (pkt_copy != NULL) { + fast_rxm_send(frcti, pkt_copy, pkt_len); + free(pkt_copy); + } + + if (re_post) + tw_post(&frcti->tlp_tw, deadline, tlp_due, frcti); + else + __atomic_clear(&frcti->tlp_pending, __ATOMIC_RELAXED); +} + +/* §7.2 lazy: post once per quiet period. tlp_due re-evaluates on fire. */ +static int tlp_arm(struct frcti * frcti) +{ + struct timespec now; + uint64_t now_ns; + uint64_t pto; + uint64_t deadline; + + /* §7.3: one outstanding probe, MAX_TLP_PER_EP per recovery ep. */ + if (LOAD_RELAXED(&frcti->tlp_high_seq) != 0) + return 0; + if (LOAD_RELAXED(&frcti->tlp_count) >= MAX_TLP_PER_EP) + return 0; + if (__atomic_test_and_set(&frcti->tlp_pending, __ATOMIC_RELAXED)) + return 0; + + clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); + + pto = tlp_pto(frcti); + + deadline = LOAD_RELAXED(&frcti->snd_cr.act) + pto; + if (deadline <= now_ns) + deadline = now_ns + pto; + + tw_post(&frcti->tlp_tw, deadline, tlp_due, frcti); + + return 0; +} + +/* + * FC window advert from any flag-bearing packet. Caps at lwe + RQ_SIZE, + * rejects backward shrink (forged/stale FC), marks window open. + * Caller wrlock. + */ +static __inline__ void frcti_fc_rcv(struct frcti * frcti, + const struct frct_pci * pci) +{ + struct frct_cr * snd_cr; + uint32_t rwe; + uint32_t rwe_max; + + snd_cr = &frcti->snd_cr; + rwe = ntoh32(pci->window); + rwe_max = snd_cr->lwe + RQ_SIZE; + + if (after(rwe, rwe_max)) + rwe = rwe_max; + + /* Reject backward shrink (forged/stale FC). */ + if (before(rwe, snd_cr->rwe)) + rwe = snd_cr->rwe; + + STORE_RELAXED(&snd_cr->rwe, rwe); + frcti->open = true; +} + +/* Packet copies captured under frcti->lock; emitted after release. */ +struct pending { + buffer_t fast_rxm; + buffer_t sack_rxm[SACK_RXM_MAX]; + size_t sack_rxm_cnt; +}; + +/* RFC 6582 §3.2: seal recovery_high on entry; do not extend on new gaps. */ +static void recovery_enter(struct frcti * frcti) +{ + if (frcti->in_recovery) + return; + + frcti->in_recovery = true; + frcti->recovery_high = frcti->snd_cr.seqno + RTT_QUARANTINE; +} + +/* True when cum-ACK clears recovery_high or all in-flight ACKed. */ +static bool recovery_exit_reached(struct frcti * frcti, + uint32_t ackno) +{ + if (!frcti->in_recovery) + return false; + + if (!before(ackno, frcti->recovery_high)) + return true; + + return ackno == frcti->snd_cr.seqno; +} + +/* RTT sample gate: Karn + SACK-consume + don't-seed. */ +static bool rtt_sample_eligible(struct frcti * frcti, + size_t p, + uint16_t flags, + uint32_t lwe) +{ + if (flags & FRCT_RXM) + return false; + if (frcti->snd_slots[p].flags & (SND_RTX | SND_TLP)) + return false; + if (LOAD_ACQUIRE(&frcti->snd_slots[p].rxm) == NULL) + return false; + if (before(lwe, frcti->rtt_lwe)) + return false; + /* Don't seed srtt from a cum-ACK; let probes seed. */ + if (frcti->srtt == 0) + return false; + return true; +} + +#define RXM_SLOT_EMPTY(rxm) ((rxm) == NULL) +#define FAST_RXM_STAGED(pending) ((pending)->fast_rxm.data != NULL) +#define RXM_FAST_DONE(flags) (((flags) & SND_FAST_RXM) != 0) + +/* RACK fast retransmit on cum-ACK: HoL aged past R, not yet retransmitted. */ +static void fast_rxm_consider(struct frcti * frcti, + uint64_t now_ns, + struct pending * pending) +{ + struct rxm_entry * rxm; + struct snd_slot * slot; + size_t hp; + uint64_t R; + bool rack_ok; + + hp = RQ_SLOT(frcti->snd_cr.lwe); + slot = &frcti->snd_slots[hp]; + rxm = LOAD_ACQUIRE(&slot->rxm); + R = rack_reorder_window(frcti); + + if (RXM_SLOT_EMPTY(rxm)) + return; + + /* RFC 8985 §6.2: time-based RACK OR DupThresh count. */ + rack_ok = (int64_t)(frcti->t_latest_ack - slot->time) > (int64_t) R; + if (!rack_ok && frcti->dup_thresh < DUP_THRESH) + return; + + /* HoL aged past t_r; let rxm_due tear the flow down. */ + if (RXM_AGED_OUT(rxm->t0, now_ns, frcti->t_r)) + return; + + /* Already on it. */ + if (FAST_RXM_STAGED(pending) || RXM_FAST_DONE(slot->flags)) + return; + + recovery_enter(frcti); + + pending->fast_rxm.data = malloc(rxm->len); + if (pending->fast_rxm.data == NULL) + return; + + pending->fast_rxm.len = rxm->len; + memcpy(pending->fast_rxm.data, rxm->pkt, rxm->len); + slot->flags |= SND_RTX | SND_FAST_RXM; + frcti->rtt_lwe = frcti->snd_cr.lwe + 1; + if (rack_ok) + STAT_BUMP(frcti, rxm_rack); + else + STAT_BUMP(frcti, rxm_dupthresh); +} + +/* Caller holds wrlock; RACK fast retransmit queued in pending. */ +__attribute__((hot)) +static void frcti_ack_rcv(struct frcti * frcti, + const struct frct_pci * pci, + uint16_t flags, + uint64_t now_ns, + struct pending * pending) +{ + uint32_t ackno; + uint32_t lwe; + size_t p; + size_t fresh; + + if (!(flags & FRCT_DATA)) + STAT_BUMP(frcti, ack_rcv); + + ackno = ntoh32(pci->ackno); + if (ackno == frcti->snd_cr.lwe) { + /* RFC 8985 §6.2: only on scoreboard change. */ + if (frcti->snd_cr.lwe != frcti->rack_fired_lwe) { + fast_rxm_consider(frcti, now_ns, pending); + frcti->rack_fired_lwe = frcti->snd_cr.lwe; + } + return; + } + + if (!within(ackno, frcti->snd_cr.lwe, frcti->snd_cr.seqno)) + return; + + lwe = frcti->snd_cr.lwe; + p = RQ_SLOT(lwe); + + STORE_RELEASE(&frcti->snd_cr.lwe, ackno); + + /* §7.3: cum-ACK past the probed seqno resolves the TLP. */ + if (frcti->tlp_high_seq != 0 + && !before(ackno, frcti->tlp_high_seq)) + frcti->tlp_high_seq = 0; + + /* §7.3: end the probe episode once inflight drains. */ + if (ackno == frcti->snd_cr.seqno) + frcti->tlp_count = 0; + + /* RFC 8985 §7.2: halve mult per REO_DECAY_PKTS fresh-ACK'd seqnos. */ + fresh = ackno - frcti->dsack_lwe_snap; + if (frcti->reo_wnd_mult > 1 && fresh >= REO_DECAY_PKTS) { + uint8_t half = frcti->reo_wnd_mult >> 1; + frcti->reo_wnd_mult = half < 1 ? 1 : half; + frcti->dsack_lwe_snap = ackno; + } + + /* RFC 8985: latest cum-ACKed send-time (slot of ackno-1). */ + frcti->t_latest_ack = frcti->snd_slots[RQ_SLOT(ackno - 1)].time; + + /* RFC 8985: SACK-above-lwe count is per-recovery-episode. */ + frcti->dup_thresh = 0; + + /* Karn-skip on retx; TLP ACK clears rto_mul (no CC backoff). */ + if ((frcti->snd_slots[p].flags & SND_RTX) == 0 + || (frcti->snd_slots[p].flags & SND_TLP) != 0) + STORE_RELEASE(&frcti->rto_mul, 0); - return wait; + if (recovery_exit_reached(frcti, ackno)) + frcti->in_recovery = false; + + if (rtt_sample_eligible(frcti, p, flags, lwe)) { + int64_t mrtt = ts_age_ns(now_ns, frcti->snd_slots[p].time); + if (mrtt > 0) { + if (!(flags & FRCT_DATA)) + STAT_BUMP(frcti, ack_rtt); + rtt_update(frcti, (time_t) mrtt, now_ns); + frcti->t_rcv_rtt = now_ns; + } + } } -static int __frcti_snd(struct frcti * frcti, - struct ssm_pk_buff * spb) +/* Skip k == lwe under clamp: NULLing HoL from a stale SACK wedges it. */ +static uint32_t sack_mark_blocks(struct frcti * frcti, + const uint8_t * payload, + uint16_t n, + uint32_t * newly_marked) { - struct frct_pci * pci; - struct timespec now; - struct frct_cr * snd_cr; - struct frct_cr * rcv_cr; - uint32_t seqno; - bool rtx; + uint32_t hi_sacked = frcti->snd_cr.lwe; + uint32_t marked = 0; + uint16_t i; + + for (i = 0; i < n; ++i) { + uint32_t s; + uint32_t e; + uint32_t k; + bool clamped; + + sack_block_get(payload, i, &s, &e); + + if (!before(s, e)) + continue; + + clamped = before(s, frcti->snd_cr.lwe); + if (clamped) + s = frcti->snd_cr.lwe; + if (after(e, frcti->snd_cr.seqno)) + e = frcti->snd_cr.seqno; + + for (k = s; before(k, e); ++k) { + size_t kp = RQ_SLOT(k); + uint64_t t_k; + if (clamped && k == frcti->snd_cr.lwe) + continue; + if (LOAD_ACQUIRE(&frcti->snd_slots[kp].rxm) == NULL) + continue; + STORE_RELEASE(&frcti->snd_slots[kp].rxm, NULL); + frcti->snd_slots[kp].flags = 0; + marked++; + /* RACK.fack: latest SACK-confirmed send-time. */ + t_k = frcti->snd_slots[kp].time; + if (t_k > frcti->t_latest_ack) + frcti->t_latest_ack = t_k; + } + + if (after(e, hi_sacked)) + hi_sacked = e; + } + + *newly_marked = marked; + return hi_sacked; +} + +/* Queue once per loss event (SND_FAST_RXM gates). Emit after unlock. */ +static void sack_queue_rxm(struct frcti * frcti, + uint32_t hi_sacked, + uint64_t now_ns, + struct pending * pending) +{ + uint64_t R = rack_reorder_window(frcti); + uint32_t k; + bool rack_ok; + + for (k = frcti->snd_cr.lwe; before(k, hi_sacked); ++k) { + struct rxm_entry * rxm; + size_t kp = RQ_SLOT(k); + size_t cnt = pending->sack_rxm_cnt; + size_t rack_age; + + rxm = LOAD_ACQUIRE(&frcti->snd_slots[kp].rxm); + + if (cnt >= SACK_RXM_MAX) + break; + + if (rxm == NULL) + continue; + + if (frcti->snd_slots[kp].flags & SND_FAST_RXM) + continue; + + if (RXM_AGED_OUT(rxm->t0, now_ns, frcti->t_r)) + continue; + + rack_age = frcti->t_latest_ack - frcti->snd_slots[kp].time; + /* RFC 8985 §6.2: time-based RACK OR DupThresh count. */ + rack_ok = (int64_t) rack_age > (int64_t) R; + if (!rack_ok && frcti->dup_thresh < DUP_THRESH) + continue; + + if (rack_ok) + STAT_BUMP(frcti, rxm_rack); + else + STAT_BUMP(frcti, rxm_dupthresh); + + pending->sack_rxm[cnt].data = malloc(rxm->len); + if (pending->sack_rxm[cnt].data == NULL) + break; + + pending->sack_rxm[cnt].len = rxm->len; + memcpy(pending->sack_rxm[cnt].data, rxm->pkt, rxm->len); + pending->sack_rxm_cnt++; + /* NULL slot so the original timer self-cleans. */ + STORE_RELEASE(&frcti->snd_slots[kp].rxm, NULL); + frcti->snd_slots[kp].time = now_ns; + frcti->snd_slots[kp].flags |= SND_RTX | SND_FAST_RXM; + frcti->rtt_lwe = k + 1; + } +} + +/* + * RFC 2883 D-SACK detector. Returns true iff block[0] is a D-SACK + * report: + * case 1: blocks[0].start < pkt_ackno (strictly below cum-ACK). + * case 2: blocks[0] is a strict sub-range of some blocks[i>0]. + * MAX_DSACK_LAG bounds case-1 distance to one rcv window (sanity). + */ +static bool sack_is_dsack(struct frcti * frcti, + const uint8_t * payload, + uint16_t n, + uint32_t pkt_ackno) +{ + uint32_t s0; + uint32_t e0; + uint16_t i; + + if (n == 0) + return false; + + sack_block_get(payload, 0, &s0, &e0); + if (!before(s0, e0)) + return false; + + if (before(s0, pkt_ackno)) { + if ((pkt_ackno - s0) <= (uint32_t) MAX_DSACK_LAG) + return true; + STAT_BUMP(frcti, dsack_drop); + return false; + } + + for (i = 1; i < n; ++i) { + uint32_t si; + uint32_t ei; + + sack_block_get(payload, i, &si, &ei); + if (!before(si, ei)) + continue; + if (!before(s0, si) && !after(e0, ei) + && (s0 != si || e0 != ei)) + return true; + } + + return false; +} + +/* RFC 8985 §7.2: grow reo_wnd_mult on DSACK; at most once per RTT. */ +static __inline__ void reo_wnd_on_dsack(struct frcti * frcti, + uint64_t now_ns) +{ + time_t srtt = frcti->srtt; + + /* Snap is unconditional: feeds the per-D-SACK decay clock. */ + frcti->dsack_lwe_snap = frcti->snd_cr.lwe; + + if (srtt > 0 + && now_ns - frcti->t_last_reo_widen <= (uint64_t) srtt) + return; + + if (frcti->reo_wnd_mult < REO_WND_MULT_MAX) + frcti->reo_wnd_mult++; + + frcti->t_last_reo_widen = now_ns; +} + +/* Caller holds wrlock; retransmits queued for post-unlock emission. */ +static void frcti_sack_rcv(struct frcti * frcti, + buffer_t pkt, + uint32_t pkt_ackno, + uint64_t now_ns, + struct pending * pending) +{ + uint32_t hi_sacked; + uint32_t marked; + uint16_t n; + bool dsack; + uint16_t n_real; + + if (pkt.len < SACK_HDR_SIZE) + return; + + n = ntoh16(*(const uint16_t *) pkt.data); + if (n > SACK_MAX_BLOCKS) + return; + + if (pkt.len < SACK_HDR_SIZE + (size_t) n * SACK_BLOCK_SIZE) + return; + + STAT_BUMP(frcti, sack_rcv); + + dsack = sack_is_dsack(frcti, pkt.data, n, pkt_ackno); + n_real = n - (dsack ? 1 : 0); + + if (dsack) { + STAT_BUMP(frcti, dsack_rcv); + reo_wnd_on_dsack(frcti, now_ns); + } + + /* DSACK-only carries no new gap; don't enter recovery. */ + if (n_real > 0) + recovery_enter(frcti); + + marked = 0; + hi_sacked = sack_mark_blocks(frcti, pkt.data, n, &marked); + frcti->dup_thresh += marked; + + if (after(hi_sacked, frcti->snd_cr.lwe)) + sack_queue_rxm(frcti, hi_sacked, now_ns, pending); +} + +/* Emit and free queued packet copies. */ +static void pending_flush(struct frcti * frcti, + struct pending * pending) +{ + size_t i; + + for (i = 0; i < pending->sack_rxm_cnt; ++i) { + sack_rxm_snd(frcti, pending->sack_rxm[i].data, + pending->sack_rxm[i].len); + free(pending->sack_rxm[i].data); + } + + if (pending->fast_rxm.data != NULL) { + int ret = fast_rxm_send(frcti, pending->fast_rxm.data, + pending->fast_rxm.len); + if (ret == -EFLOWDOWN || ret == -ENOTALLOC) + STAT_BUMP(frcti, rxm_tx_dead); + free(pending->fast_rxm.data); + } +} + +/* Pre-DRF NACK: ask peer to retransmit HoL; seqno is informational. */ +static void frcti_nack_snd(struct frcti * frcti, + uint32_t seqno_unseen) +{ + struct ssm_pk_buff * spb; + struct frct_pci * pci; + + if (frct_ctrl_alloc(&spb, &pci, 0) < 0) + return; + + pci->flags = hton16(FRCT_NACK); + pci->seqno = hton32(seqno_unseen); + + frct_hcs_set(pci, false); + + frct_tx(frcti, spb); +} + +enum frct_act { + FRCT_ACTIVE, + FRCT_INACT_NEED_NACK, + FRCT_INACT_DROP, +}; + +/* On rcv inactivity: rebase on DRF, or arm pre-DRF NACK. Caller wrlock. */ +static enum frct_act rcv_inact_check(struct frcti * frcti, + uint16_t flags, + uint32_t seqno, + uint64_t now_ns) +{ + struct frct_cr * rcv_cr = &frcti->rcv_cr; + uint64_t cd; + + if (!ts_aged_ns(now_ns, rcv_cr->act, rcv_cr->inact)) + return FRCT_ACTIVE; + + if (flags & FRCT_DRF) { + if (same_epoch_drf(seqno, flags, rcv_cr)) + return FRCT_ACTIVE; + + /* Bootstrap or fresh epoch: rebase. */ + STAT_BUMP(frcti, drf_rebase); + release_rq(frcti); + STORE_RELEASE(&rcv_cr->lwe, seqno); + rcv_cr->rwe = seqno + RQ_SIZE; + rcv_cr->seqno = seqno; + return FRCT_ACTIVE; + } + + if (!(flags & FRCT_DATA)) + return FRCT_ACTIVE; + + /* Pre-DRF: nudge sender with NACK (rate-limited). */ + cd = frcti->srtt > 0 ? (uint64_t) frcti->srtt : NACK_COOLDOWN_NS; + if (!ts_aged_ns(now_ns, frcti->t_nack, cd)) + return FRCT_INACT_DROP; + + frcti->t_nack = now_ns; + STAT_BUMP(frcti, nack_snd); + + return FRCT_INACT_NEED_NACK; +} + +/* Both modes: bounded accept into rq[seqno]. Caller wrlock. */ +__attribute__((hot)) +static bool rq_accept(struct frcti * frcti, + uint32_t seqno, + size_t pos, + uint16_t flags) +{ + struct frct_cr * rcv_cr = &frcti->rcv_cr; + + if (!before(seqno, rcv_cr->rwe)) { + STAT_BUMP(frcti, out_rcv); + return false; + } + + if (!before(seqno, rcv_cr->lwe + RQ_SIZE)) { + STAT_BUMP(frcti, rqo_rcv); + return false; + } + + if (frcti->rcv_slots[pos].idx != -1) { + if (flags & FRCT_RXM) + STAT_BUMP(frcti, rxm_dup_rcv); + else + STAT_BUMP(frcti, dup_rcv); + /* RFC 2883 §4 case 2: in-window dup; sub-range marker. */ + frcti->dsack_seqno = seqno; + frcti->dsack_valid = true; + return false; + } + + return true; +} + +/* OOO arrival; throttle by min_gap + scoreboard dedup. */ +static bool sack_check(struct frcti * frcti, + uint32_t seqno, + uint64_t now_ns, + struct sack_args * out) +{ + struct frct_cr * rcv_cr = &frcti->rcv_cr; + uint64_t min_gap; + uint16_t n; + + if (!after(seqno, rcv_cr->lwe)) + return false; + + STAT_BUMP(frcti, ooo_rcv); + + /* SACK carries cum-ACK; bound by t_a like any other ACK. */ + if (ACK_AGED_OUT(rcv_cr->act, now_ns, frcti->t_a)) + return false; + + /* srtt/8 gate starved recovery under burst loss; floor to save CPU. */ + min_gap = (uint64_t) SACK_MIN_GAP_NS; + + if (!ts_aged_ns(now_ns, frcti->t_snd_sack, min_gap)) + return false; + + out->dsack = false; + n = dsack_consume(frcti, out->blocks); + if (n == 1) + out->dsack = true; + n += sack_blocks_build(frcti, out->blocks + n, + frcti->sack_n_max - n); + + if (!out->dsack + && rcv_cr->lwe == frcti->sack_lwe && n == frcti->sack_n) + return false; + + out->n = n; + out->ack = rcv_cr->lwe; + out->rwe = frcti_advert_rwe(frcti); + frcti->t_snd_sack = now_ns; + frcti->sack_lwe = rcv_cr->lwe; + frcti->sack_n = n; + + return true; +} + +/* Wire-dup of fresh DATA at an already-ACKed seqno. */ +static __inline__ bool is_dup_data(uint16_t flags, + uint32_t seqno, + uint32_t lwe) +{ + if (!(flags & FRCT_DATA)) + return false; + + if (flags & FRCT_RXM) + return false; + + return before(seqno, lwe); +} + +/* + * Wire-dup ACK packet: same seqno as the previous emission. Updates + * the dedup ackno on a fresh ACK; caller drops on true. + */ +static __inline__ bool is_dup_ack(struct frcti * frcti, + uint16_t flags, + uint32_t seqno) +{ + if (flags & FRCT_DATA) + return false; + + if (!(flags & FRCT_ACK)) + return false; + + if (seqno == frcti->rcv_cr.ackno) + return true; + + frcti->rcv_cr.ackno = seqno; + + return false; +} + +/* Caller wrlock. */ +__attribute__((cold)) +static void seqno_rotate(struct frcti * frcti, + uint64_t now_ns) +{ + struct frct_cr * snd_cr = &frcti->snd_cr; + + if (!ts_aged_ns(now_ns, snd_cr->act, snd_cr->inact)) + return; + /* Idle-on-wire ≠idle e2e: don't orphan in-flight rxm. */ + if (snd_cr->seqno != snd_cr->lwe) + return; + + /* Avoid colliding with peer's current rcv window. */ + do { + random_buffer(&snd_cr->seqno, sizeof(snd_cr->seqno)); + } while (in_window(snd_cr->seqno, snd_cr)); + STORE_RELEASE(&snd_cr->lwe, snd_cr->seqno); + STORE_RELAXED(&snd_cr->rwe, snd_cr->lwe + START_WINDOW); + frcti->rtt_lwe = snd_cr->seqno; + frcti->in_recovery = false; + frcti->recovery_high = snd_cr->seqno; +} + +__attribute__((hot)) +static int frcti_snd(struct frcti * frcti, + struct ssm_pk_buff * spb, + uint16_t flags) +{ + struct frct_pci * pci; + struct frct_pci_stream * spci = NULL; + struct timespec now; + struct frct_cr * snd_cr; + struct frct_cr * rcv_cr; + uint32_t seqno; + uint16_t pci_flags = 0; + bool rtx; + uint64_t now_ns; + int64_t rcv_idle; + uint32_t probe_id = 0; + uint8_t probe_nonce[RTTP_NONCE_LEN] = { 0 }; + bool probe; + size_t payload_len = 0; assert(frcti); - assert(ssm_pk_buff_len(spb) != 0); + /* Stream mode permits 0-byte sends for the EOS marker. */ + assert(ssm_pk_buff_len(spb) != 0 || frcti->stream); snd_cr = &frcti->snd_cr; rcv_cr = &frcti->rcv_cr; - timerwheel_move(); + tw_move_safe(); + + if (frcti->stream) + payload_len = ssm_pk_buff_len(spb); - pci = (struct frct_pci *) ssm_pk_buff_head_alloc(spb, FRCT_PCILEN); + pci = FRCT_HDR_PUSH(spb, frcti); if (pci == NULL) return -ENOMEM; - memset(pci, 0, sizeof(*pci)); + memset(pci, 0, FRCT_PCILEN); + + if (frcti->stream) + spci = FRCT_SPCI(pci); clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); pthread_rwlock_wrlock(&frcti->lock); rtx = snd_cr->cflags & FRCTFRTX; - pci->flags |= FRCT_DATA; + pci_flags |= FRCT_DATA; + if (!frcti->stream) + pci_flags |= (flags & FRCT_FR_MASK); - /* Set DRF if there are no unacknowledged packets. */ - if (snd_cr->seqno == snd_cr->lwe) - pci->flags |= FRCT_DRF; + if (!frcti->stream && (flags & FRCT_FR_MASK) != FRCT_FR_SOLE) + STAT_BUMP(frcti, frag_snd); - /* Choose a new sequence number if sender inactivity expired. */ - if (now.tv_sec - snd_cr->act.tv_sec > snd_cr->inact) { - /* There are no unacknowledged packets. */ - assert(snd_cr->seqno == snd_cr->lwe); - random_buffer(&snd_cr->seqno, sizeof(snd_cr->seqno)); - snd_cr->lwe = snd_cr->seqno; - snd_cr->rwe = snd_cr->lwe + START_WINDOW; + if (frcti->stream) { + if (flags & FRCT_FIN) + pci_flags |= FRCT_FIN; + + spci->start = hton32(frcti->snd_byte_next); + frcti->snd_byte_next += (uint32_t) payload_len; + spci->end = hton32(frcti->snd_byte_next); + STAT_ADD(frcti, strm_snd_byte, payload_len); } + if (snd_cr->seqno == snd_cr->lwe) + pci_flags |= FRCT_DRF; + + seqno_rotate(frcti, now_ns); + seqno = snd_cr->seqno; pci->seqno = hton32(seqno); - if (now.tv_sec - rcv_cr->act.tv_sec < rcv_cr->inact) { - pci->flags |= FRCT_FC; - *((uint32_t *) pci) |= hton32(rcv_cr->rwe & 0x00FFFFFF); + rcv_idle = ts_age_ns(now_ns, rcv_cr->act); + + if (rcv_idle < (int64_t) rcv_cr->inact) { + pci_flags |= FRCT_FC; + pci->window = hton32(frcti_advert_rwe(frcti)); } if (!rtx) { - snd_cr->lwe++; + STORE_RELEASE(&snd_cr->lwe, snd_cr->lwe + 1); + STORE_RELEASE(&snd_cr->rwe, snd_cr->lwe + RQ_SIZE); } else { - if (!frcti->probe) { - frcti->rttseq = snd_cr->seqno; - frcti->t_probe = now; - frcti->probe = true; -#ifdef PROC_FLOW_STATS - frcti->n_prb++; -#endif - } - if ((now.tv_sec - rcv_cr->act.tv_sec) * BILLION <= frcti->a) { - pci->flags |= FRCT_ACK; + size_t p = RQ_SLOT(seqno); + frcti->snd_slots[p].time = now_ns; + /* Fresh send clears RTX bits. */ + frcti->snd_slots[p].flags = 0; + if (rcv_idle <= (int64_t) frcti->t_a) { + pci_flags |= FRCT_ACK; pci->ackno = hton32(rcv_cr->lwe); rcv_cr->seqno = rcv_cr->lwe; } } + pci->flags = hton16(pci_flags); + + frct_hcs_set(pci, frcti->stream); + snd_cr->seqno++; - snd_cr->act = now; + STORE_RELEASE(&snd_cr->act, now_ns); + + probe = rtt_probe_arm(frcti, now_ns, &probe_id, probe_nonce); pthread_rwlock_unlock(&frcti->lock); - if (rtx) - timerwheel_rxm(frcti, seqno, spb); + if (probe) + frcti_rttp_snd(frcti, probe_id, 0, probe_nonce); + + if (rtx) { + rxm_arm(frcti, seqno, spb); + tlp_arm(frcti); + } return 0; } -static void rtt_estimator(struct frcti * frcti, - time_t mrtt) +/* + * Stream: 0-byte FRCT_FIN DATA so peer's flow_read returns 0 at this + * byte. Msg: control packet with FRCT_FIN flag, snd_cr.seqno carried + * in pci->ackno (sender packs via frcti_pkt_snd's ackno parameter). + */ +static void frcti_fin_snd(struct frcti * frcti) { - time_t srtt = frcti->srtt; - time_t rttvar = frcti->mdev; + struct ssm_pk_buff * spb; + bool already; + uint32_t fin_seqno; - if (srtt == 0) { /* first measurement */ - srtt = mrtt; - rttvar = mrtt >> 1; - } else { - time_t delta = mrtt - srtt; - srtt += (delta >> 3); - delta = (ABS(delta) - rttvar) >> 2; -#ifdef FRCT_LINUX_RTT_ESTIMATOR - if (delta < 0) - delta >>= 3; -#endif - rttvar += delta; + if (!(frcti->snd_cr.cflags & FRCTFLINGER)) + return; + + pthread_rwlock_wrlock(&frcti->lock); + + already = frcti->snd_fin_sent; + frcti->snd_fin_sent = true; + fin_seqno = frcti->snd_cr.seqno; + + if (!already && !frcti->stream) + frcti->snd_fin_seqno = fin_seqno; + + pthread_rwlock_unlock(&frcti->lock); + + if (already) + return; + + if (!frcti->stream) { + frcti_pkt_snd(frcti, FRCT_FIN, fin_seqno, 0); + return; } -#ifdef PROC_FLOW_STATS - frcti->n_rtt++; -#endif - frcti->srtt = MAX(1000L, srtt); - frcti->mdev = MAX(100L, rttvar); - frcti->rto = MAX(RTO_MIN, frcti->srtt + (frcti->mdev << MDEV_MUL)); -} - -/* Always queues the next application packet on the RQ. */ -static void __frcti_rcv(struct frcti * frcti, - struct ssm_pk_buff * spb) -{ - ssize_t idx; - size_t pos; - struct frct_pci * pci; - struct timespec now; - struct frct_cr * rcv_cr; - struct frct_cr * snd_cr; - uint32_t seqno; - uint32_t ackno; - uint32_t rwe; - int fd = -1; - assert(frcti); + if (frct_spb_reserve(frcti_data_hdr_len(frcti), &spb) < 0) + return; + /* Reset spb to 0-len so frcti_snd's head_alloc populates PCI. */ + ssm_pk_buff_truncate(spb, 0); + + if (frcti_snd(frcti, spb, FRCT_FIN) < 0) { + frct_spb_release(spb); + return; + } + + if (frct_tx(frcti, spb) < 0) + return; + + pthread_rwlock_wrlock(&frcti->lock); + + frcti->snd_fin_seqno = frcti->snd_cr.seqno - 1; + + pthread_rwlock_unlock(&frcti->lock); +} + +static bool final_ack_due(struct frcti * frcti, + struct frct_cr * rcv_cr, + uint64_t now_ns) +{ + if (rcv_cr->lwe == rcv_cr->seqno) + return false; + + if (ACK_AGED_OUT(rcv_cr->act, now_ns, frcti->t_a)) + return false; + + return true; +} + +/* Snd-side has FLINGER cflag and unACK'd data below the FIN/seqno. */ +static __inline__ bool snd_drain_pending(struct frct_cr * snd_cr, + uint32_t edge) +{ + if (!(snd_cr->cflags & FRCTFLINGER)) + return false; + + return before(snd_cr->lwe, edge); +} + +/* Peer is still active and we haven't seen their FIN yet. */ +static __inline__ bool rcv_drain_pending(struct frcti * frcti, + struct frct_cr * rcv_cr, + uint64_t now_ns) +{ + if (frcti->rcv_fin_seen) + return false; + + return !ts_aged_ns(now_ns, rcv_cr->act, rcv_cr->inact); +} + +/* Drain-loop predicate: snd-side unACK'd data OR peer still active. */ +static bool frcti_lingering(struct frcti * frcti) +{ + struct timespec now; + struct frct_cr * snd_cr; + struct frct_cr * rcv_cr; + uint32_t edge; + uint64_t now_ns; + bool snd_linger; + bool rcv_linger; + + /* Idempotent; emits FIN once per side, both stream and msg. */ + frcti_fin_snd(frcti); + + clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); + + pthread_rwlock_rdlock(&frcti->lock); + + snd_cr = &frcti->snd_cr; rcv_cr = &frcti->rcv_cr; + + if (frcti->snd_fin_sent) + edge = frcti->snd_fin_seqno; + else + edge = snd_cr->seqno; + + snd_linger = snd_drain_pending(snd_cr, edge); + rcv_linger = rcv_drain_pending(frcti, rcv_cr, now_ns); + + pthread_rwlock_unlock(&frcti->lock); + + return snd_linger || rcv_linger; +} + +static time_t frcti_dealloc(struct frcti * frcti) +{ + struct timespec now; + struct frct_cr * snd_cr; + struct frct_cr * rcv_cr; + int ackno; + bool due; + int64_t now_ns; + int64_t rcv; + int64_t snd; + snd_cr = &frcti->snd_cr; + rcv_cr = &frcti->rcv_cr; + + /* Idempotent; usually already sent by frcti_lingering. */ + frcti_fin_snd(frcti); clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); - pci = (struct frct_pci *) ssm_pk_buff_head_release(spb, FRCT_PCILEN); + pthread_rwlock_rdlock(&frcti->lock); - idx = ssm_pk_buff_get_idx(spb); - seqno = ntoh32(pci->seqno); - pos = seqno & (RQ_SIZE - 1); + ackno = rcv_cr->lwe; + rcv = (int64_t)(rcv_cr->act + rcv_cr->inact) - now_ns; + snd = (int64_t)(snd_cr->act + snd_cr->inact) - now_ns; + due = final_ack_due(frcti, rcv_cr, now_ns); - pthread_rwlock_wrlock(&frcti->lock); + pthread_rwlock_unlock(&frcti->lock); - if (now.tv_sec - rcv_cr->act.tv_sec > rcv_cr->inact) { - if (pci->flags & FRCT_DRF) { /* New run. */ - rcv_cr->lwe = seqno; - rcv_cr->rwe = seqno + RQ_SIZE; - rcv_cr->seqno = seqno; - } else if (pci->flags & FRCT_DATA) { - goto drop_packet; - } - } + if (due) + frcti_pkt_snd(frcti, FRCT_ACK, ackno, 0); - rcv_cr->act = now; + return (time_t) MAX((MAX(rcv, snd) / BILLION), 0); +} - /* For now, just send an immediate window update. */ - if (pci->flags & FRCT_RDVS) { - fd = frcti->fd; - rwe = rcv_cr->rwe; - pthread_rwlock_unlock(&frcti->lock); +__attribute__((hot)) +static void frcti_rcv(struct frcti * frcti, + struct ssm_pk_buff * spb) +{ + ssize_t idx; + size_t pos; + struct frct_pci * pci; + struct timespec now; + uint64_t now_ns; + struct frct_cr * rcv_cr; + uint32_t seqno; + uint16_t flags; + buffer_t pkt; + struct pending pending = { 0 }; + bool in_order; + struct sack_args * sa = NULL; + bool send_sack = false; - __send_frct_pkt(fd, FRCT_FC, 0, rwe); + assert(frcti); + + rcv_cr = &frcti->rcv_cr; - ssm_pool_remove(proc.pool, idx); + clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); + + if (ssm_pk_buff_len(spb) < FRCT_PCILEN) { + frct_spb_release(spb); return; } - if (pci->flags & FRCT_ACK) { - ackno = ntoh32(pci->ackno); - if (after(ackno, frcti->snd_cr.lwe)) - frcti->snd_cr.lwe = ackno; + pci = FRCT_HDR_POP(spb, frct_pci); - if (frcti->probe && after(ackno, frcti->rttseq)) { -#ifdef PROC_FLOW_STATS - if (!(pci->flags & FRCT_DATA)) - frcti->n_dak++; -#endif - rtt_estimator(frcti, ts_diff_ns(&now, &frcti->t_probe)); - frcti->probe = false; - } + idx = ssm_pk_buff_get_off(spb); + seqno = ntoh32(pci->seqno); + pos = RQ_SLOT(seqno); + + flags = ntoh16(pci->flags); + + pkt.data = ssm_pk_buff_head(spb); + pkt.len = ssm_pk_buff_len(spb); + + if (flags & FRCT_RXM) + STAT_BUMP(frcti, rxm_rcv); + + /* Stateless / lock-free dispatches. spb released via ctrl_done. */ + if (flags & FRCT_KA) { + frcti_ka_rcv(frcti, pci, now_ns, flags); + goto ctrl_done; } - if (pci->flags & FRCT_FC) { - uint32_t rwe; + if (flags & FRCT_RTTP) { + frcti_rttp_rcv(frcti, pkt, now_ns); + goto ctrl_done; + } - rwe = ntoh32(*((uint32_t *)pci) & hton32(0x00FFFFFF)); - rwe |= snd_cr->rwe & 0xFF000000; + if (flags & FRCT_NACK) { + frcti_nack_rcv(frcti); + goto ctrl_done; + } + + if (flags & FRCT_RDVS) { + frcti_rdv_rcv(frcti); + goto ctrl_done; + } - /* Rollover for 24 bit */ - if (before(rwe, snd_cr->rwe) && snd_cr->rwe - rwe > 0x007FFFFF) - rwe += 0x01000000; + /* Msg-mode FIN: control packet, FIN seqno carried in pci->ackno. */ + if ((flags & FRCT_FIN) && !(flags & FRCT_DATA)) { + pthread_rwlock_wrlock(&frcti->lock); + if (!frcti->rcv_fin_seen) { + frcti->rcv_fin_seen = true; + frcti->rcv_byte_fin = ntoh32(pci->ackno); + } + pthread_rwlock_unlock(&frcti->lock); + goto ctrl_done; + } - snd_cr->rwe = rwe; + pthread_rwlock_wrlock(&frcti->lock); - pthread_mutex_lock(&frcti->mtx); - if (!frcti->open) { - frcti->open = true; - pthread_cond_broadcast(&frcti->cond); + /* rcv_inact_check is a no-op for non-DATA non-DRF packets. */ + if (flags & (FRCT_DATA | FRCT_DRF)) { + switch (rcv_inact_check(frcti, flags, seqno, now_ns)) { + case FRCT_INACT_NEED_NACK: + pthread_rwlock_unlock(&frcti->lock); + frcti_nack_snd(frcti, seqno - 1); + frct_spb_release(spb); + return; + case FRCT_INACT_DROP: + STAT_BUMP(frcti, inact_drop); + goto drop_packet; + case FRCT_ACTIVE: + /* FALLTHRU */ + default: + break; } - pthread_mutex_unlock(&frcti->mtx); } - if (!(pci->flags & FRCT_DATA)) + /* DATA-only act refresh: non-DATA would lock out DRF rebase. */ + if (flags & FRCT_DATA) + STORE_RELEASE(&rcv_cr->act, now_ns); + + /* Wire-dup ACK packet: same seqno as the previous emission. */ + if (is_dup_ack(frcti, flags, seqno)) { + STAT_BUMP(frcti, ack_dup_rcv); + goto drop_packet; + } + + /* Wire-dup of DATA: piggybacked ACK info already processed. */ + if (is_dup_data(flags, seqno, rcv_cr->lwe)) { + rcv_cr->seqno = seqno; + STAT_BUMP(frcti, dup_rcv); + /* RFC 2883 §4 case 1: dup below cum-ACK. */ + frcti->dsack_seqno = seqno; + frcti->dsack_valid = true; + goto drop_packet; + } + + if (flags & FRCT_ACK) + frcti_ack_rcv(frcti, pci, flags, now_ns, &pending); + + if (flags & FRCT_SACK) + frcti_sack_rcv(frcti, pkt, ntoh32(pci->ackno), + now_ns, &pending); + + if (flags & FRCT_FC) + frcti_fc_rcv(frcti, pci); + + if (!(flags & FRCT_DATA)) goto drop_packet; if (before(seqno, rcv_cr->lwe)) { - rcv_cr->seqno = seqno; /* Ensures we send a new ACK. */ -#ifdef PROC_FLOW_STATS - frcti->n_dup++; -#endif + /* Bump rcv_cr.seqno to force ack_snd to fire on the dup. */ + rcv_cr->seqno = seqno; + if (flags & FRCT_RXM) + STAT_BUMP(frcti, rxm_dup_rcv); + else + STAT_BUMP(frcti, dup_rcv); + /* RFC 2883 §4 case 1: dup below cum-ACK. */ + frcti->dsack_seqno = seqno; + frcti->dsack_valid = true; goto drop_packet; } - if (rcv_cr->cflags & FRCTFRTX) { + if (!rq_accept(frcti, seqno, pos, flags)) + goto drop_packet; - if (!before(seqno, rcv_cr->rwe)) { /* Out of window. */ -#ifdef PROC_FLOW_STATS - frcti->n_out++; -#endif + if (frcti->stream) { + if (frcti_stream_data_rcv(frcti, spb, pos, flags) < 0) { + STAT_BUMP(frcti, strm_drop); goto drop_packet; } - - if (!before(seqno, rcv_cr->lwe + RQ_SIZE)) { -#ifdef PROC_FLOW_STATS - frcti->n_rqo++; -#endif - goto drop_packet; /* Out of rq. */ - } - if (frcti->rq[pos] != -1) { -#ifdef PROC_FLOW_STATS - frcti->n_dup++; -#endif - goto drop_packet; /* Duplicate in rq. */ - } - fd = frcti->fd; + /* spb consumed by stash; do not release in drop path. */ + spb = NULL; } else { - rcv_cr->lwe = seqno; + frcti_data_stash(frcti, idx, pos, flags); + } + + /* Lazy alloc: only OOO arrivals can trigger a SACK send. */ + if (after(seqno, rcv_cr->lwe) && frcti->sack_n_max > 0) { + size_t sa_sz = sizeof(*sa) + + frcti->sack_n_max * sizeof(sa->blocks[0]); + sa = malloc(sa_sz); + /* If alloc fails, sack_check sees NULL and we skip SACK. */ } - frcti->rq[pos] = idx; + send_sack = sa != NULL && sack_check(frcti, seqno, now_ns, sa); + in_order = !after(seqno, rcv_cr->lwe); pthread_rwlock_unlock(&frcti->lock); - if (fd != -1) - timerwheel_delayed_ack(fd, frcti); + if (send_sack) { + STAT_BUMP(frcti, sack_snd); + if (sa->dsack) + STAT_BUMP(frcti, dsack_snd); + frcti_sack_snd(frcti, sa); + } else if (in_order) { + ack_arm(frcti); + } + + if ((flags & FRCT_ACK) && frcti->snd_cr.seqno != frcti->snd_cr.lwe) + tlp_arm(frcti); + + pending_flush(frcti, &pending); + + frcti_rcv_probe(frcti, now_ns); + free(sa); + return; + + ctrl_done: + frct_spb_release(spb); return; drop_packet: pthread_rwlock_unlock(&frcti->lock); - ssm_pool_remove(proc.pool, idx); - send_frct_pkt(frcti); - return; + frct_spb_release(spb); + /* with_sack=true: ack_snd no-ops if neither dsack nor SACK is due. */ + ack_snd(frcti, true); + + pending_flush(frcti, &pending); + free(sa); } + +/* NULL-shim macros for the no-FRCT case. */ + +#define FRCTI_SND(frcti, spb, flags) \ + ((frcti) == NULL ? 0 : frcti_snd((frcti), (spb), (flags))) + +#define FRCTI_RCV(frcti, spb) \ + do { \ + if ((frcti) != NULL) \ + frcti_rcv((frcti), (spb)); \ + } while (0) + +#define FRCTI_PDU_READY(frcti) \ + ((frcti) != NULL && frcti_pdu_ready(frcti)) + +#define FRCTI_CONSUME(frcti, buf, count) \ + ((frcti) == NULL ? (ssize_t) -EAGAIN \ + : (frcti)->stream \ + ? frcti_consume_stream((frcti), (buf), (count)) \ + : frcti_consume((frcti), (buf), (count))) + +#define FRCTI_IS_FRTX(frcti) \ + ((frcti) != NULL && ((frcti)->rcv_cr.cflags & FRCTFRTX)) + +#define FRCTI_IS_STREAM(frcti) ((frcti) != NULL && (frcti)->stream) + +#define FRCTI_PAYLOAD_CAP(frcti) \ + ((frcti)->frag_mtu - frcti_data_hdr_len(frcti)) + +#define FRCTI_NEEDS_FRAG(frcti, count) \ + ((frcti) != NULL && (count) > FRCTI_PAYLOAD_CAP(frcti)) + +#define FRCTI_IS_WINDOW_OPEN(frcti) \ + ((frcti) == NULL ? true : frcti_is_window_open(frcti)) + +#define FRCTI_IS_WINDOW_OPEN_N(frcti, n) \ + ((frcti) == NULL ? true : frcti_is_window_open_n((frcti), (n))) + +#define FRCTI_LINGERING(frcti) \ + ((frcti) == NULL ? false : frcti_lingering(frcti)) + +#define FRCTI_DEALLOC(frcti) \ + ((frcti) == NULL ? (time_t) 0 : frcti_dealloc(frcti)) + diff --git a/src/lib/hash.c b/src/lib/hash.c index 7adee968..7ffa5bc1 100644 --- a/src/lib/hash.c +++ b/src/lib/hash.c @@ -39,6 +39,9 @@ #include <ouroboros/md5.h> #include <ouroboros/sha3.h> #endif +#include <ouroboros/crc8.h> +#include <ouroboros/crc16.h> +#include <ouroboros/crc64.h> #include <string.h> #include <assert.h> #include <stdbool.h> @@ -69,6 +72,12 @@ int hash_len_tbl [] = { uint16_t hash_len(enum hash_algo algo) { + if (algo == HASH_CRC8) + return CRC8_HASH_LEN; + if (algo == HASH_CRC16) + return CRC16_HASH_LEN; + if (algo == HASH_CRC64) + return CRC64_HASH_LEN; #ifdef HAVE_LIBGCRYPT return (uint16_t) gcry_md_get_algo_dlen(gcry_algo_tbl[algo]); #else @@ -81,6 +90,27 @@ void mem_hash(enum hash_algo algo, const uint8_t * buf, size_t len) { + if (algo == HASH_CRC8) { + uint8_t crc = 0; + + crc8_autosar(&crc, buf, len); + *(uint8_t *) dst = crc; + return; + } + if (algo == HASH_CRC16) { + uint16_t crc = 0; + + crc16_ccitt_false(&crc, buf, len); + *(uint16_t *) dst = htobe16(crc); + return; + } + if (algo == HASH_CRC64) { + uint64_t crc = 0; + + crc64_nvme(&crc, buf, len); + *(uint64_t *) dst = htobe64(crc); + return; + } #ifdef HAVE_LIBGCRYPT gcry_md_hash_buffer(gcry_algo_tbl[algo], dst, buf, len); #else diff --git a/src/lib/irm.c b/src/lib/irm.c index 594014f7..c62701aa 100644 --- a/src/lib/irm.c +++ b/src/lib/irm.c @@ -614,7 +614,7 @@ ssize_t irm_list_names(struct name_info ** names) return 0; } - *names = malloc(nr * sizeof(**names)); + *names = calloc(nr, sizeof(**names)); if (*names == NULL) { irm_msg__free_unpacked(recv_msg, NULL); return -ENOMEM; diff --git a/src/lib/pb/ipcp.proto b/src/lib/pb/ipcp.proto index 9dc402f5..406b8d9c 100644 --- a/src/lib/pb/ipcp.proto +++ b/src/lib/pb/ipcp.proto @@ -54,7 +54,7 @@ message ipcp_msg { optional int32 response = 10; optional string comp = 11; optional uint32 timeo_sec = 12; - optional sint32 mpl = 13; + optional sint32 mpl = 13; /* MPL in ms. */ optional int32 result = 14; optional uint32 uid = 15; /* 0 = GSPP, >0 = PUP uid */ } diff --git a/src/lib/pb/irm.proto b/src/lib/pb/irm.proto index 9ed0a29b..5de860a5 100644 --- a/src/lib/pb/irm.proto +++ b/src/lib/pb/irm.proto @@ -88,12 +88,12 @@ message irm_msg { repeated ipcp_list_msg ipcps = 17; repeated name_info_msg names = 18; optional timespec_msg timeo = 19; - optional sint32 mpl = 20; + optional sint32 mpl = 20; /* MPL in ms. */ optional string comp = 21; optional bytes pk = 22; /* piggyback */ optional uint32 timeo_sec = 23; optional uint32 timeo_nsec = 24; optional sint32 result = 25; - optional bytes sym_key = 26; /* symmetric encryption key */ - optional sint32 cipher_nid = 27; /* cipher NID */ + optional bytes sym_key = 26; /* symmetric encryption key */ + optional sint32 cipher_nid = 27; /* cipher NID */ } diff --git a/src/lib/pb/model.proto b/src/lib/pb/model.proto index f1382f3d..4c1564a5 100644 --- a/src/lib/pb/model.proto +++ b/src/lib/pb/model.proto @@ -28,7 +28,7 @@ message qosspec_msg { required uint32 availability = 3; /* Class of 9s. */ required uint32 loss = 4; /* Packet loss. */ required uint32 ber = 5; /* Bit error rate, ppb. */ - required uint32 in_order = 6; /* In-order delivery. */ + required uint32 service = 6; /* enum qos_service. */ required uint32 max_gap = 7; /* In ms. */ required uint32 timeout = 8; /* Timeout in ms. */ } @@ -37,10 +37,11 @@ message flow_info_msg { required uint32 id = 1; required uint32 n_pid = 2; required uint32 n_1_pid = 3; - required uint32 mpl = 4; + required uint32 mpl = 4; /* MPL in ms. */ required uint32 state = 5; required qosspec_msg qos = 6; required uint32 uid = 7; + required uint32 mtu = 8; /* Layer MTU (bytes). */ } message name_info_msg { diff --git a/src/lib/protobuf.c b/src/lib/protobuf.c index d419a9f1..a824d357 100644 --- a/src/lib/protobuf.c +++ b/src/lib/protobuf.c @@ -81,6 +81,7 @@ flow_info_msg_t * flow_info_s_to_msg(const struct flow_info * s) msg->mpl = s->mpl; msg->state = s->state; msg->uid = s->uid; + msg->mtu = s->mtu; msg->qos = qos_spec_s_to_msg(&s->qs); if (msg->qos == NULL) goto fail_msg; @@ -107,6 +108,7 @@ struct flow_info flow_info_msg_to_s(const flow_info_msg_t * msg) s.mpl = msg->mpl; s.state = msg->state; s.uid = msg->uid; + s.mtu = msg->mtu; s.qs = qos_spec_msg_to_s(msg->qos); return s; @@ -137,7 +139,7 @@ name_info_msg_t * name_info_s_to_msg(const struct name_info * info) goto fail_msg; msg->ckey = strdup(info->c.key); - if (msg->skey == NULL) + if (msg->ckey == NULL) goto fail_msg; msg->ccrt = strdup(info->c.crt); @@ -161,6 +163,8 @@ struct name_info name_info_msg_to_s(const name_info_msg_t * msg) assert(msg != NULL); assert(strlen(msg->name) <= NAME_SIZE); + memset(&s, 0, sizeof(s)); + strcpy(s.name, msg->name); strcpy(s.s.key, msg->skey); strcpy(s.s.crt, msg->scrt); @@ -755,7 +759,7 @@ qosspec_msg_t * qos_spec_s_to_msg(const struct qos_spec * s) msg->availability = s->availability; msg->loss = s->loss; msg->ber = s->ber; - msg->in_order = s->in_order; + msg->service = s->service; msg->max_gap = s->max_gap; msg->timeout = s->timeout; @@ -773,7 +777,7 @@ struct qos_spec qos_spec_msg_to_s(const qosspec_msg_t * msg) s.availability = msg->availability; s.loss = msg->loss; s.ber = msg->ber; - s.in_order = msg->in_order; + s.service = msg->service; s.max_gap = msg->max_gap; s.timeout = msg->timeout; diff --git a/src/lib/qoscube.c b/src/lib/qoscube.c index 1eaa0d7c..5d7ae17d 100644 --- a/src/lib/qoscube.c +++ b/src/lib/qoscube.c @@ -29,15 +29,11 @@ qoscube_t qos_spec_to_cube(qosspec_t qs) { - if (qs.delay <= qos_voice.delay && - qs.bandwidth <= qos_voice.bandwidth && - qs.availability >= qos_voice.availability && - qs.max_gap <= qos_voice.max_gap) + if (qs.delay <= 50 && qs.bandwidth <= 100000 + && qs.availability >= 5 && qs.max_gap <= 50) return QOS_CUBE_VOICE; - else if (qs.delay <= qos_video.delay && - qs.bandwidth <= qos_video.bandwidth && - qs.availability >= qos_video.availability && - qs.max_gap <= qos_video.max_gap) + else if (qs.delay <= 100 && qs.availability >= 3 + && qs.max_gap <= 100) return QOS_CUBE_VIDEO; else return QOS_CUBE_BE; diff --git a/src/lib/random.c b/src/lib/random.c index 96315132..2c9a6c0d 100644 --- a/src/lib/random.c +++ b/src/lib/random.c @@ -47,8 +47,9 @@ int random_buffer(void * buf, gcry_randomize(buf, len, GCRY_STRONG_RANDOM); return 0; #elif defined(HAVE_OPENSSL_RNG) - if (len > 0 && len < INT_MAX) - return RAND_bytes((unsigned char *) buf, (int) len); - return -1; + if (len == 0 || len >= INT_MAX) + return -1; + + return RAND_bytes((unsigned char *) buf, (int) len) == 1 ? 0 : -1; #endif } diff --git a/src/lib/rib.c b/src/lib/rib.c index a8d535c9..6e421397 100644 --- a/src/lib/rib.c +++ b/src/lib/rib.c @@ -112,14 +112,14 @@ static int rib_read(const char * path, (void) info; (void) offset; - pthread_rwlock_wrlock(&rib.lock); + pthread_rwlock_rdlock(&rib.lock); list_for_each(p, &rib.reg_comps) { struct reg_comp * r = list_entry(p, struct reg_comp, next); if (strcmp(comp, r->path) == 0) { - int ret = r->ops->read(path + 1, buf, size); + struct rib_ops * ops = r->ops; pthread_rwlock_unlock(&rib.lock); - return ret; + return ops->read(path + 1, buf, size); } } @@ -160,19 +160,25 @@ static int rib_readdir(const char * path, ssize_t len; ssize_t i; struct reg_comp * c; + struct rib_ops * ops; c = list_entry(p, struct reg_comp, next); if (strcmp(path + 1, c->path) != 0) continue; - assert(c->ops->readdir != NULL); + ops = c->ops; + + assert(ops->readdir != NULL); + + pthread_rwlock_unlock(&rib.lock); - len = c->ops->readdir(&dir_entries); + len = ops->readdir(&dir_entries); if (len < 0) - break; + return 0; for (i = 0; i < len; ++i) filler(buf, dir_entries[i], NULL, 0); freepp(char, dir_entries, len); + return 0; } } diff --git a/src/lib/ssm/flow_set.c b/src/lib/ssm/flow_set.c index 73d0db55..cb38e6fd 100644 --- a/src/lib/ssm/flow_set.c +++ b/src/lib/ssm/flow_set.c @@ -58,9 +58,9 @@ #define QUEUESIZE ((SSM_RBUFF_SIZE) * sizeof(struct flowevent)) #define SSM_FSET_FILE_SIZE (SYS_MAX_FLOWS * sizeof(ssize_t) \ - + PROG_MAX_FQUEUES * sizeof(size_t) \ - + PROG_MAX_FQUEUES * sizeof(pthread_cond_t) \ - + PROG_MAX_FQUEUES * QUEUESIZE \ + + PROC_MAX_FQUEUES * sizeof(size_t) \ + + PROC_MAX_FQUEUES * sizeof(pthread_cond_t) \ + + PROC_MAX_FQUEUES * QUEUESIZE \ + sizeof(pthread_mutex_t)) #define fqueue_ptr(fs, idx) (fs->fqueues + (SSM_RBUFF_SIZE) * idx) @@ -104,10 +104,10 @@ static struct ssm_flow_set * flow_set_create(pid_t pid, set->mtable = shm_base; set->heads = (size_t *) (set->mtable + SYS_MAX_FLOWS); - set->conds = (pthread_cond_t *)(set->heads + PROG_MAX_FQUEUES); - set->fqueues = (struct flowevent *) (set->conds + PROG_MAX_FQUEUES); + set->conds = (pthread_cond_t *)(set->heads + PROC_MAX_FQUEUES); + set->fqueues = (struct flowevent *) (set->conds + PROC_MAX_FQUEUES); set->lock = (pthread_mutex_t *) - (set->fqueues + PROG_MAX_FQUEUES * (SSM_RBUFF_SIZE)); + (set->fqueues + PROC_MAX_FQUEUES * (SSM_RBUFF_SIZE)); return set; @@ -164,7 +164,7 @@ struct ssm_flow_set * ssm_flow_set_create(pid_t pid) if (pthread_condattr_setclock(&cattr, PTHREAD_COND_CLOCK)) goto fail_condattr_set; #endif - for (i = 0; i < PROG_MAX_FQUEUES; ++i) { + for (i = 0; i < PROC_MAX_FQUEUES; ++i) { set->heads[i] = 0; if (pthread_cond_init(&set->conds[i], &cattr)) goto fail_init; @@ -222,7 +222,7 @@ void ssm_flow_set_zero(struct ssm_flow_set * set, ssize_t i = 0; assert(set); - assert(idx < PROG_MAX_FQUEUES); + assert(idx < PROC_MAX_FQUEUES); pthread_mutex_lock(set->lock); @@ -242,7 +242,7 @@ int ssm_flow_set_add(struct ssm_flow_set * set, { assert(set); assert(!(flow_id < 0) && flow_id < SYS_MAX_FLOWS); - assert(idx < PROG_MAX_FQUEUES); + assert(idx < PROC_MAX_FQUEUES); pthread_mutex_lock(set->lock); @@ -264,7 +264,7 @@ void ssm_flow_set_del(struct ssm_flow_set * set, { assert(set); assert(!(flow_id < 0) && flow_id < SYS_MAX_FLOWS); - assert(idx < PROG_MAX_FQUEUES); + assert(idx < PROC_MAX_FQUEUES); pthread_mutex_lock(set->lock); @@ -282,7 +282,7 @@ int ssm_flow_set_has(struct ssm_flow_set * set, assert(set); assert(!(flow_id < 0) && flow_id < SYS_MAX_FLOWS); - assert(idx < PROG_MAX_FQUEUES); + assert(idx < PROC_MAX_FQUEUES); pthread_mutex_lock(set->lock); @@ -332,7 +332,7 @@ ssize_t ssm_flow_set_wait(const struct ssm_flow_set * set, ssize_t ret = 0; assert(set); - assert(idx < PROG_MAX_FQUEUES); + assert(idx < PROC_MAX_FQUEUES); assert(fqueue); #ifndef HAVE_ROBUST_MUTEX diff --git a/src/lib/ssm/pool.c b/src/lib/ssm/pool.c index 5c98b515..5607a360 100644 --- a/src/lib/ssm/pool.c +++ b/src/lib/ssm/pool.c @@ -24,6 +24,7 @@ #include "config.h" +#include <ouroboros/atomics.h> #include <ouroboros/errno.h> #include <ouroboros/pthread.h> #include <ouroboros/ssm_pool.h> @@ -75,26 +76,6 @@ static const struct ssm_size_class_cfg ssm_pup_cfg[SSM_POOL_MAX_CLASSES] = { #define GET_SHARD_FOR_PID(pid) ((int)((pid) % SSM_POOL_SHARDS)) -#define LOAD_RELAXED(ptr) \ - (__atomic_load_n(ptr, __ATOMIC_RELAXED)) - -#define LOAD_ACQUIRE(ptr) \ - (__atomic_load_n(ptr, __ATOMIC_ACQUIRE)) - -#define STORE_RELEASE(ptr, val) \ - (__atomic_store_n(ptr, val, __ATOMIC_RELEASE)) - -#define LOAD(ptr) \ - (__atomic_load_n(ptr, __ATOMIC_SEQ_CST)) - -#define STORE(ptr, val) \ - (__atomic_store_n(ptr, val, __ATOMIC_SEQ_CST)) - -#define FETCH_ADD(ptr, val) \ - (__atomic_fetch_add(ptr, val, __ATOMIC_SEQ_CST)) - -#define FETCH_SUB(ptr, val) \ - (__atomic_fetch_sub(ptr, val, __ATOMIC_SEQ_CST)) #define SSM_FILE_SIZE (SSM_POOL_TOTAL_SIZE + sizeof(struct _ssm_pool_hdr)) #define SSM_GSPP_FILE_SIZE (SSM_GSPP_TOTAL_SIZE + sizeof(struct _ssm_pool_hdr)) @@ -107,6 +88,8 @@ static const struct ssm_size_class_cfg ssm_pup_cfg[SSM_POOL_MAX_CLASSES] = { : SSM_PUP_FILE_SIZE) #define GET_POOL_CFG(uid) (IS_GSPP(uid) ? ssm_gspp_cfg : ssm_pup_cfg) +#define NEEDS_CHOWN(uid, gid) ((uid) != geteuid() || (gid) != getegid()) + struct ssm_pool { uint8_t * shm_base; /* start of blocks */ struct _ssm_pool_hdr * hdr; /* shared memory header */ @@ -163,29 +146,6 @@ static __inline__ void list_add_head(struct _ssm_list_head * head, STORE(&head->count, LOAD(&head->count) + 1); } -static __inline__ int select_size_class(struct ssm_pool * pool, - size_t len) -{ - size_t sz; - int i; - - assert(pool != NULL); - - /* Total space needed: header + headspace + data + tailspace */ - sz = sizeof(struct ssm_pk_buff) + SSM_PK_BUFF_HEADSPACE + len - + SSM_PK_BUFF_TAILSPACE; - - for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) { - struct _ssm_size_class * sc; - - sc = &pool->hdr->size_classes[i]; - if (sc->object_size > 0 && sz <= sc->object_size) - return i; - } - - return -1; -} - static __inline__ int find_size_class_for_offset(struct ssm_pool * pool, size_t offset) { @@ -548,7 +508,7 @@ static struct ssm_pool * __pool_create(const char * name, if (flags & O_CREAT) { if (ftruncate(fd, (off_t) file_size) < 0) goto fail_truncate; - if (uid != geteuid() && fchown(fd, uid, gid) < 0) + if (NEEDS_CHOWN(uid, gid) && fchown(fd, uid, gid) < 0) goto fail_truncate; } @@ -700,7 +660,7 @@ ssize_t ssm_pool_alloc(struct ssm_pool * pool, assert(pool != NULL); assert(spb != NULL); - idx = select_size_class(pool, count); + idx = select_size_class(pool->hdr, count); if (idx >= 0) return alloc_from_sc(pool, idx, count, ptr, spb); @@ -718,7 +678,7 @@ ssize_t ssm_pool_alloc_b(struct ssm_pool * pool, assert(pool != NULL); assert(spb != NULL); - idx = select_size_class(pool, count); + idx = select_size_class(pool->hdr, count); if (idx >= 0) return alloc_from_sc_b(pool, idx, count, ptr, spb, abstime); @@ -744,7 +704,7 @@ ssize_t ssm_pool_read(uint8_t ** dst, } struct ssm_pk_buff * ssm_pool_get(struct ssm_pool * pool, - size_t off) + size_t off) { struct ssm_pk_buff * blk; @@ -823,36 +783,36 @@ int ssm_pool_remove(struct ssm_pool * pool, return 0; } -size_t ssm_pk_buff_get_idx(struct ssm_pk_buff * spb) +size_t ssm_pk_buff_get_off(const struct ssm_pk_buff * spb) { assert(spb != NULL); return spb->off; } -uint8_t * ssm_pk_buff_head(struct ssm_pk_buff * spb) +uint8_t * ssm_pk_buff_head(const struct ssm_pk_buff * spb) { assert(spb != NULL); - return spb->data + spb->pk_head; + return (uint8_t *) spb->data + spb->pk_head; } -uint8_t * ssm_pk_buff_tail(struct ssm_pk_buff * spb) +uint8_t * ssm_pk_buff_tail(const struct ssm_pk_buff * spb) { assert(spb != NULL); - return spb->data + spb->pk_tail; + return (uint8_t *) spb->data + spb->pk_tail; } -size_t ssm_pk_buff_len(struct ssm_pk_buff * spb) +size_t ssm_pk_buff_len(const struct ssm_pk_buff * spb) { assert(spb != NULL); return spb->pk_tail - spb->pk_head; } -uint8_t * ssm_pk_buff_head_alloc(struct ssm_pk_buff * spb, - size_t size) +uint8_t * ssm_pk_buff_push(struct ssm_pk_buff * spb, + size_t size) { assert(spb != NULL); @@ -864,8 +824,8 @@ uint8_t * ssm_pk_buff_head_alloc(struct ssm_pk_buff * spb, return spb->data + spb->pk_head; } -uint8_t * ssm_pk_buff_tail_alloc(struct ssm_pk_buff * spb, - size_t size) +uint8_t * ssm_pk_buff_push_tail(struct ssm_pk_buff * spb, + size_t size) { uint8_t * buf; @@ -881,8 +841,8 @@ uint8_t * ssm_pk_buff_tail_alloc(struct ssm_pk_buff * spb, return buf; } -uint8_t * ssm_pk_buff_head_release(struct ssm_pk_buff * spb, - size_t size) +uint8_t * ssm_pk_buff_pop(struct ssm_pk_buff * spb, + size_t size) { uint8_t * buf; @@ -896,8 +856,8 @@ uint8_t * ssm_pk_buff_head_release(struct ssm_pk_buff * spb, return buf; } -uint8_t * ssm_pk_buff_tail_release(struct ssm_pk_buff * spb, - size_t size) +uint8_t * ssm_pk_buff_pop_tail(struct ssm_pk_buff * spb, + size_t size) { assert(spb != NULL); assert(!(size > spb->pk_tail - spb->pk_head)); diff --git a/src/lib/ssm/rbuff.c b/src/lib/ssm/rbuff.c index e4558c31..c149c306 100644 --- a/src/lib/ssm/rbuff.c +++ b/src/lib/ssm/rbuff.c @@ -80,6 +80,7 @@ struct ssm_rbuff { pthread_cond_t * del; /* signal when data removed */ pid_t pid; /* pid of the owner */ int flow_id; /* flow_id of the flow */ + size_t n_users; /* in-flight users */ }; #define MM_FLAGS (PROT_READ | PROT_WRITE) @@ -119,6 +120,7 @@ static struct ssm_rbuff * rbuff_create(pid_t pid, rb->del = rb->add + 1; rb->pid = pid; rb->flow_id = flow_id; + rb->n_users = 0; return rb; @@ -228,11 +230,20 @@ void ssm_rbuff_close(struct ssm_rbuff * rb) { assert(rb); + /* + * Caller must set ACL_FLOWDOWN first; if a user becomes + * cancellable, push a cleanup that decrements n_users. + */ + while (__atomic_load_n(&rb->n_users, __ATOMIC_SEQ_CST) > 0) { + struct timespec tic = { 0, 100000 }; + nanosleep(&tic, NULL); + } + rbuff_destroy(rb); } int ssm_rbuff_write(struct ssm_rbuff * rb, - size_t idx) + size_t off) { size_t acl; bool was_empty; @@ -240,6 +251,8 @@ int ssm_rbuff_write(struct ssm_rbuff * rb, assert(rb != NULL); + __atomic_fetch_add(&rb->n_users, 1, __ATOMIC_SEQ_CST); + acl = __atomic_load_n(rb->acl, __ATOMIC_SEQ_CST); if (acl != ACL_RDWR) { if (acl & ACL_FLOWDOWN) { @@ -261,7 +274,7 @@ int ssm_rbuff_write(struct ssm_rbuff * rb, was_empty = IS_EMPTY(rb); - HEAD(rb) = (ssize_t) idx; + HEAD(rb) = (ssize_t) off; ADVANCE_HEAD(rb); if (was_empty) @@ -269,16 +282,18 @@ int ssm_rbuff_write(struct ssm_rbuff * rb, pthread_mutex_unlock(rb->mtx); + __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST); return 0; fail_mutex: pthread_mutex_unlock(rb->mtx); fail_acl: + __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST); return ret; } int ssm_rbuff_write_b(struct ssm_rbuff * rb, - size_t idx, + size_t off, const struct timespec * abstime) { size_t acl; @@ -287,6 +302,8 @@ int ssm_rbuff_write_b(struct ssm_rbuff * rb, assert(rb != NULL); + __atomic_fetch_add(&rb->n_users, 1, __ATOMIC_SEQ_CST); + acl = __atomic_load_n(rb->acl, __ATOMIC_SEQ_CST); if (acl != ACL_RDWR) { if (acl & ACL_FLOWDOWN) { @@ -316,7 +333,7 @@ int ssm_rbuff_write_b(struct ssm_rbuff * rb, if (ret != -ETIMEDOUT && ret != -EFLOWDOWN) { was_empty = IS_EMPTY(rb); - HEAD(rb) = (ssize_t) idx; + HEAD(rb) = (ssize_t) off; ADVANCE_HEAD(rb); if (was_empty) pthread_cond_broadcast(rb->add); @@ -325,6 +342,7 @@ int ssm_rbuff_write_b(struct ssm_rbuff * rb, pthread_mutex_unlock(rb->mtx); fail_acl: + __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST); return ret; } @@ -351,11 +369,21 @@ ssize_t ssm_rbuff_read(struct ssm_rbuff * rb) assert(rb != NULL); - if (IS_EMPTY(rb)) - return check_rb_acl(rb); + __atomic_fetch_add(&rb->n_users, 1, __ATOMIC_SEQ_CST); + + if (IS_EMPTY(rb)) { + ret = check_rb_acl(rb); + goto out; + } robust_mutex_lock(rb->mtx); + if (IS_EMPTY(rb)) { + pthread_mutex_unlock(rb->mtx); + ret = check_rb_acl(rb); + goto out; + } + ret = TAIL(rb); ADVANCE_TAIL(rb); @@ -363,6 +391,8 @@ ssize_t ssm_rbuff_read(struct ssm_rbuff * rb) pthread_mutex_unlock(rb->mtx); + out: + __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST); return ret; } @@ -374,9 +404,13 @@ ssize_t ssm_rbuff_read_b(struct ssm_rbuff * rb, assert(rb != NULL); + __atomic_fetch_add(&rb->n_users, 1, __ATOMIC_SEQ_CST); + acl = __atomic_load_n(rb->acl, __ATOMIC_SEQ_CST); - if (IS_EMPTY(rb) && (acl & ACL_FLOWDOWN)) - return -EFLOWDOWN; + if (IS_EMPTY(rb) && (acl & ACL_FLOWDOWN)) { + idx = -EFLOWDOWN; + goto out; + } robust_mutex_lock(rb->mtx); @@ -402,6 +436,8 @@ ssize_t ssm_rbuff_read_b(struct ssm_rbuff * rb, assert(idx != -EAGAIN); + out: + __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST); return idx; } @@ -410,7 +446,11 @@ void ssm_rbuff_set_acl(struct ssm_rbuff * rb, { assert(rb != NULL); + robust_mutex_lock(rb->mtx); __atomic_store_n(rb->acl, (size_t) flags, __ATOMIC_SEQ_CST); + pthread_cond_broadcast(rb->add); + pthread_cond_broadcast(rb->del); + pthread_mutex_unlock(rb->mtx); } uint32_t ssm_rbuff_get_acl(struct ssm_rbuff * rb) @@ -424,6 +464,8 @@ void ssm_rbuff_fini(struct ssm_rbuff * rb) { assert(rb != NULL); + __atomic_fetch_add(&rb->n_users, 1, __ATOMIC_SEQ_CST); + robust_mutex_lock(rb->mtx); pthread_cleanup_push(__cleanup_mutex_unlock, rb->mtx); @@ -432,6 +474,8 @@ void ssm_rbuff_fini(struct ssm_rbuff * rb) robust_wait(rb->del, rb->mtx, NULL); pthread_cleanup_pop(true); + + __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST); } size_t ssm_rbuff_queued(struct ssm_rbuff * rb) diff --git a/src/lib/ssm/ssm.h.in b/src/lib/ssm/ssm.h.in index b9246c8b..b86327a1 100644 --- a/src/lib/ssm/ssm.h.in +++ b/src/lib/ssm/ssm.h.in @@ -38,7 +38,6 @@ #define SSM_RBUFF_PREFIX "@SSM_RBUFF_PREFIX@" #define SSM_FLOW_SET_PREFIX "@SSM_FLOW_SET_PREFIX@" #define SSM_POOL_NAME "@SSM_POOL_NAME@" -#define SSM_POOL_BLOCKS @SSM_POOL_BLOCKS@ #define SSM_RBUFF_SIZE @SSM_RBUFF_SIZE@ /* Packet buffer space reservation */ @@ -164,6 +163,24 @@ struct _ssm_pool_hdr { struct _ssm_size_class size_classes[SSM_POOL_MAX_CLASSES]; }; +#define SSM_PK_BUFF_TOTALSPACE (SSM_PK_BUFF_HEADSPACE + SSM_PK_BUFF_TAILSPACE) +static __inline__ int select_size_class(struct _ssm_pool_hdr * hdr, + size_t len) +{ + size_t sz; + int i; + + sz = sizeof(struct ssm_pk_buff) + SSM_PK_BUFF_TOTALSPACE + len; + + for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) { + struct _ssm_size_class * sc = &hdr->size_classes[i]; + if (sc->object_size > 0 && sz <= sc->object_size) + return i; + } + + return -1; +} + #ifdef __cplusplus } #endif diff --git a/src/lib/ssm/tests/pool_sharding_test.c b/src/lib/ssm/tests/pool_sharding_test.c index c53105e3..ec464a92 100644 --- a/src/lib/ssm/tests/pool_sharding_test.c +++ b/src/lib/ssm/tests/pool_sharding_test.c @@ -80,19 +80,13 @@ static int test_lazy_distribution(void) goto fail_pool; } - /* Find the first size class with blocks */ - sc_idx = -1; - for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) { - if (hdr->size_classes[i].object_count > 0) { - sc_idx = i; - break; - } - } - + /* Inspect the class that TEST_SIZE allocations will use */ + sc_idx = select_size_class(hdr, TEST_SIZE); if (sc_idx < 0) { - printf("No size classes configured.\n"); + printf("No size class fits TEST_SIZE=%d.\n", TEST_SIZE); for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) { - printf(" Class %d: count=%zu\n", i, + printf(" Class %d: object_size=%zu count=%zu\n", i, + hdr->size_classes[i].object_size, hdr->size_classes[i].object_count); } goto fail_pool; @@ -137,7 +131,6 @@ static int test_shard_migration(void) ssize_t off; int shard_idx; int sc_idx; - int i; TEST_START(); @@ -149,18 +142,11 @@ static int test_shard_migration(void) hdr = get_pool_hdr(pool); - /* Find the first size class with blocks */ - sc_idx = -1; - for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) { - if (hdr->size_classes[i].object_count > 0) { - sc_idx = i; - break; - } - } - + /* Inspect the class that TEST_SIZE allocations will use */ + sc_idx = select_size_class(hdr, TEST_SIZE); if (sc_idx < 0) { - printf("No size classes configured.\n"); - goto fail; + printf("No size class fits TEST_SIZE=%d.\n", TEST_SIZE); + goto fail_pool; } sc = &hdr->size_classes[sc_idx]; @@ -209,7 +195,6 @@ static int test_fallback_stealing(void) size_t total_free; size_t i; int sc_idx; - int c; TEST_START(); @@ -221,18 +206,11 @@ static int test_fallback_stealing(void) hdr = get_pool_hdr(pool); - /* Find the first size class with blocks */ - sc_idx = -1; - for (c = 0; c < SSM_POOL_MAX_CLASSES; c++) { - if (hdr->size_classes[c].object_count > 0) { - sc_idx = c; - break; - } - } - + /* Inspect the class that TEST_SIZE allocations will use */ + sc_idx = select_size_class(hdr, TEST_SIZE); if (sc_idx < 0) { - printf("No size classes configured.\n"); - goto fail; + printf("No size class fits TEST_SIZE=%d.\n", TEST_SIZE); + goto fail_pool; } sc = &hdr->size_classes[sc_idx]; @@ -261,7 +239,7 @@ static int test_fallback_stealing(void) /* Free them all - they go to local_shard */ for (i = 0; i < total_blocks / 2; i++) { - size_t off = ssm_pk_buff_get_idx(spbs[i]); + size_t off = ssm_pk_buff_get_off(spbs[i]); if (ssm_pool_remove(pool, off) != 0) { printf("Remove %zu failed.\n", i); free(spbs); @@ -299,7 +277,7 @@ static int test_fallback_stealing(void) /* Now all allocated blocks are in use again */ /* Cleanup - free all allocated blocks */ for (i = 0; i < total_blocks / 2; i++) { - size_t off = ssm_pk_buff_get_idx(spbs[i]); + size_t off = ssm_pk_buff_get_off(spbs[i]); ssm_pool_remove(pool, off); } @@ -396,20 +374,15 @@ static int test_multiprocess_sharding(void) /* Verify blocks distributed across shards */ hdr = get_pool_hdr(pool); - /* Find the first size class with blocks */ - sc = NULL; - for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) { - if (hdr->size_classes[i].object_count > 0) { - sc = &hdr->size_classes[i]; - break; - } - } - - if (sc == NULL) { - printf("No size classes configured.\n"); + /* Inspect the class that TEST_SIZE allocations used */ + i = select_size_class(hdr, TEST_SIZE); + if (i < 0) { + printf("No size class fits TEST_SIZE=%d.\n", TEST_SIZE); goto fail_pool; } + sc = &hdr->size_classes[i]; + /* After children allocate and free, blocks should be in shards * (though exact distribution depends on PID values) */ diff --git a/src/lib/ssm/tests/pool_test.c b/src/lib/ssm/tests/pool_test.c index 3fc19cd5..0f9db24d 100644 --- a/src/lib/ssm/tests/pool_test.c +++ b/src/lib/ssm/tests/pool_test.c @@ -741,14 +741,14 @@ static int test_ssm_pk_buff_operations(void) memcpy(head, data, dlen); - tail = ssm_pk_buff_tail_alloc(spb, 32); + tail = ssm_pk_buff_push_tail(spb, 32); if (tail == NULL) { - printf("Tail_alloc failed.\n"); + printf("push_tail failed.\n"); goto fail_ops; } if (ssm_pk_buff_len(spb) != POOL_256 + 32) { - printf("Length after tail_alloc: %zu.\n", + printf("Length after push_tail: %zu.\n", ssm_pk_buff_len(spb)); goto fail_ops; } @@ -758,14 +758,14 @@ static int test_ssm_pk_buff_operations(void) goto fail_ops; } - tail = ssm_pk_buff_tail_release(spb, 32); + tail = ssm_pk_buff_pop_tail(spb, 32); if (tail == NULL) { - printf("Tail_release failed.\n"); + printf("pop_tail failed.\n"); goto fail_ops; } if (ssm_pk_buff_len(spb) != POOL_256) { - printf("Length after tail_release: %zu.\n", + printf("Length after pop_tail: %zu.\n", ssm_pk_buff_len(spb)); goto fail_ops; } diff --git a/src/lib/tests/CMakeLists.txt b/src/lib/tests/CMakeLists.txt index 5a2f2c52..32836589 100644 --- a/src/lib/tests/CMakeLists.txt +++ b/src/lib/tests/CMakeLists.txt @@ -10,7 +10,6 @@ create_test_sourcelist(${PARENT_DIR}_tests test_suite.c auth_test_slh_dsa.c bitmap_test.c btree_test.c - crc32_test.c crypt_test.c hash_test.c kex_test.c @@ -20,6 +19,7 @@ create_test_sourcelist(${PARENT_DIR}_tests test_suite.c sockets_test.c time_test.c tpm_test.c + tw_test.c ) add_executable(${PARENT_DIR}_test ${${PARENT_DIR}_tests}) diff --git a/src/lib/tests/auth_test.c b/src/lib/tests/auth_test.c index 1a5a87af..0f3ef715 100644 --- a/src/lib/tests/auth_test.c +++ b/src/lib/tests/auth_test.c @@ -347,6 +347,59 @@ static int test_verify_crt(void) return TEST_RC_FAIL; } +static int test_verify_crt_missing_root_ca(void) +{ + struct auth_ctx * auth; + void * _signed_server_crt; + void * _im_ca_crt; + + TEST_START(); + + auth = auth_create_ctx(); + if (auth == NULL) { + printf("Failed to create auth context.\n"); + goto fail_create_ctx; + } + + if (crypt_load_crt_str(signed_server_crt_ec, &_signed_server_crt) < 0) { + printf("Failed to load signed crt from string.\n"); + goto fail_load_signed; + } + + if (crypt_load_crt_str(im_ca_crt_ec, &_im_ca_crt) < 0) { + printf("Failed to load intermediate crt from string.\n"); + goto fail_load_im_ca; + } + + /* Add only the intermediate CA - root CA is missing */ + if (auth_add_crt_to_store(auth, _im_ca_crt) < 0) { + printf("Failed to add intermediate ca crt to auth store.\n"); + goto fail_add; + } + + if (auth_verify_crt(auth, _signed_server_crt) == 0) { + printf("Verification should fail without root CA.\n"); + goto fail_add; + } + + crypt_free_crt(_im_ca_crt); + crypt_free_crt(_signed_server_crt); + auth_destroy_ctx(auth); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_add: + crypt_free_crt(_im_ca_crt); + fail_load_im_ca: + crypt_free_crt(_signed_server_crt); + fail_load_signed: + auth_destroy_ctx(auth); + fail_create_ctx: + TEST_FAIL(); + return TEST_RC_FAIL; +} + int test_auth_sign(void) { uint8_t buf[TEST_MSG_SIZE]; @@ -526,6 +579,7 @@ int auth_test(int argc, ret |= test_crypt_check_pubkey_crt(); ret |= test_store_add(); ret |= test_verify_crt(); + ret |= test_verify_crt_missing_root_ca(); ret |= test_auth_sign(); ret |= test_auth_bad_signature(); ret |= test_crt_str(); @@ -538,6 +592,7 @@ int auth_test(int argc, (void) test_crypt_check_pubkey_crt; (void) test_store_add; (void) test_verify_crt; + (void) test_verify_crt_missing_root_ca; (void) test_auth_sign; (void) test_auth_bad_signature; (void) test_crt_str; diff --git a/src/lib/tests/hash_test.c b/src/lib/tests/hash_test.c index e43847e1..451d3c25 100644 --- a/src/lib/tests/hash_test.c +++ b/src/lib/tests/hash_test.c @@ -39,6 +39,74 @@ struct vec_entry { char * out; }; +static int test_crc8(void) +{ + int ret = 0; + + struct vec_entry vec [] = { + { "", "00" }, + { "123456789", "df" }, + { NULL, NULL } + }; + + struct vec_entry * cur = vec; + + TEST_START(); + + while (cur->in != NULL) { + uint8_t crc; + char res[3]; + + str_hash(HASH_CRC8, &crc, cur->in); + + sprintf(res, "%02x", crc); + if (strcmp(res, cur->out) != 0) { + printf("Hash failed %s != %s.\n", res, cur->out); + ret |= -1; + } + + ++cur; + } + + TEST_END(ret); + + return ret; +} + +static int test_crc16(void) +{ + int ret = 0; + + struct vec_entry vec [] = { + { "", "ffff" }, + { "123456789", "29b1" }, + { NULL, NULL } + }; + + struct vec_entry * cur = vec; + + TEST_START(); + + while (cur->in != NULL) { + uint8_t crc[2]; + char res[5]; + + str_hash(HASH_CRC16, crc, cur->in); + + sprintf(res, "%02x%02x", crc[0], crc[1]); + if (strcmp(res, cur->out) != 0) { + printf("Hash failed %s != %s.\n", res, cur->out); + ret |= -1; + } + + ++cur; + } + + TEST_END(ret); + + return ret; +} + static int test_crc32(void) { int ret = 0; @@ -74,6 +142,42 @@ static int test_crc32(void) return ret; } +static int test_crc64(void) +{ + int ret = 0; + + struct vec_entry vec [] = { + { "", "0000000000000000" }, + { "123456789", "ae8b14860a799888" }, + { "0123456789abcdef", + "091485ca7018730e" }, + { NULL, NULL } + }; + + struct vec_entry * cur = vec; + + TEST_START(); + + while (cur->in != NULL) { + uint8_t crc[8]; + char res[17]; + + str_hash(HASH_CRC64, crc, cur->in); + + sprintf(res, HASH_FMT64, HASH_VAL64(crc)); + if (strcmp(res, cur->out) != 0) { + printf("Hash failed %s != %s.\n", res, cur->out); + ret |= -1; + } + + ++cur; + } + + TEST_END(ret); + + return ret; +} + static int test_md5(void) { int ret = 0; @@ -192,8 +296,14 @@ int hash_test(int argc, (void) argc; (void) argv; + ret |= test_crc8(); + + ret |= test_crc16(); + ret |= test_crc32(); + ret |= test_crc64(); + ret |= test_md5(); ret |= test_sha3(); diff --git a/src/lib/tests/kex_test.c b/src/lib/tests/kex_test.c index ced760fe..6a4f802e 100644 --- a/src/lib/tests/kex_test.c +++ b/src/lib/tests/kex_test.c @@ -106,7 +106,7 @@ static int test_kex_dh_pkp_create_destroy(void) { struct sec_config kex; void * pkp; - uint8_t buf[MSGBUFSZ]; + uint8_t buf[CRYPT_KEY_BUFSZ]; TEST_START(); @@ -134,7 +134,7 @@ static int test_kex_get_algo_from_pk(const char * algo) void * pkp; buffer_t pk; ssize_t len; - uint8_t buf[MSGBUFSZ]; + uint8_t buf[CRYPT_KEY_BUFSZ]; char extracted_algo[256]; TEST_START("(%s)", algo); @@ -204,8 +204,8 @@ static int test_kex_dhe_derive(const char * algo) buffer_t pk1; buffer_t pk2; ssize_t len; - uint8_t buf1[MSGBUFSZ]; - uint8_t buf2[MSGBUFSZ]; + uint8_t buf1[CRYPT_KEY_BUFSZ]; + uint8_t buf2[CRYPT_KEY_BUFSZ]; uint8_t s1[SYMMKEYSZ]; uint8_t s2[SYMMKEYSZ]; @@ -317,7 +317,7 @@ static int test_kex_dhe_corrupted_pubkey(const char * algo) void * pkp; buffer_t pk; ssize_t len; - uint8_t buf[MSGBUFSZ]; + uint8_t buf[CRYPT_KEY_BUFSZ]; uint8_t s[SYMMKEYSZ]; TEST_START("(%s)", algo); @@ -363,8 +363,8 @@ static int test_kex_dhe_wrong_algo(void) void * pkp2; buffer_t pk2; ssize_t len; - uint8_t buf1[MSGBUFSZ]; - uint8_t buf2[MSGBUFSZ]; + uint8_t buf1[CRYPT_KEY_BUFSZ]; + uint8_t buf2[CRYPT_KEY_BUFSZ]; uint8_t s[SYMMKEYSZ]; const char * algo1 = "X25519"; const char * algo2 = "X448"; diff --git a/src/lib/tests/kex_test_ml_kem.c b/src/lib/tests/kex_test_ml_kem.c index 3bb9ae7c..7761c3dc 100644 --- a/src/lib/tests/kex_test_ml_kem.c +++ b/src/lib/tests/kex_test_ml_kem.c @@ -197,8 +197,8 @@ static int test_kex_kem(const char * algo) buffer_t ct; ssize_t len; ssize_t ct_len; - uint8_t buf1[MSGBUFSZ]; - uint8_t buf2[MSGBUFSZ]; + uint8_t buf1[CRYPT_KEY_BUFSZ]; + uint8_t buf2[CRYPT_KEY_BUFSZ]; uint8_t s1[SYMMKEYSZ]; uint8_t s2[SYMMKEYSZ]; int kdf; @@ -262,8 +262,8 @@ static int test_kex_kem_corrupted_ciphertext(const char * algo) buffer_t ct; ssize_t len; ssize_t ct_len; - uint8_t buf1[MSGBUFSZ]; - uint8_t buf2[MSGBUFSZ]; + uint8_t buf1[CRYPT_KEY_BUFSZ]; + uint8_t buf2[CRYPT_KEY_BUFSZ]; uint8_t s1[SYMMKEYSZ]; uint8_t s2[SYMMKEYSZ]; int kdf; @@ -334,9 +334,9 @@ static int test_kex_kem_wrong_keypair(const char * algo) buffer_t ct; ssize_t len; ssize_t ct_len; - uint8_t buf1[MSGBUFSZ]; - uint8_t buf2[MSGBUFSZ]; - uint8_t buf3[MSGBUFSZ]; + uint8_t buf1[CRYPT_KEY_BUFSZ]; + uint8_t buf2[CRYPT_KEY_BUFSZ]; + uint8_t buf3[CRYPT_KEY_BUFSZ]; uint8_t s1[SYMMKEYSZ]; uint8_t s2[SYMMKEYSZ]; @@ -402,8 +402,8 @@ static int test_kex_kem_truncated_ciphertext(const char * algo) buffer_t ct; ssize_t len; ssize_t ct_len; - uint8_t buf1[MSGBUFSZ]; - uint8_t buf2[MSGBUFSZ]; + uint8_t buf1[CRYPT_KEY_BUFSZ]; + uint8_t buf2[CRYPT_KEY_BUFSZ]; uint8_t s1[SYMMKEYSZ]; uint8_t s2[SYMMKEYSZ]; diff --git a/src/lib/tests/tpm_test.c b/src/lib/tests/tpm_test.c index df1d8850..7cc049cd 100644 --- a/src/lib/tests/tpm_test.c +++ b/src/lib/tests/tpm_test.c @@ -21,7 +21,7 @@ */ -#include "tpm.c" +#include <ouroboros/tpm.h> #include <test/test.h> diff --git a/src/lib/tests/tw_test.c b/src/lib/tests/tw_test.c new file mode 100644 index 00000000..32c302c4 --- /dev/null +++ b/src/lib/tests/tw_test.c @@ -0,0 +1,663 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * Generic timing-wheel tests + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +#if defined(__linux__) || defined(__CYGWIN__) +#define _DEFAULT_SOURCE +#else +#define _POSIX_C_SOURCE 200809L +#endif + +#include "config.h" + +#include <test/test.h> + +#include <ouroboros/time.h> +#include <ouroboros/tw.h> + +#include <stdint.h> +#include <stdio.h> +#include <time.h> + +struct payload { + struct tw_entry tw; + int fired; +}; + +struct cancel_payload { + struct tw_entry tw; + int fired; + struct tw_entry * sibling; +}; + +struct repost_payload { + struct tw_entry tw; + int fired; + struct payload * sibling; + uint64_t repost_at; +}; + +static void cb_count(void * arg) +{ + struct payload * p = arg; + p->fired++; +} + +static void cb_cancel_sibling(void * arg) +{ + struct cancel_payload * p = arg; + p->fired++; + tw_cancel(p->sibling); +} + +static void cb_repost_sibling(void * arg) +{ + struct repost_payload * p = arg; + p->fired++; + tw_post(&p->sibling->tw, p->repost_at, cb_count, p->sibling); +} + +static uint64_t now_ns(void) +{ + struct timespec ts; + clock_gettime(PTHREAD_COND_CLOCK, &ts); + return TS_TO_UINT64(ts); +} + +static void sleep_ns(uint64_t ns) +{ + struct timespec ts; + UINT64_TO_TS(ns, &ts); + nanosleep(&ts, NULL); +} + +static int test_tw_init_fini(void) +{ + TEST_START(); + + if (tw_init() < 0) { + printf("tw_init failed.\n"); + goto fail; + } + + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_tw_post_fires_after_deadline(void) +{ + struct payload p; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + tw_init_entry(&p.tw); + p.fired = 0; + + tw_post(&p.tw, now_ns() + 5 * MILLION, cb_count, &p); + + sleep_ns(20 * MILLION); + tw_move(); + + if (p.fired != 1) { + printf("expected 1 fire, got %d\n", p.fired); + goto fail_post; + } + + tw_cancel(&p.tw); + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_post: + tw_cancel(&p.tw); + tw_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_tw_no_fire_before_deadline(void) +{ + struct payload p; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + tw_init_entry(&p.tw); + p.fired = 0; + + tw_post(&p.tw, now_ns() + 100 * MILLION, cb_count, &p); + + sleep_ns(2 * MILLION); + tw_move(); + + if (p.fired != 0) { + printf("expected 0 fires, got %d\n", p.fired); + goto fail_post; + } + + tw_cancel(&p.tw); + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_post: + tw_cancel(&p.tw); + tw_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_tw_cancel_prevents_fire(void) +{ + struct payload p; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + tw_init_entry(&p.tw); + p.fired = 0; + + tw_post(&p.tw, now_ns() + 5 * MILLION, cb_count, &p); + tw_cancel(&p.tw); + + sleep_ns(20 * MILLION); + tw_move(); + + if (p.fired != 0) { + printf("cancelled entry fired %d times\n", p.fired); + goto fail_init; + } + + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_init: + tw_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_tw_cancel_unposted_is_noop(void) +{ + struct tw_entry e; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + tw_init_entry(&e); + tw_cancel(&e); + tw_cancel(&e); + + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_tw_fire_only_once(void) +{ + struct payload p; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + tw_init_entry(&p.tw); + p.fired = 0; + + tw_post(&p.tw, now_ns() + 3 * MILLION, cb_count, &p); + + sleep_ns(20 * MILLION); + tw_move(); + tw_move(); + tw_move(); + + if (p.fired != 1) { + printf("expected 1 fire, got %d after 3 moves\n", p.fired); + goto fail_post; + } + + tw_cancel(&p.tw); + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_post: + tw_cancel(&p.tw); + tw_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* Multi-level: post a level-1 (>= 256ms) deadline; should still fire. */ +static int test_tw_post_level1_fires(void) +{ + struct payload p; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + tw_init_entry(&p.tw); + p.fired = 0; + + tw_post(&p.tw, now_ns() + 300 * MILLION, cb_count, &p); + + if (p.tw.lvl != 1) { + printf("expected level 1 placement, got %zu\n", p.tw.lvl); + goto fail_post; + } + + sleep_ns(320 * MILLION); + tw_move(); + + if (p.fired != 1) { + printf("level-1 entry didn't fire (got %d)\n", p.fired); + goto fail_post; + } + + tw_cancel(&p.tw); + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_post: + tw_cancel(&p.tw); + tw_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_tw_many_entries_all_fire(void) +{ + struct payload pl[16]; + size_t i; + size_t total = 0; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + for (i = 0; i < 16; ++i) { + tw_init_entry(&pl[i].tw); + pl[i].fired = 0; + tw_post(&pl[i].tw, now_ns() + (1 + i) * MILLION, + cb_count, &pl[i]); + } + + sleep_ns(40 * MILLION); + tw_move(); + + for (i = 0; i < 16; ++i) + total += pl[i].fired; + + if (total != 16) { + printf("expected 16 fires, got %zu\n", total); + goto fail_post; + } + + for (i = 0; i < 16; ++i) + tw_cancel(&pl[i].tw); + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_post: + for (i = 0; i < 16; ++i) + tw_cancel(&pl[i].tw); + tw_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* tw_next_expiry signals empty wheel via tv_nsec == -1. */ +static int test_tw_next_expiry_empty(void) +{ + struct timespec out; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + tw_next_expiry(&out); + if (out.tv_nsec != -1) { + printf("expected tv_nsec=-1, got %ld\n", (long) out.tv_nsec); + goto fail_init; + } + + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_init: + tw_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* tw_next_expiry returns a deadline within the right ballpark. */ +static int test_tw_next_expiry_returns_deadline(void) +{ + struct payload p; + struct timespec out; + uint64_t target; + uint64_t out_ns; + int64_t skew; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + tw_init_entry(&p.tw); + p.fired = 0; + + target = now_ns() + 50 * MILLION; + tw_post(&p.tw, target, cb_count, &p); + + tw_next_expiry(&out); + out_ns = TS_TO_UINT64(out); + + /* Level-0 quantization gives ±1 slot of skew. */ + skew = (int64_t)(out_ns) - (int64_t)(target); + if (skew < -2 * MILLION || skew > 4 * MILLION) { + printf("deadline not in -2..+4 ms, skew=%ld ns\n", (long) skew); + goto fail_post; + } + + tw_cancel(&p.tw); + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_post: + tw_cancel(&p.tw); + tw_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* Repost: fire, then post again. */ +static int test_tw_repost_after_fire(void) +{ + struct payload p; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + tw_init_entry(&p.tw); + p.fired = 0; + + tw_post(&p.tw, now_ns() + 3 * MILLION, cb_count, &p); + sleep_ns(20 * MILLION); + tw_move(); + if (p.fired != 1) { + printf("first fire missed\n"); + goto fail_post; + } + + tw_post(&p.tw, now_ns() + 3 * MILLION, cb_count, &p); + sleep_ns(20 * MILLION); + tw_move(); + if (p.fired != 2) { + printf("second fire missed (fired=%d)\n", p.fired); + goto fail_post; + } + + tw_cancel(&p.tw); + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_post: + tw_cancel(&p.tw); + tw_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* Double-post replaces the schedule; only the second fires. */ +static int test_tw_double_post_replaces(void) +{ + struct payload p; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + tw_init_entry(&p.tw); + p.fired = 0; + + tw_post(&p.tw, now_ns() + 30 * MILLION, cb_count, &p); + tw_post(&p.tw, now_ns() + 3 * MILLION, cb_count, &p); + + sleep_ns(20 * MILLION); + tw_move(); + + if (p.fired != 1) { + printf("expected 1 fire after replace, got %d\n", p.fired); + goto fail_post; + } + + sleep_ns(40 * MILLION); + tw_move(); + + if (p.fired != 1) { + printf("first schedule fired after replace (got %d)\n", + p.fired); + goto fail_post; + } + + tw_cancel(&p.tw); + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_post: + tw_cancel(&p.tw); + tw_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* Fire callback may safely cancel a sibling in the same slot. */ +static int test_tw_fire_cancels_sibling(void) +{ + struct cancel_payload a; + struct payload b; + uint64_t deadline; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + tw_init_entry(&a.tw); + tw_init_entry(&b.tw); + a.fired = 0; + a.sibling = &b.tw; + b.fired = 0; + + deadline = now_ns() + 3 * MILLION; + tw_post(&a.tw, deadline, cb_cancel_sibling, &a); + tw_post(&b.tw, deadline, cb_count, &b); + + sleep_ns(20 * MILLION); + tw_move(); + + if (a.fired != 1) { + printf("a expected 1 fire, got %d\n", a.fired); + goto fail_post; + } + if (b.fired != 0) { + printf("b should not have fired (got %d)\n", b.fired); + goto fail_post; + } + + tw_cancel(&a.tw); + tw_cancel(&b.tw); + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_post: + tw_cancel(&a.tw); + tw_cancel(&b.tw); + tw_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* Fire callback may safely repost a sibling to a future slot. */ +static int test_tw_fire_posts_sibling(void) +{ + struct repost_payload a; + struct payload b; + uint64_t deadline; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + tw_init_entry(&a.tw); + tw_init_entry(&b.tw); + a.fired = 0; + a.sibling = &b; + a.repost_at = now_ns() + 30 * MILLION; + b.fired = 0; + + deadline = now_ns() + 3 * MILLION; + tw_post(&a.tw, deadline, cb_repost_sibling, &a); + tw_post(&b.tw, deadline, cb_count, &b); + + sleep_ns(20 * MILLION); + tw_move(); + + if (a.fired != 1) { + printf("a expected 1 fire, got %d\n", a.fired); + goto fail_post; + } + if (b.fired != 0) { + printf("b fired before reposted deadline (got %d)\n", + b.fired); + goto fail_post; + } + + sleep_ns(25 * MILLION); + tw_move(); + + if (b.fired != 1) { + printf("b expected 1 fire after repost, got %d\n", + b.fired); + goto fail_post; + } + + tw_cancel(&a.tw); + tw_cancel(&b.tw); + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_post: + tw_cancel(&a.tw); + tw_cancel(&b.tw); + tw_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +int tw_test(int argc, + char ** argv) +{ + int ret = 0; + + (void) argc; + (void) argv; + + ret |= test_tw_init_fini(); + ret |= test_tw_post_fires_after_deadline(); + ret |= test_tw_no_fire_before_deadline(); + ret |= test_tw_cancel_prevents_fire(); + ret |= test_tw_cancel_unposted_is_noop(); + ret |= test_tw_fire_only_once(); + ret |= test_tw_post_level1_fires(); + ret |= test_tw_many_entries_all_fire(); + ret |= test_tw_next_expiry_empty(); + ret |= test_tw_next_expiry_returns_deadline(); + ret |= test_tw_repost_after_fire(); + ret |= test_tw_double_post_replaces(); + ret |= test_tw_fire_cancels_sibling(); + ret |= test_tw_fire_posts_sibling(); + + return ret; +} diff --git a/src/lib/timerwheel.c b/src/lib/timerwheel.c deleted file mode 100644 index 2c796c96..00000000 --- a/src/lib/timerwheel.c +++ /dev/null @@ -1,414 +0,0 @@ -/* - * Ouroboros - Copyright (C) 2016 - 2026 - * - * Timerwheel - * - * Dimitri Staessens <dimitri@ouroboros.rocks> - * Sander Vrijders <sander@ouroboros.rocks> - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public License - * version 2.1 as published by the Free Software Foundation. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., http://www.fsf.org/about/contact/. - */ - -#include <ouroboros/list.h> - -/* Overflow limits range to about 6 hours. */ -#define ts_to_ns(ts) (ts.tv_sec * BILLION + ts.tv_nsec) -#define ts_to_rxm_slot(ts) (ts_to_ns(ts) >> RXMQ_RES) -#define ts_to_ack_slot(ts) (ts_to_ns(ts) >> ACKQ_RES) - -struct rxm { - struct list_head next; - uint32_t seqno; -#ifndef RXM_BUFFER_ON_HEAP - struct ssm_pk_buff * spb; -#endif - struct frct_pci * pkt; - size_t len; - time_t t0; /* Time when original was sent (us). */ - struct frcti * frcti; - int fd; - int flow_id; /* Prevent rtx when fd reused. */ -}; - -struct ack { - struct list_head next; - struct frcti * frcti; - int fd; - int flow_id; -}; - -struct { - /* - * At a 1 ms min resolution, every level bumps the - * resolution by a factor of 16. - */ - struct list_head rxms[RXMQ_LVLS][RXMQ_SLOTS]; - - struct list_head acks[ACKQ_SLOTS]; - bool map[ACKQ_SLOTS][PROG_MAX_FLOWS]; - - size_t prv_rxm[RXMQ_LVLS]; /* Last processed rxm slots. */ - size_t prv_ack; /* Last processed ack slot. */ - pthread_mutex_t lock; -} rw; - -static void timerwheel_fini(void) -{ - size_t i; - size_t j; - struct list_head * p; - struct list_head * h; - - pthread_mutex_lock(&rw.lock); - - for (i = 0; i < RXMQ_LVLS; ++i) { - for (j = 0; j < RXMQ_SLOTS; j++) { - list_for_each_safe(p, h, &rw.rxms[i][j]) { - struct rxm * rxm; - rxm = list_entry(p, struct rxm, next); - list_del(&rxm->next); -#ifdef RXM_BUFFER_ON_HEAP - free(rxm->pkt); -#else - ssm_pk_buff_ack(rxm->spb); - ipcp_spb_release(rxm->spb); -#endif - free(rxm); - } - } - } - - for (i = 0; i < ACKQ_SLOTS; ++i) { - list_for_each_safe(p, h, &rw.acks[i]) { - struct ack * a = list_entry(p, struct ack, next); - list_del(&a->next); - free(a); - } - } - - pthread_mutex_unlock(&rw.lock); - - pthread_mutex_destroy(&rw.lock); -} - -static int timerwheel_init(void) -{ - struct timespec now; - size_t i; - size_t j; - - if (pthread_mutex_init(&rw.lock, NULL)) - return -1; - - clock_gettime(PTHREAD_COND_CLOCK, &now); - - for (i = 0; i < RXMQ_LVLS; ++i) { - rw.prv_rxm[i] = (ts_to_rxm_slot(now) - 1); - rw.prv_rxm[i] >>= (RXMQ_BUMP * i); - rw.prv_rxm[i] &= (RXMQ_SLOTS - 1); - for (j = 0; j < RXMQ_SLOTS; ++j) - list_head_init(&rw.rxms[i][j]); - } - - rw.prv_ack = (ts_to_ack_slot(now) - 1) & (ACKQ_SLOTS - 1); - for (i = 0; i < ACKQ_SLOTS; ++i) - list_head_init(&rw.acks[i]); - - return 0; -} - -static void timerwheel_move(void) -{ - struct timespec now; - struct list_head * p; - struct list_head * h; - size_t rxm_slot; - size_t ack_slot; - size_t i; - size_t j; - - pthread_mutex_lock(&rw.lock); - - pthread_cleanup_push(__cleanup_mutex_unlock, &rw.lock); - - clock_gettime(PTHREAD_COND_CLOCK, &now); - - rxm_slot = ts_to_rxm_slot(now); - - for (i = 0; i < RXMQ_LVLS; ++i) { - size_t j_max_slot = rxm_slot & (RXMQ_SLOTS - 1); - j = rw.prv_rxm[i]; - if (j_max_slot < j) - j_max_slot += RXMQ_SLOTS; - while (j++ < j_max_slot) { - list_for_each_safe(p, h, - &rw.rxms[i][j & (RXMQ_SLOTS - 1)]) { - struct rxm * r; - struct frct_cr * snd_cr; - struct frct_cr * rcv_cr; - size_t slot; - size_t rslot; - ssize_t idx; - struct ssm_pk_buff * spb; - struct frct_pci * pci; - struct flow * f; - uint32_t snd_lwe; - uint32_t rcv_lwe; - size_t lvl = 0; - - r = list_entry(p, struct rxm, next); - - list_del(&r->next); - - snd_cr = &r->frcti->snd_cr; - rcv_cr = &r->frcti->rcv_cr; - f = &proc.flows[r->fd]; -#ifndef RXM_BUFFER_ON_HEAP - ssm_pk_buff_ack(r->spb); -#endif - if (f->frcti == NULL - || f->info.id != r->flow_id) - goto cleanup; - - pthread_rwlock_rdlock(&r->frcti->lock); - - snd_lwe = snd_cr->lwe; - rcv_lwe = rcv_cr->lwe; - - pthread_rwlock_unlock(&r->frcti->lock); - - /* Has been ack'd, remove. */ - if (before(r->seqno, snd_lwe)) - goto cleanup; - - /* Check for r-timer expiry. */ - if (ts_to_ns(now) - r->t0 > r->frcti->r) - goto flow_down; - - pthread_rwlock_wrlock(&r->frcti->lock); - - if (r->seqno == r->frcti->rttseq) { - r->frcti->rto += - r->frcti->rto >> RTO_DIV; - r->frcti->probe = false; - } -#ifdef PROC_FLOW_STATS - r->frcti->n_rtx++; -#endif - rslot = r->frcti->rto >> RXMQ_RES; - - pthread_rwlock_unlock(&r->frcti->lock); - - /* Schedule at least in the next time slot. */ - slot = ts_to_ns(now) >> RXMQ_RES; - - while (rslot >= RXMQ_SLOTS) { - ++lvl; - rslot >>= RXMQ_BUMP; - slot >>= RXMQ_BUMP; - } - - if (lvl >= RXMQ_LVLS) /* Can't reschedule */ - goto flow_down; - - rslot = (rslot + slot + 1) & (RXMQ_SLOTS - 1); -#ifdef RXM_BLOCKING - if (ipcp_spb_reserve(&spb, r->len) < 0) -#else - if (ssm_pool_alloc(proc.pool, r->len, NULL, - &spb) < 0) -#endif - goto reschedule; /* rdrbuff full */ - - pci = (struct frct_pci *) ssm_pk_buff_head(spb); - memcpy(pci, r->pkt, r->len); -#ifndef RXM_BUFFER_ON_HEAP - ipcp_spb_release(r->spb); - r->spb = spb; - r->pkt = pci; - ssm_pk_buff_wait_ack(spb); -#endif - idx = ssm_pk_buff_get_idx(spb); - - /* Retransmit the copy. */ - pci->ackno = hton32(rcv_lwe); -#ifdef RXM_BLOCKING - if (ssm_rbuff_write_b(f->tx_rb, idx, NULL) < 0) -#else - if (ssm_rbuff_write(f->tx_rb, idx) < 0) -#endif - goto flow_down; - ssm_flow_set_notify(f->set, f->info.id, - FLOW_PKT); - reschedule: - list_add(&r->next, &rw.rxms[lvl][rslot]); - continue; - - flow_down: - ssm_rbuff_set_acl(f->tx_rb, ACL_FLOWDOWN); - ssm_rbuff_set_acl(f->rx_rb, ACL_FLOWDOWN); - cleanup: -#ifdef RXM_BUFFER_ON_HEAP - free(r->pkt); -#else - ipcp_spb_release(r->spb); -#endif - free(r); - } - } - rw.prv_rxm[i] = rxm_slot & (RXMQ_SLOTS - 1); - /* Move up a level in the wheel. */ - rxm_slot >>= RXMQ_BUMP; - } - - ack_slot = ts_to_ack_slot(now) & (ACKQ_SLOTS - 1) ; - - j = rw.prv_ack; - - if (ack_slot < j) - ack_slot += ACKQ_SLOTS; - - while (j++ < ack_slot) { - list_for_each_safe(p, h, &rw.acks[j & (ACKQ_SLOTS - 1)]) { - struct ack * a; - struct flow * f; - - a = list_entry(p, struct ack, next); - - list_del(&a->next); - - f = &proc.flows[a->fd]; - - rw.map[j & (ACKQ_SLOTS - 1)][a->fd] = false; - - if (f->info.id == a->flow_id && f->frcti != NULL) - send_frct_pkt(a->frcti); - - free(a); - } - } - - rw.prv_ack = ack_slot & (ACKQ_SLOTS - 1); - - pthread_cleanup_pop(true); -} - -static int timerwheel_rxm(struct frcti * frcti, - uint32_t seqno, - struct ssm_pk_buff * spb) -{ - struct timespec now; - struct rxm * r; - size_t slot; - size_t lvl = 0; - time_t rto_slot; - - r = malloc(sizeof(*r)); - if (r == NULL) - return -ENOMEM; - - clock_gettime(PTHREAD_COND_CLOCK, &now); - - r->t0 = ts_to_ns(now); - r->seqno = seqno; - r->frcti = frcti; - r->len = ssm_pk_buff_len(spb); -#ifdef RXM_BUFFER_ON_HEAP - r->pkt = malloc(r->len); - if (r->pkt == NULL) { - free(r); - return -ENOMEM; - } - memcpy(r->pkt, ssm_pk_buff_head(spb), r->len); -#else - r->spb = spb; - r->pkt = (struct frct_pci *) ssm_pk_buff_head(spb); -#endif - pthread_rwlock_rdlock(&r->frcti->lock); - - rto_slot = frcti->rto >> RXMQ_RES; - slot = r->t0 >> RXMQ_RES; - - r->fd = frcti->fd; - r->flow_id = proc.flows[r->fd].info.id; - - pthread_rwlock_unlock(&r->frcti->lock); - - while (rto_slot >= RXMQ_SLOTS) { - ++lvl; - rto_slot >>= RXMQ_BUMP; - slot >>= RXMQ_BUMP; - } - - if (lvl >= RXMQ_LVLS) { /* Out of timerwheel range. */ -#ifdef RXM_BUFFER_ON_HEAP - free(r->pkt); -#endif - free(r); - return -EPERM; - } - - slot = (slot + rto_slot + 1) & (RXMQ_SLOTS - 1); - - pthread_mutex_lock(&rw.lock); - - list_add_tail(&r->next, &rw.rxms[lvl][slot]); -#ifndef RXM_BUFFER_ON_HEAP - ssm_pk_buff_wait_ack(spb); -#endif - pthread_mutex_unlock(&rw.lock); - - return 0; -} - -static int timerwheel_delayed_ack(int fd, - struct frcti * frcti) -{ - struct timespec now; - struct ack * a; - size_t slot; - - a = malloc(sizeof(*a)); - if (a == NULL) - return -ENOMEM; - - clock_gettime(PTHREAD_COND_CLOCK, &now); - - pthread_rwlock_rdlock(&frcti->lock); - - slot = (((ts_to_ns(now) + (TICTIME << 1)) >> ACKQ_RES) + 1) - & (ACKQ_SLOTS - 1); - - pthread_rwlock_unlock(&frcti->lock); - - a->fd = fd; - a->frcti = frcti; - a->flow_id = proc.flows[fd].info.id; - - pthread_mutex_lock(&rw.lock); - - if (rw.map[slot][fd]) { - pthread_mutex_unlock(&rw.lock); - free(a); - return 0; - } - - rw.map[slot][fd] = true; - - list_add_tail(&a->next, &rw.acks[slot]); - - pthread_mutex_unlock(&rw.lock); - - return 0; -} diff --git a/src/lib/tw.c b/src/lib/tw.c new file mode 100644 index 00000000..ccde7dd1 --- /dev/null +++ b/src/lib/tw.c @@ -0,0 +1,307 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * Generic deadline-ordered callback queue (timing wheel) + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +#if defined(__linux__) || defined(__CYGWIN__) +#define _DEFAULT_SOURCE +#else +#define _POSIX_C_SOURCE 200809L +#endif + +#include "config.h" + +#include <ouroboros/list.h> +#include <ouroboros/pthread.h> +#include <ouroboros/time.h> +#include <ouroboros/tw.h> + +#include <assert.h> +#include <stdbool.h> +#include <stdint.h> + +/* 3 levels × 256 slots, 1 ms / 16 ms / 256 ms per-slot resolution. */ +#define TW_LVLS 3 +#define TW_SLOTS 256 +#define TW_BUMP 4 +#define TW_RES 20 /* 2^20 ns ≈ 1 ms per slot at level 0. */ + +#define TW_SLOT(x) ((x) & (TW_SLOTS - 1)) + +static struct { + struct list_head levels[TW_LVLS][TW_SLOTS]; + size_t prv[TW_LVLS]; + pthread_mutex_t mtx; + pthread_mutex_t move_mtx; + bool initialised; +} tw; + +static size_t tw_lvl_res(size_t lvl) +{ + return TW_RES + TW_BUMP * lvl; +} + +/* Smallest level whose slot range covers the deadline. */ +static size_t tw_pick_lvl(uint64_t now_ns, + uint64_t deadline_ns) +{ + uint64_t delta; + size_t lvl; + + delta = deadline_ns > now_ns ? deadline_ns - now_ns : 0; + lvl = 0; + + while (lvl < TW_LVLS - 1 && (delta >> tw_lvl_res(lvl)) >= TW_SLOTS) + ++lvl; + + return lvl; +} + +static size_t tw_slot(uint64_t ns, + size_t lvl) +{ + return TW_SLOT(ns >> tw_lvl_res(lvl)); +} + +int tw_init(void) +{ + struct timespec now; + size_t i; + size_t j; + + assert(!tw.initialised); + + if (pthread_mutex_init(&tw.mtx, NULL)) + goto fail_mtx; + + if (pthread_mutex_init(&tw.move_mtx, NULL)) + goto fail_move_mtx; + + clock_gettime(PTHREAD_COND_CLOCK, &now); + + for (i = 0; i < TW_LVLS; ++i) { + tw.prv[i] = TW_SLOT(tw_slot(TS_TO_UINT64(now), i) - 1); + for (j = 0; j < TW_SLOTS; ++j) + list_head_init(&tw.levels[i][j]); + } + + tw.initialised = true; + + return 0; + + fail_move_mtx: + pthread_mutex_destroy(&tw.mtx); + fail_mtx: + return -1; +} + +void tw_fini(void) +{ + size_t i; + size_t j; + + assert(tw.initialised); + + for (i = 0; i < TW_LVLS; ++i) { + for (j = 0; j < TW_SLOTS; ++j) + assert(list_is_empty(&tw.levels[i][j])); + } + + pthread_mutex_destroy(&tw.move_mtx); + pthread_mutex_destroy(&tw.mtx); + + tw.initialised = false; +} + +void tw_init_entry(struct tw_entry * e) +{ + list_head_init(&e->next); + + e->deadline_ns = 0; + e->fire = NULL; + e->arg = NULL; + e->lvl = 0; +} + +void tw_post(struct tw_entry * e, + uint64_t deadline_ns, + tw_fire_fn_t fire, + void * arg) +{ + struct timespec now; + size_t lvl; + size_t slot; + + assert(tw.initialised); + + clock_gettime(PTHREAD_COND_CLOCK, &now); + + lvl = tw_pick_lvl(TS_TO_UINT64(now), deadline_ns); + /* +1 so deadline <= slot_start; lands later in slot. */ + slot = TW_SLOT(tw_slot(deadline_ns, lvl) + 1); + + e->deadline_ns = deadline_ns; + e->fire = fire; + e->arg = arg; + e->lvl = lvl; + + pthread_mutex_lock(&tw.mtx); + + if (!list_is_empty(&e->next)) + list_del(&e->next); + + list_add_tail(&e->next, &tw.levels[lvl][slot]); + + pthread_mutex_unlock(&tw.mtx); +} + +void tw_cancel(struct tw_entry * e) +{ + if (e == NULL) + return; + + assert(tw.initialised); + + pthread_mutex_lock(&tw.mtx); + + if (!list_is_empty(&e->next)) { + list_del(&e->next); + list_head_init(&e->next); + } + + pthread_mutex_unlock(&tw.mtx); +} + +void tw_move(void) +{ + struct timespec now; + struct list_head deferred; + struct list_head * p; + uint64_t now_ns; + size_t i; + size_t j; + size_t cur; + + assert(tw.initialised); + + if (pthread_mutex_trylock(&tw.move_mtx) != 0) + return; + + pthread_cleanup_push(__cleanup_mutex_unlock, &tw.move_mtx); + + pthread_mutex_lock(&tw.mtx); + + pthread_cleanup_push(__cleanup_mutex_unlock, &tw.mtx); + + clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); + + for (i = 0; i < TW_LVLS; ++i) { + cur = tw_slot(now_ns, i); + + j = tw.prv[i]; + if (cur < j) + cur += TW_SLOTS; + + while (j++ < cur) { + size_t s = TW_SLOT(j); + + /* Pop-front so fire may mutate any entry. */ + list_head_init(&deferred); + + while (!list_is_empty(&tw.levels[i][s])) { + struct tw_entry * e; + p = tw.levels[i][s].nxt; + e = list_entry(p, struct tw_entry, next); + list_del(&e->next); + + if (e->deadline_ns > now_ns) { + list_add_tail(&e->next, &deferred); + continue; + } + + pthread_mutex_unlock(&tw.mtx); + e->fire(e->arg); + pthread_mutex_lock(&tw.mtx); + } + + while (!list_is_empty(&deferred)) { + p = deferred.nxt; + list_del(p); + list_add_tail(p, &tw.levels[i][s]); + } + } + + tw.prv[i] = TW_SLOT(cur); + } + + pthread_cleanup_pop(true); /* tw.mtx */ + pthread_cleanup_pop(true); /* tw.move_mtx */ +} + +/* Earliest pending deadline at level lvl, INT64_MAX if level is empty. */ +static int64_t tw_lvl_earliest(size_t lvl, + uint64_t now_ns) +{ + size_t cur = tw_slot(now_ns, lvl); + size_t j; + + for (j = 1; j <= TW_SLOTS; ++j) { + size_t s = TW_SLOT(cur + j); + + if (list_is_empty(&tw.levels[lvl][s])) + continue; + + return (int64_t)(now_ns + ((uint64_t) j << tw_lvl_res(lvl))); + } + + return INT64_MAX; +} + +void tw_next_expiry(struct timespec * out) +{ + struct timespec now; + uint64_t now_ns; + int64_t earliest = INT64_MAX; + size_t i; + + assert(tw.initialised); + + clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); + + pthread_mutex_lock(&tw.mtx); + + for (i = 0; i < TW_LVLS; ++i) { + int64_t dl = tw_lvl_earliest(i, now_ns); + if (dl < earliest) + earliest = dl; + } + + pthread_mutex_unlock(&tw.mtx); + + if (earliest == INT64_MAX) { + /* Empty wheel: tv_nsec=-1 is an invalid normalised value. */ + out->tv_sec = 0; + out->tv_nsec = -1; + } else { + UINT64_TO_TS((uint64_t) earliest, out); + } +} diff --git a/src/tools/CMakeLists.txt b/src/tools/CMakeLists.txt index 3cec8172..6b418838 100644 --- a/src/tools/CMakeLists.txt +++ b/src/tools/CMakeLists.txt @@ -63,6 +63,11 @@ target_include_directories(operf PRIVATE ${TOOLS_INCLUDE_DIRS}) target_link_libraries(operf PRIVATE ouroboros-dev) install(TARGETS operf RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) +add_executable(oftp oftp/oftp.c) +target_include_directories(oftp PRIVATE ${TOOLS_INCLUDE_DIRS}) +target_link_libraries(oftp PRIVATE ouroboros-dev) +install(TARGETS oftp RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") add_executable(ovpn ovpn/ovpn.c) target_include_directories(ovpn PRIVATE ${TOOLS_INCLUDE_DIRS}) diff --git a/src/tools/irm/irm_ipcp_connect.c b/src/tools/irm/irm_ipcp_connect.c index f88c36dc..fb21faec 100644 --- a/src/tools/irm/irm_ipcp_connect.c +++ b/src/tools/irm/irm_ipcp_connect.c @@ -100,16 +100,18 @@ int do_connect_ipcp(int argc, } if (qos != NULL) { - if (strcmp(qos, "best") == 0) - qs = qos_best_effort; - else if (strcmp(qos, "raw") == 0) + if (strcmp(qos, "raw") == 0) qs = qos_raw; - else if (strcmp(qos, "video") == 0) - qs = qos_video; - else if (strcmp(qos, "voice") == 0) - qs = qos_voice; - else if (strcmp(qos, "data") == 0) - qs = qos_data; + else if (strcmp(qos, "safe") == 0) + qs = qos_raw_safe; + else if (strcmp(qos, "rt") == 0) + qs = qos_rt; + else if (strcmp(qos, "rt-safe") == 0) + qs = qos_rt_safe; + else if (strcmp(qos, "msg") == 0) + qs = qos_msg; + else if (strcmp(qos, "stream") == 0) + qs = qos_stream; else printf("Unknown QoS cube, defaulting to raw.\n"); } @@ -126,7 +128,7 @@ int do_connect_ipcp(int argc, if (wildcard_match(comp, MGMT) == 0) { component = MGMT_COMP; - /* FIXME: move to qos_data when stable */ + /* FIXME: move to qos_msg when stable */ if (irm_connect_ipcp(pid, dst, component, qos_raw)) return -1; } diff --git a/src/tools/ocbr/ocbr_client.c b/src/tools/ocbr/ocbr_client.c index 9dd9904c..36c07d43 100644 --- a/src/tools/ocbr/ocbr_client.c +++ b/src/tools/ocbr/ocbr_client.c @@ -37,8 +37,11 @@ */ #include <ouroboros/dev.h> +#include <ouroboros/qos.h> #include <signal.h> +#include <stdlib.h> +#include <string.h> volatile bool stop; @@ -86,6 +89,11 @@ int client_main(char * server, struct timespec end; struct timespec intv = {(gap / BILLION), gap % BILLION}; int ms; + const char * qenv; + qosspec_t qs; + qosspec_t * qsp; + + qsp = NULL; stop = false; @@ -98,16 +106,38 @@ int client_main(char * server, sigaction(SIGHUP, &sig_act, NULL) || sigaction(SIGPIPE, &sig_act, NULL)) { printf("Failed to install sighandler.\n"); - return -1; + return 2; } printf("Client started, duration %d, rate %lu b/s, size %d B.\n", duration, rate, size); - fd = flow_alloc(server, NULL, NULL); + qenv = getenv("OCBR_QOS"); + if (qenv != NULL) { + if (strcmp(qenv, "raw") == 0) + qs = qos_raw; + else if (strcmp(qenv, "safe") == 0) + qs = qos_raw_safe; + else if (strcmp(qenv, "rt") == 0) + qs = qos_rt; + else if (strcmp(qenv, "rt_safe") == 0) + qs = qos_rt_safe; + else if (strcmp(qenv, "msg") == 0) + qs = qos_msg; + else if (strcmp(qenv, "stream") == 0) + qs = qos_stream; + else { + fprintf(stderr, + "Unknown OCBR_QOS='%s', using raw.\n", qenv); + qs = qos_raw; + } + qsp = &qs; + printf("OCBR_QOS=%s\n", qenv); + } + fd = flow_alloc(server, qsp, NULL); if (fd < 0) { printf("Failed to allocate flow.\n"); - return -1; + return 2; } clock_gettime(CLOCK_REALTIME, &start); diff --git a/src/tools/oecho/oecho.c b/src/tools/oecho/oecho.c index 14caab53..ef0a168f 100644 --- a/src/tools/oecho/oecho.c +++ b/src/tools/oecho/oecho.c @@ -101,20 +101,20 @@ static int client_main(void) fd = flow_alloc("oecho", NULL, NULL); if (fd < 0) { printf("Failed to allocate flow.\n"); - return -1; + return 2; } if (flow_write(fd, message, strlen(message) + 1) < 0) { printf("Failed to write packet.\n"); flow_dealloc(fd); - return -1; + return 1; } count = flow_read(fd, buf, BUF_SIZE); if (count < 0) { printf("Failed to read packet.\n"); flow_dealloc(fd); - return -1; + return 1; } printf("Server replied with %.*s\n", (int) count, buf); @@ -126,7 +126,7 @@ static int client_main(void) int main(int argc, char ** argv) { - int ret = -1; + int ret = 0; bool server = false; argc--; diff --git a/src/tools/oftp/oftp.c b/src/tools/oftp/oftp.c new file mode 100644 index 00000000..1ae99403 --- /dev/null +++ b/src/tools/oftp/oftp.c @@ -0,0 +1,441 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * A minimal file-transfer tool over an FRCT stream flow + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#define _POSIX_C_SOURCE 200809L + +#include <ouroboros/crc64.h> +#include <ouroboros/dev.h> +#include <ouroboros/errno.h> +#include <ouroboros/fccntl.h> +#include <ouroboros/qos.h> + +#include <fcntl.h> +#include <inttypes.h> +#include <signal.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <unistd.h> + +#define BUF_SIZE 16384 + +static volatile sig_atomic_t stop = 0; + +static void apply_rto_min_env(int fd) +{ + const char * env; + long v; + + env = getenv("OFTP_FRCT_RTO_MIN"); + if (env == NULL) + return; + v = strtol(env, NULL, 10); + if (v <= 0) + return; + if (fccntl(fd, FRCTSRTOMIN, (time_t) v) < 0) + fprintf(stderr, + "oftp: failed to set RTO_MIN=%ld ns\n", v); +} + +static void apply_stream_ring_sz_env(int fd) +{ + const char * env; + long v; + + env = getenv("OFTP_FRCT_STREAM_RING_SZ"); + if (env == NULL) + return; + v = strtol(env, NULL, 10); + if (v <= 0) + return; + if (fccntl(fd, FRCTSRRINGSZ, (size_t) v) < 0) + fprintf(stderr, + "oftp: failed to set STREAM_RING_SZ=%ld\n", v); +} + +static void on_signal(int signo) +{ + (void) signo; + stop = 1; +} + +static void usage(void) +{ + printf("Usage: oftp [OPTION]...\n" + "Stream-mode file transfer over an Ouroboros flow.\n\n" + " -l, --listen Run as the receiver (server)\n" + " -n, --name NAME Destination service name (client)\n" + " -i, --in FILE Read input from FILE (default stdin)\n" + " -o, --out FILE Write output to FILE (default stdout)\n" + " -N, --bytes SIZE Stop after SIZE bytes " + "(K/M/G suffix; client only)\n" + " --help Display this help text and exit\n"); +} + +static int parse_size(const char * s, size_t * out) +{ + char * end; + unsigned long v; + size_t mul; + + v = strtoul(s, &end, 0); + if (end == s) + return -1; + + mul = 1; + if (*end == 'k' || *end == 'K') + mul = 1024UL; + else if (*end == 'm' || *end == 'M') + mul = 1024UL * 1024UL; + else if (*end == 'g' || *end == 'G') + mul = 1024UL * 1024UL * 1024UL; + else if (*end != '\0') + return -1; + + *out = (size_t) v * mul; + return 0; +} + +static void report_xfer(const char * tag, + size_t total, + uint64_t crc, + const struct timespec * t0, + const struct timespec * t1) +{ + double elapsed_s; + double mib_per_s; + + elapsed_s = (t1->tv_sec - t0->tv_sec) + + (t1->tv_nsec - t0->tv_nsec) / 1e9; + if (elapsed_s <= 0.0) + elapsed_s = 1e-9; + + mib_per_s = ((double) total / (1024.0 * 1024.0)) / elapsed_s; + + fprintf(stderr, + "oftp: %s %zu bytes in %.3f s (%.2f MiB/s) " + "crc64=%016" PRIx64 "\n", + tag, total, elapsed_s, mib_per_s, crc); +} + +static int xfer_to_flow(int fd, FILE * in, size_t max_bytes) +{ + char buf[BUF_SIZE]; + size_t n; + size_t total; + size_t want; + size_t off; + ssize_t w; + uint64_t crc; + struct timespec t0; + struct timespec t1; + + total = 0; + crc = 0; + + clock_gettime(CLOCK_MONOTONIC, &t0); + + while (!stop) { + want = sizeof(buf); + if (max_bytes > 0 && max_bytes - total < want) + want = max_bytes - total; + if (want == 0) + break; + + n = fread(buf, 1, want, in); + if (n == 0) + break; + + crc64_nvme(&crc, buf, n); + + off = 0; + while (off < n) { + w = flow_write(fd, buf + off, n - off); + if (w < 0) { + fprintf(stderr, + "flow_write failed: %zd\n", w); + return 1; + } + off += (size_t) w; + total += (size_t) w; + } + } + + clock_gettime(CLOCK_MONOTONIC, &t1); + + if (ferror(in)) { + fprintf(stderr, "Input read error.\n"); + return 1; + } + + report_xfer("sent", total, crc, &t0, &t1); + return 0; +} + +static int xfer_from_flow(int fd, FILE * out) +{ + char buf[BUF_SIZE]; + size_t total; + ssize_t n; + uint64_t crc; + struct timespec timeout; + struct timespec t0; + struct timespec t1; + bool started; + + total = 0; + crc = 0; + started = false; + timeout.tv_sec = 1; + timeout.tv_nsec = 0; + + /* Short timeout so SIGTERM/SIGINT 'stop' is observed promptly. */ + fccntl(fd, FLOWSRCVTIMEO, &timeout); + + while (!stop) { + n = flow_read(fd, buf, sizeof(buf)); + if (n == 0) { + /* Clean EOF: peer sent EOS and we drained it. */ + clock_gettime(CLOCK_MONOTONIC, &t1); + fflush(out); + if (!started) + t0 = t1; + report_xfer("received", total, crc, &t0, &t1); + return 0; + } + if (n == -ETIMEDOUT) + continue; + if (n < 0) { + /* Peer aborted before EOS: partial transfer. */ + if (n == -EFLOWDOWN || n == -EFLOWPEER) { + fprintf(stderr, + "oftp: peer aborted at %zu B\n", + total); + return 2; + } + fprintf(stderr, + "flow_read failed: %zd\n", n); + return 1; + } + if (!started) { + clock_gettime(CLOCK_MONOTONIC, &t0); + started = true; + } + crc64_nvme(&crc, buf, (size_t) n); + if (fwrite(buf, 1, (size_t) n, out) != (size_t) n) { + fprintf(stderr, "Output write error.\n"); + return 1; + } + total += (size_t) n; + } + + /* Receiver was signalled (SIGINT/SIGTERM) before EOF. */ + fflush(out); + fprintf(stderr, "oftp: interrupted at %zu B\n", total); + return 2; +} + +static int server_main(const char * outpath) +{ + FILE * out = stdout; + int fd; + int ofd; + int rc; + qosspec_t qs; + + if (outpath != NULL) { + ofd = open(outpath, + O_WRONLY | O_CREAT | O_EXCL | O_NOFOLLOW, + 0600); + if (ofd < 0) { + perror("open"); + return 1; + } + out = fdopen(ofd, "wb"); + if (out == NULL) { + perror("fdopen"); + close(ofd); + unlink(outpath); + return 1; + } + } + + fprintf(stderr, "oftp: listening...\n"); + + fd = flow_accept(&qs, NULL); + if (fd < 0) { + fprintf(stderr, "flow_accept failed: %d\n", fd); + if (out != stdout) + fclose(out); + return 1; + } + + if (qs.service != SVC_STREAM) { + fprintf(stderr, + "oftp: rejecting non-stream flow (service=%u)\n", + qs.service); + flow_dealloc(fd); + if (out != stdout) { + fclose(out); + unlink(outpath); + } + return 1; + } + + apply_rto_min_env(fd); + apply_stream_ring_sz_env(fd); + + rc = xfer_from_flow(fd, out); + + flow_dealloc(fd); + + if (out != stdout) { + fclose(out); + /* Drop the half-written file on abort/interrupt. */ + if (rc != 0) + unlink(outpath); + } + + return rc; +} + +static int client_main(const char * name, + const char * inpath, + size_t max_bytes) +{ + FILE * in; + int fd; + int rc; + qosspec_t qs; + + in = stdin; + qs = qos_stream; + + if (inpath != NULL) { + in = fopen(inpath, "rb"); + if (in == NULL) { + perror("fopen"); + return 1; + } + } + + fd = flow_alloc(name, &qs, NULL); + if (fd < 0) { + fprintf(stderr, "flow_alloc failed: %d\n", fd); + if (in != stdin) + fclose(in); + return 2; + } + + apply_rto_min_env(fd); + apply_stream_ring_sz_env(fd); + + rc = xfer_to_flow(fd, in, max_bytes); + + flow_dealloc(fd); + + if (in != stdin) + fclose(in); + + return rc; +} + +int main(int argc, char ** argv) +{ + bool server; + const char * name; + const char * inpath; + const char * outpath; + size_t max_bytes; + struct sigaction sa; + + server = false; + name = NULL; + inpath = NULL; + outpath = NULL; + max_bytes = 0; + + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = on_signal; + sigaction(SIGINT, &sa, NULL); + sigaction(SIGTERM, &sa, NULL); + signal(SIGPIPE, SIG_IGN); + + argc--; argv++; + while (argc > 0) { + if (strcmp(*argv, "-l") == 0 || + strcmp(*argv, "--listen") == 0) { + server = true; + } else if ((strcmp(*argv, "-n") == 0 || + strcmp(*argv, "--name") == 0) && argc > 1) { + name = *(++argv); argc--; + } else if ((strcmp(*argv, "-i") == 0 || + strcmp(*argv, "--in") == 0) && argc > 1) { + inpath = *(++argv); argc--; + } else if ((strcmp(*argv, "-o") == 0 || + strcmp(*argv, "--out") == 0) && argc > 1) { + outpath = *(++argv); argc--; + } else if ((strcmp(*argv, "-N") == 0 || + strcmp(*argv, "--bytes") == 0) && argc > 1) { + if (parse_size(*(++argv), &max_bytes) < 0) { + fprintf(stderr, + "oftp: bad size '%s'\n", *argv); + return 1; + } + argc--; + } else if (strcmp(*argv, "--help") == 0) { + usage(); + return 0; + } else { + usage(); + return 1; + } + argc--; argv++; + } + + if (server) + return server_main(outpath); + + if (name == NULL) { + usage(); + return 1; + } + + return client_main(name, inpath, max_bytes); +} diff --git a/src/tools/operf/operf.c b/src/tools/operf/operf.c index 1872b351..0198e871 100644 --- a/src/tools/operf/operf.c +++ b/src/tools/operf/operf.c @@ -248,5 +248,5 @@ int main(int argc, char ** argv) if (ret < 0) exit(EXIT_FAILURE); - exit(EXIT_SUCCESS); + exit(ret); } diff --git a/src/tools/operf/operf_client.c b/src/tools/operf/operf_client.c index 7e8f1a9b..e478aeff 100644 --- a/src/tools/operf/operf_client.c +++ b/src/tools/operf/operf_client.c @@ -185,7 +185,7 @@ int client_main(void) sigaction(SIGHUP, &sig_act, NULL) || sigaction(SIGPIPE, &sig_act, NULL)) { printf("Failed to install sighandler.\n"); - return -1; + return 2; } client.sent = 0; @@ -196,7 +196,7 @@ int client_main(void) fd = flow_alloc(client.server_name, NULL, NULL); if (fd < 0) { printf("Failed to allocate flow.\n"); - return -1; + return 2; } if (client.conf.test_type == TEST_TYPE_BI) @@ -207,7 +207,7 @@ int client_main(void) if (flow_write(fd, &client.conf, sizeof(client.conf)) < 0) { printf("Failed to send configuration.\n"); flow_dealloc(fd); - return -1; + return 1; } sleep(1); diff --git a/src/tools/oping/oping.c b/src/tools/oping/oping.c index 763c0d62..10e1e23c 100644 --- a/src/tools/oping/oping.c +++ b/src/tools/oping/oping.c @@ -60,7 +60,7 @@ #include <errno.h> #include <float.h> -#define OPING_BUF_SIZE 1500 +#define OPING_BUF_SIZE 16384 #define ECHO_REQUEST 0 #define ECHO_REPLY 1 #define OPING_MAX_FLOWS 256 @@ -81,8 +81,9 @@ " -F, --flood-busy Flood with busy-polling (lower latency)\n" \ " -i, --interval Interval (default 1000ms)\n" \ " -n, --server-name Name of the oping server\n" \ -" -q, --qos QoS (raw, best, video, voice, data)\n" \ +" -q, --qos QoS (raw, safe, rt, rt-safe, msg)\n" \ " -s, --size Payload size (B, default 64)\n" \ +" -W, --timeout Per-packet recv timeout, ms (default 2000)\n" \ " -Q, --quiet Only print final statistics\n" \ " -D, --timeofday Print time of day before each line\n" \ "\n" \ @@ -93,9 +94,11 @@ struct { int interval; uint32_t count; int size; + int timeout; /* per-packet recv timeout, ms */ bool timestamp; bool flood; bool flood_busy; + long duration; qosspec_t qs; /* stats */ @@ -175,18 +178,20 @@ int main(int argc, argc--; argv++; - client.s_apn = NULL; - client.interval = 1000; - client.size = 64; - client.count = INT_MAX; - client.timestamp = false; - client.flood = false; + client.s_apn = NULL; + client.interval = 1000; + client.size = 64; + client.count = INT_MAX; + client.timeout = 2000; + client.timestamp = false; + client.flood = false; client.flood_busy = false; - client.qs = qos_raw; - client.quiet = false; - server.quiet = false; - server.poll = false; - server.busy = false; + client.duration = 0; + client.qs = qos_raw; + client.quiet = false; + server.quiet = false; + server.poll = false; + server.busy = false; while (argc > 0) { if ((strcmp(*argv, "-i") == 0 || @@ -216,6 +221,12 @@ int main(int argc, argc > 1) { client.size = strtol(*(++argv), &rem, 10); --argc; + } else if ((strcmp(*argv, "-W") == 0 || + strcmp(*argv, "--timeout") == 0) && + argc > 1) { + client.timeout = strtol(*(++argv), &rem, 10); + client.timeout *= time_mul(rem); + --argc; } else if ((strcmp(*argv, "-q") == 0 || strcmp(*argv, "--qos") == 0) && argc > 1) { @@ -249,23 +260,25 @@ int main(int argc, } if (duration > 0) { - if (client.interval == 0) + if (client.flood || client.flood_busy) + client.duration = duration; + else if (client.interval == 0) client.count = duration * 10; else client.count = duration / client.interval; } if (qos != NULL) { - if (strcmp(qos, "best") == 0) - client.qs = qos_best_effort; - else if (strcmp(qos, "raw") == 0) + if (strcmp(qos, "raw") == 0) client.qs = qos_raw; - else if (strcmp(qos, "video") == 0) - client.qs = qos_video; - else if (strcmp(qos, "voice") == 0) - client.qs = qos_voice; - else if (strcmp(qos, "data") == 0) - client.qs = qos_data; + else if (strcmp(qos, "safe") == 0) + client.qs = qos_raw_safe; + else if (strcmp(qos, "rt") == 0) + client.qs = qos_rt; + else if (strcmp(qos, "rt-safe") == 0) + client.qs = qos_rt_safe; + else if (strcmp(qos, "msg") == 0) + client.qs = qos_msg; else printf("Unknown QoS cube, defaulting to raw.\n"); } @@ -298,7 +311,7 @@ int main(int argc, if (ret < 0) exit(EXIT_FAILURE); - exit(EXIT_SUCCESS); + exit(ret); fail: usage(); diff --git a/src/tools/oping/oping_client.c b/src/tools/oping/oping_client.c index 23807f65..4b01315d 100644 --- a/src/tools/oping/oping_client.c +++ b/src/tools/oping/oping_client.c @@ -47,6 +47,7 @@ void shutdown_client(int signo, siginfo_t * info, void * c) case SIGINT: case SIGTERM: case SIGHUP: + case SIGALRM: stop = true; default: return; @@ -89,7 +90,7 @@ static void print_rtt(int len, int seq, void * reader(void * o) { - struct timespec timeout = {client.interval / 1000 + 2, 0}; + struct timespec timeout; struct timespec now = {0, 0}; struct timespec sent; @@ -100,6 +101,9 @@ void * reader(void * o) double ms = 0; uint32_t exp_id = 0; + timeout.tv_sec = client.timeout / 1000; + timeout.tv_nsec = (client.timeout % 1000) * MILLION; + fccntl(fd, FLOWSRCVTIMEO, &timeout); while (!stop && client.rcvd != client.count) { @@ -284,18 +288,15 @@ static int flood_busy_ping(int fd) msg->tv_sec = sent.tv_sec; msg->tv_nsec = sent.tv_nsec; - if (flow_write(fd, buf, - client.size) < 0) { - printf("Failed to send " - "packet.\n"); + if (flow_write(fd, buf, client.size) < 0) { + printf("Failed to send packet.\n"); break; } ++client.sent; do { - n = flow_read(fd, buf, - OPING_BUF_SIZE); + n = flow_read(fd, buf, OPING_BUF_SIZE); } while (n == -EAGAIN && !stop); if (n < 0) @@ -315,9 +316,7 @@ static int flood_busy_ping(int fd) update_rtt_stats(ms); if (!client.quiet) - print_rtt(client.size, - ntohl(msg->id), ms, - NULL); + print_rtt(client.size, ntohl(msg->id), ms, NULL); } return 0; @@ -371,9 +370,7 @@ static int flood_ping(int fd) update_rtt_stats(ms); if (!client.quiet) - print_rtt(client.size, - ntohl(msg->id), ms, - NULL); + print_rtt(client.size, ntohl(msg->id), ms, NULL); } return 0; @@ -404,25 +401,34 @@ static int client_main(void) if (sigaction(SIGINT, &sig_act, NULL) || sigaction(SIGTERM, &sig_act, NULL) || sigaction(SIGHUP, &sig_act, NULL) || - sigaction(SIGPIPE, &sig_act, NULL)) { + sigaction(SIGPIPE, &sig_act, NULL) || + sigaction(SIGALRM, &sig_act, NULL)) { printf("Failed to install sighandler.\n"); - return -1; + return 2; } if (client_init()) { printf("Failed to initialize client.\n"); - return -1; + return 2; } fd = flow_alloc(client.s_apn, &client.qs, NULL); if (fd < 0) { printf("Failed to allocate flow: %d.\n", fd); client_fini(); - return -1; + return 2; } fccntl(fd, FLOWSFLAGS, FLOWFRDWR | FLOWFRNOPART); + if (client.duration > 0) { + struct itimerval it; + memset(&it, 0, sizeof(it)); + it.it_value.tv_sec = client.duration / 1000; + it.it_value.tv_usec = (client.duration % 1000) * 1000; + setitimer(ITIMER_REAL, &it, NULL); + } + clock_gettime(CLOCK_REALTIME, &tic); if (client.flood_busy) @@ -439,5 +445,5 @@ static int client_main(void) flow_dealloc(fd); client_fini(); - return 0; + return client.rcvd == client.sent ? 0 : 1; } diff --git a/src/tools/oping/oping_server.c b/src/tools/oping/oping_server.c index 33af28c4..e98ca040 100644 --- a/src/tools/oping/oping_server.c +++ b/src/tools/oping/oping_server.c @@ -237,6 +237,14 @@ int server_main(void) return -1; } + if (pthread_mutex_init(&server.lock, NULL)) { + fqueue_destroy(server.fq); + fset_destroy(server.flows); + return -1; + } + + memset(server.times, 0, sizeof(server.times)); + pthread_create(&server.cleaner_pt, NULL, cleaner_thread, NULL); if (server.busy) { @@ -255,11 +263,13 @@ int server_main(void) pthread_cancel(server.cleaner_pt); - fset_destroy(server.flows); - fqueue_destroy(server.fq); - + /* Join cancellable threads before tearing down their fset. */ pthread_join(server.server_pt, NULL); pthread_join(server.cleaner_pt, NULL); + pthread_mutex_destroy(&server.lock); + fset_destroy(server.flows); + fqueue_destroy(server.fq); + return 0; } |
