diff options
149 files changed, 17703 insertions, 3332 deletions
diff --git a/.ci/woodpecker/10-build.yaml b/.ci/woodpecker/10-build.yaml index 0a82c469..31b9b9b4 100644 --- a/.ci/woodpecker/10-build.yaml +++ b/.ci/woodpecker/10-build.yaml @@ -88,13 +88,6 @@ steps: done done - for rxm_heap in TRUE FALSE; do - for rxm_block in TRUE FALSE; do - echo "--- HEAP=$rxm_heap BLOCKING=$rxm_block ---" - run_build \ - -DRXM_BUFFER_ON_HEAP=$rxm_heap \ - -DRXM_BLOCKING=$rxm_block - done - done + run_build @@ -1,3 +1,4 @@ *~ *# build/ +/tags diff --git a/CMakeLists.txt b/CMakeLists.txt index c886146d..bfabd711 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,5 +69,6 @@ add_subdirectory(src/ipcpd) add_subdirectory(src/tools) setup_coverage_target() include(doc) +include(tags) include(install) diff --git a/cmake/config/global.cmake b/cmake/config/global.cmake index 0ac256bb..1e172724 100644 --- a/cmake/config/global.cmake +++ b/cmake/config/global.cmake @@ -25,8 +25,16 @@ set(SHM_LOCKFILE_NAME "/${SHM_PREFIX}.lockfile" CACHE INTERNAL # Secure memory configuration set(IRMD_SECMEM_MAX 1048576 CACHE STRING "IRMd secure heap size") -set(PROC_SECMEM_MAX 1048576 CACHE STRING "Process secure heap size") -set(SECMEM_GUARD 32 CACHE STRING "Secure heap min size") +# ~8 KiB secure heap per encrypted flow (cur+prev node slabs); the total +# is rounded up to a power of two for the OpenSSL secure-heap allocator. +set(PROC_SECMEM_FLOWS 512 CACHE STRING + "Max concurrent encrypted flows the per-process secure heap is sized for") +math(EXPR PROC_SECMEM_NEED "${PROC_SECMEM_FLOWS} * 8192") +set(PROC_SECMEM_MAX 4096) +while(PROC_SECMEM_MAX LESS PROC_SECMEM_NEED) + math(EXPR PROC_SECMEM_MAX "${PROC_SECMEM_MAX} * 2") +endwhile() +set(SECMEM_MINSIZE 32 CACHE STRING "Secure heap min alloc size") # Container/deployment options set(BUILD_CONTAINER FALSE CACHE BOOL diff --git a/cmake/config/ipcp/broadcast.cmake b/cmake/config/ipcp/broadcast.cmake index 79f41d10..f521ed8e 100644 --- a/cmake/config/ipcp/broadcast.cmake +++ b/cmake/config/ipcp/broadcast.cmake @@ -4,3 +4,6 @@ set(IPCP_BROADCAST_TARGET ipcpd-broadcast) set(IPCP_BROADCAST_MPL 100 CACHE STRING "Default maximum packet lifetime for the Broadcast IPCP, in ms") + +set(IPCP_BROADCAST_MTU 1400 CACHE STRING + "Layer MTU advertised by the Broadcast IPCP, in bytes") diff --git a/cmake/config/ipcp/common.cmake b/cmake/config/ipcp/common.cmake index ffd5dc32..7dbc252b 100644 --- a/cmake/config/ipcp/common.cmake +++ b/cmake/config/ipcp/common.cmake @@ -41,3 +41,18 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Linux") set(IPCP_LINUX_TIMERSLACK_NS 100 CACHE STRING "Slack value for high resolution timers on Linux systems.") endif() + +# Per-flow statistics exposed via the RIB (requires FUSE). +if(HAVE_FUSE) + set(IPCP_FLOW_STATS TRUE CACHE BOOL + "Enable per-flow statistics via the RIB") + if(IPCP_FLOW_STATS) + message(STATUS "IPCP flow statistics enabled") + endif() + + set(IPCP_ETH_FLOW_STATS FALSE CACHE BOOL + "Enable ipcpd-eth flow statistics via RIB") + if(IPCP_ETH_FLOW_STATS) + message(STATUS "ipcpd-eth flow statistics enabled") + endif() +endif() diff --git a/cmake/config/ipcp/eth.cmake b/cmake/config/ipcp/eth.cmake index 4b9007d2..d336d647 100644 --- a/cmake/config/ipcp/eth.cmake +++ b/cmake/config/ipcp/eth.cmake @@ -10,6 +10,10 @@ set(IPCP_ETH_WR_THR 1 CACHE STRING "Number of writer threads in Ethernet IPCP") set(IPCP_ETH_QDISC_BYPASS false CACHE BOOL "Bypass the Qdisc in the kernel when using raw sockets") +set(IPCP_ETH_SNDBUF 0 CACHE STRING + "Raw socket SO_SNDBUF in bytes; 0 = leave kernel default (wmem_default)") +set(IPCP_ETH_RCVBUF 0 CACHE STRING + "Raw socket SO_RCVBUF in bytes; 0 = leave kernel default (rmem_default)") set(IPCP_ETH_LO_MTU 9000 CACHE STRING "Restrict Ethernet MTU over loopback interfaces") set(IPCP_ETH_MGMT_FRAME_SIZE 9000 CACHE STRING diff --git a/cmake/config/ipcp/local.cmake b/cmake/config/ipcp/local.cmake index 88ee8998..70423cd1 100644 --- a/cmake/config/ipcp/local.cmake +++ b/cmake/config/ipcp/local.cmake @@ -2,8 +2,38 @@ set(IPCP_LOCAL_TARGET ipcpd-local) -set(IPCP_LOCAL_MPL 100 CACHE STRING +set(IPCP_LOCAL_MPL 50 CACHE STRING "Default maximum packet lifetime for the Local IPCP, in ms") +set(IPCP_LOCAL_MTU 65000 CACHE STRING + "Layer MTU advertised by the Local IPCP, in bytes") + set(IPCP_LOCAL_POLLING FALSE CACHE BOOL "Enable active polling in the Local IPCP for low-latency mode") + +# IPCP_LOCAL_MTU must fit in the largest enabled GSPP and PUP class +# (sender-side allocation: daemons use GSPP, apps use PUP). Reserve a +# margin for sizeof(struct ssm_pk_buff) + HEADSPACE + TAILSPACE. +math(EXPR _ssm_pk_overhead + "${SSM_PK_BUFF_HEADSPACE} + ${SSM_PK_BUFF_TAILSPACE} + 64") + +foreach(_pool GSPP PUP) + set(_largest 0) + foreach(_pair "256;256" "512;512" "1K;1024" "2K;2048" "4K;4096" + "16K;16384" "64K;65536" "256K;262144" "1M;1048576") + list(GET _pair 0 _name) + list(GET _pair 1 _bytes) + if(SSM_${_pool}_${_name}_BLOCKS GREATER 0 + AND _bytes GREATER _largest) + set(_largest ${_bytes}) + endif() + endforeach() + math(EXPR _avail "${_largest} - ${_ssm_pk_overhead}") + if(IPCP_LOCAL_MTU GREATER _avail) + message(FATAL_ERROR + "IPCP_LOCAL_MTU (${IPCP_LOCAL_MTU}) exceeds largest enabled " + "SSM_${_pool} class minus per-block overhead " + "(${_largest} - ${_ssm_pk_overhead} = ${_avail} bytes). " + "Lower IPCP_LOCAL_MTU or enable a larger SSM_${_pool}_*_BLOCKS.") + endif() +endforeach() diff --git a/cmake/config/ipcp/udp.cmake b/cmake/config/ipcp/udp.cmake index 0124c261..af84a844 100644 --- a/cmake/config/ipcp/udp.cmake +++ b/cmake/config/ipcp/udp.cmake @@ -10,3 +10,7 @@ set(IPCP_UDP_WR_THR 3 CACHE STRING "Number of writer threads in UDP IPCPs") set(IPCP_UDP_MPL 5000 CACHE STRING "Default maximum packet lifetime for the UDP IPCPs, in ms") +set(IPCP_UDP4_MTU 1472 CACHE STRING + "Fallback UDP4 layer MTU when getsockopt(IP_MTU) is unavailable, in bytes") +set(IPCP_UDP6_MTU 1452 CACHE STRING + "Fallback UDP6 layer MTU when getsockopt(IPV6_MTU) is unavailable, in bytes") diff --git a/cmake/config/ipcp/unicast.cmake b/cmake/config/ipcp/unicast.cmake index 3b5b0ce7..b8d4d516 100644 --- a/cmake/config/ipcp/unicast.cmake +++ b/cmake/config/ipcp/unicast.cmake @@ -4,6 +4,8 @@ set(IPCP_UNICAST_TARGET ipcpd-unicast) set(IPCP_UNICAST_MPL 100 CACHE STRING "Default maximum packet lifetime for the Unicast IPCP, in ms") +set(IPCP_UNICAST_MTU 1400 CACHE STRING + "Layer MTU advertised by the Unicast IPCP, in bytes (TODO: derive per-flow from n-1 path MTU minus DT PCI)") set(PFT_SIZE 256 CACHE STRING "Prefix forwarding table size for the Unicast IPCP") diff --git a/cmake/config/irmd.cmake b/cmake/config/irmd.cmake index b86a40c5..79e24bae 100644 --- a/cmake/config/irmd.cmake +++ b/cmake/config/irmd.cmake @@ -10,8 +10,8 @@ set(ENROLL_TIMEOUT 20000 CACHE STRING "Timeout for an IPCP to enroll (ms)") set(REG_TIMEOUT 20000 CACHE STRING "Timeout for registering a name (ms)") -set(QUERY_TIMEOUT 200 CACHE STRING - "Timeout to query a name with an IPCP (ms)") +set(QUERY_TIMEOUT 2000 CACHE STRING + "Timeout to query a name with an IPCP (ms); must exceed shim retry budget") set(CONNECT_TIMEOUT 20000 CACHE STRING "Timeout to connect an IPCP to another IPCP (ms)") set(FLOW_ALLOC_TIMEOUT 20000 CACHE STRING @@ -20,6 +20,12 @@ set(FLOW_ALLOC_TIMEOUT 20000 CACHE STRING # OAP (Ouroboros Authentication Protocol) set(OAP_REPLAY_TIMER 20 CACHE STRING "OAP replay protection window (s)") +set(OAP_REPLAY_MAX 4096 CACHE STRING + "Maximum entries in the OAP replay cache (bounds memory/CPU under flood)") +set(OAP_REKEY_TIMER 120 CACHE STRING + "Tier-2 re-key interval (s); bounds key age / PCS healing, 0 disables") +set(OAP_CLIENT_AUTH_DEFAULT TRUE CACHE BOOL + "Client requires the server to authenticate by default") set(DEBUG_PROTO_OAP FALSE CACHE BOOL "Add Flow allocation protocol message output to IRMd debug logging") diff --git a/cmake/config/lib.cmake b/cmake/config/lib.cmake index 287f30dc..81a7d6ba 100644 --- a/cmake/config/lib.cmake +++ b/cmake/config/lib.cmake @@ -4,11 +4,11 @@ # Flow limits set(SYS_MAX_FLOWS 10240 CACHE STRING "Maximum number of total flows for this system") -set(PROG_MAX_FLOWS 4096 CACHE STRING +set(PROC_MAX_FLOWS 4096 CACHE STRING "Maximum number of flows in an application") -set(PROG_RES_FDS 64 CACHE STRING +set(PROC_RES_FDS 64 CACHE STRING "Number of reserved flow descriptors per application") -set(PROG_MAX_FQUEUES 32 CACHE STRING +set(PROC_MAX_FQUEUES 32 CACHE STRING "Maximum number of flow sets per application") # Threading @@ -28,18 +28,28 @@ set(SOCKET_TIMEOUT 500 CACHE STRING set(QOS_DISABLE_CRC TRUE CACHE BOOL "Ignores ber setting on all QoS cubes") -# Delta-t protocol timers -set(DELTA_T_MPL 60 CACHE STRING - "Maximum packet lifetime (s)") -set(DELTA_T_ACK 10 CACHE STRING - "Maximum time to acknowledge a packet (s)") -set(DELTA_T_RTX 120 CACHE STRING - "Maximum time to retransmit a packet (s)") +include(utils/CPUUtils) +detect_pclmul() +detect_pmull() +if(HAVE_PCLMUL) + message(STATUS "CRC-64/NVMe backend: PCLMUL (x86 SSE4.1+PCLMUL)") +elseif(HAVE_PMULL) + message(STATUS "CRC-64/NVMe backend: PMULL (aarch64 crypto)") +else() + message(STATUS "CRC-64/NVMe backend: byte table (no acceleration)") +endif() + +# Delta-t protocol timers (Watson bound: 3*MPL + A + R). +# MPL is reported per IPCP (IPCP_*_MPL); A and R are FRCT-wide. +set(DELTA_T_ACK 1000 CACHE STRING + "Maximum time to acknowledge a packet (ms)") +set(DELTA_T_RTX 30000 CACHE STRING + "Maximum time to retransmit a packet (ms)") # FRCT configuration -set(FRCT_REORDER_QUEUE_SIZE 256 CACHE STRING +set(FRCT_REORDER_QUEUE_SIZE 128 CACHE STRING "Size of the reordering queue, must be a power of 2") -set(FRCT_START_WINDOW 64 CACHE STRING +set(FRCT_START_WINDOW 128 CACHE STRING "Start window, must be a power of 2") set(FRCT_LINUX_RTT_ESTIMATOR TRUE CACHE BOOL "Use Linux RTT estimator formula instead of the TCP RFC formula") @@ -48,15 +58,13 @@ set(FRCT_RTO_MDEV_MULTIPLIER 2 CACHE STRING set(FRCT_RTO_INC_FACTOR 0 CACHE STRING "Divisor for RTO increase after timeout: RTO += RTX >> X, 0: Karn/Partridge") set(FRCT_RTO_MIN 250 CACHE STRING - "Minimum Retransmission Timeout (RTO) for FRCT (us)") + "Hard floor for Retransmission Timeout (RTO) for FRCT (us)") set(FRCT_TICK_TIME 5000 CACHE STRING "Tick time for FRCT activity (retransmission, acknowledgments) (us)") +set(FRCT_DEBUG_STDOUT FALSE CACHE BOOL + "Print FRCT final counters to stdout at flow teardown") # Retransmission (RXM) configuration -set(RXM_BUFFER_ON_HEAP FALSE CACHE BOOL - "Store packets for retransmission on the heap instead of in packet buffer") -set(RXM_BLOCKING TRUE CACHE BOOL - "Use blocking writes for retransmission") set(RXM_MIN_RESOLUTION 20 CACHE STRING "Minimum retransmission delay (ns), as a power to 2") set(RXM_WHEEL_MULTIPLIER 4 CACHE STRING @@ -79,8 +87,44 @@ set(TPM_DEBUG_ABORT_TIMEOUT 0 CACHE STRING "TPM abort process after a thread reaches this timeout (s), 0 disables") # Encryption -set(KEY_ROTATION_BIT 20 CACHE STRING - "Bit position in packet counter that triggers key rotation (default 20 = every 2^20 packets)") +set(KEY_LEAF_BITS 20 CACHE STRING + "Packets per leaf key as a power of two (2^20 = AEAD-safe default)") +set(KEY_NODE_BITS 6 CACHE STRING + "Leaf keys per node key, power of two (2^6 = 64; leak compartment)") +set(KEY_NODE_COUNT 128 CACHE STRING + "Node keys per batch (N); <= 4096, the 12-bit on-wire node index") +set(KEY_REKEY_WATERMARK 4 CACHE STRING + "Re-key when this many node keys remain; 0 disables the count trigger") +set(KEY_REPLAY_WINDOW 2048 CACHE STRING + "RX replay window in packets; power of two, >= 128") +set(KEY_REKEY_WM_CHECK_BITS 16 CACHE STRING + "Re-key watermark is consulted once per 2^n flow writes") +if(NOT KEY_REPLAY_WINDOW MATCHES "^[0-9]+$") + message(FATAL_ERROR "KEY_REPLAY_WINDOW must be a positive integer") +endif() +math(EXPR _krw_p2 "${KEY_REPLAY_WINDOW} & (${KEY_REPLAY_WINDOW} - 1)") +if(KEY_REPLAY_WINDOW LESS 128 OR NOT _krw_p2 EQUAL 0) + message(FATAL_ERROR "KEY_REPLAY_WINDOW must be a power of two >= 128") +endif() + +# Re-key must finish within its lead window - KEY_REKEY_WATERMARK node keys +# worth of packets - before the batch exhausts and TX fails closed. dev.c only +# evaluates the watermark once per FLOW_WM_CHECK writes, so a lead below ~2x +# that leaves a high-rate flow no room to complete the exchange. Production +# defaults are vast; this guards under-sized (test) geometries. +if(KEY_REKEY_WATERMARK GREATER 0) + math(EXPR _rk_wm_check "1 << ${KEY_REKEY_WM_CHECK_BITS}") + math(EXPR _rk_lead + "${KEY_REKEY_WATERMARK} << (${KEY_LEAF_BITS} + ${KEY_NODE_BITS})") + math(EXPR _rk_min "2 * ${_rk_wm_check}") + if(_rk_lead LESS _rk_min) + message(WARNING + "Re-key lead is ${_rk_lead} packets vs the watermark check interval " + "${_rk_wm_check}; a high-rate flow may exhaust its key batch before the " + "re-key completes (TX fails closed until it does). Raise KEY_LEAF_BITS, " + "KEY_NODE_BITS, or KEY_REKEY_WATERMARK.") + endif() +endif() # Flow statistics (requires FUSE) if(HAVE_FUSE) @@ -92,3 +136,4 @@ if(HAVE_FUSE) message(STATUS "Application flow statistics disabled") endif() endif() + diff --git a/cmake/config/ssm.cmake b/cmake/config/ssm.cmake index c1f34655..589171ea 100644 --- a/cmake/config/ssm.cmake +++ b/cmake/config/ssm.cmake @@ -15,14 +15,12 @@ set(SSM_PUP_NAME_FMT "/${SSM_PREFIX}.pup.%d" CACHE INTERNAL # Packet buffer configuration set(SSM_POOL_NAME "/${SHM_PREFIX}.pool" CACHE INTERNAL "Name for the main POSIX shared memory pool") -set(SSM_POOL_BLOCKS 16384 CACHE STRING - "Number of blocks in SSM packet pool, must be a power of 2") set(SSM_PK_BUFF_HEADSPACE 256 CACHE STRING "Bytes of headspace to reserve for future headers") set(SSM_PK_BUFF_TAILSPACE 32 CACHE STRING "Bytes of tailspace to reserve for future tails") set(SSM_RBUFF_SIZE 1024 CACHE STRING - "Number of blocks in rbuff buffer, must be a power of 2") + "Number of slots in a flow's rbuff ring; must be a power of 2") set(SSM_RBUFF_PREFIX "/${SHM_PREFIX}.rbuff." CACHE INTERNAL "Prefix for rbuff POSIX shared memory filenames") set(SSM_FLOW_SET_PREFIX "/${SHM_PREFIX}.set." CACHE INTERNAL @@ -31,12 +29,14 @@ set(SSM_FLOW_SET_PREFIX "/${SHM_PREFIX}.set." CACHE INTERNAL # Number of shards per size class for reducing contention set(SSM_POOL_SHARDS 4 CACHE STRING "Number of allocator shards per size class") +set(SSM_POOL_RECLAIM_AGE_S 60 CACHE STRING + "Minimum age in seconds before a block is presumed stale and reclaimed") # Global Shared Packet Pool (GSPP) - for privileged processes # Shared by all processes in 'ouroboros' group (~60 MB total) set(SSM_GSPP_256_BLOCKS 1024 CACHE STRING "GSPP: Number of 256B blocks") -set(SSM_GSPP_512_BLOCKS 768 CACHE STRING +set(SSM_GSPP_512_BLOCKS 2048 CACHE STRING "GSPP: Number of 512B blocks") set(SSM_GSPP_1K_BLOCKS 512 CACHE STRING "GSPP: Number of 1KB blocks") @@ -55,13 +55,13 @@ set(SSM_GSPP_1M_BLOCKS 16 CACHE STRING # Per-User Pool (PUP) - for unprivileged applications # Each unprivileged app gets its own smaller pool (~7.5 MB total) -set(SSM_PUP_256_BLOCKS 128 CACHE STRING +set(SSM_PUP_256_BLOCKS 512 CACHE STRING "PUP: Number of 256B blocks") -set(SSM_PUP_512_BLOCKS 96 CACHE STRING +set(SSM_PUP_512_BLOCKS 512 CACHE STRING "PUP: Number of 512B blocks") -set(SSM_PUP_1K_BLOCKS 64 CACHE STRING +set(SSM_PUP_1K_BLOCKS 512 CACHE STRING "PUP: Number of 1KB blocks") -set(SSM_PUP_2K_BLOCKS 48 CACHE STRING +set(SSM_PUP_2K_BLOCKS 512 CACHE STRING "PUP: Number of 2KB blocks") set(SSM_PUP_4K_BLOCKS 32 CACHE STRING "PUP: Number of 4KB blocks") @@ -74,6 +74,23 @@ set(SSM_PUP_256K_BLOCKS 2 CACHE STRING set(SSM_PUP_1M_BLOCKS 0 CACHE STRING "PUP: Number of 1MB blocks") +# Zero classes too small for spb header + HEADSPACE + TAILSPACE + 1 B. +math(EXPR _SSM_MIN_USEFUL_CLASS + "32 + ${SSM_PK_BUFF_HEADSPACE} + ${SSM_PK_BUFF_TAILSPACE}") +foreach(_pair "256:256" "512:512" "1K:1024" "2K:2048") + string(REPLACE ":" ";" _p "${_pair}") + list(GET _p 0 _suffix) + list(GET _p 1 _size) + if(_size LESS _SSM_MIN_USEFUL_CLASS) + set(SSM_GSPP_${_suffix}_BLOCKS 0) + set(SSM_PUP_${_suffix}_BLOCKS 0) + endif() +endforeach() +unset(_SSM_MIN_USEFUL_CLASS) +unset(_p) +unset(_suffix) +unset(_size) + # SSM pool size calculations include(utils/HumanReadable) @@ -129,3 +146,23 @@ message(STATUS " Blocks: ${SSM_PUP_256_BLOCKS}, ${SSM_PUP_512_BLOCKS}, " "${SSM_PUP_1K_BLOCKS}, ${SSM_PUP_2K_BLOCKS}, ${SSM_PUP_4K_BLOCKS}, " "${SSM_PUP_16K_BLOCKS}, ${SSM_PUP_64K_BLOCKS}, ${SSM_PUP_256K_BLOCKS}, " "${SSM_PUP_1M_BLOCKS}") + +# FRCT reorder queue must fit in every enabled size class. If RQ_SIZE +# >= any backing pool, the receiver advertises a window the pool +# cannot back; np1_flow_write fails under load and a single dropped +# fragment wedges the flow. Auto-zeroed classes are skipped. +foreach(_class 256 512 1K 2K) + if(SSM_PUP_${_class}_BLOCKS GREATER 0 + AND NOT FRCT_REORDER_QUEUE_SIZE LESS SSM_PUP_${_class}_BLOCKS) + message(FATAL_ERROR + "FRCT_REORDER_QUEUE_SIZE (${FRCT_REORDER_QUEUE_SIZE}) must be " + "< SSM_PUP_${_class}_BLOCKS (${SSM_PUP_${_class}_BLOCKS}): " + "the FC window cannot exceed the pool that backs OOO stashing.") + endif() + if(SSM_GSPP_${_class}_BLOCKS GREATER 0 + AND NOT FRCT_REORDER_QUEUE_SIZE LESS SSM_GSPP_${_class}_BLOCKS) + message(FATAL_ERROR + "FRCT_REORDER_QUEUE_SIZE (${FRCT_REORDER_QUEUE_SIZE}) must be " + "< SSM_GSPP_${_class}_BLOCKS (${SSM_GSPP_${_class}_BLOCKS}).") + endif() +endforeach() diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake index 109fe1d6..ff44ad68 100644 --- a/cmake/dependencies.cmake +++ b/cmake/dependencies.cmake @@ -7,6 +7,7 @@ include(dependencies/system/libraries) include(dependencies/system/explicit_bzero) include(dependencies/system/robustmutex) include(dependencies/system/fuse) +include(dependencies/system/liburcu) include(dependencies/system/sysrandom) # Cryptography diff --git a/cmake/dependencies/system/liburcu.cmake b/cmake/dependencies/system/liburcu.cmake new file mode 100644 index 00000000..89a7ab12 --- /dev/null +++ b/cmake/dependencies/system/liburcu.cmake @@ -0,0 +1,45 @@ +# Userspace RCU (liburcu) - optional. Enables lock-free data-plane key +# rotation; absent => per-flow rwlock fallback. The "bulletproof" flavour +# (urcu-bp) auto-registers reader threads, so application threads need no +# RCU lifecycle plumbing. +if(PkgConfig_FOUND) + pkg_check_modules(URCU_PKG QUIET IMPORTED_TARGET liburcu-bp) + if(URCU_PKG_FOUND AND NOT TARGET Urcu::Urcu) + add_library(Urcu::Urcu ALIAS PkgConfig::URCU_PKG) + endif() +endif() + +if(NOT URCU_PKG_FOUND) + find_library(URCU_BP_LIBRARY urcu-bp QUIET) + find_library(URCU_COMMON_LIBRARY urcu-common QUIET) + find_path(URCU_INCLUDE_DIR urcu-bp.h QUIET) + if(URCU_BP_LIBRARY AND URCU_COMMON_LIBRARY AND URCU_INCLUDE_DIR) + set(URCU_PKG_FOUND TRUE) + if(NOT TARGET Urcu::Urcu) + add_library(Urcu::Urcu INTERFACE IMPORTED) + set_target_properties(Urcu::Urcu PROPERTIES + INTERFACE_LINK_LIBRARIES "${URCU_BP_LIBRARY};${URCU_COMMON_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${URCU_INCLUDE_DIR}") + endif() + endif() +endif() + +if(URCU_PKG_FOUND) + set(DISABLE_LIBURCU FALSE CACHE BOOL "Disable liburcu (RCU) support") + if(NOT DISABLE_LIBURCU) + if(URCU_PKG_VERSION) + message(STATUS "liburcu (RCU) support enabled (version ${URCU_PKG_VERSION})") + else() + message(STATUS "liburcu (RCU) support enabled") + endif() + set(HAVE_LIBURCU TRUE CACHE INTERNAL "Userspace RCU (liburcu) available") + else() + message(STATUS "liburcu (RCU) support disabled by user") + unset(HAVE_LIBURCU CACHE) + endif() +else() + message(STATUS "Install liburcu (urcu-bp) for lock-free data-plane re-keying") + unset(HAVE_LIBURCU CACHE) +endif() + +mark_as_advanced(URCU_BP_LIBRARY URCU_COMMON_LIBRARY URCU_INCLUDE_DIR) diff --git a/cmake/tags.cmake b/cmake/tags.cmake new file mode 100644 index 00000000..00e6f0d6 --- /dev/null +++ b/cmake/tags.cmake @@ -0,0 +1,21 @@ +find_program(CTAGS_EXECUTABLE + NAMES ctags-universal universal-ctags ctags + DOC "Generate a ctags index for source navigation: make tags") +mark_as_advanced(CTAGS_EXECUTABLE) + +if(CTAGS_EXECUTABLE) + add_custom_target(tags + COMMAND ${CTAGS_EXECUTABLE} + -R + --languages=C + --c-kinds=+p + --fields=+S + --exclude=build + --exclude=build-claude + --exclude=build_tmp + --exclude=.git + -f ${CMAKE_SOURCE_DIR}/tags + ${CMAKE_SOURCE_DIR} + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + COMMENT "Generating ctags index at ${CMAKE_SOURCE_DIR}/tags") +endif() diff --git a/cmake/utils/CPUUtils.cmake b/cmake/utils/CPUUtils.cmake new file mode 100644 index 00000000..8ca7683a --- /dev/null +++ b/cmake/utils/CPUUtils.cmake @@ -0,0 +1,82 @@ +include(CheckCSourceRuns) + +# Compile + run a probe so we only enable a feature the host CPU +# actually implements (toolchains accept flags the silicon may lack). +# Cross-compile without an emulator: feature off. +function(detect_cpu_feature _result_var _flags _source) + set(_save_flags "${CMAKE_REQUIRED_FLAGS}") + set(_save_quiet "${CMAKE_REQUIRED_QUIET}") + set(CMAKE_REQUIRED_FLAGS "${_save_flags} ${_flags}") + set(CMAKE_REQUIRED_QUIET TRUE) + if(CMAKE_CROSSCOMPILING AND NOT CMAKE_CROSSCOMPILING_EMULATOR) + set(${_result_var} FALSE CACHE INTERNAL + "${_result_var} (cross-compile without emulator: off)") + else() + check_c_source_runs("${_source}" ${_result_var}) + endif() + set(CMAKE_REQUIRED_FLAGS "${_save_flags}") + set(CMAKE_REQUIRED_QUIET "${_save_quiet}") +endfunction() + +# x86 PCLMULQDQ + SSE4.1. argc-derived input defeats constant folding; +# SIGILL handler exits cleanly so the kernel skips the core dump. +function(detect_pclmul) + detect_cpu_feature(_HAVE_PCLMUL "-mpclmul" +"#include <wmmintrin.h> +#include <signal.h> +#include <unistd.h> +static void on_sigill(int sig) { (void) sig; _exit(1); } +int main(int argc, char ** argv) { + __m128i a; + __m128i b; + (void) argv; + signal(SIGILL, on_sigill); + a = _mm_set1_epi32(argc); + b = _mm_clmulepi64_si128(a, a, 0); + return _mm_cvtsi128_si32(b) & 0; +}") + detect_cpu_feature(_HAVE_SSE41 "-msse4.1" +"#include <smmintrin.h> +#include <signal.h> +#include <unistd.h> +static void on_sigill(int sig) { (void) sig; _exit(1); } +int main(int argc, char ** argv) { + __m128i a; + (void) argv; + signal(SIGILL, on_sigill); + a = _mm_set1_epi32(argc); + return _mm_extract_epi32(a, 0) & 0; +}") + if(_HAVE_PCLMUL AND _HAVE_SSE41) + set(HAVE_PCLMUL TRUE CACHE INTERNAL + "x86 PCLMUL + SSE4.1 intrinsics available") + else() + unset(HAVE_PCLMUL CACHE) + endif() +endfunction() + +# aarch64 FEAT_PMULL (vmull_p64). Pi 4's BCM2711 accepts +crypto at +# compile time but lacks the hardware — the runtime probe catches that. +function(detect_pmull) + detect_cpu_feature(_HAVE_PMULL "-march=armv8-a+crypto" +"#include <arm_neon.h> +#include <signal.h> +#include <stdint.h> +#include <unistd.h> +static void on_sigill(int sig) { (void) sig; _exit(1); } +int main(int argc, char ** argv) { + poly64_t a; + poly128_t c; + (void) argv; + signal(SIGILL, on_sigill); + a = (poly64_t) (uint64_t) argc; + c = vmull_p64(a, a); + return (int) (vgetq_lane_u64((uint64x2_t) c, 0) & 0); +}") + if(_HAVE_PMULL) + set(HAVE_PMULL TRUE CACHE INTERNAL + "aarch64 PMULL intrinsics available") + else() + unset(HAVE_PMULL CACHE) + endif() +endfunction() diff --git a/doc/man/flow_alloc.3 b/doc/man/flow_alloc.3 index dbe5323c..8a9b5f5b 100644 --- a/doc/man/flow_alloc.3 +++ b/doc/man/flow_alloc.3 @@ -62,10 +62,60 @@ The \fBflow_dealloc\fR() function will release any resources associated with the flow. This call may block and keep reliable flows active until all packets are acknowledged. -A \fBqosspec_t\fR specifies the following QoS characteristics of a -flow: - -TODO: specify a qosspec_t +A \fBqosspec_t\fR specifies the QoS characteristics of a flow. +The fields are: + +.TP +\fBdelay\fR (ms) +Maximum one-way delay. +.TP +\fBbandwidth\fR (bits/s) +Minimum bandwidth. +.TP +\fBavailability\fR +Class of 9s (e.g. 5 = 99.999%). +.TP +\fBloss\fR +Tolerated packet loss; 0 selects reliable delivery. +.TP +\fBber\fR +Tolerated bit error rate (errors per billion bits); 0 enables an +end-to-end integrity check (corrupted packets are dropped). +.TP +\fBservice\fR +Framing / reliability class: \fBSVC_RAW\fR (0) disables FRCT; +\fBSVC_MESSAGE\fR (1) preserves SDU boundaries; \fBSVC_STREAM\fR (2) is +a byte stream with no SDU boundaries. \fBSVC_STREAM\fR requires +\fIloss\fR = 0; otherwise +\fBflow_alloc\fR()/\fBflow_accept\fR() returns \fB-EINVAL\fR. +.TP +\fBmax_gap\fR (ms) +Maximum tolerated inter-packet gap. Packets exceeding the gap +budget are dropped under the real-time cubes. +.TP +\fBtimeout\fR (ms) +Peer-liveness timeout; 0 disables. Only applies when FRCT is +enabled (service > 0). + +.PP +The library provides predefined cubes: + +.TP +\fBqos_raw\fR +No guarantees, no integrity check. +.TP +\fBqos_raw_safe\fR +Best-effort with end-to-end integrity (ber = 0). +.TP +\fBqos_rt\fR / \fBqos_rt_safe\fR +Real-time messages, optimised for latency over reliability; +\fBqos_rt_safe\fR adds an end-to-end integrity check. +.TP +\fBqos_msg\fR +Reliable, SDU-preserving delivery. +.TP +\fBqos_stream\fR +Reliable byte stream; no SDU boundaries are preserved. .SH RETURN VALUE @@ -117,13 +167,39 @@ _ \fBflow_dealloc\fR() & Thread safety & MT-Safe .TE +.SH NOTES +The returned file descriptor is subject to a single-reader and +single-writer discipline \(em at most one thread may call +.BR flow_read () +(or monitor the fd via +.BR fevent ()) +and at most one thread may call +.BR flow_write () +concurrently. See +.BR flow_read (3), +.BR flow_write (3), +and +.BR fevent (3) +for details. +.PP +.BR flow_dealloc () +must not be called concurrently with any thread that is inside +.BR flow_read (), +.BR flow_write (), +.BR fevent (), +or any other Ouroboros library call on the same fd; the result is +undefined behaviour. Applications must serialise teardown with +in-flight use, e.g. by signalling worker threads to drop the fd +before calling +.BR flow_dealloc (). + .SH TERMINOLOGY Please see \fBouroboros-glossary\fR(7). .SH SEE ALSO -.BR fccntl "(3), " flow_read "(3), " fqueue "(3), " fset "(3), " \ -ouroboros (8) +.BR fccntl "(3), " fevent "(3), " flow_read "(3), " flow_write "(3), " \ +fqueue "(3), " fset "(3), " ouroboros (8) .SH COLOPHON This page is part of the Ouroboros project, found at diff --git a/doc/man/flow_read.3 b/doc/man/flow_read.3 index acc1f61e..d4a5e883 100644 --- a/doc/man/flow_read.3 +++ b/doc/man/flow_read.3 @@ -39,8 +39,7 @@ end of the datagram. On success, \fBflow_write\fR() returns the number of bytes written. On failure, a negative value indicating the error will be returned. -Partial writes needs to be explicitly enabled. Passing a -NULL pointer for \fIbuf\fR returns 0 with no other effects. +Passing a NULL pointer for \fIbuf\fR returns 0 with no other effects. .SH ERRORS .B -EINVAL @@ -62,7 +61,8 @@ The flow has been reported down. The flow's peer is unresponsive (flow timed out). .B -EMSGSIZE -The buffer was too large to be written. +The received packet does not fit in the caller's buffer and partial +reads are disabled (see \fBfccntl\fR(3), \fBFLOWFRNOPART\fR). .SH ATTRIBUTES @@ -74,11 +74,47 @@ LB|LB|LB L|L|L. Interface & Attribute & Value _ -\fBflow_read\fR() & Thread safety & MT-Safe +\fBflow_read\fR() & Thread safety & MT-Safe race:fd _ -\fBflow_write\fR() & Thread safety & MT-Safe +\fBflow_write\fR() & Thread safety & MT-Safe race:fd .TE +.SH THREAD SAFETY +Only one thread may call +.BR flow_read () +on a given file descriptor at any time. Partial-read state kept +across calls assumes a single logical reader; two threads racing +.BR flow_read () +on the same fd is undefined behaviour. Likewise, only one thread +may call +.BR flow_write () +on a given fd at a time; two writer threads on the same fd is +undefined behaviour. +.PP +Combining a writer thread with a reader thread (one thread calling +.BR flow_write (), +another calling +.BR flow_read () +or +.BR fevent ()) +is permitted and safe. The writer does not need a dedicated reader +thread \(em when the FRCT send window fills, +.BR flow_write () +drives its own inbound rx draining internally to process incoming +ACKs and reopen the window, clamped by the caller's +.BR fccntl (3) +send-timeout if any. +.PP +Monitoring the same fd via +.BR fevent () +from a different thread is well-defined but races: events reported +by +.BR fevent () +may already have been consumed by the racing +.BR flow_read (), +so the second reader may then block. See +.BR fevent (3). + .SH TERMINOLOGY Please see \fBouroboros-glossary\fR(7). diff --git a/doc/man/fqueue.3 b/doc/man/fqueue.3 index 72a0bc25..f2fb8c9f 100644 --- a/doc/man/fqueue.3 +++ b/doc/man/fqueue.3 @@ -116,6 +116,27 @@ _ \fBfevent\fR() & Thread safety & MT-Safe .TE +.SH THREAD SAFETY +.BR fevent () +and +.BR flow_read () +on the same fd from distinct threads is well-defined but races: +events reported by +.BR fevent () +may already have been consumed by the racing +.BR flow_read (), +so the reader may then block. Same shape as +.BR select (2) ++ +.BR read (2) +from distinct threads. The intended pattern is that the thread +invoking +.BR fevent () +is the same thread that calls +.BR flow_read () +on the fds returned by +.BR fqueue_next (). + .SH TERMINOLOGY Please see \fBouroboros-glossary\fR(7). diff --git a/include/ouroboros/atomics.h b/include/ouroboros/atomics.h new file mode 100644 index 00000000..8e667522 --- /dev/null +++ b/include/ouroboros/atomics.h @@ -0,0 +1,39 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * Atomic helpers + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +#ifndef OUROBOROS_LIB_ATOMICS_H +#define OUROBOROS_LIB_ATOMICS_H + +#define LOAD_RELAXED(p) (__atomic_load_n(p, __ATOMIC_RELAXED)) +#define LOAD_ACQUIRE(p) (__atomic_load_n(p, __ATOMIC_ACQUIRE)) +#define LOAD(p) (__atomic_load_n(p, __ATOMIC_SEQ_CST)) + +#define STORE_RELAXED(p, v) (__atomic_store_n(p, v, __ATOMIC_RELAXED)) +#define STORE_RELEASE(p, v) (__atomic_store_n(p, v, __ATOMIC_RELEASE)) +#define STORE(p, v) (__atomic_store_n(p, v, __ATOMIC_SEQ_CST)) + +#define FETCH_ADD_RELAXED(p, v) (__atomic_fetch_add(p, v, __ATOMIC_RELAXED)) +#define FETCH_SUB_RELAXED(p, v) (__atomic_fetch_sub(p, v, __ATOMIC_RELAXED)) +#define FETCH_ADD(p, v) (__atomic_fetch_add(p, v, __ATOMIC_SEQ_CST)) +#define FETCH_SUB(p, v) (__atomic_fetch_sub(p, v, __ATOMIC_SEQ_CST)) + +#endif /* OUROBOROS_LIB_ATOMICS_H */ diff --git a/include/ouroboros/crc16.h b/include/ouroboros/crc16.h new file mode 100644 index 00000000..df4d4f57 --- /dev/null +++ b/include/ouroboros/crc16.h @@ -0,0 +1,43 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * 16-bit Cyclic Redundancy Check (CCITT-FALSE variant) + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +/* + * Polynomial: ITU-T V.41 / CCITT-FALSE, CRC-16/IBM-3740. + * reveng catalog: https://reveng.sourceforge.io/crc-catalogue + * + * Intended for medium-size header check sequences (typ. <= 4 KiB). + * Hamming distance HD=4 up to 32751 message bits. + */ + +#ifndef OUROBOROS_LIB_CRC16_H +#define OUROBOROS_LIB_CRC16_H + +#include <stddef.h> +#include <stdint.h> + +#define CRC16_HASH_LEN 2 + +void crc16_ccitt_false(uint16_t * crc, + const void * buf, + size_t len); + +#endif /* OUROBOROS_LIB_CRC16_H */ diff --git a/include/ouroboros/crc64.h b/include/ouroboros/crc64.h new file mode 100644 index 00000000..f6e407a0 --- /dev/null +++ b/include/ouroboros/crc64.h @@ -0,0 +1,44 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * 64-bit Cyclic Redundancy Check (NVMe variant) + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +/* + * Polynomial: NVM Express Base Spec, CRC-64/NVMe. + * reveng catalog: https://reveng.sourceforge.io/crc-catalogue + * + * Fold-by-N (PCLMUL/PMULL) algorithm: + * V. Gopal et al., "Fast CRC Computation for Generic Polynomials + * Using PCLMULQDQ", Intel white paper, 2009. + */ + +#ifndef OUROBOROS_LIB_CRC64_H +#define OUROBOROS_LIB_CRC64_H + +#include <stddef.h> +#include <stdint.h> + +#define CRC64_HASH_LEN 8 + +void crc64_nvme(uint64_t * crc, + const void * buf, + size_t len); + +#endif /* OUROBOROS_LIB_CRC64_H */ diff --git a/include/ouroboros/crc8.h b/include/ouroboros/crc8.h new file mode 100644 index 00000000..97502a25 --- /dev/null +++ b/include/ouroboros/crc8.h @@ -0,0 +1,43 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * 8-bit Cyclic Redundancy Check (AUTOSAR variant) + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +/* + * Polynomial: AUTOSAR SWS_CRC, CRC-8/AUTOSAR. + * reveng catalog: https://reveng.sourceforge.io/crc-catalogue + * + * Intended for short header check sequences (typ. <= 32 bytes). + * Hamming distance HD=4 up to 119 message bits, HD=3 up to 247. + */ + +#ifndef OUROBOROS_LIB_CRC8_H +#define OUROBOROS_LIB_CRC8_H + +#include <stddef.h> +#include <stdint.h> + +#define CRC8_HASH_LEN 1 + +void crc8_autosar(uint8_t * crc, + const void * buf, + size_t len); + +#endif /* OUROBOROS_LIB_CRC8_H */ diff --git a/include/ouroboros/crypt.h b/include/ouroboros/crypt.h index 5e082bb9..9feaa610 100644 --- a/include/ouroboros/crypt.h +++ b/include/ouroboros/crypt.h @@ -28,18 +28,19 @@ #include <assert.h> -#define IVSZ 16 +#define NONCESZ 16 #define SYMMKEYSZ 32 #define MAX_HASH_SIZE 64 /* SHA-512/BLAKE2b max */ #define KEX_ALGO_BUFSZ 32 #define KEX_CIPHER_BUFSZ 32 +#define CACERT_PATH_BUFSZ 256 /* * On OSX the OpenSSL NIDs are automatically loaded with evp.h. * Some have a different spelling. This header avoids the double definitions. */ - #define NID_undef 0 +#define NID_undef 0 /* Cipher NIDs (match OpenSSL values) */ #define NID_aes_128_gcm 895 @@ -50,7 +51,7 @@ #define NID_aes_256_ctr 906 #define NID_chacha20_poly1305 1018 - #if !defined (__APPLE__) || !defined ( HAVE_OPENSSL ) +#if !defined (__APPLE__) || !defined ( HAVE_OPENSSL ) /* KEX algorithm NIDs (match OpenSSL values) */ #define NID_X9_62_prime256v1 415 #define NID_secp384r1 715 @@ -101,11 +102,15 @@ #define IS_KEX_ALGO_SET(cfg) ((cfg)->x.nid != NID_undef) #define IS_KEX_CIPHER_SET(cfg) ((cfg)->c.nid != NID_undef) +/* Flow role: forks the per-direction keys so each end's TX = peer's RX. */ +#define CRYPT_ROLE_INIT 0 /* flow allocator / OAP client */ +#define CRYPT_ROLE_RESP 1 /* flow acceptor / OAP server */ struct crypt_sk { int nid; uint8_t * key; - uint8_t rot_bit; /* Rotation bit to control epoch */ + uint8_t epoch; /* installed batch epoch */ + uint8_t role; /* CRYPT_ROLE_INIT / _RESP */ }; struct sec_config { @@ -114,18 +119,26 @@ struct sec_config { int nid; int mode; } x; /* key exchange */ + struct { const char * str; int nid; } k; /* kdf */ + struct { const char * str; int nid; } c; /* cipher */ + struct { const char * str; int nid; } d; /* digest */ + + struct { + bool req; /* require peer auth */ + char cacert[CACERT_PATH_BUFSZ]; /* pinned CA, "" = any */ + } a; /* authentication */ }; /* Helper macros to set sec_config fields consistently */ @@ -211,9 +224,21 @@ void auth_destroy_ctx(struct auth_ctx * ctx); int auth_add_crt_to_store(struct auth_ctx * ctx, void * crt); +/* Untrusted intermediates: used to build a path, never as trust anchors */ +int auth_add_crt_to_chain(struct auth_ctx * ctx, + void * crt); + int auth_verify_crt(struct auth_ctx * ctx, void * crt); +/* As auth_verify_crt, pin must be in the verified chain (NULL: any) */ +int auth_verify_crt_pin(struct auth_ctx * ctx, + void * crt, + void * pin); + +/* False for PQC keys: their signature digest is intrinsic */ +bool crypt_pk_requires_md(const void * pk); + int auth_sign(void * pkp, int md_nid, buffer_t msg, @@ -289,12 +314,16 @@ const char * md_nid_to_str(uint16_t nid); uint16_t md_str_to_nid(const char * kdf); -ssize_t md_digest(int md_nid, - buffer_t in, - uint8_t * out); +ssize_t md_digest(int md_nid, + buffer_t in, + uint8_t * out); ssize_t md_len(int md_nid); +int crypt_hkdf_expand(buffer_t key, + buffer_t info, + buffer_t out); + int crypt_encrypt(struct crypt_ctx * ctx, buffer_t in, buffer_t * out); @@ -303,10 +332,37 @@ int crypt_decrypt(struct crypt_ctx * ctx, buffer_t in, buffer_t * out); -int crypt_get_ivsz(struct crypt_ctx * ctx); +/* One-shot AEAD over an explicit key/nonce. out = ciphertext ‖ tag. */ +int crypt_oneshot_seal(int nid, + const uint8_t * key, + const uint8_t * nonce, + buffer_t aad, + buffer_t in, + buffer_t * out); + +int crypt_oneshot_open(int nid, + const uint8_t * key, + const uint8_t * nonce, + buffer_t aad, + buffer_t in, + buffer_t * out); + +int crypt_get_headsz(struct crypt_ctx * ctx); int crypt_get_tagsz(struct crypt_ctx * ctx); +int crypt_rekey(struct crypt_ctx * ctx, + struct crypt_sk * sk); + +/* Nodes remaining in the TX batch (re-key watermark). */ +int crypt_nodes_left(struct crypt_ctx * ctx); + +/* 1 once the peer has been observed on the current generation. */ +int crypt_peer_synced(struct crypt_ctx * ctx); + +/* Switch TX to the installed (new) batch (after peer synced/grace). */ +void crypt_tx_promote(struct crypt_ctx * ctx); + int crypt_load_crt_file(const char * path, void ** crt); @@ -342,6 +398,10 @@ int crypt_load_pubkey_raw_file(const char * path, int crypt_load_privkey_raw_file(const char * path, void ** key); +int crypt_ct_cmp(const void * a, + const void * b, + size_t len); + int crypt_cmp_key(const void * key1, const void * key2); diff --git a/include/ouroboros/errno.h b/include/ouroboros/errno.h index 9d84df88..eedd978f 100644 --- a/include/ouroboros/errno.h +++ b/include/ouroboros/errno.h @@ -37,5 +37,6 @@ #ifndef EAUTH /* Exists on BSD */ #define EAUTH 1009 /* Authentication error */ #endif +#define EREPLAY 1010 /* OAP replay detected */ #endif /* OUROBOROS_ERRNO_H */ diff --git a/include/ouroboros/fccntl.h b/include/ouroboros/fccntl.h index d3baea8f..e91e91dd 100644 --- a/include/ouroboros/fccntl.h +++ b/include/ouroboros/fccntl.h @@ -50,6 +50,12 @@ #define FRCTFRESCNTL 00000002 /* Feedback from receiver */ #define FRCTFLINGER 00000004 /* Send unsent data */ +/* All user-visible bits (readable via FRCTGFLAGS). */ +#define FRCTFMASK (FRCTFRTX | FRCTFRESCNTL | FRCTFLINGER) + +/* Subset writable via FRCTSFLAGS; FRCTFRTX is fixed at flow_alloc. */ +#define FRCTFSETMASK (FRCTFRESCNTL | FRCTFLINGER) + /* Flow operations */ #define FLOWSRCVTIMEO 00000001 /* Set read timeout */ #define FLOWGRCVTIMEO 00000002 /* Get read timeout */ @@ -60,10 +66,17 @@ #define FLOWGFLAGS 00000007 /* Get flags for flow */ #define FLOWGRXQLEN 00000010 /* Get queue length on rx */ #define FLOWGTXQLEN 00000011 /* Get queue length on tx */ +#define FLOWGMTU 00000012 /* Get per-packet MTU */ /* FRCT operations */ #define FRCTSFLAGS 00001000 /* Set flags for FRCT */ #define FRCTGFLAGS 00002000 /* Get flags for FRCT */ +#define FRCTSMAXSDU 00003000 /* Set max recv SDU size */ +#define FRCTGMAXSDU 00004000 /* Get max recv SDU size */ +#define FRCTSRRINGSZ 00005000 /* Set stream rcv ring sz */ +#define FRCTGRRINGSZ 00006000 /* Get stream rcv ring sz */ +#define FRCTSRTOMIN 00007000 /* Set RTO floor (ns) */ +#define FRCTGRTOMIN 00010000 /* Get RTO floor (ns) */ __BEGIN_DECLS diff --git a/include/ouroboros/flow.h b/include/ouroboros/flow.h index fe4582e7..8b096410 100644 --- a/include/ouroboros/flow.h +++ b/include/ouroboros/flow.h @@ -25,6 +25,7 @@ #include <ouroboros/qos.h> +#include <stdint.h> #include <sys/types.h> #define SYMMKEYSZ 32 @@ -50,6 +51,8 @@ struct flow_info { time_t mpl; + uint32_t mtu; /* n-1 layer MTU in bytes, 0 = unknown */ + struct qos_spec qs; enum flow_state state; diff --git a/include/ouroboros/fqueue.h b/include/ouroboros/fqueue.h index 2546c79d..322da3ea 100644 --- a/include/ouroboros/fqueue.h +++ b/include/ouroboros/fqueue.h @@ -34,7 +34,8 @@ enum fqtype { FLOW_UP = (1 << 2), FLOW_ALLOC = (1 << 3), FLOW_DEALLOC = (1 << 4), - FLOW_PEER = (1 << 5) + FLOW_PEER = (1 << 5), + FLOW_UPD = (1 << 6) }; struct flow_set; diff --git a/include/ouroboros/hash.h b/include/ouroboros/hash.h index 0838df97..c6609ffc 100644 --- a/include/ouroboros/hash.h +++ b/include/ouroboros/hash.h @@ -38,6 +38,9 @@ enum hash_algo { HASH_SHA3_512 = DIR_HASH_SHA3_512, HASH_CRC32, HASH_MD5, + HASH_CRC64, + HASH_CRC8, + HASH_CRC16, }; #define HASH_FMT32 "%02x%02x%02x%02x" @@ -86,4 +89,7 @@ void str_hash(enum hash_algo algo, void * dst, const char * str); +/* Non-cryptographic finalizer for hashing an integer key to a table index. */ +uint64_t hash_mix64(uint64_t key); + #endif /* OUROBOROS_LIB_HASH_H */ diff --git a/include/ouroboros/ipcp-dev.h b/include/ouroboros/ipcp-dev.h index 93236271..d00d6f08 100644 --- a/include/ouroboros/ipcp-dev.h +++ b/include/ouroboros/ipcp-dev.h @@ -28,16 +28,23 @@ #include <ouroboros/ssm_pool.h> #include <ouroboros/utils.h> +#include <stdint.h> + int ipcp_create_r(const struct ipcp_info * info); int ipcp_flow_req_arr(const buffer_t * dst, qosspec_t qs, time_t mpl, + uint32_t mtu, const buffer_t * data); +int ipcp_flow_update_arr(int flow_id, + const buffer_t * data); + int ipcp_flow_alloc_reply(int fd, int response, time_t mpl, + uint32_t mtu, const buffer_t * data); int ipcp_flow_read(int fd, diff --git a/include/ouroboros/logs.h b/include/ouroboros/logs.h index 58494531..1ae77673 100644 --- a/include/ouroboros/logs.h +++ b/include/ouroboros/logs.h @@ -29,6 +29,7 @@ #include <ouroboros/hash.h> +#include <pthread.h> #include <unistd.h> #include <stdio.h> #include <stdbool.h> @@ -55,6 +56,8 @@ void log_fini(void); #define __olog(CLR, LVL, SYSLVL, ...) \ do { \ + int __cs; \ + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &__cs); \ if (log_syslog) { \ syslog(SYSLVL, __VA_ARGS__); \ } else { \ @@ -64,10 +67,13 @@ void log_fini(void); printf(CLR_RESET "\n"); \ fflush(stdout); \ } \ + pthread_setcancelstate(__cs, NULL); \ } while (0) #define __olog_id(CLR, LVL, SYSLVL, id, fmt, ...) \ do { \ + int __cs; \ + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &__cs); \ if (log_syslog) { \ syslog(SYSLVL, "[" HASH_FMT64 "] " fmt, \ HASH_VAL64(id), ## __VA_ARGS__); \ @@ -79,6 +85,7 @@ void log_fini(void); printf(CLR_RESET "\n"); \ fflush(stdout); \ } \ + pthread_setcancelstate(__cs, NULL); \ } while (0) #ifndef OUROBOROS_DISABLE_LOGGING diff --git a/include/ouroboros/name.h b/include/ouroboros/name.h index a9393820..a3aac8c4 100644 --- a/include/ouroboros/name.h +++ b/include/ouroboros/name.h @@ -34,9 +34,9 @@ enum pol_balance { }; struct name_sec_paths { - char enc[NAME_PATH_SIZE + 1]; /* path to crypt for this name */ - char key[NAME_PATH_SIZE + 1]; /* path to key for this name */ - char crt[NAME_PATH_SIZE + 1]; /* path to crt for this name */ + char sec[NAME_PATH_SIZE + 1]; /* path to sec.conf for this name */ + char key[NAME_PATH_SIZE + 1]; /* path to key for this name */ + char crt[NAME_PATH_SIZE + 1]; /* path to crt for this name */ }; struct name_info { diff --git a/include/ouroboros/np1_flow.h b/include/ouroboros/np1_flow.h index 6f341cfc..758b6db8 100644 --- a/include/ouroboros/np1_flow.h +++ b/include/ouroboros/np1_flow.h @@ -36,13 +36,17 @@ int np1_flow_resp(int flow_id, int np1_flow_dealloc(int flow_id, time_t timeo); +int np1_flow_fd(int flow_id); + +int np1_flow_id(int fd); + static const qosspec_t qos_np1 = { + .service = SVC_RAW, .delay = UINT32_MAX, .bandwidth = 0, .availability = 0, .loss = UINT32_MAX, .ber = UINT32_MAX, - .in_order = 0, .max_gap = UINT32_MAX, .timeout = 0 }; diff --git a/include/ouroboros/pthread.h b/include/ouroboros/pthread.h index cd500795..3ca79d10 100644 --- a/include/ouroboros/pthread.h +++ b/include/ouroboros/pthread.h @@ -24,6 +24,7 @@ #define OUROBOROS_LIB_PTHREAD_H #include <pthread.h> +#include <stdio.h> static int __attribute__((unused)) __timedwait(pthread_cond_t * cond, pthread_mutex_t * mtx, @@ -48,4 +49,9 @@ static void __attribute__((unused)) __cleanup_mutex_unlock(void * mutex) pthread_mutex_unlock((pthread_mutex_t *) mutex); } +static void __attribute__((unused)) __cleanup_fclose(void * fp) +{ + fclose((FILE *) fp); +} + #endif /* OUROBOROS_LIB_PTHREAD_H */ diff --git a/include/ouroboros/qos.h b/include/ouroboros/qos.h index 6b0bbc17..7980ad00 100644 --- a/include/ouroboros/qos.h +++ b/include/ouroboros/qos.h @@ -28,79 +28,88 @@ #define DEFAULT_PEER_TIMEOUT 120000 +/* qos_spec.service: framing / reliability class. */ +enum qos_service { + SVC_RAW = 0, /* No FRCT; best-effort raw messages */ + SVC_MESSAGE = 1, /* FRCT, reliable ordered messages */ + SVC_STREAM = 2, /* FRCT, reliable ordered byte stream */ +}; + typedef struct qos_spec { + uint8_t service; /* enum qos_service; gates FRCT (>0). */ uint32_t delay; /* In ms. */ uint64_t bandwidth; /* In bits/s. */ uint8_t availability; /* Class of 9s. */ uint32_t loss; /* Packet loss. */ uint32_t ber; /* Bit error rate, errors per billion bits. */ - uint8_t in_order; /* In-order delivery, enables FRCT. */ uint32_t max_gap; /* In ms. */ uint32_t timeout; /* Peer timeout time, in ms, 0 = no timeout. */ } qosspec_t; +/* "_safe" = integrity check (ber=0). "rt" = latency over reliability. */ + static const qosspec_t qos_raw = { + .service = SVC_RAW, .delay = UINT32_MAX, .bandwidth = 0, .availability = 0, .loss = 1, .ber = 1, - .in_order = 0, .max_gap = UINT32_MAX, - .timeout = DEFAULT_PEER_TIMEOUT + .timeout = 0 }; -static const qosspec_t qos_raw_no_errors = { +static const qosspec_t qos_raw_safe = { + .service = SVC_RAW, .delay = UINT32_MAX, .bandwidth = 0, .availability = 0, .loss = 1, .ber = 0, - .in_order = 0, .max_gap = UINT32_MAX, - .timeout = DEFAULT_PEER_TIMEOUT + .timeout = 0 }; -static const qosspec_t qos_best_effort = { - .delay = UINT32_MAX, - .bandwidth = 0, - .availability = 0, +static const qosspec_t qos_rt = { + .service = SVC_MESSAGE, + .delay = 100, + .bandwidth = UINT64_MAX, + .availability = 3, .loss = 1, - .ber = 0, - .in_order = 1, - .max_gap = UINT32_MAX, + .ber = 1, + .max_gap = 100, .timeout = DEFAULT_PEER_TIMEOUT }; -static const qosspec_t qos_video = { +static const qosspec_t qos_rt_safe = { + .service = SVC_MESSAGE, .delay = 100, .bandwidth = UINT64_MAX, .availability = 3, .loss = 1, .ber = 0, - .in_order = 1, .max_gap = 100, .timeout = DEFAULT_PEER_TIMEOUT }; -static const qosspec_t qos_voice = { - .delay = 50, - .bandwidth = 100000, - .availability = 5, - .loss = 1, +static const qosspec_t qos_msg = { + .service = SVC_MESSAGE, + .delay = 1000, + .bandwidth = 0, + .availability = 0, + .loss = 0, .ber = 0, - .in_order = 1, - .max_gap = 50, + .max_gap = 2000, .timeout = DEFAULT_PEER_TIMEOUT }; -static const qosspec_t qos_data = { +static const qosspec_t qos_stream = { + .service = SVC_STREAM, .delay = 1000, .bandwidth = 0, .availability = 0, .loss = 0, .ber = 0, - .in_order = 1, .max_gap = 2000, .timeout = DEFAULT_PEER_TIMEOUT }; diff --git a/include/ouroboros/rcu.h b/include/ouroboros/rcu.h new file mode 100644 index 00000000..b4e7d27c --- /dev/null +++ b/include/ouroboros/rcu.h @@ -0,0 +1,110 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * Read-mostly pointer publication (RCU, with a locked fallback) + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +#ifndef OUROBOROS_LIB_RCU_H +#define OUROBOROS_LIB_RCU_H + +/* + * Lock-free reads of published pointers via liburcu (urcu-bp) when + * available; a per-object rwlock fallback otherwise. + * Include config.h before this header so HAVE_LIBURCU is defined. + * + * Embed a struct rcu_guard in the object. A reader brackets its access + * with rcu_rdlock/rcu_rdunlock and reads published pointers via rcu_deref. + * A writer serialises with rcu_wrlock/rcu_wrunlock and publishes via + * rcu_assign; after unlock it reclaims a now-unreachable object with + * rcu_reclaim (waits out live readers) before freeing it. rcu_drain waits + * out all readers at teardown. + */ + +#include <ouroboros/pthread.h> + +#ifdef HAVE_LIBURCU + +#include <urcu-bp.h> + +struct rcu_guard { + pthread_mutex_t w; /* serialises writers; readers use RCU */ +}; + +#define rcu_guard_init(g) pthread_mutex_init(&(g)->w, NULL) +#define rcu_guard_fini(g) pthread_mutex_destroy(&(g)->w) +#define rcu_rdlock(g) ((void) (g), rcu_read_lock()) +#define rcu_rdunlock(g) ((void) (g), rcu_read_unlock()) +#define rcu_wrlock(g) pthread_mutex_lock(&(g)->w) +#define rcu_wrunlock(g) pthread_mutex_unlock(&(g)->w) +#define rcu_deref(p) rcu_dereference(p) +#define rcu_assign(p, v) rcu_assign_pointer(p, v) +#define rcu_reclaim(g) ((void) (g), synchronize_rcu()) +#define rcu_drain(g) ((void) (g), synchronize_rcu()) + +/* TSan can miss the publish/consume barrier under urcu. */ +#if defined(__SANITIZE_THREAD__) +#define RCU_TSAN_ANNOTATE +#endif +#if defined(__has_feature) +#if __has_feature(thread_sanitizer) +#define RCU_TSAN_ANNOTATE +#endif +#endif + +/* + * Publish/consume annotations re-expose liburcu's rcu_assign/rcu_deref edge to + * TSan, which cannot see liburcu's barriers. Call rcu_publish(p) before + * publishing p with rcu_assign, and rcu_consume(p) after reading it with + * rcu_deref. No-op without liburcu (the rwlock fallback already gives TSan the + * edge) or without TSan. + */ +#ifdef RCU_TSAN_ANNOTATE +#include <sanitizer/tsan_interface.h> +#define rcu_publish(p) __tsan_release(p) +#define rcu_consume(p) __tsan_acquire(p) +#else +#define rcu_publish(p) ((void) (p)) +#define rcu_consume(p) ((void) (p)) +#endif + +#else /* !HAVE_LIBURCU : per-object rwlock fallback */ + +struct rcu_guard { + pthread_rwlock_t rw; /* readers rd, writers wr */ +}; + +#define rcu_guard_init(g) pthread_rwlock_init(&(g)->rw, NULL) +#define rcu_guard_fini(g) pthread_rwlock_destroy(&(g)->rw) +#define rcu_rdlock(g) pthread_rwlock_rdlock(&(g)->rw) +#define rcu_rdunlock(g) pthread_rwlock_unlock(&(g)->rw) +#define rcu_wrlock(g) pthread_rwlock_wrlock(&(g)->rw) +#define rcu_wrunlock(g) pthread_rwlock_unlock(&(g)->rw) +#define rcu_deref(p) (p) +#define rcu_assign(p, v) ((p) = (v)) +#define rcu_reclaim(g) ((void) (g)) /* wrlock already excluded readers */ +#define rcu_drain(g) (pthread_rwlock_wrlock(&(g)->rw), \ + pthread_rwlock_unlock(&(g)->rw)) + +/* rwlock already gives TSan the publish/consume edge; no annotation. */ +#define rcu_publish(p) ((void) (p)) +#define rcu_consume(p) ((void) (p)) + +#endif /* HAVE_LIBURCU */ + +#endif /* OUROBOROS_LIB_RCU_H */ diff --git a/include/ouroboros/serdes-irm.h b/include/ouroboros/serdes-irm.h index 1dfff4d9..a5854d5b 100644 --- a/include/ouroboros/serdes-irm.h +++ b/include/ouroboros/serdes-irm.h @@ -31,6 +31,7 @@ #include <ouroboros/utils.h> #include <inttypes.h> +#include <stdbool.h> int flow_alloc__irm_req_ser(buffer_t * buf, const struct flow_info * flow, @@ -51,6 +52,10 @@ int ipcp_flow_req_arr__irm_req_ser(buffer_t * buf, const struct flow_info * flow, const buffer_t * data); +int ipcp_flow_update_arr__irm_req_ser(buffer_t * buf, + const struct flow_info * flow, + const buffer_t * data); + int ipcp_flow_alloc_reply__irm_msg_ser(buffer_t * buf, const struct flow_info * flow, int response, @@ -64,6 +69,15 @@ int flow_dealloc__irm_req_ser(buffer_t * buf, const struct flow_info * flow, const struct timespec * timeo); +int flow_update__irm_req_ser(buffer_t * buf, + const struct flow_info * flow, + bool rekey); + +int flow_rekey__irm_result_des(buffer_t * buf, + struct crypt_sk * sk, + bool * has_key, + bool * initiator); + int ipcp_flow_dealloc__irm_req_ser(buffer_t * buf, const struct flow_info * info); diff --git a/include/ouroboros/ssm_pk_buff.h b/include/ouroboros/ssm_pk_buff.h index 1b779ad1..1d5597c7 100644 --- a/include/ouroboros/ssm_pk_buff.h +++ b/include/ouroboros/ssm_pk_buff.h @@ -28,25 +28,25 @@ struct ssm_pk_buff; -size_t ssm_pk_buff_get_idx(struct ssm_pk_buff * spb); +size_t ssm_pk_buff_get_off(const struct ssm_pk_buff * spb); -uint8_t * ssm_pk_buff_head(struct ssm_pk_buff * spb); +uint8_t * ssm_pk_buff_head(const struct ssm_pk_buff * spb); -uint8_t * ssm_pk_buff_tail(struct ssm_pk_buff * spb); +uint8_t * ssm_pk_buff_tail(const struct ssm_pk_buff * spb); -size_t ssm_pk_buff_len(struct ssm_pk_buff * spb); +size_t ssm_pk_buff_len(const struct ssm_pk_buff * spb); -uint8_t * ssm_pk_buff_head_alloc(struct ssm_pk_buff * spb, - size_t size); +uint8_t * ssm_pk_buff_push(struct ssm_pk_buff * spb, + size_t size); -uint8_t * ssm_pk_buff_tail_alloc(struct ssm_pk_buff * spb, - size_t size); +uint8_t * ssm_pk_buff_push_tail(struct ssm_pk_buff * spb, + size_t size); -uint8_t * ssm_pk_buff_head_release(struct ssm_pk_buff * spb, - size_t size); +uint8_t * ssm_pk_buff_pop(struct ssm_pk_buff * spb, + size_t size); -uint8_t * ssm_pk_buff_tail_release(struct ssm_pk_buff * spb, - size_t size); +uint8_t * ssm_pk_buff_pop_tail(struct ssm_pk_buff * spb, + size_t size); void ssm_pk_buff_truncate(struct ssm_pk_buff * spb, size_t len); diff --git a/include/ouroboros/ssm_pool.h b/include/ouroboros/ssm_pool.h index 89eff8eb..bba76798 100644 --- a/include/ouroboros/ssm_pool.h +++ b/include/ouroboros/ssm_pool.h @@ -32,7 +32,7 @@ struct ssm_pool; -/* Pool API: uid = 0 for GSPP (privileged), uid > 0 for PUP (per-user) */ +/* Pool API: uid = 0 for GSPP (privileged), uid > 0 for PUP (per-user). */ struct ssm_pool * ssm_pool_create(uid_t uid, gid_t gid); @@ -46,13 +46,13 @@ int ssm_pool_mlock(struct ssm_pool * pool); void ssm_pool_gspp_purge(void); -/* Alloc count bytes, returns block index, a ptr and pk_buff. */ +/* Alloc count bytes, returns block offset, a ptr and pk_buff. */ ssize_t ssm_pool_alloc(struct ssm_pool * pool, size_t count, uint8_t ** ptr, struct ssm_pk_buff ** spb); -ssize_t ssm_pool_alloc_b(struct ssm_pool * pool, +ssize_t ssm_pool_alloc_b(struct ssm_pool * pool, size_t count, uint8_t ** ptr, struct ssm_pk_buff ** spb, @@ -60,13 +60,13 @@ ssize_t ssm_pool_alloc_b(struct ssm_pool * pool, ssize_t ssm_pool_read(uint8_t ** dst, struct ssm_pool * pool, - size_t idx); + size_t off); struct ssm_pk_buff * ssm_pool_get(struct ssm_pool * pool, - size_t idx); + size_t off); int ssm_pool_remove(struct ssm_pool * pool, - size_t idx); + size_t off); void ssm_pool_reclaim_orphans(struct ssm_pool * pool, pid_t pid); diff --git a/include/ouroboros/ssm_rbuff.h b/include/ouroboros/ssm_rbuff.h index ffa10b8e..e77eec09 100644 --- a/include/ouroboros/ssm_rbuff.h +++ b/include/ouroboros/ssm_rbuff.h @@ -28,10 +28,12 @@ #include <stdint.h> -#define ACL_RDWR 0000 -#define ACL_RDONLY 0001 -#define ACL_FLOWDOWN 0002 -#define ACL_FLOWPEER 0004 +#define RB_RD 0001 /* read permitted (0 = no access) */ +#define RB_WR 0002 /* write permitted (0 = no access) */ +#define RB_RDWR (RB_RD | RB_WR) +#define RB_FLOWDOWN 0004 +#define RB_FLOWPEER 0010 +#define RB_REKEY 0020 /* re-key seed parked (out-of-band signal) */ struct ssm_rbuff; @@ -45,20 +47,23 @@ struct ssm_rbuff * ssm_rbuff_open(pid_t pid, void ssm_rbuff_close(struct ssm_rbuff * rb); -void ssm_rbuff_set_acl(struct ssm_rbuff * rb, - uint32_t flags); +void ssm_rbuff_set_bits(struct ssm_rbuff * rb, + uint32_t bits); -uint32_t ssm_rbuff_get_acl(struct ssm_rbuff * rb); +void ssm_rbuff_clr_bits(struct ssm_rbuff * rb, + uint32_t bits); + +uint32_t ssm_rbuff_get_flags(struct ssm_rbuff * rb); void ssm_rbuff_fini(struct ssm_rbuff * rb); int ssm_rbuff_mlock(struct ssm_rbuff * rb); int ssm_rbuff_write(struct ssm_rbuff * rb, - size_t idx); + size_t off); int ssm_rbuff_write_b(struct ssm_rbuff * rb, - size_t idx, + size_t off, const struct timespec * abstime); ssize_t ssm_rbuff_read(struct ssm_rbuff * rb); diff --git a/include/ouroboros/time.h b/include/ouroboros/time.h index 3d037a3c..a4136e8e 100644 --- a/include/ouroboros/time.h +++ b/include/ouroboros/time.h @@ -46,6 +46,12 @@ #define TS_TO_UINT64(ts) \ ((uint64_t)(ts).tv_sec * BILLION + (uint64_t)(ts).tv_nsec) +#define UINT64_TO_TS(ns, ts) \ + do { \ + (ts)->tv_sec = (time_t)((ns) / BILLION); \ + (ts)->tv_nsec = (long)((ns) % BILLION); \ + } while (0) + #define TIMEVAL_INIT_S(s) {(s), 0} #define TIMEVAL_INIT_MS(ms) {(ms) / 1000, ((ms) % 1000) * 1000} #define TIMEVAL_INIT_US(us) {(us) / MILLION, ((us) % MILLION)} diff --git a/include/ouroboros/tpm.h b/include/ouroboros/tpm.h index c01a235c..56c04701 100644 --- a/include/ouroboros/tpm.h +++ b/include/ouroboros/tpm.h @@ -24,6 +24,7 @@ #define OUROBOROS_LIB_TPM_H #include <stdbool.h> +#include <sys/types.h> struct tpm; diff --git a/include/ouroboros/tw.h b/include/ouroboros/tw.h new file mode 100644 index 00000000..156f99db --- /dev/null +++ b/include/ouroboros/tw.h @@ -0,0 +1,77 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * Generic deadline-ordered callback queue (timing wheel) + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +#ifndef OUROBOROS_TW_H +#define OUROBOROS_TW_H + +#include <ouroboros/cdefs.h> +#include <ouroboros/list.h> + +#include <stddef.h> +#include <stdint.h> +#include <time.h> + +typedef void (*tw_fire_fn_t)(void * arg); + +struct tw_entry { + struct list_head next; + uint64_t deadline_ns; + tw_fire_fn_t fire; + void * arg; + size_t lvl; +}; + +__BEGIN_DECLS + +int tw_init(void); + +void tw_fini(void); + +void tw_init_entry(struct tw_entry * e); + +/* + * Schedule e to fire at deadline_ns. If e is already posted, + * the previous schedule is cancelled and replaced. + */ +void tw_post(struct tw_entry * e, + uint64_t deadline_ns, + tw_fire_fn_t fire, + void * arg); + +void tw_cancel(struct tw_entry * e); + +/* + * Advance the wheel and fire due callbacks. Callbacks run with the wheel + * unlocked and may call tw_post / tw_cancel on any entry, including the one + * currently firing. Concurrent tw_move from a second thread is a no-op. + */ +void tw_move(void); + +/* + * Write the absolute deadline of the earliest pending entry to *out. + * Empty wheel is signalled by out->tv_nsec == -1. + */ +void tw_next_expiry(struct timespec * out); + +__END_DECLS + +#endif /* OUROBOROS_TW_H */ diff --git a/include/test/certs/ecdsa.h b/include/test/certs/ecdsa.h index 1d61a3f8..cbc4ed06 100644 --- a/include/test/certs/ecdsa.h +++ b/include/test/certs/ecdsa.h @@ -107,6 +107,23 @@ static const char * signed_server_crt_ec = \ "ktkxoHAFbjQEPQIhAMInHI7lvRmS0IMw1wBF/WlUZWKvhyU/TeMIZfk/JGCS\n" "-----END CERTIFICATE-----\n"; +/* Valid CA outside the test chain, for cacert= pin mismatch */ +static __attribute__((unused)) const char * other_ca_crt_ec = \ +"-----BEGIN CERTIFICATE-----\n" +"MIICNjCCAdugAwIBAgIUTZcZ9hKXyCT/VgTw8TD1TB2mzrgwCgYIKoZIzj0EAwIw\n" +"cDELMAkGA1UEBhMCQkUxDDAKBgNVBAgMA09WTDEOMAwGA1UEBwwFR2hlbnQxDDAK\n" +"BgNVBAoMA283czEVMBMGA1UECwwMdW5pdHRlc3QubzdzMR4wHAYDVQQDDBVvdGhl\n" +"ci1jYS51bml0dGVzdC5vN3MwHhcNMjYwNjEyMTU1MjAzWhcNNDYwNjA3MTU1MjAz\n" +"WjBwMQswCQYDVQQGEwJCRTEMMAoGA1UECAwDT1ZMMQ4wDAYDVQQHDAVHaGVudDEM\n" +"MAoGA1UECgwDbzdzMRUwEwYDVQQLDAx1bml0dGVzdC5vN3MxHjAcBgNVBAMMFW90\n" +"aGVyLWNhLnVuaXR0ZXN0Lm83czBZMBMGByqGSM49AgEGCCqGSM49AwEHA0IABNtu\n" +"FghMww2kQ6a+Coe6VPzfBRUZlm7y6/RfbRFPvErowOqKLQP+wCs8Rq46VmHCYTbB\n" +"OlRwzJKcNoSeJ4MNWUqjUzBRMB0GA1UdDgQWBBTmEP8W6fgViKIjw8CpTuQwyuOi\n" +"kTAfBgNVHSMEGDAWgBTmEP8W6fgViKIjw8CpTuQwyuOikTAPBgNVHRMBAf8EBTAD\n" +"AQH/MAoGCCqGSM49BAMCA0kAMEYCIQDQOCfFcOJm49R975RBPfVMy0pXGx/YeQcy\n" +"6WKAeLuTowIhAISdVZ6KxsgkwuswMtDWAkCBujep0XSBGXtXmi4959DH\n" +"-----END CERTIFICATE-----\n"; + /* Self-signed by server test-1.unittest.o7s using its key */ static __attribute__((unused)) const char * server_crt_ec = \ "-----BEGIN CERTIFICATE-----\n" @@ -121,5 +138,25 @@ static __attribute__((unused)) const char * server_crt_ec = \ "gRo=\n" "-----END CERTIFICATE-----\n"; +/* + * Name-confusion fixture: real CN is "attacker.unittest.o7s", but the + * O field value is "CN=victim.unittest.o7s" so the oneline subject is + * "/O=CN=victim.unittest.o7s/CN=attacker.unittest.o7s". A strstr("CN=") + * scan latches onto the decoy. The real CN must win. + */ +static __attribute__((unused)) const char * confused_crt_ec = \ +"-----BEGIN CERTIFICATE-----\n" +"MIIB1jCCAX2gAwIBAgIUCfXJzDQ3Sx5qcyVB9Rb4/FdZ+QowCgYIKoZIzj0EAwIw\n" +"QTEfMB0GA1UECgwWQ049dmljdGltLnVuaXR0ZXN0Lm83czEeMBwGA1UEAwwVYXR0\n" +"YWNrZXIudW5pdHRlc3QubzdzMB4XDTI2MDYxNDE5MDcwMVoXDTQ2MDYwOTE5MDcw\n" +"MVowQTEfMB0GA1UECgwWQ049dmljdGltLnVuaXR0ZXN0Lm83czEeMBwGA1UEAwwV\n" +"YXR0YWNrZXIudW5pdHRlc3QubzdzMFkwEwYHKoZIzj0CAQYIKoZIzj0DAQcDQgAE\n" +"oLwrbLs3diGcjyY2ErvO/U6CoyyKfl/8e1nxBKXHSOkO5xVmFu+EobEQVFvabxE/\n" +"x4RttKcGJqUe8vlyQexQq6NTMFEwHQYDVR0OBBYEFGBaOBzTsCakjBN61x0ZnHSk\n" +"04T3MB8GA1UdIwQYMBaAFGBaOBzTsCakjBN61x0ZnHSk04T3MA8GA1UdEwEB/wQF\n" +"MAMBAf8wCgYIKoZIzj0EAwIDRwAwRAIgFtBeVxlRuI7y9Bo/Dh97ajTbHJXYMkc6\n" +"ZqflSN3Q/uACIHWoCVn6u6+JjF+Kj9zubFJ49RIQJthSeP8xj7yTeV17\n" +"-----END CERTIFICATE-----\n"; + #endif /* TEST_CERTS_H */ diff --git a/irmd.conf.in b/irmd.conf.in index dee88392..b9b79782 100644 --- a/irmd.conf.in +++ b/irmd.conf.in @@ -56,10 +56,10 @@ prog=["@INSTALL_DIR@/ovpn"] # Defaults to []. prog=["@INSTALL_DIR@/oping"] # Defaults to []. args=["--listen"] # Defaults to disabled. Autostart server with these args. lb="round-robin" # Defaults to spill (load-balancing options: spill, round-robin). -# server_enc_file=/path/to/enc.conf Default: @OUROBOROS_SRV_CRT_DIR@/<name>/enc.conf +# server_sec_file=/path/to/sec.conf Default: @OUROBOROS_SRV_CRT_DIR@/<name>/sec.conf # server_crt_file=/path/to/crt.pem Default: @OUROBOROS_SRV_CRT_DIR@/<name>/crt.pem # server_key_file=/path/to/key.pem Default: @OUROBOROS_SRV_CRT_DIR@/<name>/key.pem -# client_enc_file=/path/to/enc.conf Default: @OUROBOROS_CLI_CRT_DIR@/<name>/enc.conf +# client_sec_file=/path/to/sec.conf Default: @OUROBOROS_CLI_CRT_DIR@/<name>/sec.conf # client_crt_file=/path/to/crt.pem Default: @OUROBOROS_CLI_CRT_DIR@/<name>/crt.pem # client_key_file=/path/to/key.pem Default: @OUROBOROS_CLI_CRT_DIR@/<name>/key.pem diff --git a/enc.conf.in b/sec.conf.in index 8f91d717..4796b72d 100644 --- a/enc.conf.in +++ b/sec.conf.in @@ -1,19 +1,19 @@ -### Example Ouroboros encryption configuration file +### Example Ouroboros security configuration file # -# This file specifies the key exchange (KEX) algorithm and cipher to use -# for encrypted flows. +# This file specifies the security parameters for a service: the key +# exchange (KEX) algorithm, cipher, key derivation, and peer authentication. # # File Locations: # --------------- # # This file should be placed at one of: -# @OUROBOROS_CONFIG_DIR@/security/server/<name>/enc.conf (server-side config) -# @OUROBOROS_CONFIG_DIR@/security/client/<name>/enc.conf (client-side config) +# @OUROBOROS_CONFIG_DIR@/security/server/<name>/sec.conf (server-side config) +# @OUROBOROS_CONFIG_DIR@/security/client/<name>/sec.conf (client-side config) # # Where <name> is the service name registered with 'irm name create'. # # You can override the default paths using: -# irm name create <name> sencpath <server-enc-path> cencpath <client-enc-path> +# irm name create <name> ssecpath <server-sec-path> csecpath <client-sec-path> # # Configuration Options: # ---------------------- @@ -22,7 +22,8 @@ # cipher=<cipher> Symmetric cipher algorithm # kdf=<hash> Key derivation function hash algorithm # kem_mode=<mode> KEM encapsulation mode (server or client) -# none Explicitly disable encryption +# auth=<policy> Peer authentication policy (required or optional) +# encryption=none Explicitly disable encryption # # Supported KEX algorithms (kex=): # -------------------------------- @@ -57,11 +58,6 @@ # aes-256-gcm AES-256 in GCM mode (default) # chacha20-poly1305 ChaCha20-Poly1305 # -# Stream ciphers (not recommended): -# aes-128-ctr AES-128 in CTR mode -# aes-192-ctr AES-192 in CTR mode -# aes-256-ctr AES-256 in CTR mode -# # Key Derivation Functions (kdf=): # --------------------------------- # @@ -76,6 +72,33 @@ # blake2b512 BLAKE2b-512 # blake2s256 BLAKE2s-256 # +# Peer Authentication (auth=): +# ---------------------------- +# +# optional Accept unauthenticated peers +# required Reject peers that do not present a valid certificate +# +# This setting applies to the *peer*: in a client config it requires +# the server to authenticate; in a server config it requires the +# client. The defaults mirror the web: a client config defaults to +# required (the server must authenticate), a server config defaults +# to optional (client authentication is opt-in). Set auth=required on +# the server too for mutual authentication. Combine encryption=none +# with auth=required for authenticated but unencrypted flows. +# +# Issuer Pinning (cacert=): +# ------------------------- +# +# cacert=<path> Path to a CA certificate that must be part of the +# peer certificate's verified chain +# +# The peer certificate is always validated against the trusted CA +# store; cacert= further restricts which CA must have issued it: a +# certificate, if presented, must chain through the pinned CA. Whether +# a certificate is mandatory is controlled by auth= alone: under +# auth=optional a peer may still connect without one. The pinned CA +# must load when the config is read, otherwise flow allocation fails. +# # KEM Mode (kem_mode=): # --------------------- # @@ -147,4 +170,8 @@ kdf=sha256 # kdf=sha512 # # Disable encryption: -# none +# encryption=none +# +# Authentication required, no encryption: +# encryption=none +# auth=required diff --git a/src/ipcpd/broadcast/dt.c b/src/ipcpd/broadcast/dt.c index 30e89a4f..95483e33 100644 --- a/src/ipcpd/broadcast/dt.c +++ b/src/ipcpd/broadcast/dt.c @@ -28,7 +28,7 @@ #include "config.h" -#define BROADCAST_MTU 1400 /* FIXME: avoid packet copy. */ +#define BROADCAST_MTU IPCP_BROADCAST_MTU /* FIXME: avoid packet copy. */ #define DT "dt" #define OUROBOROS_PREFIX DT diff --git a/src/ipcpd/broadcast/main.c b/src/ipcpd/broadcast/main.c index b3cbdc79..d18cac82 100644 --- a/src/ipcpd/broadcast/main.c +++ b/src/ipcpd/broadcast/main.c @@ -242,7 +242,7 @@ static int broadcast_ipcp_join(int fd, notifier_event(NOTIFY_DT_CONN_ADD, &conn); - ipcp_flow_alloc_reply(fd, 0, mpl, &data); + ipcp_flow_alloc_reply(fd, 0, mpl, IPCP_BROADCAST_MTU, &data); return 0; } @@ -307,12 +307,13 @@ int main(int argc, ipcp_sigwait(); if (ipcp_get_state() == IPCP_SHUTDOWN) { + ipcp_stop(); stop_components(); finalize_components(); + } else { + ipcp_stop(); } - ipcp_stop(); - enroll_fini(); connmgr_fini(); diff --git a/src/ipcpd/config.h.in b/src/ipcpd/config.h.in index 0b4252e5..7edec526 100644 --- a/src/ipcpd/config.h.in +++ b/src/ipcpd/config.h.in @@ -23,8 +23,8 @@ #define PTHREAD_COND_CLOCK @PTHREAD_COND_CLOCK@ #define SYS_MAX_FLOWS @SYS_MAX_FLOWS@ -#define PROG_RES_FDS @PROG_RES_FDS@ -#define PROG_MAX_FLOWS @PROG_MAX_FLOWS@ +#define PROC_RES_FDS @PROC_RES_FDS@ +#define PROC_MAX_FLOWS @PROC_MAX_FLOWS@ #define SOCKET_TIMEOUT @SOCKET_TIMEOUT@ #define CONNECT_TIMEOUT @CONNECT_TIMEOUT@ @@ -46,11 +46,13 @@ #define IPCP_SCHED_THR_MUL @IPCP_SCHED_THR_MUL@ #define PFT_SIZE @PFT_SIZE@ #define IPCP_UNICAST_MPL @IPCP_UNICAST_MPL@ +#define IPCP_UNICAST_MTU @IPCP_UNICAST_MTU@ #define CONNMGR_RCV_TIMEOUT @CONNMGR_RCV_TIMEOUT@ #cmakedefine DISABLE_CORE_LOCK #cmakedefine BUILD_CONTAINER #cmakedefine IPCP_FLOW_STATS +#cmakedefine IPCP_ETH_FLOW_STATS #cmakedefine IPCP_DEBUG_LOCAL #ifdef CONFIG_OUROBOROS_DEBUG #cmakedefine DEBUG_PROTO_DHT @@ -65,6 +67,8 @@ #define IPCP_UDP_RD_THR @IPCP_UDP_RD_THR@ #define IPCP_UDP_WR_THR @IPCP_UDP_WR_THR@ #define IPCP_UDP_MPL @IPCP_UDP_MPL@ +#define IPCP_UDP4_MTU @IPCP_UDP4_MTU@ +#define IPCP_UDP6_MTU @IPCP_UDP6_MTU@ /* eth */ #cmakedefine HAVE_NETMAP @@ -76,10 +80,13 @@ #define IPCP_ETH_LO_MTU @IPCP_ETH_LO_MTU@ #define IPCP_ETH_MGMT_FRAME_SIZE @IPCP_ETH_MGMT_FRAME_SIZE@ #define IPCP_ETH_MPL @IPCP_ETH_MPL@ +#define IPCP_ETH_SNDBUF @IPCP_ETH_SNDBUF@ +#define IPCP_ETH_RCVBUF @IPCP_ETH_RCVBUF@ /* local */ #define IPCP_LOCAL_MPL @IPCP_LOCAL_MPL@ +#define IPCP_LOCAL_MTU @IPCP_LOCAL_MTU@ /* broadcast */ -/* local */ #define IPCP_BROADCAST_MPL @IPCP_BROADCAST_MPL@ +#define IPCP_BROADCAST_MTU @IPCP_BROADCAST_MTU@ diff --git a/src/ipcpd/eth/eth.c b/src/ipcpd/eth/eth.c index 4be7775e..7e038a03 100644 --- a/src/ipcpd/eth/eth.c +++ b/src/ipcpd/eth/eth.c @@ -37,19 +37,30 @@ #include "config.h" +#include <ouroboros/atomics.h> #include <ouroboros/endian.h> #include <ouroboros/hash.h> #include <ouroboros/errno.h> #include <ouroboros/list.h> #include <ouroboros/utils.h> #include <ouroboros/bitmap.h> +#include <ouroboros/crc8.h> #include <ouroboros/dev.h> #include <ouroboros/ipcp-dev.h> #include <ouroboros/fqueue.h> #include <ouroboros/logs.h> +#include <ouroboros/np1_flow.h> #include <ouroboros/time.h> #include <ouroboros/fccntl.h> #include <ouroboros/pthread.h> +#include <ouroboros/rib.h> + +#ifndef IPCP_ETH_FLOW_STATS +#undef FETCH_ADD_RELAXED +#define FETCH_ADD_RELAXED(p, v) ((void) 0) +#undef FETCH_SUB_RELAXED +#define FETCH_SUB_RELAXED(p, v) ((void) 0) +#endif #include "ipcp.h" #include "np1.h" @@ -122,7 +133,8 @@ #define MGMT_EID 0 #define DIX_EID_SIZE sizeof(uint16_t) #define DIX_LENGTH_SIZE sizeof(uint16_t) -#define DIX_HEADER_SIZE (DIX_EID_SIZE + DIX_LENGTH_SIZE) +#define DIX_HCS_SIZE CRC8_HASH_LEN +#define DIX_HEADER_SIZE (DIX_EID_SIZE + DIX_LENGTH_SIZE + DIX_HCS_SIZE) #define ETH_HEADER_TOT_SIZE (ETH_HEADER_SIZE + DIX_HEADER_SIZE) #define MAX_EIDS (1 << (8 * DIX_EID_SIZE)) #define ETH_MAX_PACKET_SIZE (ETH_MTU - DIX_HEADER_SIZE) @@ -130,21 +142,26 @@ #elif defined(BUILD_ETH_LLC) #define THIS_TYPE IPCP_ETH_LLC #define MGMT_SAP 0x01 -#define LLC_HEADER_SIZE 3 +#define LLC_FIELDS_SIZE 3 +#define LLC_HCS_SIZE CRC8_HASH_LEN +#define LLC_HEADER_SIZE (LLC_FIELDS_SIZE + LLC_HCS_SIZE) #define ETH_HEADER_TOT_SIZE (ETH_HEADER_SIZE + LLC_HEADER_SIZE) #define MAX_SAPS 64 #define ETH_MAX_PACKET_SIZE (ETH_MTU - LLC_HEADER_SIZE) #define ETH_FRAME_SIZE (ETH_HEADER_SIZE + ETH_MTU_MAX) #endif -#define NAME_QUERY_TIMEO 2000 /* ms */ -#define MGMT_TIMEO 100 /* ms */ +#define NAME_QUERY_TIMEO 1900 /* ms total budget */ +#define NAME_QUERY_RETRIES 3 /* retransmits, 4 attempts total */ +#define MGMT_TIMEO 100 /* ms */ #define MGMT_FRAME_SIZE IPCP_ETH_MGMT_FRAME_SIZE +#define ETH_RIB_PATH "eth" #define FLOW_REQ 0 #define FLOW_REPLY 1 #define NAME_QUERY_REQ 2 #define NAME_QUERY_REPLY 3 +#define FLOW_IRM_UPDATE 4 struct mgmt_msg { #if defined(BUILD_ETH_DIX) @@ -165,7 +182,7 @@ struct mgmt_msg { uint32_t delay; uint32_t timeout; int32_t response; - uint8_t in_order; + uint8_t service; #if defined (BUILD_ETH_DIX) uint8_t code; uint8_t availability; @@ -185,6 +202,7 @@ struct eth_frame { uint8_t ssap; uint8_t cf; #endif + uint8_t hcs; uint8_t payload; } __attribute__((packed)); @@ -196,6 +214,17 @@ struct ef { int8_t r_sap; #endif uint8_t r_addr[MAC_SIZE]; +#ifdef IPCP_ETH_FLOW_STATS + struct { + time_t stamp; + size_t p_rcv; + size_t b_rcv; + size_t p_dlv_f; + size_t p_snd; + size_t b_snd; + size_t p_snd_f; + } stat; +#endif }; struct mgmt_frame { @@ -233,6 +262,22 @@ struct { struct ef * fd_to_ef; fset_t * np1_flows; pthread_rwlock_t flows_lock; +#ifdef IPCP_ETH_FLOW_STATS + struct { + size_t n_flows; + size_t n_rcv; + size_t n_snd; + size_t n_mgmt_rcv; + size_t n_mgmt_snd; + size_t n_bad_id; + size_t n_dlv_f; + size_t n_buf_f; + size_t n_rcv_f; + size_t n_snd_f; + size_t kern_rcv; + size_t kern_drp; + } stat; +#endif pthread_t packet_writer[IPCP_ETH_WR_THR]; pthread_t packet_reader[IPCP_ETH_RD_THR]; @@ -284,7 +329,14 @@ static int eth_data_init(void) eth_data.fd_to_ef[i].r_sap = -1; #endif memset(ð_data.fd_to_ef[i].r_addr, 0, MAC_SIZE); +#ifdef IPCP_ETH_FLOW_STATS + memset(ð_data.fd_to_ef[i].stat, 0, + sizeof(eth_data.fd_to_ef[i].stat)); +#endif } +#ifdef IPCP_ETH_FLOW_STATS + memset(ð_data.stat, 0, sizeof(eth_data.stat)); +#endif eth_data.shim_data = shim_data_create(); if (eth_data.shim_data == NULL) @@ -357,6 +409,227 @@ static void eth_data_fini(void) free(eth_data.fd_to_ef); } +#ifdef IPCP_ETH_FLOW_STATS +static int eth_rib_read(const char * path, + char * buf, + size_t len) +{ + struct ef * flow; + int fd; + char tmstr[RIB_TM_STRLEN]; + struct tm * tm; + time_t stamp; + char * entry; + + entry = strstr(path, RIB_SEPARATOR) + 1; + assert(entry); + + if (len < 2048) + return 0; + + buf[0] = '\0'; + + if (strcmp(entry, "summary") == 0) { + int n; +#if defined(HAVE_RAW_SOCKETS) + int rcvbuf = 0; + int sndbuf = 0; + int queued = 0; + socklen_t optlen = sizeof(rcvbuf); +# if defined(__linux__) + struct tpacket_stats tp_stats; + socklen_t tp_len = sizeof(tp_stats); +# endif + + getsockopt(eth_data.s_fd, SOL_SOCKET, + SO_RCVBUF, &rcvbuf, &optlen); + optlen = sizeof(sndbuf); + getsockopt(eth_data.s_fd, SOL_SOCKET, + SO_SNDBUF, &sndbuf, &optlen); + ioctl(eth_data.s_fd, FIONREAD, &queued); +# if defined(__linux__) + if (getsockopt(eth_data.s_fd, SOL_PACKET, + PACKET_STATISTICS, + &tp_stats, &tp_len) == 0) { + FETCH_ADD_RELAXED(ð_data.stat.kern_rcv, + tp_stats.tp_packets); + FETCH_ADD_RELAXED(ð_data.stat.kern_drp, + tp_stats.tp_drops); + } +# endif +#endif + n = sprintf(buf, + "Active flows: %20zu\n" + "Total frames received: %20zu\n" + "Total frames sent: %20zu\n" + "Management frames received: %20zu\n" + "Management frames sent: %20zu\n" + "Bad EID/SAP frames: %20zu\n" + "Delivery (N+1) failures: %20zu\n" + "Buffer alloc failures: %20zu\n" + "Frame read failures: %20zu\n" + "Frame send failures: %20zu\n", + LOAD_RELAXED(ð_data.stat.n_flows), + LOAD_RELAXED(ð_data.stat.n_rcv), + LOAD_RELAXED(ð_data.stat.n_snd), + LOAD_RELAXED(ð_data.stat.n_mgmt_rcv), + LOAD_RELAXED(ð_data.stat.n_mgmt_snd), + LOAD_RELAXED(ð_data.stat.n_bad_id), + LOAD_RELAXED(ð_data.stat.n_dlv_f), + LOAD_RELAXED(ð_data.stat.n_buf_f), + LOAD_RELAXED(ð_data.stat.n_rcv_f), + LOAD_RELAXED(ð_data.stat.n_snd_f)); +#if defined(HAVE_RAW_SOCKETS) + n += sprintf(buf + n, + "Socket rcvbuf (bytes): %20d\n" + "Socket sndbuf (bytes): %20d\n" + "Socket queued (bytes): %20d\n", + rcvbuf, sndbuf, queued); +# if defined(__linux__) + n += sprintf(buf + n, + "Kernel frames received: %20zu\n" + "Kernel frames dropped: %20zu\n", + LOAD_RELAXED(ð_data.stat.kern_rcv), + LOAD_RELAXED(ð_data.stat.kern_drp)); +# endif +#endif + return n; + } + + fd = atoi(entry); + + if (fd < 0 || fd >= SYS_MAX_FLOWS) + return -1; + + flow = ð_data.fd_to_ef[fd]; + + pthread_rwlock_rdlock(ð_data.flows_lock); + + stamp = flow->stat.stamp; + if (stamp == 0) { + pthread_rwlock_unlock(ð_data.flows_lock); + return 0; + } + + pthread_rwlock_unlock(ð_data.flows_lock); + + tm = gmtime(&stamp); + strftime(tmstr, sizeof(tmstr), RIB_TM_FORMAT, tm); + + sprintf(buf, + "Flow established at: %20s\n" + "Sent (packets): %20zu\n" + "Sent (bytes): %20zu\n" + "Send failed (packets): %20zu\n" + "Received (packets): %20zu\n" + "Received (bytes): %20zu\n" + "Delivery (N+1) failures: %20zu\n", + tmstr, + LOAD_RELAXED(&flow->stat.p_snd), + LOAD_RELAXED(&flow->stat.b_snd), + LOAD_RELAXED(&flow->stat.p_snd_f), + LOAD_RELAXED(&flow->stat.p_rcv), + LOAD_RELAXED(&flow->stat.b_rcv), + LOAD_RELAXED(&flow->stat.p_dlv_f)); + + return strlen(buf); +} + +static int eth_rib_readdir(char *** buf) +{ + char entry[RIB_PATH_LEN + 1]; + size_t i; + int idx = 0; + int n_entries; + + pthread_rwlock_rdlock(ð_data.flows_lock); + + n_entries = (int) LOAD_RELAXED(ð_data.stat.n_flows) + 1; + + *buf = malloc(sizeof(**buf) * n_entries); + if (*buf == NULL) + goto fail_entries; + + (*buf)[idx] = malloc(strlen("summary") + 1); + if ((*buf)[idx] == NULL) + goto fail_entry; + + strcpy((*buf)[idx++], "summary"); + + for (i = 0; i < SYS_MAX_FLOWS && idx < n_entries; ++i) { + if (eth_data.fd_to_ef[i].stat.stamp == 0) + continue; + + sprintf(entry, "%zu", i); + + (*buf)[idx] = malloc(strlen(entry) + 1); + if ((*buf)[idx] == NULL) + goto fail_entry; + + strcpy((*buf)[idx++], entry); + } + + pthread_rwlock_unlock(ð_data.flows_lock); + + return idx; + + fail_entry: + while (idx-- > 0) + free((*buf)[idx]); + free(*buf); + fail_entries: + pthread_rwlock_unlock(ð_data.flows_lock); + return -ENOMEM; +} + +static int eth_rib_getattr(const char * path, + struct rib_attr * attr) +{ + int fd; + char * entry; + struct ef * flow; + + entry = strstr(path, RIB_SEPARATOR) + 1; + assert(entry); + + if (strcmp(entry, "summary") == 0) { + attr->size = 2048; + attr->mtime = 0; + return 0; + } + + fd = atoi(entry); + + if (fd < 0 || fd >= SYS_MAX_FLOWS) { + attr->size = 0; + attr->mtime = 0; + return 0; + } + + flow = ð_data.fd_to_ef[fd]; + + pthread_rwlock_rdlock(ð_data.flows_lock); + + if (flow->stat.stamp != 0) { + attr->size = 2048; + attr->mtime = flow->stat.stamp; + } else { + attr->size = 0; + attr->mtime = 0; + } + + pthread_rwlock_unlock(ð_data.flows_lock); + + return 0; +} + +static struct rib_ops eth_r_ops = { + .read = eth_rib_read, + .readdir = eth_rib_readdir, + .getattr = eth_rib_getattr +}; +#endif /* IPCP_ETH_FLOW_STATS */ + #ifdef BUILD_ETH_LLC static uint8_t reverse_bits(uint8_t b) { @@ -409,12 +682,18 @@ static int eth_ipcp_send_frame(const uint8_t * dst_addr, e_frame->ethertype = eth_data.ethertype; e_frame->eid = htons(deid); e_frame->length = htons(len); + mem_hash(HASH_CRC8, &e_frame->hcs, + (uint8_t *) &e_frame->eid, + DIX_EID_SIZE + DIX_LENGTH_SIZE); frame_len = ETH_HEADER_TOT_SIZE + len; #elif defined(BUILD_ETH_LLC) e_frame->length = htons(LLC_HEADER_SIZE + len); e_frame->dsap = dsap; e_frame->ssap = ssap; e_frame->cf = cf; + mem_hash(HASH_CRC8, &e_frame->hcs, + (uint8_t *) &e_frame->dsap, + LLC_FIELDS_SIZE); frame_len = ETH_HEADER_TOT_SIZE + len; #endif @@ -440,10 +719,7 @@ static int eth_ipcp_send_frame(const uint8_t * dst_addr, } assert(FD_ISSET(eth_data.s_fd, &fds)); - if (sendto(eth_data.s_fd, - frame, - frame_len, - 0, + if (sendto(eth_data.s_fd, frame, frame_len, 0, (struct sockaddr *) ð_data.device, sizeof(eth_data.device)) <= 0) { log_dbg("Failed to send message: %s.", strerror(errno)); @@ -451,6 +727,8 @@ static int eth_ipcp_send_frame(const uint8_t * dst_addr, } #endif /* HAVE_NETMAP */ + FETCH_ADD_RELAXED(ð_data.stat.n_snd, 1); + return 0; } @@ -490,7 +768,7 @@ static int eth_ipcp_alloc(const uint8_t * dst_addr, msg->availability = qs.availability; msg->loss = hton32(qs.loss); msg->ber = hton32(qs.ber); - msg->in_order = qs.in_order; + msg->service = qs.service; msg->max_gap = hton32(qs.max_gap); msg->timeout = hton32(qs.timeout); @@ -508,6 +786,9 @@ static int eth_ipcp_alloc(const uint8_t * dst_addr, buf, len + data->len); free(buf); + if (ret == 0) + FETCH_ADD_RELAXED(ð_data.stat.n_mgmt_snd, 1); + return ret; } @@ -558,11 +839,65 @@ static int eth_ipcp_alloc_resp(uint8_t * dst_addr, return -1; } + FETCH_ADD_RELAXED(ð_data.stat.n_mgmt_snd, 1); + free(buf); return 0; } +static int eth_ipcp_flow_update(int fd, + const buffer_t * data) +{ + struct mgmt_msg * msg; + struct ef * flow; + uint8_t * buf; + uint8_t r_addr[MAC_SIZE]; + int ret; + + buf = malloc(sizeof(*msg) + ETH_HEADER_TOT_SIZE + data->len); + if (buf == NULL) + return -1; + + memset(buf, 0, sizeof(*msg) + ETH_HEADER_TOT_SIZE + data->len); + + msg = (struct mgmt_msg *) (buf + ETH_HEADER_TOT_SIZE); + + msg->code = FLOW_IRM_UPDATE; + + pthread_rwlock_rdlock(ð_data.flows_lock); + + flow = ð_data.fd_to_ef[fd]; +#if defined(BUILD_ETH_DIX) + msg->seid = htons((uint16_t) fd); + msg->deid = htons((uint16_t) flow->r_eid); +#elif defined(BUILD_ETH_LLC) + msg->ssap = flow->sap; + msg->dsap = (uint8_t) flow->r_sap; +#endif + memcpy(r_addr, flow->r_addr, MAC_SIZE); + + pthread_rwlock_unlock(ð_data.flows_lock); + + if (data->len > 0) + memcpy(msg + 1, data->data, data->len); + + ret = eth_ipcp_send_frame(r_addr, +#if defined(BUILD_ETH_DIX) + MGMT_EID, +#elif defined(BUILD_ETH_LLC) + reverse_bits(MGMT_SAP), + reverse_bits(MGMT_SAP), +#endif + buf, sizeof(*msg) + data->len); + free(buf); + + if (ret == 0) + FETCH_ADD_RELAXED(ð_data.stat.n_mgmt_snd, 1); + + return ret; +} + static int eth_ipcp_req(uint8_t * r_addr, #if defined(BUILD_ETH_DIX) uint16_t r_eid, @@ -575,7 +910,8 @@ static int eth_ipcp_req(uint8_t * r_addr, { int fd; - fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_ETH_MPL, data); + fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_ETH_MPL, + ETH_MAX_PACKET_SIZE, data); if (fd < 0) { log_err("Could not get new flow from IRMd."); return -1; @@ -622,7 +958,7 @@ static int eth_ipcp_alloc_reply(uint8_t * r_addr, fd = eth_data.ef_to_fd[dsap]; #endif if (fd < 0) { - pthread_rwlock_unlock(& eth_data.flows_lock); + pthread_rwlock_unlock(ð_data.flows_lock); log_err("No flow found with that SAP."); return -1; /* -EFLOWNOTFOUND */ } @@ -647,7 +983,8 @@ static int eth_ipcp_alloc_reply(uint8_t * r_addr, #elif defined(BUILD_ETH_LLC) log_dbg("Flow reply, fd %d, SSAP %d, DSAP %d.", fd, ssap, dsap); #endif - if ((ret = ipcp_flow_alloc_reply(fd, response, mpl, data)) < 0) { + if ((ret = ipcp_flow_alloc_reply(fd, response, mpl, + ETH_MAX_PACKET_SIZE, data)) < 0) { log_err("Failed to reply to flow allocation."); return -1; } @@ -689,6 +1026,8 @@ static int eth_ipcp_name_query_req(const uint8_t * hash, return -1; } + FETCH_ADD_RELAXED(ð_data.stat.n_mgmt_snd, 1); + free(buf); } @@ -709,6 +1048,44 @@ static int eth_ipcp_name_query_reply(const uint8_t * hash, return 0; } +static int eth_ipcp_flow_update_arr(const uint8_t * buf, + size_t len) +{ + struct mgmt_msg * msg; + buffer_t data; + int fd; + int flow_id; + + msg = (struct mgmt_msg *) buf; + + data.data = (uint8_t *) buf + sizeof(*msg); + data.len = len - sizeof(*msg); + + pthread_rwlock_rdlock(ð_data.flows_lock); +#if defined(BUILD_ETH_DIX) + fd = ntohs(msg->deid); +#elif defined(BUILD_ETH_LLC) + fd = eth_data.ef_to_fd[msg->dsap]; +#endif + pthread_rwlock_unlock(ð_data.flows_lock); + + if (fd < 0 || fd >= SYS_MAX_FLOWS) { + log_err("Flow update for unknown endpoint."); + return -1; + } + + flow_id = np1_flow_id(fd); + if (flow_id < 0) + return -1; + + if (ipcp_flow_update_arr(flow_id, &data) < 0) { + log_err("Failed to relay flow update on fd %d.", fd); + return -1; + } + + return 0; +} + static int eth_ipcp_mgmt_frame(const uint8_t * buf, size_t len, uint8_t * r_addr) @@ -718,20 +1095,24 @@ static int eth_ipcp_mgmt_frame(const uint8_t * buf, qosspec_t qs; buffer_t data; + if (len < sizeof(*msg)) + return -1; + msg = (struct mgmt_msg *) buf; switch (msg->code) { case FLOW_REQ: msg_len = sizeof(*msg) + ipcp_dir_hash_len(); - assert(len >= msg_len); + if (len < msg_len) + return -1; qs.delay = ntoh32(msg->delay); qs.bandwidth = ntoh64(msg->bandwidth); qs.availability = msg->availability; qs.loss = ntoh32(msg->loss); qs.ber = ntoh32(msg->ber); - qs.in_order = msg->in_order; + qs.service = msg->service; qs.max_gap = ntoh32(msg->max_gap); qs.timeout = ntoh32(msg->timeout); @@ -752,8 +1133,6 @@ static int eth_ipcp_mgmt_frame(const uint8_t * buf, } break; case FLOW_REPLY: - assert(len >= sizeof(*msg)); - data.data = (uint8_t *) buf + sizeof(*msg); data.len = len - sizeof(*msg); @@ -768,10 +1147,17 @@ static int eth_ipcp_mgmt_frame(const uint8_t * buf, ntoh32(msg->response), &data); break; + case FLOW_IRM_UPDATE: + eth_ipcp_flow_update_arr(buf, len); + break; case NAME_QUERY_REQ: + if (len < sizeof(*msg) + ipcp_dir_hash_len()) + return -1; eth_ipcp_name_query_req(buf + sizeof(*msg), r_addr); break; case NAME_QUERY_REPLY: + if (len < sizeof(*msg) + ipcp_dir_hash_len()) + return -1; eth_ipcp_name_query_reply(buf + sizeof(*msg), r_addr); break; default: @@ -844,6 +1230,12 @@ static void * eth_ipcp_packet_reader(void * o) fd_set fds; int frame_len; #endif +#if defined(HAVE_RAW_SOCKETS) + struct sockaddr_ll src; + socklen_t slen; +#endif + size_t eth_len; + uint8_t hcs; struct eth_frame * e_frame; struct mgmt_frame * frame; @@ -881,24 +1273,58 @@ static void * eth_ipcp_packet_reader(void * o) if (select(eth_data.s_fd + 1, &fds, NULL, NULL, NULL) < 0) continue; assert(FD_ISSET(eth_data.s_fd, &fds)); - if (ipcp_spb_reserve(&spb, ETH_MTU)) + if (ipcp_spb_reserve(&spb, ETH_MTU)) { + FETCH_ADD_RELAXED(ð_data.stat.n_buf_f, 1); continue; - buf = ssm_pk_buff_head_alloc(spb, ETH_HEADER_TOT_SIZE); + } + buf = ssm_pk_buff_push(spb, ETH_HEADER_TOT_SIZE); if (buf == NULL) { log_dbg("Failed to allocate header."); ipcp_spb_release(spb); + FETCH_ADD_RELAXED(ð_data.stat.n_buf_f, 1); continue; } - frame_len = recv(eth_data.s_fd, buf, - ETH_MTU + ETH_HEADER_TOT_SIZE, 0); + slen = sizeof(src); + /* MSG_DONTWAIT: RD_THR>1 race-loser bails with EAGAIN. */ + frame_len = recvfrom(eth_data.s_fd, buf, + ETH_MTU + ETH_HEADER_TOT_SIZE, + MSG_DONTWAIT, + (struct sockaddr *) &src, &slen); #endif - if (frame_len <= 0) { - log_dbg("Failed to receive frame."); + if (frame_len == 0) { ipcp_spb_release(spb); + continue; /* Spurious */ + } + + if (frame_len < 0) { + ipcp_spb_release(spb); + + if (errno == EAGAIN || errno == EWOULDBLOCK) + continue; + + log_dbg("Failed to rcv frame: %s.", strerror(errno)); + FETCH_ADD_RELAXED(ð_data.stat.n_rcv_f, 1); continue; } #endif +#if defined(HAVE_NETMAP) + eth_len = hdr.len; +#elif defined(HAVE_BPF) + eth_len = ((struct bpf_hdr *) buf)->bh_caplen; +#else + eth_len = (size_t) frame_len; +#endif + /* Defense in depth: reject before parsing dereferences. */ + if (eth_len < ETH_HEADER_TOT_SIZE) + goto fail_frame; + +#if defined(HAVE_RAW_SOCKETS) + /* Drop our own egress. */ + if (src.sll_pkttype == PACKET_OUTGOING) + goto fail_frame; +#endif + #if defined(HAVE_BPF) && !defined(HAVE_NETMAP) e_frame = (struct eth_frame *) (buf + ((struct bpf_hdr *) buf)->bh_hdrlen); @@ -916,6 +1342,8 @@ static void * eth_ipcp_packet_reader(void * o) e_frame->dst_hwaddr, MAC_SIZE) && memcmp(br_addr, e_frame->dst_hwaddr, MAC_SIZE)) { + FETCH_ADD_RELAXED(ð_data.stat.n_bad_id, 1); + goto fail_frame; } #endif length = ntohs(e_frame->length); @@ -923,17 +1351,41 @@ static void * eth_ipcp_packet_reader(void * o) if (e_frame->ethertype != eth_data.ethertype) goto fail_frame; + if (length > ETH_MTU) + goto fail_frame; + deid = ntohs(e_frame->eid); - if (deid == MGMT_EID) { #elif defined (BUILD_ETH_LLC) if (length > 0x05FF) /* DIX */ goto fail_frame; + if (length < LLC_HEADER_SIZE || length > ETH_MTU) + goto fail_frame; + length -= LLC_HEADER_SIZE; dsap = reverse_bits(e_frame->dsap); ssap = reverse_bits(e_frame->ssap); +#endif + + if (eth_len < ETH_HEADER_TOT_SIZE + (size_t) length) + goto fail_frame; + +#if defined(BUILD_ETH_DIX) + mem_hash(HASH_CRC8, &hcs, + (uint8_t *) &e_frame->eid, + DIX_EID_SIZE + DIX_LENGTH_SIZE); +#elif defined(BUILD_ETH_LLC) + mem_hash(HASH_CRC8, &hcs, + (uint8_t *) &e_frame->dsap, + LLC_FIELDS_SIZE); +#endif + if (hcs != e_frame->hcs) + goto fail_frame; +#if defined(BUILD_ETH_DIX) + if (deid == MGMT_EID) { +#elif defined (BUILD_ETH_LLC) if (ssap == MGMT_SAP && dsap == MGMT_SAP) { #endif ipcp_spb_release(spb); /* No need for the N+1 buffer. */ @@ -941,13 +1393,13 @@ static void * eth_ipcp_packet_reader(void * o) if (length > MGMT_FRAME_SIZE) { log_warn("Management frame size %u exceeds %u.", length, MGMT_FRAME_SIZE); - goto fail_frame; + continue; } frame = malloc(sizeof(*frame)); if (frame == NULL) { log_err("Failed to allocate frame."); - goto fail_frame; + continue; } memcpy(frame->buf, &e_frame->payload, length); @@ -958,6 +1410,8 @@ static void * eth_ipcp_packet_reader(void * o) list_add(&frame->next, ð_data.mgmt_frames); pthread_cond_signal(ð_data.mgmt_cond); pthread_mutex_unlock(ð_data.mgmt_lock); + FETCH_ADD_RELAXED(ð_data.stat.n_rcv, 1); + FETCH_ADD_RELAXED(ð_data.stat.n_mgmt_rcv, 1); } else { pthread_rwlock_rdlock(ð_data.flows_lock); @@ -968,6 +1422,7 @@ static void * eth_ipcp_packet_reader(void * o) #endif if (fd < 0) { pthread_rwlock_unlock(ð_data.flows_lock); + FETCH_ADD_RELAXED(ð_data.stat.n_bad_id, 1); goto fail_frame; } @@ -976,13 +1431,18 @@ static void * eth_ipcp_packet_reader(void * o) || memcmp(eth_data.fd_to_ef[fd].r_addr, e_frame->src_hwaddr, MAC_SIZE)) { pthread_rwlock_unlock(ð_data.flows_lock); + FETCH_ADD_RELAXED(ð_data.stat.n_bad_id, 1); goto fail_frame; } #endif + FETCH_ADD_RELAXED(ð_data.fd_to_ef[fd].stat.p_rcv, 1); + FETCH_ADD_RELAXED(ð_data.fd_to_ef[fd].stat.b_rcv, + length); + FETCH_ADD_RELAXED(ð_data.stat.n_rcv, 1); pthread_rwlock_unlock(ð_data.flows_lock); #ifndef HAVE_NETMAP - ssm_pk_buff_head_release(spb, ETH_HEADER_TOT_SIZE); + ssm_pk_buff_pop(spb, ETH_HEADER_TOT_SIZE); ssm_pk_buff_truncate(spb, length); #else if (ipcp_spb_reserve(&spb, length)) @@ -991,8 +1451,13 @@ static void * eth_ipcp_packet_reader(void * o) buf = ssm_pk_buff_head(spb); memcpy(buf, &e_frame->payload, length); #endif - if (np1_flow_write(fd, spb, NP1_GET_POOL(fd)) < 0) + if (np1_flow_write(fd, spb, NP1_GET_POOL(fd)) < 0) { ipcp_spb_release(spb); + FETCH_ADD_RELAXED( + ð_data.fd_to_ef[fd].stat.p_dlv_f, + 1); + FETCH_ADD_RELAXED(ð_data.stat.n_dlv_f, 1); + } continue; fail_frame: @@ -1048,10 +1513,11 @@ static void * eth_ipcp_packet_writer(void * o) len = ssm_pk_buff_len(spb); - if (ssm_pk_buff_head_alloc(spb, ETH_HEADER_TOT_SIZE) + if (ssm_pk_buff_push(spb, ETH_HEADER_TOT_SIZE) == NULL) { log_dbg("Failed to allocate header."); ipcp_spb_release(spb); + FETCH_ADD_RELAXED(ð_data.stat.n_buf_f, 1); continue; } @@ -1075,8 +1541,20 @@ static void * eth_ipcp_packet_writer(void * o) dsap, ssap, #endif ssm_pk_buff_head(spb), - len)) + len)) { log_dbg("Failed to send frame."); + FETCH_ADD_RELAXED( + ð_data.fd_to_ef[fd].stat.p_snd_f, + 1); + FETCH_ADD_RELAXED(ð_data.stat.n_snd_f, 1); + } else { + FETCH_ADD_RELAXED( + ð_data.fd_to_ef[fd].stat.p_snd, + 1); + FETCH_ADD_RELAXED( + ð_data.fd_to_ef[fd].stat.b_snd, + len); + } ipcp_spb_release(spb); } } @@ -1424,12 +1902,14 @@ static int eth_init_bpf(struct ifreq * ifr) return -1; } #elif defined(HAVE_RAW_SOCKETS) +#define SOCKOPT() static int eth_init_raw_socket(struct ifreq * ifr) { int idx; - int flags; + int sndbuf; + int rcvbuf; #if defined(IPCP_ETH_QDISC_BYPASS) - int qdisc_bypass = 1; + int qdisc_bypass = 1; #endif /* ENABLE_QDISC_BYPASS */ idx = if_nametoindex(ifr->ifr_name); @@ -1437,6 +1917,7 @@ static int eth_init_raw_socket(struct ifreq * ifr) log_err("Failed to retrieve interface index."); return -1; } + memset(&(eth_data.device), 0, sizeof(eth_data.device)); eth_data.device.sll_ifindex = idx; eth_data.device.sll_family = AF_PACKET; @@ -1453,17 +1934,6 @@ static int eth_init_raw_socket(struct ifreq * ifr) goto fail_socket; } - flags = fcntl(eth_data.s_fd, F_GETFL, 0); - if (flags < 0) { - log_err("Failed to get flags."); - goto fail_device; - } - - if (fcntl(eth_data.s_fd, F_SETFL, flags | O_NONBLOCK)) { - log_err("Failed to set socket non-blocking."); - goto fail_device; - } - #if defined(IPCP_ETH_QDISC_BYPASS) if (setsockopt(eth_data.s_fd, SOL_PACKET, PACKET_QDISC_BYPASS, &qdisc_bypass, sizeof(qdisc_bypass))) { @@ -1471,6 +1941,18 @@ static int eth_init_raw_socket(struct ifreq * ifr) } #endif + sndbuf = IPCP_ETH_SNDBUF; + if (sndbuf > 0 && setsockopt(eth_data.s_fd, SOL_SOCKET, SO_SNDBUF, + &sndbuf, sizeof(sndbuf))) { + log_info("Failed to set SO_SNDBUF to %d.", sndbuf); + } + + rcvbuf = IPCP_ETH_RCVBUF; + if (rcvbuf > 0 && setsockopt(eth_data.s_fd, SOL_SOCKET, SO_RCVBUF, + &rcvbuf, sizeof(rcvbuf))) { + log_info("Failed to set SO_RCVBUF to %d.", rcvbuf); + } + if (bind(eth_data.s_fd, (struct sockaddr *) ð_data.device, sizeof(eth_data.device)) < 0) { log_err("Failed to bind socket to interface."); @@ -1543,6 +2025,12 @@ static int eth_ipcp_bootstrap(struct ipcp_config * conf) return -1; } #endif /* HAVE_NETMAP */ +#ifdef IPCP_ETH_FLOW_STATS + if (rib_reg(ETH_RIB_PATH, ð_r_ops)) { + log_err("Failed to register RIB."); + goto fail_rib_reg; + } +#endif #if defined(__linux__) if (pthread_create(ð_data.if_monitor, NULL, eth_ipcp_if_monitor, NULL)) { @@ -1606,6 +2094,10 @@ static int eth_ipcp_bootstrap(struct ipcp_config * conf) #if defined(__linux__) fail_monitor: #endif +#ifdef IPCP_ETH_FLOW_STATS + rib_unreg(ETH_RIB_PATH); + fail_rib_reg: +#endif #if defined(HAVE_NETMAP) nm_close(eth_data.nmd); #elif defined(HAVE_BPF) @@ -1637,12 +2129,14 @@ static int eth_ipcp_unreg(const uint8_t * hash) static int eth_ipcp_query(const uint8_t * hash) { uint8_t r_addr[MAC_SIZE]; - struct timespec timeout = TIMESPEC_INIT_MS(NAME_QUERY_TIMEO); + struct timespec timeout; struct dir_query * query; int ret; + int attempt; uint8_t * buf; struct mgmt_msg * msg; size_t len; + long per_ms; if (shim_data_dir_has(eth_data.shim_data, hash)) return 0; @@ -1662,32 +2156,46 @@ static int eth_ipcp_query(const uint8_t * hash) memset(r_addr, 0xff, MAC_SIZE); - query = shim_data_dir_query_create(eth_data.shim_data, hash); - if (query == NULL) { - free(buf); - return -1; - } + per_ms = NAME_QUERY_TIMEO / (NAME_QUERY_RETRIES + 1); - if (eth_ipcp_send_frame(r_addr, + ret = -1; + for (attempt = 0; attempt <= NAME_QUERY_RETRIES; ++attempt) { + query = shim_data_dir_query_create(eth_data.shim_data, hash); + if (query == NULL) { + ret = -1; + break; + } + + if (eth_ipcp_send_frame(r_addr, #if defined(BUILD_ETH_DIX) - MGMT_EID, + MGMT_EID, #elif defined(BUILD_ETH_LLC) - reverse_bits(MGMT_SAP), - reverse_bits(MGMT_SAP), + reverse_bits(MGMT_SAP), + reverse_bits(MGMT_SAP), #endif - buf, len)) { - log_err("Failed to send management frame."); + buf, len)) { + log_err("Failed to send management frame."); + shim_data_dir_query_destroy(eth_data.shim_data, + query); + ret = -1; + break; + } + + FETCH_ADD_RELAXED(ð_data.stat.n_mgmt_snd, 1); + + timeout.tv_sec = per_ms / 1000; + timeout.tv_nsec = (per_ms % 1000) * 1000000L; + + ret = shim_data_dir_query_wait(query, &timeout); + shim_data_dir_query_destroy(eth_data.shim_data, query); - free(buf); - return -1; + + if (ret != -ETIMEDOUT) + break; } free(buf); - ret = shim_data_dir_query_wait(query, &timeout); - - shim_data_dir_query_destroy(eth_data.shim_data, query); - return ret; } @@ -1748,6 +2256,14 @@ static int eth_ipcp_flow_alloc(int fd, } fset_add(eth_data.np1_flows, fd); +#ifdef IPCP_ETH_FLOW_STATS + pthread_rwlock_wrlock(ð_data.flows_lock); + memset(ð_data.fd_to_ef[fd].stat, 0, + sizeof(eth_data.fd_to_ef[fd].stat)); + eth_data.fd_to_ef[fd].stat.stamp = time(NULL); + FETCH_ADD_RELAXED(ð_data.stat.n_flows, 1); + pthread_rwlock_unlock(ð_data.flows_lock); +#endif #if defined(BUILD_ETH_LLC) log_dbg("Assigned SAP %d for fd %d.", ssap, fd); #endif @@ -1808,6 +2324,14 @@ static int eth_ipcp_flow_alloc_resp(int fd, } fset_add(eth_data.np1_flows, fd); +#ifdef IPCP_ETH_FLOW_STATS + pthread_rwlock_wrlock(ð_data.flows_lock); + memset(ð_data.fd_to_ef[fd].stat, 0, + sizeof(eth_data.fd_to_ef[fd].stat)); + eth_data.fd_to_ef[fd].stat.stamp = time(NULL); + FETCH_ADD_RELAXED(ð_data.stat.n_flows, 1); + pthread_rwlock_unlock(ð_data.flows_lock); +#endif #if defined(BUILD_ETH_LLC) log_dbg("Assigned SAP %d for fd %d.", ssap, fd); #endif @@ -1836,6 +2360,12 @@ static int eth_ipcp_flow_dealloc(int fd) #endif memset(ð_data.fd_to_ef[fd].r_addr, 0, MAC_SIZE); +#ifdef IPCP_ETH_FLOW_STATS + memset(ð_data.fd_to_ef[fd].stat, 0, + sizeof(eth_data.fd_to_ef[fd].stat)); + FETCH_SUB_RELAXED(ð_data.stat.n_flows, 1); +#endif + pthread_rwlock_unlock(ð_data.flows_lock); ipcp_flow_dealloc(fd); @@ -1854,7 +2384,8 @@ static struct ipcp_ops eth_ops = { .ipcp_flow_alloc = eth_ipcp_flow_alloc, .ipcp_flow_join = NULL, .ipcp_flow_alloc_resp = eth_ipcp_flow_alloc_resp, - .ipcp_flow_dealloc = eth_ipcp_flow_dealloc + .ipcp_flow_dealloc = eth_ipcp_flow_dealloc, + .ipcp_flow_update = eth_ipcp_flow_update }; int main(int argc, @@ -1902,6 +2433,9 @@ int main(int argc, #ifdef __linux__ pthread_join(eth_data.if_monitor, NULL); #endif +#ifdef IPCP_ETH_FLOW_STATS + rib_unreg(ETH_RIB_PATH); +#endif } ipcp_stop(); diff --git a/src/ipcpd/ipcp.c b/src/ipcpd/ipcp.c index 5ad2401f..dcee4b9c 100644 --- a/src/ipcpd/ipcp.c +++ b/src/ipcpd/ipcp.c @@ -363,6 +363,7 @@ static void * acceptloop(void * o) int ipcp_wait_flow_req_arr(const uint8_t * dst, qosspec_t qs, time_t mpl, + uint32_t mtu, const buffer_t * data) { struct timespec ts = TIMESPEC_INIT_MS(ALLOC_TIMEOUT); @@ -392,7 +393,7 @@ int ipcp_wait_flow_req_arr(const uint8_t * dst, assert(ipcpd.alloc_id == -1); - fd = ipcp_flow_req_arr(&hash, qs, mpl, data); + fd = ipcp_flow_req_arr(&hash, qs, mpl, mtu, data); if (fd < 0) { pthread_mutex_unlock(&ipcpd.alloc_lock); log_err("Failed to get fd for flow."); @@ -819,6 +820,33 @@ static void do_flow_dealloc(int flow_id, log_info("Finished deallocating flow %d.", flow_id); } +static void do_flow_update(int flow_id, + const buffer_t * data, + ipcp_msg_t * ret_msg) +{ + int fd; + + if (ipcpd.ops->ipcp_flow_update == NULL) { + log_err("Failed to update flow: operation unsupported."); + ret_msg->result = -ENOTSUP; + return; + } + + if (ipcp_get_state() != IPCP_OPERATIONAL) { + ret_msg->result = -EIPCPSTATE; + return; + } + + fd = np1_flow_fd(flow_id); + if (fd < 0) { + log_warn("Flow update for unknown flow_id %d.", flow_id); + ret_msg->result = -1; + return; + } + + ret_msg->result = ipcpd.ops->ipcp_flow_update(fd, data); +} + static void * mainloop(void * o) { int sfd; @@ -917,6 +945,13 @@ static void * mainloop(void * o) case IPCP_MSG_CODE__IPCP_FLOW_DEALLOC: do_flow_dealloc(msg->flow_id, msg->timeo_sec, &ret_msg); break; + case IPCP_MSG_CODE__IPCP_FLOW_UPDATE: + assert(msg->pk.len > 0 ? msg->pk.data != NULL + : msg->pk.data == NULL); + data.len = msg->pk.len; + data.data = msg->pk.data; + do_flow_update(msg->flow_id, &data, &ret_msg); + break; default: ret_msg.result = -1; log_err("Unknown message code: %d.", msg->code); diff --git a/src/ipcpd/ipcp.h b/src/ipcpd/ipcp.h index 26a780a3..210157ec 100644 --- a/src/ipcpd/ipcp.h +++ b/src/ipcpd/ipcp.h @@ -68,6 +68,9 @@ struct ipcp_ops { const buffer_t * data); int (* ipcp_flow_dealloc)(int fd); + + int (* ipcp_flow_update)(int fd, + const buffer_t * data); }; int ipcp_init(int argc, @@ -98,6 +101,7 @@ enum ipcp_state ipcp_get_state(void); int ipcp_wait_flow_req_arr(const uint8_t * dst, qosspec_t qs, time_t mpl, + uint32_t mtu, const buffer_t * data); int ipcp_wait_flow_resp(const int fd); diff --git a/src/ipcpd/local/main.c b/src/ipcpd/local/main.c index 2c867317..c0aeb51e 100644 --- a/src/ipcpd/local/main.c +++ b/src/ipcpd/local/main.c @@ -38,6 +38,7 @@ #include <ouroboros/ipcp.h> #include <ouroboros/ipcp-dev.h> #include <ouroboros/local-dev.h> +#include <ouroboros/np1_flow.h> #include "ipcp.h" #include "np1.h" @@ -203,7 +204,8 @@ static int local_ipcp_flow_alloc(int fd, HASH_VAL32(dst), fd); assert(dst); - out_fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_LOCAL_MPL, data); + out_fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_LOCAL_MPL, + IPCP_LOCAL_MTU, data); if (out_fd < 0) { log_dbg("Flow allocation failed: %d", out_fd); return -1; @@ -255,14 +257,16 @@ static int local_ipcp_flow_alloc_resp(int fd, } if (response < 0) { - ipcp_flow_alloc_reply(out_fd, response, mpl, data); + ipcp_flow_alloc_reply(out_fd, response, mpl, + IPCP_LOCAL_MTU, data); log_info("Flow allocation rejected, fds (%d, %d).", out_fd, fd); return 0; } fset_add(local_data.flows, fd); - if (ipcp_flow_alloc_reply(out_fd, response, mpl, data) < 0) { + if (ipcp_flow_alloc_reply(out_fd, response, mpl, + IPCP_LOCAL_MTU, data) < 0) { log_err("Failed to reply to allocation"); fset_del(local_data.flows, fd); return -1; @@ -294,6 +298,38 @@ static int local_ipcp_flow_dealloc(int fd) return 0; } +/* Loopback relay: deliver the update back to the peer end (same IRMd). */ +static int local_ipcp_flow_update(int fd, + const buffer_t * data) +{ + int out_fd; + int out_flow_id; + + pthread_rwlock_rdlock(&local_data.lock); + + out_fd = local_data.in_out[fd]; + + pthread_rwlock_unlock(&local_data.lock); + + if (out_fd == -1) { + log_err("Flow update on fd %d with no peer.", fd); + return -1; + } + + out_flow_id = np1_flow_id(out_fd); + if (out_flow_id < 0) { + log_err("No flow_id for peer fd %d.", out_fd); + return -1; + } + + if (ipcp_flow_update_arr(out_flow_id, data) < 0) { + log_err("Failed to relay flow update to fd %d.", out_fd); + return -1; + } + + return 0; +} + static struct ipcp_ops local_ops = { .ipcp_bootstrap = local_ipcp_bootstrap, .ipcp_enroll = NULL, @@ -305,7 +341,8 @@ static struct ipcp_ops local_ops = { .ipcp_flow_alloc = local_ipcp_flow_alloc, .ipcp_flow_join = NULL, .ipcp_flow_alloc_resp = local_ipcp_flow_alloc_resp, - .ipcp_flow_dealloc = local_ipcp_flow_dealloc + .ipcp_flow_dealloc = local_ipcp_flow_dealloc, + .ipcp_flow_update = local_ipcp_flow_update }; int main(int argc, diff --git a/src/ipcpd/udp/udp.c b/src/ipcpd/udp/udp.c index 452bbc1a..db57e2f4 100644 --- a/src/ipcpd/udp/udp.c +++ b/src/ipcpd/udp/udp.c @@ -28,6 +28,8 @@ #include <ouroboros/list.h> #include <ouroboros/utils.h> #include <ouroboros/dev.h> +#include <ouroboros/ipcp-dev.h> +#include <ouroboros/np1_flow.h> #include <ouroboros/fqueue.h> #include <ouroboros/errno.h> #include <ouroboros/logs.h> @@ -47,9 +49,14 @@ #include <stdlib.h> #include <sys/wait.h> #include <fcntl.h> +#include <unistd.h> +#if defined(__linux__) +#include <netinet/ip.h> +#endif #define FLOW_REQ 1 #define FLOW_REPLY 2 +#define FLOW_IRM_UPDATE 3 #define OUR_HEADER_LEN sizeof(uint32_t) /* adds eid */ @@ -87,7 +94,7 @@ struct mgmt_msg { uint8_t code; /* QoS parameters from spec */ uint8_t availability; - uint8_t in_order; + uint8_t service; } __attribute__((packed)); struct mgmt_frame { @@ -130,6 +137,53 @@ static const char * __inet_ntop(const struct __ADDR * addr, return inet_ntop(__AF, addr, buf, __ADDRSTRLEN); } +#if defined(BUILD_IPCP_UDP4) +#define UDP_MTU_FALLBACK IPCP_UDP4_MTU +#define UDP_IP_OVERHEAD 28U /* IPv4 + UDP */ +#else +#define UDP_MTU_FALLBACK IPCP_UDP6_MTU +#define UDP_IP_OVERHEAD 48U /* IPv6 + UDP */ +#endif + +static uint32_t udp_query_mtu(const struct __SOCKADDR * saddr) +{ +#if defined(__linux__) && (defined(IP_MTU) || defined(IPV6_MTU)) + int sock; + int mtu = 0; + socklen_t len = sizeof(mtu); + + sock = socket(__AF, SOCK_DGRAM, IPPROTO_UDP); + if (sock < 0) + return UDP_MTU_FALLBACK; + + if (connect(sock, (const struct sockaddr *) saddr, + sizeof(*saddr)) < 0) + goto fallback; + +#if defined(BUILD_IPCP_UDP4) && defined(IP_MTU) + if (getsockopt(sock, IPPROTO_IP, IP_MTU, &mtu, &len) < 0) + goto fallback; +#elif defined(BUILD_IPCP_UDP6) && defined(IPV6_MTU) + if (getsockopt(sock, IPPROTO_IPV6, IPV6_MTU, &mtu, &len) < 0) + goto fallback; +#else + goto fallback; +#endif + close(sock); + + if (mtu <= (int) UDP_IP_OVERHEAD) + return UDP_MTU_FALLBACK; + + return (uint32_t) mtu - UDP_IP_OVERHEAD; + + fallback: + close(sock); +#else + (void) saddr; +#endif + return UDP_MTU_FALLBACK; +} + static int udp_data_init(void) { int i; @@ -220,7 +274,7 @@ static int udp_ipcp_port_alloc(const struct __SOCKADDR * r_saddr, msg->availability = qs.availability; msg->loss = hton32(qs.loss); msg->ber = hton32(qs.ber); - msg->in_order = qs.in_order; + msg->service = qs.service; msg->max_gap = hton32(qs.max_gap); msg->timeout = hton32(qs.timeout); @@ -277,6 +331,48 @@ static int udp_ipcp_port_alloc_resp(const struct __SOCKADDR * r_saddr, return 0; } +static int udp_ipcp_flow_update(int fd, + const buffer_t * data) +{ + struct mgmt_msg * msg; + struct __SOCKADDR r_saddr; + uint32_t d_eid; + + msg = malloc(sizeof(*msg) + data->len); + if (msg == NULL) + return -1; + + memset(msg, 0, sizeof(*msg) + data->len); + + pthread_rwlock_rdlock(&udp_data.flows_lock); + + r_saddr = udp_data.fd_to_uf[fd].r_saddr; + d_eid = (uint32_t) udp_data.fd_to_uf[fd].d_eid; + + pthread_rwlock_unlock(&udp_data.flows_lock); + + msg->eid = hton32(MGMT_EID); + msg->code = FLOW_IRM_UPDATE; + msg->s_eid = hton32(d_eid); + msg->d_eid = hton32((uint32_t) fd); + + if (data->len > 0) + memcpy(msg + 1, data->data, data->len); + + if (sendto(udp_data.s_fd, msg, sizeof(*msg) + data->len, + SENDTO_FLAGS, + (const struct sockaddr *) &r_saddr, + sizeof(r_saddr)) < 0) { + log_err("Failed to send flow update: %s.", strerror(errno)); + free(msg); + return -1; + } + + free(msg); + + return 0; +} + static int udp_ipcp_port_req(struct __SOCKADDR * c_saddr, int d_eid, const uint8_t * dst, @@ -285,7 +381,8 @@ static int udp_ipcp_port_req(struct __SOCKADDR * c_saddr, { int fd; - fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_UDP_MPL, data); + fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_UDP_MPL, + udp_query_mtu(c_saddr), data); if (fd < 0) { log_err("Could not get new flow from IRMd."); return -1; @@ -332,7 +429,8 @@ static int udp_ipcp_port_alloc_reply(const struct __SOCKADDR * saddr, pthread_rwlock_unlock(&udp_data.flows_lock); - if (ipcp_flow_alloc_reply(s_eid, response, mpl, data) < 0) { + if (ipcp_flow_alloc_reply(s_eid, response, mpl, + udp_query_mtu(saddr), data) < 0) { log_err("Failed to reply to flow allocation."); return -1; } @@ -343,6 +441,37 @@ static int udp_ipcp_port_alloc_reply(const struct __SOCKADDR * saddr, return 0; } +static int udp_ipcp_flow_update_arr(const uint8_t * buf, + size_t len) +{ + struct mgmt_msg * msg; + buffer_t data; + int fd; + int flow_id; + + msg = (struct mgmt_msg *) buf; + + fd = (int) ntoh32(msg->s_eid); + if (fd < 0 || fd >= SYS_MAX_FLOWS) { + log_err("Flow update for invalid eid %d.", fd); + return -1; + } + + data.len = len - sizeof(*msg); + data.data = (uint8_t *) buf + sizeof(*msg); + + flow_id = np1_flow_id(fd); + if (flow_id < 0) + return -1; + + if (ipcp_flow_update_arr(flow_id, &data) < 0) { + log_err("Failed to relay flow update on fd %d.", fd); + return -1; + } + + return 0; +} + static int udp_ipcp_mgmt_frame(struct __SOCKADDR c_saddr, const uint8_t * buf, size_t len) @@ -352,13 +481,18 @@ static int udp_ipcp_mgmt_frame(struct __SOCKADDR c_saddr, qosspec_t qs; buffer_t data; + /* Defence against malformed/corrupted wire input. */ + if (len < sizeof(*msg)) + return -1; + msg = (struct mgmt_msg *) buf; switch (msg->code) { case FLOW_REQ: msg_len = sizeof(*msg) + ipcp_dir_hash_len(); - assert(len >= msg_len); + if (len < msg_len) + return -1; data.len = len - msg_len; data.data = (uint8_t *) buf + msg_len; @@ -369,7 +503,7 @@ static int udp_ipcp_mgmt_frame(struct __SOCKADDR c_saddr, qs.availability = msg->availability; qs.loss = ntoh32(msg->loss); qs.ber = ntoh32(msg->ber); - qs.in_order = msg->in_order; + qs.service = msg->service; qs.max_gap = ntoh32(msg->max_gap); qs.timeout = ntoh32(msg->timeout); @@ -377,8 +511,6 @@ static int udp_ipcp_mgmt_frame(struct __SOCKADDR c_saddr, (uint8_t *) (msg + 1), qs, &data); case FLOW_REPLY: - assert(len >= sizeof(*msg)); - data.len = len - sizeof(*msg); data.data = (uint8_t *) buf + sizeof(*msg); @@ -387,6 +519,8 @@ static int udp_ipcp_mgmt_frame(struct __SOCKADDR c_saddr, ntoh32(msg->d_eid), ntoh32(msg->response), &data); + case FLOW_IRM_UPDATE: + return udp_ipcp_flow_update_arr(buf, len); default: log_err("Unknown message received %d.", msg->code); return -1; @@ -549,7 +683,7 @@ static void * udp_ipcp_packet_writer(void * o) continue; } - buf = ssm_pk_buff_head_alloc(spb, OUR_HEADER_LEN); + buf = ssm_pk_buff_push(spb, OUR_HEADER_LEN); if (buf == NULL) { log_dbg("Failed to allocate header."); ipcp_spb_release(spb); @@ -1140,7 +1274,8 @@ static struct ipcp_ops udp_ops = { .ipcp_flow_alloc = udp_ipcp_flow_alloc, .ipcp_flow_join = NULL, .ipcp_flow_alloc_resp = udp_ipcp_flow_alloc_resp, - .ipcp_flow_dealloc = udp_ipcp_flow_dealloc + .ipcp_flow_dealloc = udp_ipcp_flow_dealloc, + .ipcp_flow_update = udp_ipcp_flow_update }; int main(int argc, diff --git a/src/ipcpd/unicast/dt.c b/src/ipcpd/unicast/dt.c index 252477f4..e89cb17e 100644 --- a/src/ipcpd/unicast/dt.c +++ b/src/ipcpd/unicast/dt.c @@ -31,6 +31,7 @@ #define DT "dt" #define OUROBOROS_PREFIX DT +#include <ouroboros/atomics.h> #include <ouroboros/bitmap.h> #include <ouroboros/errno.h> #include <ouroboros/logs.h> @@ -139,7 +140,7 @@ static void dt_pci_shrink(struct ssm_pk_buff * spb) { assert(spb); - ssm_pk_buff_head_release(spb, dt_pci_info.head_size); + ssm_pk_buff_pop(spb, dt_pci_info.head_size); } struct { @@ -168,22 +169,33 @@ struct { size_t f_nhp_pkt[QOS_CUBE_MAX]; size_t f_nhp_bytes[QOS_CUBE_MAX]; pthread_mutex_t lock; - } stat[PROG_MAX_FLOWS]; + } stat[PROC_MAX_FLOWS]; size_t n_flows; #endif struct bmp * res_fds; - struct comp_info comps[PROG_RES_FDS]; + struct comp_info comps[PROC_RES_FDS]; pthread_rwlock_t lock; pthread_t listener; } dt; +/* + * Flow stats are lock-free relaxed atomics on the data path; the per-flow + * lock still guards the stamp/addr/n_flows lifecycle (see stat_used). + */ +#ifdef IPCP_FLOW_STATS +#define dt_stat_inc(idx, name, qc, len) \ + do { \ + FETCH_ADD_RELAXED(&dt.stat[idx].name ## _pkt[qc], 1); \ + FETCH_ADD_RELAXED(&dt.stat[idx].name ## _bytes[qc], (len)); \ + } while (0) +#define dt_stat_load(idx, field, qc) LOAD_RELAXED(&dt.stat[idx].field[qc]) + static int dt_rib_read(const char * path, char * buf, size_t len) { -#ifdef IPCP_FLOW_STATS int fd; int i; char str[QOS_BLOCK_LEN + 1]; @@ -220,7 +232,7 @@ static int dt_rib_read(const char * path, tm = gmtime(&dt.stat[fd].stamp); strftime(tmstr, sizeof(tmstr), RIB_TM_FORMAT, tm); - if (fd >= PROG_RES_FDS) { + if (fd >= PROC_RES_FDS) { fccntl(fd, FLOWGRXQLEN, &rxqlen); fccntl(fd, FLOWGTXQLEN, &txqlen); } @@ -249,20 +261,20 @@ static int dt_rib_read(const char * path, " failed nhop (packets): %20zu\n" " failed nhop (bytes): %20zu\n", i, - dt.stat[fd].snd_pkt[i], - dt.stat[fd].snd_bytes[i], - dt.stat[fd].rcv_pkt[i], - dt.stat[fd].rcv_bytes[i], - dt.stat[fd].lcl_w_pkt[i], - dt.stat[fd].lcl_w_bytes[i], - dt.stat[fd].lcl_r_pkt[i], - dt.stat[fd].lcl_r_bytes[i], - dt.stat[fd].r_drp_pkt[i], - dt.stat[fd].r_drp_bytes[i], - dt.stat[fd].w_drp_pkt[i], - dt.stat[fd].w_drp_bytes[i], - dt.stat[fd].f_nhp_pkt[i], - dt.stat[fd].f_nhp_bytes[i] + dt_stat_load(fd, snd_pkt, i), + dt_stat_load(fd, snd_bytes, i), + dt_stat_load(fd, rcv_pkt, i), + dt_stat_load(fd, rcv_bytes, i), + dt_stat_load(fd, lcl_w_pkt, i), + dt_stat_load(fd, lcl_w_bytes, i), + dt_stat_load(fd, lcl_r_pkt, i), + dt_stat_load(fd, lcl_r_bytes, i), + dt_stat_load(fd, r_drp_pkt, i), + dt_stat_load(fd, r_drp_bytes, i), + dt_stat_load(fd, w_drp_pkt, i), + dt_stat_load(fd, w_drp_bytes, i), + dt_stat_load(fd, f_nhp_pkt, i), + dt_stat_load(fd, f_nhp_bytes, i) ); strcat(buf, str); } @@ -270,17 +282,10 @@ static int dt_rib_read(const char * path, pthread_mutex_unlock(&dt.stat[fd].lock); return RIB_FILE_STRLEN; -#else - (void) path; - (void) buf; - (void) len; - return 0; -#endif } static int dt_rib_readdir(char *** buf) { -#ifdef IPCP_FLOW_STATS char entry[RIB_PATH_LEN + 1]; size_t i; int idx = 0; @@ -296,7 +301,7 @@ static int dt_rib_readdir(char *** buf) if (*buf == NULL) goto fail_entries; - for (i = 0; i < PROG_MAX_FLOWS; ++i) { + for (i = 0; i < PROC_MAX_FLOWS; ++i) { pthread_mutex_lock(&dt.stat[i].lock); if (dt.stat[i].stamp == 0) { @@ -327,16 +332,11 @@ static int dt_rib_readdir(char *** buf) fail_entries: pthread_rwlock_unlock(&dt.lock); return -ENOMEM; -#else - (void) buf; - return 0; -#endif } static int dt_rib_getattr(const char * path, struct rib_attr * attr) { -#ifdef IPCP_FLOW_STATS int fd; char * entry; @@ -356,10 +356,7 @@ static int dt_rib_getattr(const char * path, } pthread_mutex_unlock(&dt.stat[fd].lock); -#else - (void) path; - (void) attr; -#endif + return 0; } @@ -369,7 +366,12 @@ static struct rib_ops r_ops = { .getattr = dt_rib_getattr }; -#ifdef IPCP_FLOW_STATS +/* + * Hold dt.lock + per-stat together: dt_rib_readdir samples n_flows + * under rdlock and walks stamps under per-stat; updates must be + * atomic w.r.t. that snapshot or the malloc(n_flows) buffer can + * overflow. + */ static void stat_used(int fd, uint64_t addr) { @@ -377,6 +379,7 @@ static void stat_used(int fd, clock_gettime(CLOCK_REALTIME_COARSE, &now); + pthread_rwlock_wrlock(&dt.lock); pthread_mutex_lock(&dt.stat[fd].lock); memset(&dt.stat[fd], 0, sizeof(dt.stat[fd])); @@ -384,14 +387,13 @@ static void stat_used(int fd, dt.stat[fd].stamp = (addr != INVALID_ADDR) ? now.tv_sec : 0; dt.stat[fd].addr = addr; - pthread_mutex_unlock(&dt.stat[fd].lock); - - pthread_rwlock_wrlock(&dt.lock); - (addr != INVALID_ADDR) ? ++dt.n_flows : --dt.n_flows; + pthread_mutex_unlock(&dt.stat[fd].lock); pthread_rwlock_unlock(&dt.lock); } +#else +#define dt_stat_inc(idx, name, qc, len) ((void) 0) #endif static void handle_event(void * self, @@ -440,15 +442,10 @@ static void packet_handler(int fd, len = ssm_pk_buff_len(spb); #ifndef IPCP_FLOW_STATS - (void) fd; -#else - pthread_mutex_lock(&dt.stat[fd].lock); - - ++dt.stat[fd].rcv_pkt[qc]; - dt.stat[fd].rcv_bytes[qc] += len; - - pthread_mutex_unlock(&dt.stat[fd].lock); + (void) fd; #endif + dt_stat_inc(fd, rcv, qc, len); + memset(&dt_pci, 0, sizeof(dt_pci)); head = ssm_pk_buff_head(spb); @@ -458,14 +455,7 @@ static void packet_handler(int fd, if (dt_pci.ttl == 0) { log_dbg("TTL was zero."); ipcp_spb_release(spb); -#ifdef IPCP_FLOW_STATS - pthread_mutex_lock(&dt.stat[fd].lock); - - ++dt.stat[fd].r_drp_pkt[qc]; - dt.stat[fd].r_drp_bytes[qc] += len; - - pthread_mutex_unlock(&dt.stat[fd].lock); -#endif + dt_stat_inc(fd, r_drp, qc, len); return; } @@ -475,14 +465,7 @@ static void packet_handler(int fd, log_dbg("No next hop for %" PRIu64 ".", dt_pci.dst_addr); ipcp_spb_release(spb); -#ifdef IPCP_FLOW_STATS - pthread_mutex_lock(&dt.stat[fd].lock); - - ++dt.stat[fd].f_nhp_pkt[qc]; - dt.stat[fd].f_nhp_bytes[qc] += len; - - pthread_mutex_unlock(&dt.stat[fd].lock); -#endif + dt_stat_inc(fd, f_nhp, qc, len); return; } @@ -494,27 +477,14 @@ static void packet_handler(int fd, if (ret == -EFLOWDOWN) notifier_event(NOTIFY_DT_FLOW_DOWN, &ofd); ipcp_spb_release(spb); -#ifdef IPCP_FLOW_STATS - pthread_mutex_lock(&dt.stat[ofd].lock); - - ++dt.stat[ofd].w_drp_pkt[qc]; - dt.stat[ofd].w_drp_bytes[qc] += len; - - pthread_mutex_unlock(&dt.stat[ofd].lock); -#endif + dt_stat_inc(ofd, w_drp, qc, len); return; } -#ifdef IPCP_FLOW_STATS - pthread_mutex_lock(&dt.stat[ofd].lock); - ++dt.stat[ofd].snd_pkt[qc]; - dt.stat[ofd].snd_bytes[qc] += len; - - pthread_mutex_unlock(&dt.stat[ofd].lock); -#endif + dt_stat_inc(ofd, snd, qc, len); } else { dt_pci_shrink(spb); - if (dt_pci.eid >= PROG_RES_FDS) { + if (dt_pci.eid >= PROC_RES_FDS) { uint8_t ecn = *(head + dt_pci_info.ecn_o); fa_np1_rcv(dt_pci.eid, ecn, spb); return; @@ -526,20 +496,9 @@ static void packet_handler(int fd, ipcp_spb_release(spb); return; } -#ifdef IPCP_FLOW_STATS - pthread_mutex_lock(&dt.stat[fd].lock); - - ++dt.stat[fd].lcl_r_pkt[qc]; - dt.stat[fd].lcl_r_bytes[qc] += len; - - pthread_mutex_unlock(&dt.stat[fd].lock); - pthread_mutex_lock(&dt.stat[dt_pci.eid].lock); - - ++dt.stat[dt_pci.eid].snd_pkt[qc]; - dt.stat[dt_pci.eid].snd_bytes[qc] += len; + dt_stat_inc(fd, lcl_r, qc, len); + dt_stat_inc(dt_pci.eid, snd, qc, len); - pthread_mutex_unlock(&dt.stat[dt_pci.eid].lock); -#endif dt.comps[dt_pci.eid].post_packet(dt.comps[dt_pci.eid].comp, spb); } @@ -569,7 +528,9 @@ int dt_init(struct dt_config cfg) { int i; int j; +#ifdef IPCP_FLOW_STATS char dtstr[RIB_NAME_STRLEN + 1]; +#endif enum pol_pff pp; struct conn_info info; @@ -636,13 +597,13 @@ int dt_init(struct dt_config cfg) goto fail_rwlock_init; } - dt.res_fds = bmp_create(PROG_RES_FDS, 0); + dt.res_fds = bmp_create(PROC_RES_FDS, 0); if (dt.res_fds == NULL) goto fail_res_fds; #ifdef IPCP_FLOW_STATS memset(dt.stat, 0, sizeof(dt.stat)); - for (i = 0; i < PROG_MAX_FLOWS; ++i) + for (i = 0; i < PROC_MAX_FLOWS; ++i) if (pthread_mutex_init(&dt.stat[i].lock, NULL)) { log_err("Failed to init mutex for flow %d.", i); for (j = 0; j < i; ++j) @@ -651,18 +612,19 @@ int dt_init(struct dt_config cfg) } dt.n_flows = 0; -#endif + sprintf(dtstr, "%s." ADDR_FMT32, DT, ADDR_VAL32(&dt.addr)); if (rib_reg(dtstr, &r_ops)) { log_err("Failed to register RIB."); goto fail_rib_reg; } +#endif return 0; - fail_rib_reg: #ifdef IPCP_FLOW_STATS - for (i = 0; i < PROG_MAX_FLOWS; ++i) + fail_rib_reg: + for (i = 0; i < PROC_MAX_FLOWS; ++i) pthread_mutex_destroy(&dt.stat[i].lock); fail_stat_lock: #endif @@ -685,13 +647,15 @@ int dt_init(struct dt_config cfg) void dt_fini(void) { +#ifdef IPCP_FLOW_STATS char dtstr[RIB_NAME_STRLEN + 1]; +#endif int i; +#ifdef IPCP_FLOW_STATS sprintf(dtstr, "%s.%" PRIu64, DT, dt.addr); rib_unreg(dtstr); -#ifdef IPCP_FLOW_STATS - for (i = 0; i < PROG_MAX_FLOWS; ++i) + for (i = 0; i < PROC_MAX_FLOWS; ++i) pthread_mutex_destroy(&dt.stat[i].lock); #endif bmp_destroy(dt.res_fds); @@ -791,7 +755,7 @@ int dt_reg_comp(void * comp, void dt_unreg_comp(int eid) { - assert(eid >= 0 && eid < PROG_RES_FDS); + assert(eid >= 0 && eid < PROC_RES_FDS); pthread_rwlock_wrlock(&dt.lock); @@ -823,33 +787,21 @@ int dt_write_packet(uint64_t dst_addr, #ifdef IPCP_FLOW_STATS len = ssm_pk_buff_len(spb); - if (eid < PROG_RES_FDS) { - pthread_mutex_lock(&dt.stat[eid].lock); - - ++dt.stat[eid].lcl_r_pkt[qc]; - dt.stat[eid].lcl_r_bytes[qc] += len; - - pthread_mutex_unlock(&dt.stat[eid].lock); - } + if (eid < PROC_RES_FDS) + dt_stat_inc(eid, lcl_r, qc, len); #endif fd = pff_nhop(dt.pff[qc], dst_addr); if (fd < 0) { log_dbg("Could not get nhop for " ADDR_FMT32 ".", ADDR_VAL32(&dst_addr)); #ifdef IPCP_FLOW_STATS - if (eid < PROG_RES_FDS) { - pthread_mutex_lock(&dt.stat[eid].lock); - - ++dt.stat[eid].lcl_r_pkt[qc]; - dt.stat[eid].lcl_r_bytes[qc] += len; - - pthread_mutex_unlock(&dt.stat[eid].lock); - } + if (eid < PROC_RES_FDS) + dt_stat_inc(eid, lcl_r, qc, len); #endif return -EPERM; } - head = ssm_pk_buff_head_alloc(spb, dt_pci_info.head_size); + head = ssm_pk_buff_push(spb, dt_pci_info.head_size); if (head == NULL) { log_dbg("Failed to allocate DT header."); goto fail_write; @@ -874,31 +826,17 @@ int dt_write_packet(uint64_t dst_addr, goto fail_write; } #ifdef IPCP_FLOW_STATS - pthread_mutex_lock(&dt.stat[fd].lock); - - if (dt_pci.eid < PROG_RES_FDS) { - ++dt.stat[fd].lcl_w_pkt[qc]; - dt.stat[fd].lcl_w_bytes[qc] += len; - } - ++dt.stat[fd].snd_pkt[qc]; - dt.stat[fd].snd_bytes[qc] += len; - - pthread_mutex_unlock(&dt.stat[fd].lock); + if (dt_pci.eid < PROC_RES_FDS) + dt_stat_inc(fd, lcl_w, qc, len); + dt_stat_inc(fd, snd, qc, len); #endif return 0; fail_write: #ifdef IPCP_FLOW_STATS - pthread_mutex_lock(&dt.stat[fd].lock); - - if (eid < PROG_RES_FDS) { - ++dt.stat[fd].lcl_w_pkt[qc]; - dt.stat[fd].lcl_w_bytes[qc] += len; - } - ++dt.stat[fd].w_drp_pkt[qc]; - dt.stat[fd].w_drp_bytes[qc] += len; - - pthread_mutex_unlock(&dt.stat[fd].lock); + if (eid < PROC_RES_FDS) + dt_stat_inc(fd, lcl_w, qc, len); + dt_stat_inc(fd, w_drp, qc, len); #endif return -1; } diff --git a/src/ipcpd/unicast/fa.c b/src/ipcpd/unicast/fa.c index c157d71c..c6eca175 100644 --- a/src/ipcpd/unicast/fa.c +++ b/src/ipcpd/unicast/fa.c @@ -37,6 +37,7 @@ #include <ouroboros/errno.h> #include <ouroboros/dev.h> #include <ouroboros/ipcp-dev.h> +#include <ouroboros/np1_flow.h> #include <ouroboros/rib.h> #include <ouroboros/random.h> #include <ouroboros/pthread.h> @@ -61,9 +62,10 @@ #define TIMEOUT 10 * MILLION /* nanoseconds */ #define MSGBUFSZ 32768 -#define FLOW_REQ 0 -#define FLOW_REPLY 1 -#define FLOW_UPDATE 2 +#define FLOW_REQ 0 +#define FLOW_REPLY 1 +#define FLOW_UPDATE 2 +#define FLOW_IRM_UPDATE 3 #define STAT_FILE_LEN 0 @@ -81,7 +83,7 @@ struct fa_msg { uint16_t ece; uint8_t code; uint8_t availability; - uint8_t in_order; + uint8_t service; } __attribute__((packed)); struct cmd { @@ -111,7 +113,7 @@ struct fa_flow { struct { pthread_rwlock_t flows_lock; - struct fa_flow flows[PROG_MAX_FLOWS]; + struct fa_flow flows[PROC_MAX_FLOWS]; #ifdef IPCP_FLOW_STATS size_t n_flows; #endif @@ -125,11 +127,11 @@ struct { struct psched * psched; } fa; +#ifdef IPCP_FLOW_STATS static int fa_rib_read(const char * path, char * buf, size_t len) { -#ifdef IPCP_FLOW_STATS struct fa_flow * flow; int fd; char r_addrstr[21]; @@ -145,7 +147,7 @@ static int fa_rib_read(const char * path, fd = atoi(entry); - if (fd < 0 || fd >= PROG_MAX_FLOWS) + if (fd < 0 || fd >= PROC_MAX_FLOWS) return -1; if (len < 1536) @@ -199,17 +201,10 @@ static int fa_rib_read(const char * path, pthread_rwlock_unlock(&fa.flows_lock); return strlen(buf); -#else - (void) path; - (void) buf; - (void) len; - return 0; -#endif } static int fa_rib_readdir(char *** buf) { -#ifdef IPCP_FLOW_STATS char entry[RIB_PATH_LEN + 1]; size_t i; int idx = 0; @@ -225,7 +220,7 @@ static int fa_rib_readdir(char *** buf) if (*buf == NULL) goto fail_entries; - for (i = 0; i < PROG_MAX_FLOWS; ++i) { + for (i = 0; i < PROC_MAX_FLOWS; ++i) { struct fa_flow * flow; flow = &fa.flows[i]; @@ -254,16 +249,11 @@ static int fa_rib_readdir(char *** buf) fail_entries: pthread_rwlock_unlock(&fa.flows_lock); return -ENOMEM; -#else - (void) buf; - return 0; -#endif } static int fa_rib_getattr(const char * path, struct rib_attr * attr) { -#ifdef IPCP_FLOW_STATS int fd; char * entry; struct fa_flow * flow; @@ -286,10 +276,7 @@ static int fa_rib_getattr(const char * path, } pthread_rwlock_unlock(&fa.flows_lock); -#else - (void) path; - (void) attr; -#endif + return 0; } @@ -298,6 +285,7 @@ static struct rib_ops r_ops = { .readdir = fa_rib_readdir, .getattr = fa_rib_getattr }; +#endif /* IPCP_FLOW_STATS */ static int eid_to_fd(uint64_t eid) { @@ -306,7 +294,7 @@ static int eid_to_fd(uint64_t eid) fd = eid & 0xFFFFFFFF; - if (fd < 0 || fd >= PROG_MAX_FLOWS) + if (fd < 0 || fd >= PROC_MAX_FLOWS) return -1; flow = &fa.flows[fd]; @@ -496,11 +484,12 @@ static int fa_handle_flow_req(struct fa_msg * msg, qs.availability = msg->availability; qs.loss = ntoh32(msg->loss); qs.ber = ntoh32(msg->ber); - qs.in_order = msg->in_order; + qs.service = msg->service; qs.max_gap = ntoh32(msg->max_gap); qs.timeout = ntoh32(msg->timeout); - fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_UNICAST_MPL, &data); + fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_UNICAST_MPL, + IPCP_UNICAST_MTU, &data); if (fd < 0) return fd; @@ -528,7 +517,8 @@ static int fa_handle_flow_reply(struct fa_msg * msg, time_t mpl = IPCP_UNICAST_MPL; int response; - assert(len >= sizeof(*msg)); + if (len < sizeof(*msg)) + return -EINVAL; data.data = (uint8_t *) msg + sizeof(*msg); data.len = len - sizeof(*msg); @@ -558,7 +548,8 @@ static int fa_handle_flow_reply(struct fa_msg * msg, pthread_rwlock_unlock(&fa.flows_lock); - if (ipcp_flow_alloc_reply(fd, response, mpl, &data) < 0) { + if (ipcp_flow_alloc_reply(fd, response, mpl, + IPCP_UNICAST_MTU, &data) < 0) { log_err("Failed to reply for flow allocation on fd %d.", fd); return -EIRMD; } @@ -572,8 +563,8 @@ static int fa_handle_flow_update(struct fa_msg * msg, struct fa_flow * flow; int fd; - (void) len; - assert(len >= sizeof(*msg)); + if (len < sizeof(*msg)) + return -EINVAL; pthread_rwlock_wrlock(&fa.flows_lock); @@ -596,6 +587,43 @@ static int fa_handle_flow_update(struct fa_msg * msg, return 0; } +static int fa_handle_flow_irm_update(struct fa_msg * msg, + size_t len) +{ + buffer_t data; + int fd; + int flow_id; + + if (len < sizeof(*msg)) + return -EINVAL; + + data.data = (uint8_t *) msg + sizeof(*msg); + data.len = len - sizeof(*msg); + + pthread_rwlock_rdlock(&fa.flows_lock); + + fd = eid_to_fd(ntoh64(msg->r_eid)); + + pthread_rwlock_unlock(&fa.flows_lock); + + if (fd < 0) { + log_err("Flow update for unknown EID %" PRIu64 ".", + ntoh64(msg->r_eid)); + return -ENOTALLOC; + } + + flow_id = np1_flow_id(fd); + if (flow_id < 0) + return -ENOTALLOC; + + if (ipcp_flow_update_arr(flow_id, &data) < 0) { + log_err("Failed to relay flow update on fd %d.", fd); + return -EIRMD; + } + + return 0; +} + static void * fa_handle_packet(void * o) { (void) o; @@ -624,6 +652,10 @@ static void * fa_handle_packet(void * o) if (fa_handle_flow_update(msg, len) < 0) log_err("Error handling flow update."); break; + case FLOW_IRM_UPDATE: + if (fa_handle_flow_irm_update(msg, len) < 0) + log_err("Error handling flow update."); + break; default: log_warn("Recieved unknown flow allocation message."); break; @@ -652,8 +684,10 @@ int fa_init(void) if (pthread_cond_init(&fa.cond, &cattr)) goto fail_cond; +#ifdef IPCP_FLOW_STATS if (rib_reg(FA, &r_ops)) goto fail_rib_reg; +#endif fa.eid = dt_reg_comp(&fa, &fa_post_packet, FA); if ((int) fa.eid < 0) @@ -666,8 +700,10 @@ int fa_init(void) return 0; fail_dt_reg: +#ifdef IPCP_FLOW_STATS rib_unreg(FA); fail_rib_reg: +#endif pthread_cond_destroy(&fa.cond); fail_cond: pthread_condattr_destroy(&cattr); @@ -681,8 +717,9 @@ int fa_init(void) void fa_fini(void) { +#ifdef IPCP_FLOW_STATS rib_unreg(FA); - +#endif pthread_cond_destroy(&fa.cond);; pthread_mutex_destroy(&fa.mtx); pthread_rwlock_destroy(&fa.flows_lock); @@ -789,7 +826,7 @@ int fa_alloc(int fd, msg->availability = qs.availability; msg->loss = hton32(qs.loss); msg->ber = hton32(qs.ber); - msg->in_order = qs.in_order; + msg->service = qs.service; msg->max_gap = hton32(qs.max_gap); msg->timeout = hton32(qs.timeout); @@ -878,6 +915,44 @@ int fa_alloc_resp(int fd, return -1; } +int fa_irm_update(int fd, + const buffer_t * data) +{ + struct fa_msg * msg; + struct ssm_pk_buff * spb; + struct fa_flow * flow; + qoscube_t qc = QOS_CUBE_BE; + uint64_t r_addr; + + flow = &fa.flows[fd]; + + if (ipcp_spb_reserve(&spb, sizeof(*msg) + data->len)) + return -1; + + msg = (struct fa_msg *) ssm_pk_buff_head(spb); + memset(msg, 0, sizeof(*msg)); + + msg->code = FLOW_IRM_UPDATE; + if (data->len > 0) + memcpy(msg + 1, data->data, data->len); + + pthread_rwlock_rdlock(&fa.flows_lock); + + msg->r_eid = hton64(flow->r_eid); + msg->s_eid = hton64(flow->s_eid); + r_addr = flow->r_addr; + + pthread_rwlock_unlock(&fa.flows_lock); + + if (dt_write_packet(r_addr, qc, fa.eid, spb)) { + log_err("Failed to send flow update packet."); + ipcp_spb_release(spb); + return -1; + } + + return 0; +} + int fa_dealloc(int fd) { if (ipcp_flow_fini(fd) < 0) diff --git a/src/ipcpd/unicast/fa.h b/src/ipcpd/unicast/fa.h index 0c19dc25..f31b40e9 100644 --- a/src/ipcpd/unicast/fa.h +++ b/src/ipcpd/unicast/fa.h @@ -45,6 +45,9 @@ int fa_alloc_resp(int fd, int fa_dealloc(int fd); +int fa_irm_update(int fd, + const buffer_t * data); + void fa_np1_rcv(uint64_t eid, uint8_t ecn, struct ssm_pk_buff * spb); diff --git a/src/ipcpd/unicast/main.c b/src/ipcpd/unicast/main.c index 583a04ff..1155b88b 100644 --- a/src/ipcpd/unicast/main.c +++ b/src/ipcpd/unicast/main.c @@ -273,7 +273,8 @@ static struct ipcp_ops unicast_ops = { .ipcp_flow_alloc = fa_alloc, .ipcp_flow_join = NULL, .ipcp_flow_alloc_resp = fa_alloc_resp, - .ipcp_flow_dealloc = fa_dealloc + .ipcp_flow_dealloc = fa_dealloc, + .ipcp_flow_update = fa_irm_update }; int main(int argc, @@ -307,8 +308,8 @@ int main(int argc, ipcp_sigwait(); if (ipcp_get_state() == IPCP_SHUTDOWN) { - stop_components(); ipcp_stop(); + stop_components(); finalize_components(); } else { ipcp_stop(); diff --git a/src/ipcpd/unicast/pff/alternate.c b/src/ipcpd/unicast/pff/alternate.c index be1c35c0..1c508c1b 100644 --- a/src/ipcpd/unicast/pff/alternate.c +++ b/src/ipcpd/unicast/pff/alternate.c @@ -211,7 +211,7 @@ struct pff_i * alternate_pff_create(void) if (pthread_rwlock_init(&tmp->lock, NULL)) goto fail_lock; - tmp->pft = pft_create(PFT_SIZE, false); + tmp->pft = pft_create(PFT_SIZE); if (tmp->pft == NULL) goto fail_pft; diff --git a/src/ipcpd/unicast/pff/multipath.c b/src/ipcpd/unicast/pff/multipath.c index c636e789..9ba59592 100644 --- a/src/ipcpd/unicast/pff/multipath.c +++ b/src/ipcpd/unicast/pff/multipath.c @@ -63,7 +63,7 @@ struct pff_i * multipath_pff_create(void) if (pthread_rwlock_init(&tmp->lock, NULL)) goto fail_rwlock; - tmp->pft = pft_create(PFT_SIZE, false); + tmp->pft = pft_create(PFT_SIZE); if (tmp->pft == NULL) goto fail_pft; diff --git a/src/ipcpd/unicast/pff/pft.c b/src/ipcpd/unicast/pff/pft.c index a0d70799..d0e562d6 100644 --- a/src/ipcpd/unicast/pff/pft.c +++ b/src/ipcpd/unicast/pff/pft.c @@ -43,12 +43,10 @@ struct pft_entry { struct pft { struct list_head * buckets; - bool hash_key; uint64_t buckets_size; }; -struct pft * pft_create(uint64_t buckets, - bool hash_key) +struct pft * pft_create(uint64_t buckets) { struct pft * tmp; unsigned int i; @@ -69,7 +67,6 @@ struct pft * pft_create(uint64_t buckets, if (tmp == NULL) return NULL; - tmp->hash_key = hash_key; tmp->buckets_size = buckets; tmp->buckets = malloc(buckets * sizeof(*tmp->buckets)); @@ -113,22 +110,10 @@ void pft_flush(struct pft * pft) } } -static uint64_t hash(uint64_t key) -{ - uint64_t res[2]; - - mem_hash(HASH_MD5, res, (uint8_t *) &key, sizeof(key)); - - return res[0]; -} - static uint64_t calc_key(struct pft * pft, uint64_t dst) { - if (pft->hash_key) - dst = hash(dst); - - return (dst & (pft->buckets_size - 1)); + return hash_mix64(dst) & (pft->buckets_size - 1); } int pft_insert(struct pft * pft, diff --git a/src/ipcpd/unicast/pff/pft.h b/src/ipcpd/unicast/pff/pft.h index 3bb9cff7..15bbe451 100644 --- a/src/ipcpd/unicast/pff/pft.h +++ b/src/ipcpd/unicast/pff/pft.h @@ -24,14 +24,12 @@ #define OUROBOROS_PFT_H #include <stdint.h> -#include <stdbool.h> #include <stdlib.h> struct pft; /* Buckets is rounded up to the nearest power of 2 */ -struct pft * pft_create(uint64_t buckets, - bool hash_key); +struct pft * pft_create(uint64_t buckets); void pft_destroy(struct pft * table); diff --git a/src/ipcpd/unicast/pff/simple.c b/src/ipcpd/unicast/pff/simple.c index be542bdb..7befa42f 100644 --- a/src/ipcpd/unicast/pff/simple.c +++ b/src/ipcpd/unicast/pff/simple.c @@ -63,7 +63,7 @@ struct pff_i * simple_pff_create(void) return NULL; } - tmp->pft = pft_create(PFT_SIZE, false); + tmp->pft = pft_create(PFT_SIZE); if (tmp->pft == NULL) { pthread_rwlock_destroy(&tmp->lock); free(tmp); diff --git a/src/ipcpd/unicast/pff/tests/pft_test.c b/src/ipcpd/unicast/pff/tests/pft_test.c index 4962c241..20e73a94 100644 --- a/src/ipcpd/unicast/pff/tests/pft_test.c +++ b/src/ipcpd/unicast/pff/tests/pft_test.c @@ -38,15 +38,7 @@ int pft_test(int argc, (void) argc; (void) argv; - pft = pft_create(TBL_SIZE, true); - if (pft == NULL) { - printf("Failed to create.\n"); - return -1; - } - - pft_destroy(pft); - - pft = pft_create(TBL_SIZE, false); + pft = pft_create(TBL_SIZE); if (pft == NULL) { printf("Failed to create.\n"); return -1; diff --git a/src/ipcpd/unicast/routing/graph.c b/src/ipcpd/unicast/routing/graph.c index 0226c762..c168eb7d 100644 --- a/src/ipcpd/unicast/routing/graph.c +++ b/src/ipcpd/unicast/routing/graph.c @@ -603,9 +603,9 @@ static int graph_routing_table_lfa(struct graph * graph, struct list_head * table, int ** dist) { - int * n_dist[PROG_MAX_FLOWS]; - uint64_t addrs[PROG_MAX_FLOWS]; - int n_index[PROG_MAX_FLOWS]; + int * n_dist[PROC_MAX_FLOWS]; + uint64_t addrs[PROC_MAX_FLOWS]; + int n_index[PROC_MAX_FLOWS]; struct list_head * p; struct list_head * q; struct vertex * v; @@ -618,7 +618,7 @@ static int graph_routing_table_lfa(struct graph * graph, if (graph_routing_table_simple(graph, s_addr, table, dist)) goto fail_table; - for (j = 0; j < PROG_MAX_FLOWS; j++) { + for (j = 0; j < PROC_MAX_FLOWS; j++) { n_dist[j] = NULL; n_index[j] = -1; addrs[j] = -1; diff --git a/src/ipcpd/unicast/routing/link-state.c b/src/ipcpd/unicast/routing/link-state.c index 051dd98d..c4ea9e1c 100644 --- a/src/ipcpd/unicast/routing/link-state.c +++ b/src/ipcpd/unicast/routing/link-state.c @@ -415,7 +415,7 @@ static void calculate_pff(struct routing_i * instance) struct list_head table; struct list_head * p; struct list_head * q; - int fds[PROG_MAX_FLOWS]; + int fds[PROC_MAX_FLOWS]; assert(instance); diff --git a/src/irmd/CMakeLists.txt b/src/irmd/CMakeLists.txt index 9aa747ca..5aa457ff 100644 --- a/src/irmd/CMakeLists.txt +++ b/src/irmd/CMakeLists.txt @@ -7,11 +7,11 @@ if(HAVE_TOML) set(INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}") configure_file("${CMAKE_SOURCE_DIR}/irmd.conf.in" "${CMAKE_BINARY_DIR}/${OUROBOROS_CONFIG_FILE}.example" @ONLY) - configure_file("${CMAKE_SOURCE_DIR}/enc.conf.in" - "${CMAKE_BINARY_DIR}/enc.conf.example" @ONLY) + configure_file("${CMAKE_SOURCE_DIR}/sec.conf.in" + "${CMAKE_BINARY_DIR}/sec.conf.example" @ONLY) install(FILES "${CMAKE_BINARY_DIR}/${OUROBOROS_CONFIG_FILE}.example" DESTINATION "${OUROBOROS_CONFIG_DIR}") - install(FILES "${CMAKE_BINARY_DIR}/enc.conf.example" + install(FILES "${CMAKE_BINARY_DIR}/sec.conf.example" DESTINATION "${OUROBOROS_CONFIG_DIR}") install(CODE " if(NOT EXISTS \"${OUROBOROS_CONFIG_DIR}/${OUROBOROS_CONFIG_FILE}\") diff --git a/src/irmd/config.h.in b/src/irmd/config.h.in index df0cd718..e14cff75 100644 --- a/src/irmd/config.h.in +++ b/src/irmd/config.h.in @@ -42,6 +42,9 @@ #define FLOW_DEALLOC_TIMEOUT @FLOW_DEALLOC_TIMEOUT@ #define OAP_REPLAY_TIMER @OAP_REPLAY_TIMER@ +#define OAP_REPLAY_MAX @OAP_REPLAY_MAX@ +#define OAP_REKEY_TIMER @OAP_REKEY_TIMER@ +#cmakedefine01 OAP_CLIENT_AUTH_DEFAULT #define BOOTSTRAP_TIMEOUT @BOOTSTRAP_TIMEOUT@ #define ENROLL_TIMEOUT @ENROLL_TIMEOUT@ diff --git a/src/irmd/configfile.c b/src/irmd/configfile.c index 53608eee..35cf4292 100644 --- a/src/irmd/configfile.c +++ b/src/irmd/configfile.c @@ -922,10 +922,10 @@ static int toml_name(toml_table_t * table, toml_array_t * progs; toml_array_t * args; toml_datum_t lb; - toml_datum_t senc; + toml_datum_t ssec; toml_datum_t scrt; toml_datum_t skey; - toml_datum_t cenc; + toml_datum_t csec; toml_datum_t ccrt; toml_datum_t ckey; @@ -957,8 +957,8 @@ static int toml_name(toml_table_t * table, log_err("Invalid load-balancing policy for %s.", name); return -1; } - senc = toml_string_in(table, "server_enc_file"); - if (senc.ok && cp_chk_path(info.s.enc, senc.u.s) < 0) + ssec = toml_string_in(table, "server_sec_file"); + if (ssec.ok && cp_chk_path(info.s.sec, ssec.u.s) < 0) return -1; scrt = toml_string_in(table, "server_crt_file"); @@ -969,8 +969,8 @@ static int toml_name(toml_table_t * table, if (skey.ok && cp_chk_path(info.s.key, skey.u.s) < 0) return -1; - cenc = toml_string_in(table, "client_enc_file"); - if (cenc.ok && cp_chk_path(info.c.enc, cenc.u.s) < 0) + csec = toml_string_in(table, "client_sec_file"); + if (csec.ok && cp_chk_path(info.c.sec, csec.u.s) < 0) return -1; ccrt = toml_string_in(table, "client_crt_file"); diff --git a/src/irmd/ipcp.c b/src/irmd/ipcp.c index a7da186c..7eccfc80 100644 --- a/src/irmd/ipcp.c +++ b/src/irmd/ipcp.c @@ -444,6 +444,38 @@ int ipcp_flow_join(const struct flow_info * flow, return ret; } +int ipcp_flow_update(const struct flow_info * flow, + const buffer_t data) +{ + ipcp_msg_t msg = IPCP_MSG__INIT; + ipcp_msg_t * recv_msg; + int ret; + + msg.code = IPCP_MSG_CODE__IPCP_FLOW_UPDATE; + msg.has_flow_id = true; + msg.flow_id = flow->id; + msg.has_pk = true; + msg.pk.data = data.data; + msg.pk.len = data.len; + + recv_msg = send_recv_ipcp_msg(flow->n_1_pid, &msg); + if (recv_msg == NULL) { + log_err("Did not receive message."); + return -EIPCP; + } + + if (!recv_msg->has_result) { + log_err("Message has no result"); + ipcp_msg__free_unpacked(recv_msg, NULL); + return -EIPCP; + } + + ret = recv_msg->result; + ipcp_msg__free_unpacked(recv_msg, NULL); + + return ret; +} + int ipcp_flow_alloc(const struct flow_info * flow, const buffer_t dst, const buffer_t data) diff --git a/src/irmd/ipcp.h b/src/irmd/ipcp.h index f1025096..8d06623c 100644 --- a/src/irmd/ipcp.h +++ b/src/irmd/ipcp.h @@ -68,4 +68,7 @@ int ipcp_flow_dealloc(pid_t pid, int flow_id, time_t timeo); +int ipcp_flow_update(const struct flow_info * flow, + const buffer_t data); + #endif /* OUROBOROS_IRMD_IPCP_H */ diff --git a/src/irmd/main.c b/src/irmd/main.c index a85a9bf0..19be4ab9 100644 --- a/src/irmd/main.c +++ b/src/irmd/main.c @@ -36,6 +36,7 @@ #include <ouroboros/crypt.h> #include <ouroboros/errno.h> #include <ouroboros/flow.h> +#include <ouroboros/fqueue.h> #include <ouroboros/hash.h> #include <ouroboros/irm.h> #include <ouroboros/list.h> @@ -86,7 +87,11 @@ #define TIMESYNC_SLACK 100 /* ms */ #define OAP_SEEN_TIMER 20 /* s */ #define DEALLOC_TIME 300 /* s */ -#define DIRECT_MPL 1 /* s */ +#define REKEY_BATCH 64 /* flows re-keyed per timer pass */ +#define REKEY_RESP_TIMEO 20 /* s; give-up on a re-key RESPONSE */ +#define DIRECT_MPL 20 /* ms */ +/* bytes; in-process, bounded only by PUP/GSPP. */ +#define DIRECT_MTU 65000 enum irm_state { IRMD_NULL = 0, @@ -103,13 +108,38 @@ struct cmd { int fd; }; +/* In-flight Tier-2 re-key, owned solely by the re-key worker thread. */ +struct rekey_ctx { + struct list_head next; + + int flow_id; + void * ctx; /* OAP client ctx (opaque) */ + struct timespec deadline; /* reap if no RESPONSE by then */ +}; + +enum rekey_evt_type { + REKEY_INIT = 0, /* start an exchange for flow_id */ + REKEY_REQ, /* a REQUEST arrived for flow_id */ + REKEY_RESP, /* a RESPONSE arrived for flow_id */ + REKEY_DIRECT /* in-process re-key, direct flow */ +}; + +struct rekey_evt { + struct list_head next; + + enum rekey_evt_type type; + int flow_id; + pid_t n_1_pid; /* INIT: flow's lower IPCP */ + buffer_t buf; /* RESP: owned RESPONSE payload */ +}; + struct { bool log_stdout; /* log to stdout */ #ifdef HAVE_TOML char * cfg_file; /* configuration file path */ #endif struct lockfile * lf; /* single irmd per system */ - struct ssm_pool * gspp; /* pool for packets */ + struct ssm_pool * gspp; /* pool for packets */ int sockfd; /* UNIX socket */ @@ -124,6 +154,13 @@ struct { pthread_t irm_sanitize; /* clean up irmd resources */ pthread_t acceptor; /* accept new commands */ + + struct { + pthread_t worker; /* Tier-2 re-key orchestrator */ + struct list_head inbox; /* re-key events for worker */ + pthread_cond_t cond; /* inbox signal condvar */ + pthread_mutex_t mtx; /* inbox lock */ + } rk; } irmd; static enum irm_state irmd_get_state(void) @@ -452,8 +489,8 @@ static void name_update_sec_paths(struct name_info * info) assert(info != NULL); - if (strlen(info->s.enc) == 0) - sprintf(info->s.enc, "%s/%s/enc.conf", srv_dir, info->name); + if (strlen(info->s.sec) == 0) + sprintf(info->s.sec, "%s/%s/sec.conf", srv_dir, info->name); if (strlen(info->s.crt) == 0) sprintf(info->s.crt, "%s/%s/crt.pem", srv_dir, info->name); @@ -461,8 +498,8 @@ static void name_update_sec_paths(struct name_info * info) if (strlen(info->s.key) == 0) sprintf(info->s.key, "%s/%s/key.pem", srv_dir, info->name); - if (strlen(info->c.enc) == 0) - sprintf(info->c.enc, "%s/%s/enc.conf", cli_dir, info->name); + if (strlen(info->c.sec) == 0) + sprintf(info->c.sec, "%s/%s/sec.conf", cli_dir, info->name); if (strlen(info->c.crt) == 0) sprintf(info->c.crt, "%s/%s/crt.pem", cli_dir, info->name); @@ -782,7 +819,8 @@ static int name_unreg(const char * name, static int get_peer_ids(int fd, uid_t * uid, - gid_t * gid) + gid_t * gid, + pid_t * pid) { #if defined(__linux__) struct ucred ucred; @@ -795,9 +833,14 @@ static int get_peer_ids(int fd, *uid = ucred.uid; *gid = ucred.gid; + if (pid != NULL) + *pid = ucred.pid; #else if (getpeereid(fd, uid, gid) < 0) goto fail; + + if (pid != NULL) + *pid = -1; /* no portable SO_PEERCRED.pid equivalent */ #endif return 0; fail: @@ -846,6 +889,7 @@ static int flow_accept(struct flow_info * flow, { buffer_t req_hdr; buffer_t resp_hdr; + buffer_t peer_crt = BUF_INIT; char name[NAME_SIZE + 1]; struct name_info info; int err; @@ -909,7 +953,12 @@ static int flow_accept(struct flow_info * flow, flow->uid = reg_get_proc_uid(flow->n_pid); - err = oap_srv_process(&info, req_hdr, &resp_hdr, data, sk); + err = oap_srv_process(&info, req_hdr, &resp_hdr, data, sk, + false, NULL, &peer_crt); + if (err == -EREPLAY) { + log_warn("Dropping replayed alloc request for %s.", name); + goto fail_replay; + } if (err < 0) { log_err("OAP processing failed for %s.", name); goto fail_oap; @@ -920,16 +969,21 @@ static int flow_accept(struct flow_info * flow, log_err("Failed to respond to direct flow."); goto fail_resp; } + if (sk->nid != NID_undef) + reg_flow_set_rekey(flow->id, false, peer_crt); log_info("Flow %d accepted (direct) by %d for %s.", flow->id, flow->n_pid, name); } else if (ipcp_flow_alloc_resp(flow, 0, resp_hdr) < 0) { log_err("Failed to respond to flow allocation."); goto fail_resp; } else { + if (sk->nid != NID_undef) + reg_flow_set_rekey(flow->id, false, peer_crt); log_info("Flow %d accepted by %d for %s (uid %d).", flow->id, flow->n_pid, name, flow->uid); } + freebuf(peer_crt); freebuf(req_hdr); freebuf(resp_hdr); @@ -938,6 +992,10 @@ static int flow_accept(struct flow_info * flow, fail_oap: if (!reg_flow_is_direct(flow->id)) ipcp_flow_alloc_resp(flow, err, resp_hdr); + fail_replay: + freebuf(peer_crt); + freebuf(req_hdr); + freebuf(resp_hdr); fail_wait: reg_destroy_flow(flow->id); fail_flow: @@ -945,6 +1003,7 @@ static int flow_accept(struct flow_info * flow, fail_resp: flow->state = FLOW_NULL; + freebuf(peer_crt); freebuf(req_hdr); freebuf(resp_hdr); reg_destroy_flow(flow->id); @@ -1193,6 +1252,7 @@ static int flow_alloc_direct(const char * dst, struct flow_info acc; /* server side flow */ buffer_t req_hdr = BUF_INIT; buffer_t resp_hdr = BUF_INIT; + buffer_t no_crt = BUF_INIT; void * ctx; int err; @@ -1202,13 +1262,14 @@ static int flow_alloc_direct(const char * dst, return -EAGAIN; } - if (oap_cli_prepare(&ctx, info, &req_hdr, *data) < 0) { + if (oap_cli_prepare(&ctx, info, &req_hdr, *data, false) < 0) { log_err("Failed to prepare OAP for %s.", dst); return -EBADF; } acc.n_1_pid = flow->n_pid; acc.mpl = DIRECT_MPL; + acc.mtu = DIRECT_MTU; acc.qs = flow->qs; acc.state = FLOW_ALLOCATED; @@ -1234,7 +1295,7 @@ static int flow_alloc_direct(const char * dst, return -ETIMEDOUT; } - err = oap_cli_complete(ctx, info, resp_hdr, data, sk); + err = oap_cli_complete(ctx, info, resp_hdr, data, sk, NULL, NULL); if (err < 0) { log_err("OAP completion failed for %s.", dst); freebuf(resp_hdr); @@ -1244,8 +1305,13 @@ static int flow_alloc_direct(const char * dst, flow->id = acc.id; flow->n_1_pid = acc.n_pid; flow->mpl = DIRECT_MPL; + flow->mtu = DIRECT_MTU; flow->state = FLOW_ALLOCATED; + /* Mark encrypted for re-key; the acceptor caches the cert. */ + if (sk->nid != NID_undef) + reg_flow_set_rekey(acc.id, true, no_crt); + log_info("Flow %d allocated (direct) for %d to %s.", flow->id, flow->n_pid, dst); @@ -1264,6 +1330,7 @@ static int flow_alloc(const char * dst, buffer_t req_hdr = BUF_INIT; buffer_t resp_hdr = BUF_INIT; buffer_t hash = BUF_INIT; + buffer_t peer_crt = BUF_INIT; struct name_info info; void * ctx; int err; @@ -1297,6 +1364,8 @@ static int flow_alloc(const char * dst, goto fail_flow; } + reg_set_name_for_flow_id(dst, flow->id); + if (get_ipcp_by_dst(dst, &flow->n_1_pid, &hash) < 0) { log_err("Failed to find IPCP for %s.", dst); err = -EIPCP; @@ -1309,7 +1378,7 @@ static int flow_alloc(const char * dst, goto fail_prepare; } - if (oap_cli_prepare(&ctx, &info, &req_hdr, *data) < 0) { + if (oap_cli_prepare(&ctx, &info, &req_hdr, *data, false) < 0) { log_err("Failed to prepare OAP request for %s.", dst); err = -EBADF; goto fail_prepare; @@ -1341,12 +1410,16 @@ static int flow_alloc(const char * dst, goto fail_peer; } - err = oap_cli_complete(ctx, &info, resp_hdr, data, sk); + err = oap_cli_complete(ctx, &info, resp_hdr, data, sk, NULL, &peer_crt); if (err < 0) { log_err("OAP completion failed for %s.", dst); goto fail_complete; } + if (sk->nid != NID_undef) + reg_flow_set_rekey(flow->id, true, peer_crt); + + freebuf(peer_crt); freebuf(req_hdr); freebuf(resp_hdr); freebuf(hash); @@ -1354,7 +1427,8 @@ static int flow_alloc(const char * dst, return 0; fail_complete: - ctx = NULL; /* freee'd on complete */ + freebuf(peer_crt); + ctx = NULL; /* free'd on complete */ fail_peer: flow->state = FLOW_DEALLOCATED; fail_wait: @@ -1421,6 +1495,741 @@ static int flow_dealloc_resp(struct flow_info * flow) return 0; } +/* + * Inbox producers. Any thread may post; the worker drains. INIT carries + * the flow's lower IPCP pid; RESP transfers ownership of buf. + */ +static void rekey_post(enum rekey_evt_type type, + int flow_id, + pid_t n_1_pid, + buffer_t * buf) +{ + struct rekey_evt * evt; + + evt = malloc(sizeof(*evt)); + if (evt == NULL) { + log_err("Failed to malloc re-key event for flow %d.", flow_id); + if (type == REKEY_INIT || type == REKEY_DIRECT) + reg_flow_clear_in_flight(flow_id); + else + reg_flow_rekey_arr_done(flow_id, type == REKEY_REQ); + + if (buf != NULL) + freebuf(*buf); + + return; + } + + list_head_init(&evt->next); + evt->type = type; + evt->flow_id = flow_id; + evt->n_1_pid = n_1_pid; + clrbuf(evt->buf); + if (buf != NULL) { + evt->buf = *buf; + clrbuf(*buf); + } + + pthread_mutex_lock(&irmd.rk.mtx); + + list_add_tail(&evt->next, &irmd.rk.inbox); + pthread_cond_signal(&irmd.rk.cond); + + pthread_mutex_unlock(&irmd.rk.mtx); +} + +static void rekey_post_init(int flow_id, + pid_t n_1_pid) +{ + rekey_post(REKEY_INIT, flow_id, n_1_pid, NULL); +} + +static void rekey_post_resp(int flow_id, + buffer_t * buf) +{ + rekey_post(REKEY_RESP, flow_id, 0, buf); +} + +static void rekey_post_req(int flow_id, + pid_t n_1_pid, + buffer_t * buf) +{ + rekey_post(REKEY_REQ, flow_id, n_1_pid, buf); +} + +static void rekey_post_direct(int flow_id) +{ + rekey_post(REKEY_DIRECT, flow_id, 0, NULL); +} + +/* Worker-only: find an in-flight entry by flow_id. */ +static struct rekey_ctx * rekey_find(struct list_head * tbl, + int flow_id) +{ + struct list_head * p; + + list_for_each(p, tbl) { + struct rekey_ctx * e = list_entry(p, struct rekey_ctx, next); + if (e->flow_id == flow_id) + return e; + } + + return NULL; +} + +/* Worker-only: drop an entry, freeing its OAP ctx. */ +static void rekey_drop(struct rekey_ctx * e) +{ + if (e->ctx != NULL) + oap_ctx_free(e->ctx); + + list_del(&e->next); + free(e); +} + +/* Resolve a flow's registered name info; < 0 if the flow or name is gone. */ +static int rekey_name_info(int flow_id, + struct name_info * info) +{ + char name[NAME_SIZE + 1]; + + if (reg_get_name_for_flow_id(name, flow_id) < 0) + return -1; + + return reg_get_name_info(name, info); +} + +/* Flow-update relay payload: a 1-byte type prefix on an opaque body. */ +enum flow_upd_type { + FLOW_UPD_REKEY_REQ = 0, + FLOW_UPD_REKEY_RESP = 1, +}; + +/* Prepend the update type to body; caller frees out on success. */ +static int flow_upd_wrap(buffer_t * out, + uint8_t type, + const buffer_t * body) +{ + out->len = body->len + 1; + out->data = malloc(out->len); + if (out->data == NULL) + return -ENOMEM; + + out->data[0] = type; + memcpy(out->data + 1, body->data, body->len); + + return 0; +} + +/* Cleanup handlers — the re-key worker is cancelled at shutdown. */ +static void rk_free_evt(void * o) +{ + struct rekey_evt * evt = o; + + freebuf(evt->buf); + free(evt); +} + +static void rk_freebuf(void * o) +{ + freebuf(*(buffer_t *) o); +} + +static void rk_clear_in_flight(void * o) +{ + reg_flow_clear_in_flight(*(int *) o); +} + +static void rk_clear_key(void * o) +{ + crypt_secure_clear(o, SYMMKEYSZ); +} + +static void rekey_do_initiate(struct list_head * tbl, + int flow_id, + pid_t n_1_pid) +{ + struct rekey_ctx * e; + struct flow_info info; + struct name_info name; + buffer_t req = BUF_INIT; + buffer_t upd = BUF_INIT; + buffer_t data = BUF_INIT; + void * ctx = NULL; + int ret; + + e = rekey_find(tbl, flow_id); + if (e != NULL) + rekey_drop(e); /* Replace in-flight entries */ + + if (rekey_name_info(flow_id, &name) < 0) { + log_err("Failed to get name info to re-key flow %d.", flow_id); + goto fail; + } + + if (oap_cli_prepare(&ctx, &name, &req, data, true) < 0) { + log_err("Failed to prepare re-key for flow %d.", flow_id); + goto fail; + } + + memset(&info, 0, sizeof(info)); + info.id = flow_id; + info.n_1_pid = n_1_pid; + + if (flow_upd_wrap(&upd, FLOW_UPD_REKEY_REQ, &req) < 0) { + log_err("Failed to wrap re-key request for flow %d.", flow_id); + goto fail_ctx; + } + + pthread_cleanup_push(rk_clear_in_flight, &flow_id); + pthread_cleanup_push(oap_ctx_free, ctx); + pthread_cleanup_push(rk_freebuf, &req); + pthread_cleanup_push(rk_freebuf, &upd); + ret = ipcp_flow_update(&info, upd); + pthread_cleanup_pop(false); + pthread_cleanup_pop(false); + pthread_cleanup_pop(false); + pthread_cleanup_pop(false); + freebuf(upd); + if (ret < 0) { + log_err("Failed to send re-key request for flow %d.", flow_id); + goto fail_ctx; + } + + e = malloc(sizeof(*e)); + if (e == NULL) { + log_err("Failed to malloc re-key ctx for flow %d.", flow_id); + goto fail_ctx; + } + + list_head_init(&e->next); + e->flow_id = flow_id; + e->ctx = ctx; + clock_gettime(PTHREAD_COND_CLOCK, &e->deadline); + e->deadline.tv_sec += REKEY_RESP_TIMEO; + + list_add(&e->next, tbl); + + log_dbg("Re-key request sent for flow %d.", flow_id); + + freebuf(req); + + return; + + fail_ctx: + oap_ctx_free(ctx); + freebuf(req); + fail: + reg_flow_clear_in_flight(flow_id); +} + +/* Worker-only: complete the exchange, install the pending seed. */ +static void rekey_do_complete(struct list_head * tbl, + int flow_id, + buffer_t buf) +{ + struct rekey_ctx * e; + struct name_info info; + struct crypt_sk sk; + uint8_t kbuf[SYMMKEYSZ]; + buffer_t data = BUF_INIT; + buffer_t crt = BUF_INIT; + uint8_t newgen; + + e = rekey_find(tbl, flow_id); + if (e == NULL) { + log_dbg("Stale re-key RESPONSE for flow %d.", flow_id); + return; + } + + /* A concurrent responder already parked a seed; don't overwrite. */ + if (reg_flow_rekey_pending(flow_id)) { + log_dbg("Re-key already pending for flow %d.", flow_id); + goto finish; + } + + if (rekey_name_info(flow_id, &info) < 0) { + log_err("Failed to get name info to re-key flow %d.", flow_id); + goto finish; + } + + sk.key = kbuf; + + reg_flow_get_peer_crt(flow_id, &crt); + + /* oap_cli_complete frees the ctx on every path. */ + if (oap_cli_complete(e->ctx, &info, buf, &data, &sk, &crt, NULL) < 0) { + log_warn("Failed to complete re-key for flow %d.", flow_id); + e->ctx = NULL; + goto finish_clear; + } + + e->ctx = NULL; + + if (data.len != 1) { + log_warn("Re-key reply malformed for flow %d.", flow_id); + goto finish_clear; + } + + newgen = *(uint8_t *) data.data; + + if (newgen >= 16) { + log_warn("Re-key gen %u out of range for flow %d.", + newgen, flow_id); + goto finish_clear; + } + + if (reg_flow_store_pending(flow_id, kbuf, newgen, true) < 0) + log_warn("Flow %d gone during re-key.", flow_id); + else + reg_notify_flow(flow_id, FLOW_UPD); + + log_dbg("Re-key completed for flow %d (gen %u).", flow_id, newgen); + + finish_clear: + crypt_secure_clear(kbuf, SYMMKEYSZ); + freebuf(data); + finish: + freebuf(crt); + rekey_drop(e); + reg_flow_clear_in_flight(flow_id); +} + +/* Worker-only: reap entries whose RESPONSE never arrived. */ +static void rekey_reap_expired(struct list_head * tbl) +{ + struct list_head * p; + struct list_head * h; + struct timespec now; + + clock_gettime(PTHREAD_COND_CLOCK, &now); + + list_for_each_safe(p, h, tbl) { + struct rekey_ctx * e = list_entry(p, struct rekey_ctx, next); + if (ts_diff_ns(&e->deadline, &now) > 0) + continue; + + log_warn("Re-key timed out for flow %d.", e->flow_id); + reg_flow_clear_in_flight(e->flow_id); + rekey_drop(e); + } +} + +/* Responder side: process request, install pending seed, send response. */ +static int rekey_respond(struct flow_info * flow, + buffer_t * pk) +{ + struct name_info info; + struct crypt_sk sk; + uint8_t kbuf[SYMMKEYSZ]; + buffer_t rsp = BUF_INIT; + buffer_t upd = BUF_INIT; + buffer_t data = BUF_INIT; + buffer_t crt = BUF_INIT; + uint8_t newgen; + int epoch; + int err; + + epoch = reg_flow_get_epoch(flow->id); + if (epoch < 0) { + log_warn("Re-key for unknown flow %d.", flow->id); + return -EBADF; + } + + /* Collision: we are driving our own exchange; let it win. */ + if (reg_flow_rekey_should_yield(flow->id)) { + log_dbg("Yielding to own re-key for flow %d.", flow->id); + return 0; + } + + if (rekey_name_info(flow->id, &info) < 0) { + log_err("Failed to get name info to re-key flow %d.", flow->id); + return -ENAME; + } + + if (reg_flow_rekey_pending(flow->id)) { + log_dbg("Duplicate re-key request for flow %d.", flow->id); + return 0; + } + + newgen = (uint8_t) ((epoch + 1) & 0x0F); + data.data = &newgen; + data.len = 1; + + sk.key = kbuf; + + reg_flow_get_peer_crt(flow->id, &crt); + + err = oap_srv_process(&info, *pk, &rsp, &data, &sk, true, &crt, NULL); + if (err < 0) { + /* data still points to stack newgen; don't free it. */ + log_err("Failed to process re-key OAP for flow %d.", flow->id); + goto finish; + } + + /* On success oap_srv_process repointed data to client output. */ + freebuf(data); + + if (reg_flow_store_pending(flow->id, kbuf, newgen, false) < 0) { + log_warn("Flow %d gone during re-key.", flow->id); + err = -EBADF; + goto finish; + } + + reg_notify_flow(flow->id, FLOW_UPD); + + if (flow_upd_wrap(&upd, FLOW_UPD_REKEY_RESP, &rsp) == 0) { + pthread_cleanup_push(rk_clear_key, kbuf); + pthread_cleanup_push(rk_freebuf, &rsp); + pthread_cleanup_push(rk_freebuf, &crt); + pthread_cleanup_push(rk_freebuf, &upd); + if (ipcp_flow_update(flow, upd) < 0) + log_err("Failed to send re-key response for flow %d.", + flow->id); + pthread_cleanup_pop(false); + pthread_cleanup_pop(false); + pthread_cleanup_pop(false); + pthread_cleanup_pop(false); + freebuf(upd); + } + + err = 0; + finish: + crypt_secure_clear(kbuf, SYMMKEYSZ); + freebuf(rsp); + freebuf(crt); + + return err; +} + +/* + * Worker-only: re-key a direct (loopback) flow, the exchange runs in-process: + * build a client request, then derive the shared seed, and hand the one seed + * to both apps with RB_REKEY. + */ +static void rekey_do_direct(int flow_id) +{ + struct name_info info; + struct crypt_sk sk; + uint8_t kbuf[SYMMKEYSZ]; + buffer_t req = BUF_INIT; + buffer_t rsp = BUF_INIT; + buffer_t data = BUF_INIT; + buffer_t crt = BUF_INIT; + void * ctx = NULL; + uint8_t newgen; + int epoch; + + epoch = reg_flow_get_epoch(flow_id); + if (epoch < 0) { + log_warn("Re-key for unknown flow %d.", flow_id); + reg_flow_clear_in_flight(flow_id); + return; + } + + if (rekey_name_info(flow_id, &info) < 0) { + log_err("Failed to get name info to re-key flow %d.", flow_id); + reg_flow_clear_in_flight(flow_id); + return; + } + + if (oap_cli_prepare(&ctx, &info, &req, data, true) < 0) { + log_err("Failed to prepare re-key for flow %d.", flow_id); + reg_flow_clear_in_flight(flow_id); + return; + } + + newgen = (uint8_t) ((epoch + 1) & 0x0F); + data.data = &newgen; + data.len = 1; + + sk.key = kbuf; + + reg_flow_get_peer_crt(flow_id, &crt); + + if (oap_srv_process(&info, req, &rsp, &data, &sk, true, + &crt, NULL) < 0) { + /* data still points to stack newgen; don't free it. */ + log_err("Failed to process re-key OAP for flow %d.", flow_id); + reg_flow_clear_in_flight(flow_id); + goto out; + } + + /* On success oap_srv_process repointed data to its output. */ + freebuf(data); + + if (reg_flow_store_pending_direct(flow_id, kbuf, newgen) < 0) { + log_warn("Flow %d gone during re-key.", flow_id); + reg_flow_clear_in_flight(flow_id); + goto out; + } + + reg_notify_flow_peers(flow_id, FLOW_UPD); + + log_dbg("Re-key completed (direct) for flow %d (gen %u).", + flow_id, newgen); + out: + crypt_secure_clear(kbuf, SYMMKEYSZ); + oap_ctx_free(ctx); + freebuf(req); + freebuf(rsp); + freebuf(crt); +} + +/* Route one snapshot entry to the wire or in-process re-key path. */ +static void rekey_dispatch(struct list_head * tbl, + const struct rekey_info * ri) +{ + if (ri->direct) + rekey_do_direct(ri->flow_id); + else + rekey_do_initiate(tbl, ri->flow_id, ri->n_1_pid); +} + +static int flow_update_arr(struct flow_info * flow, + buffer_t * pk) +{ + uint8_t type; + bool is_req; + + if (pk->len < 1) + return -EINVAL; + + type = pk->data[0]; + + switch (type) { + case FLOW_UPD_REKEY_REQ: + is_req = true; + break; + case FLOW_UPD_REKEY_RESP: + is_req = false; + break; + default: + log_warn("Unknown flow update type %u.", type); + return -EINVAL; + } + + /* Drop floods/spoofs before allocating a worker event. */ + if (!reg_flow_rekey_arr_admit(flow->id, flow->n_1_pid, is_req)) + return 0; + + /* Strip the type byte, keeping the malloc base for hand-off. */ + memmove(pk->data, pk->data + 1, pk->len - 1); + pk->len -= 1; + + /* Defer to worker; an inline RESP send deadlocks loopback. */ + if (is_req) + rekey_post_req(flow->id, flow->n_1_pid, pk); + else + rekey_post_resp(flow->id, pk); + + return 0; +} + +static int flow_update(struct flow_info * flow, + uid_t uid, + pid_t cpid, + bool rekey, + struct crypt_sk * sk, + bool * has_key, + bool * initiator) +{ + uint8_t seed[SYMMKEYSZ]; + uint8_t epoch; + int rc; + + *has_key = false; + *initiator = false; + + if (rekey) { + pid_t n_1_pid; + + if (!reg_flow_owned_by(flow->id, uid)) + return -EPERM; + + /* Direct flows re-key in-process; no lower IPCP carrier. */ + if (reg_flow_is_direct(flow->id)) { + if (reg_flow_rekey_begin(flow->id)) + rekey_post_direct(flow->id); + + return 0; + } + + /* Watermark re-key: the app can't know its lower IPCP. */ + n_1_pid = reg_flow_get_n_1_pid(flow->id); + if (n_1_pid <= 0) + return 0; + + /* One exchange per flow; the latch arbitrates collisions. */ + if (reg_flow_rekey_begin(flow->id)) + rekey_post_init(flow->id, n_1_pid); + + return 0; + } + + rc = reg_flow_take_pending(flow->id, uid, cpid, seed, &epoch, + initiator); + if (rc == -EPERM) + return -EPERM; + + if (rc != 0) + return 0; + + memcpy(sk->key, seed, SYMMKEYSZ); + sk->epoch = epoch; + *has_key = true; + + crypt_secure_clear(seed, SYMMKEYSZ); + + log_dbg("Delivered re-key seed for flow %d (gen %u).", + flow->id, epoch); + + return 0; +} + +static void rekey_table_cleanup(void * o) +{ + struct list_head * tbl = o; + struct list_head * p; + struct list_head * h; + + list_for_each_safe(p, h, tbl) { + struct rekey_ctx * e = list_entry(p, struct rekey_ctx, next); + rekey_drop(e); + } +} + +static struct rekey_evt * rekey_event_wait(const struct timespec * dl) +{ + struct rekey_evt * evt = NULL; + int ret = 0; + + pthread_mutex_lock(&irmd.rk.mtx); + pthread_cleanup_push(__cleanup_mutex_unlock, &irmd.rk.mtx); + + while (list_is_empty(&irmd.rk.inbox) && ret != -ETIMEDOUT) + ret = -pthread_cond_timedwait(&irmd.rk.cond, &irmd.rk.mtx, dl); + + if (!list_is_empty(&irmd.rk.inbox)) { + evt = list_first_entry(&irmd.rk.inbox, struct rekey_evt, next); + list_del(&evt->next); + } + + pthread_cleanup_pop(true); + + return evt; +} + +static struct timespec rekey_deadline(struct list_head * tbl, + struct timespec next) +{ + struct timespec deadline = next; + struct list_head * p; + + list_for_each(p, tbl) { + struct rekey_ctx * e; + e = list_entry(p, struct rekey_ctx, next); + if (ts_diff_ns(&e->deadline, &deadline) < 0) + deadline = e->deadline; + } + + return deadline; +} + +static void rekey_handle_evt(struct list_head * tbl, + struct rekey_evt * evt) +{ + struct flow_info rinfo; + + pthread_cleanup_push(rk_free_evt, evt); + + switch (evt->type) { + case REKEY_INIT: + rekey_do_initiate(tbl, evt->flow_id, evt->n_1_pid); + break; + case REKEY_REQ: + memset(&rinfo, 0, sizeof(rinfo)); + rinfo.id = evt->flow_id; + rinfo.n_1_pid = evt->n_1_pid; + rekey_respond(&rinfo, &evt->buf); + reg_flow_rekey_arr_done(evt->flow_id, true); + break; + case REKEY_RESP: + rekey_do_complete(tbl, evt->flow_id, evt->buf); + reg_flow_rekey_arr_done(evt->flow_id, false); + break; + case REKEY_DIRECT: + rekey_do_direct(evt->flow_id); + break; + default: + break; + } + + pthread_cleanup_pop(true); +} + +/* On the periodic tick, dispatch all flows due for re-keying. */ +static void rekey_run_periodic(struct list_head * tbl, + struct timespec * next) +{ + struct rekey_info snap[REKEY_BATCH]; + struct timespec now; + int n; + int i; + + clock_gettime(PTHREAD_COND_CLOCK, &now); + + if (ts_diff_ns(next, &now) > 0) + return; + + n = reg_flow_snapshot_rekey_due(snap, REKEY_BATCH); + for (i = 0; i < n; ++i) + rekey_dispatch(tbl, &snap[i]); + + clock_gettime(PTHREAD_COND_CLOCK, next); + next->tv_sec += OAP_REKEY_TIMER; +} + +/* + * Single worker owning all in-flight Tier-2 re-keys. It drains the + * inbox, runs the periodic snapshot, and reaps timed-out exchanges. + * The table is touched only here, so it needs no lock. + */ +static void * rekey_worker(void * o) +{ + struct list_head table; + struct timespec next; + + (void) o; + + list_head_init(&table); + + clock_gettime(PTHREAD_COND_CLOCK, &next); + next.tv_sec += OAP_REKEY_TIMER; + + pthread_cleanup_push(rekey_table_cleanup, &table); + + while (true) { + struct rekey_evt * evt; + struct timespec deadline; + + deadline = rekey_deadline(&table, next); + + evt = rekey_event_wait(&deadline); + + if (evt != NULL) + rekey_handle_evt(&table, evt); + + rekey_run_periodic(&table, &next); + + rekey_reap_expired(&table); + } + + pthread_cleanup_pop(true); + + return (void *) 0; +} + static void * acceptloop(void * o) { int csockfd; @@ -1491,6 +2300,11 @@ static irm_msg_t * do_command_msg(irm_msg_t * msg, struct timespec now; struct timespec ts = TIMESPEC_INIT_S(0); /* static analysis */ int res; + bool has_key = false; + bool initiator = false; + uid_t uid; + gid_t gid; + pid_t cpid; irm_msg_t * ret_msg; buffer_t data; @@ -1557,7 +2371,7 @@ static irm_msg_t * do_command_msg(irm_msg_t * msg, case IRM_MSG_CODE__IRM_PROC_ANNOUNCE: proc.pid = msg->pid; strcpy(proc.prog, msg->prog); - res = get_peer_ids(fd, &proc.uid, &proc.gid); + res = get_peer_ids(fd, &proc.uid, &proc.gid, NULL); if (res < 0) log_err("Failed to get UID/GID for pid %d.", msg->pid); else @@ -1600,26 +2414,29 @@ static irm_msg_t * do_command_msg(irm_msg_t * msg, flow = flow_info_msg_to_s(msg->flow_info); sk.key = kbuf; res = flow_accept(&flow, &data, abstime, &sk); - if (res == 0) { - ret_msg->flow_info = flow_info_s_to_msg(&flow); - ret_msg->has_pk = data.len != 0; - ret_msg->pk.data = data.data; - ret_msg->pk.len = data.len; - ret_msg->has_cipher_nid = true; - ret_msg->cipher_nid = sk.nid; - if (sk.nid != NID_undef) { - hbuf = malloc(SYMMKEYSZ); - if (hbuf == NULL) { - log_err("Failed to malloc key buf"); - return NULL; - } - - memcpy(hbuf, kbuf, SYMMKEYSZ); - ret_msg->sym_key.data = hbuf; - ret_msg->sym_key.len = SYMMKEYSZ; - ret_msg->has_sym_key = true; - } + if (res != 0) + break; + + ret_msg->flow_info = flow_info_s_to_msg(&flow); + ret_msg->has_pk = data.len != 0; + ret_msg->pk.data = data.data; + ret_msg->pk.len = data.len; + ret_msg->has_cipher_nid = true; + ret_msg->cipher_nid = sk.nid; + if (sk.nid == NID_undef) + break; + + hbuf = malloc(SYMMKEYSZ); + if (hbuf == NULL) { + log_err("Failed to malloc key buf"); + res = -ENOMEM; + break; } + + memcpy(hbuf, kbuf, SYMMKEYSZ); + ret_msg->sym_key.data = hbuf; + ret_msg->sym_key.len = SYMMKEYSZ; + ret_msg->has_sym_key = true; break; case IRM_MSG_CODE__IRM_FLOW_ALLOC: data.len = msg->pk.len; @@ -1630,25 +2447,29 @@ static irm_msg_t * do_command_msg(irm_msg_t * msg, abstime = abstime == NULL ? &max : abstime; sk.key = kbuf; res = flow_alloc(msg->dst, &flow, &data, abstime, &sk); - if (res == 0) { - ret_msg->flow_info = flow_info_s_to_msg(&flow); - ret_msg->has_pk = data.len != 0; - ret_msg->pk.data = data.data; - ret_msg->pk.len = data.len; - ret_msg->has_cipher_nid = true; - ret_msg->cipher_nid = sk.nid; - if (sk.nid != NID_undef) { - hbuf = malloc(SYMMKEYSZ); - if (hbuf == NULL) { - log_err("Failed to malloc key buf"); - return NULL; - } - memcpy(hbuf, kbuf, SYMMKEYSZ); - ret_msg->sym_key.data = hbuf; - ret_msg->sym_key.len = SYMMKEYSZ; - ret_msg->has_sym_key = true; - } + if (res != 0) + break; + + ret_msg->flow_info = flow_info_s_to_msg(&flow); + ret_msg->has_pk = data.len != 0; + ret_msg->pk.data = data.data; + ret_msg->pk.len = data.len; + ret_msg->has_cipher_nid = true; + ret_msg->cipher_nid = sk.nid; + if (sk.nid == NID_undef) + break; + + hbuf = malloc(SYMMKEYSZ); + if (hbuf == NULL) { + log_err("Failed to malloc key buf"); + res = -ENOMEM; + break; } + + memcpy(hbuf, kbuf, SYMMKEYSZ); + ret_msg->sym_key.data = hbuf; + ret_msg->sym_key.len = SYMMKEYSZ; + ret_msg->has_sym_key = true; break; case IRM_MSG_CODE__IRM_FLOW_JOIN: assert(msg->pk.len == 0 && msg->pk.data == NULL); @@ -1687,6 +2508,51 @@ static irm_msg_t * do_command_msg(irm_msg_t * msg, flow = flow_info_msg_to_s(msg->flow_info); res = flow_alloc_reply(&flow, msg->response, &data); break; + case IRM_MSG_CODE__IPCP_FLOW_UPDATE_ARR: + data.len = msg->pk.len; + data.data = msg->pk.data; + msg->pk.data = NULL; /* pass data */ + msg->pk.len = 0; + flow = flow_info_msg_to_s(msg->flow_info); + res = flow_update_arr(&flow, &data); + freebuf(data); + break; + case IRM_MSG_CODE__IRM_FLOW_UPDATE: + flow = flow_info_msg_to_s(msg->flow_info); + if (get_peer_ids(fd, &uid, &gid, &cpid) < 0) { + res = -EPERM; + break; + } + + if (cpid <= 0) /* non-Linux: fall back to asserted pid */ + cpid = flow.n_pid; + + sk.key = kbuf; + res = flow_update(&flow, uid, cpid, msg->rekey, &sk, &has_key, + &initiator); + if (res != 0) + break; + + ret_msg->flow_info = flow_info_s_to_msg(&flow); + if (!has_key) + break; + + hbuf = malloc(SYMMKEYSZ); + if (hbuf == NULL) { + log_err("Failed to malloc key buf"); + res = -ENOMEM; + break; + } + + memcpy(hbuf, kbuf, SYMMKEYSZ); + ret_msg->sym_key.data = hbuf; + ret_msg->sym_key.len = SYMMKEYSZ; + ret_msg->has_sym_key = true; + ret_msg->has_generation = true; + ret_msg->generation = sk.epoch; + ret_msg->has_rk_initiator = true; + ret_msg->rk_initiator = initiator; + break; default: log_err("Don't know that message code."); res = -1; @@ -1706,6 +2572,13 @@ static irm_msg_t * do_command_msg(irm_msg_t * msg, return ret_msg; } +/* Wipe the session key from a reply before its buffers are freed. */ +static void clear_msg_key(irm_msg_t * msg) +{ + if (msg != NULL && msg->has_sym_key) + crypt_secure_clear(msg->sym_key.data, msg->sym_key.len); +} + static void * mainloop(void * o) { int sfd; @@ -1717,6 +2590,7 @@ static void * mainloop(void * o) while (true) { irm_msg_t * ret_msg; struct cmd * cmd; + bool had_key; pthread_mutex_lock(&irmd.cmd_lock); @@ -1780,6 +2654,9 @@ static void * mainloop(void * o) irm_msg__pack(ret_msg, buffer.data); + had_key = ret_msg->has_sym_key; + clear_msg_key(ret_msg); + irm_msg__free_unpacked(ret_msg, NULL); pthread_cleanup_push(__cleanup_close_ptr, &sfd); @@ -1794,6 +2671,9 @@ static void * mainloop(void * o) strerror(errno)); } + if (had_key) + crypt_secure_clear(buffer.data, buffer.len); + pthread_cleanup_pop(true); pthread_cleanup_pop(true); @@ -1801,6 +2681,7 @@ static void * mainloop(void * o) continue; fail: + clear_msg_key(ret_msg); irm_msg__free_unpacked(ret_msg, NULL); fail_msg: close(sfd); @@ -1884,12 +2765,14 @@ void * irm_sanitize(void * o) return (void *) 0; } -static int irm_load_store(char * dpath) +static int irm_load_store(char * dpath, + bool anchor) { struct stat st; struct dirent * dent; DIR * dir; void * crt; + int ret; if (stat(dpath, &st) == -1) { log_dbg("Store directory %s not found.", dpath); @@ -1933,7 +2816,9 @@ static int irm_load_store(char * dpath) goto fail_file; } - if (oap_auth_add_ca_crt(crt) < 0) { + ret = anchor ? oap_auth_add_ca_crt(crt) + : oap_auth_add_chain_crt(crt); + if (ret < 0) { log_err("Failed to add certificate from %s to store.", path); goto fail_crt_add; @@ -2030,6 +2915,29 @@ static int irm_init(void) list_head_init(&irmd.cmds); + if (pthread_mutex_init(&irmd.rk.mtx, NULL)) { + log_err("Failed to initialize mutex."); + goto fail_rk_mtx; + } + + if (pthread_condattr_init(&cattr)) { + log_err("Failed to initialize condattr."); + goto fail_rk_mtx; + } + +#ifndef __APPLE__ + pthread_condattr_setclock(&cattr, PTHREAD_COND_CLOCK); +#endif + if (pthread_cond_init(&irmd.rk.cond, &cattr)) { + log_err("Failed to initialize condvar."); + pthread_condattr_destroy(&cattr); + goto fail_rk_cond; + } + + pthread_condattr_destroy(&cattr); + + list_head_init(&irmd.rk.inbox); + if (stat(SOCK_PATH, &st) == -1) { if (mkdir(SOCK_PATH, 0777)) { log_err("Failed to create sockets directory."); @@ -2077,12 +2985,12 @@ static int irm_init(void) goto fail_oap; } - if (irm_load_store(OUROBOROS_CA_CRT_DIR) < 0) { + if (irm_load_store(OUROBOROS_CA_CRT_DIR, true) < 0) { log_err("Failed to load CA certificates."); goto fail_load_store; } - if (irm_load_store(OUROBOROS_CHAIN_DIR) < 0) { + if (irm_load_store(OUROBOROS_CHAIN_DIR, false) < 0) { log_err("Failed to load intermediate certificates."); goto fail_load_store; } @@ -2133,6 +3041,10 @@ static int irm_init(void) fail_sock_path: unlink(IRM_SOCK_PATH); fail_stat: + pthread_cond_destroy(&irmd.rk.cond); + fail_rk_cond: + pthread_mutex_destroy(&irmd.rk.mtx); + fail_rk_mtx: pthread_cond_destroy(&irmd.cmd_cond); fail_cmd_cond: pthread_mutex_destroy(&irmd.cmd_lock); @@ -2181,13 +3093,28 @@ static void irm_fini(void) pthread_mutex_unlock(&irmd.cmd_lock); + pthread_mutex_lock(&irmd.rk.mtx); + + list_for_each_safe(p, h, &irmd.rk.inbox) { + struct rekey_evt * evt; + evt = list_entry(p, struct rekey_evt, next); + list_del(&evt->next); + freebuf(evt->buf); + free(evt); + } + + pthread_mutex_unlock(&irmd.rk.mtx); + pthread_mutex_destroy(&irmd.cmd_lock); pthread_cond_destroy(&irmd.cmd_cond); + pthread_mutex_destroy(&irmd.rk.mtx); + pthread_cond_destroy(&irmd.rk.cond); pthread_rwlock_destroy(&irmd.state_lock); #ifdef HAVE_FUSE while (rmdir(FUSE_PREFIX) < 0 && retries-- > 0) nanosleep(&wait, NULL); + if (retries < 0) log_err("Failed to remove " FUSE_PREFIX); #endif @@ -2220,10 +3147,18 @@ static int irm_start(void) if (pthread_create(&irmd.acceptor, NULL, acceptloop, NULL)) goto fail_acceptor; + if (OAP_REKEY_TIMER > 0) { + if (pthread_create(&irmd.rk.worker, NULL, rekey_worker, NULL)) + goto fail_rekey_worker; + } + log_info("Ouroboros IPC Resource Manager daemon started..."); return 0; + fail_rekey_worker: + pthread_cancel(irmd.acceptor); + pthread_join(irmd.acceptor, NULL); fail_acceptor: pthread_cancel(irmd.irm_sanitize); pthread_join(irmd.irm_sanitize, NULL); @@ -2263,6 +3198,11 @@ static void irm_sigwait(sigset_t sigset) static void irm_stop(void) { + if (OAP_REKEY_TIMER > 0) { + pthread_cancel(irmd.rk.worker); + pthread_join(irmd.rk.worker, NULL); + } + pthread_cancel(irmd.acceptor); pthread_cancel(irmd.irm_sanitize); @@ -2383,26 +3323,31 @@ int main(int argc, goto fail_irm_init; } - if (irm_init() < 0) + if (crypt_secure_malloc_init(IRMD_SECMEM_MAX) < 0) { + log_err("Failed to initialize secure memory allocation."); + goto fail_secmem; + } + + if (irm_init() < 0) { + log_err("Failed to initialize IRMd."); goto fail_irm_init; + } if (reg_init() < 0) { log_err("Failed to initialize registry."); goto fail_reg; } - if (crypt_secure_malloc_init(IRMD_SECMEM_MAX) < 0) { - log_err("Failed to initialize secure memory allocation."); - goto fail_reg; - } - pthread_sigmask(SIG_BLOCK, &sigset, NULL); - if (irm_start() < 0) + if (irm_start() < 0) { + log_err("Failed to start IRMd."); goto fail_irm_start; + } #ifdef HAVE_TOML if (irm_configure(irmd.cfg_file) < 0) { + log_err("Failed to load IRMd configuration."); irmd_set_state(IRMD_SHUTDOWN); ret = EXIT_FAILURE; } @@ -2415,15 +3360,16 @@ int main(int argc, pthread_sigmask(SIG_UNBLOCK, &sigset, NULL); - crypt_secure_malloc_fini(); - crypt_cleanup(); - reg_clear(); reg_fini(); irm_fini(); + crypt_secure_malloc_fini(); + + crypt_cleanup(); + log_info("Ouroboros IPC Resource Manager daemon exited. Bye."); log_fini(); @@ -2435,5 +3381,8 @@ int main(int argc, fail_reg: irm_fini(); fail_irm_init: + crypt_secure_malloc_fini(); + crypt_cleanup(); + fail_secmem: exit(EXIT_FAILURE); } diff --git a/src/irmd/oap.c b/src/irmd/oap.c deleted file mode 100644 index 1831f533..00000000 --- a/src/irmd/oap.c +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Ouroboros - Copyright (C) 2016 - 2026 - * - * OAP - Shared credential and configuration loading - * - * Dimitri Staessens <dimitri@ouroboros.rocks> - * Sander Vrijders <sander@ouroboros.rocks> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., http://www.fsf.org/about/contact/. - */ - -#if defined(__linux__) || defined(__CYGWIN__) - #define _DEFAULT_SOURCE -#else - #define _POSIX_C_SOURCE 200809L -#endif - -#define OUROBOROS_PREFIX "irmd/oap" - -#include <ouroboros/crypt.h> -#include <ouroboros/errno.h> -#include <ouroboros/logs.h> - -#include "config.h" - -#include <assert.h> -#include <string.h> -#include <sys/stat.h> - -/* - * Shared credential and configuration loading helpers - */ - -#ifndef OAP_TEST_MODE - -static bool file_exists(const char * path) -{ - struct stat s; - - if (stat(path, &s) < 0 && errno == ENOENT) { - log_dbg("File %s does not exist.", path); - return false; - } - - return true; -} - -int load_credentials(const char * name, - const struct name_sec_paths * paths, - void ** pkp, - void ** crt) -{ - assert(paths != NULL); - assert(pkp != NULL); - assert(crt != NULL); - - *pkp = NULL; - *crt = NULL; - - if (!file_exists(paths->crt) || !file_exists(paths->key)) { - log_info("No authentication certificates for %s.", name); - return 0; - } - - if (crypt_load_crt_file(paths->crt, crt) < 0) { - log_err("Failed to load %s for %s.", paths->crt, name); - goto fail_crt; - } - - if (crypt_load_privkey_file(paths->key, pkp) < 0) { - log_err("Failed to load %s for %s.", paths->key, name); - goto fail_key; - } - - log_info("Loaded authentication certificates for %s.", name); - - return 0; - - fail_key: - crypt_free_crt(*crt); - *crt = NULL; - fail_crt: - return -EAUTH; -} - -int load_kex_config(const char * name, - const char * path, - struct sec_config * cfg) -{ - assert(name != NULL); - assert(cfg != NULL); - - memset(cfg, 0, sizeof(*cfg)); - - /* Load encryption config */ - if (!file_exists(path)) - log_dbg("No encryption %s for %s.", path, name); - - if (load_sec_config_file(cfg, path) < 0) { - log_warn("Failed to load %s for %s.", path, name); - return -1; - } - - if (!IS_KEX_ALGO_SET(cfg)) { - log_info("Key exchange not configured for %s.", name); - return 0; - } - - if (cfg->c.nid == NID_undef || crypt_nid_to_str(cfg->c.nid) == NULL) { - log_err("Invalid cipher NID %d for %s.", cfg->c.nid, name); - return -ECRYPT; - } - - log_info("Encryption enabled for %s.", name); - - return 0; -} - -#endif /* OAP_TEST_MODE */ diff --git a/src/irmd/oap.h b/src/irmd/oap.h index d6d8dfe2..86f11e21 100644 --- a/src/irmd/oap.h +++ b/src/irmd/oap.h @@ -28,6 +28,8 @@ #include <ouroboros/name.h> #include <ouroboros/utils.h> +#include <stdbool.h> + /* OAP authentication state (in oap/auth.c) */ int oap_auth_init(void); @@ -35,31 +37,46 @@ void oap_auth_fini(void); int oap_auth_add_ca_crt(void * crt); +int oap_auth_add_chain_crt(void * crt); + /* * Prepare OAP request header for server, returns context * Passes client data for srv, returns srv data for client +* rekey forces ephemeral server-encap KEX (no client-encap; preserves FS/PCS) */ int oap_cli_prepare(void ** ctx, const struct name_info * info, buffer_t * req_buf, - buffer_t data); + buffer_t data, + bool rekey); /* * Server processes header, creates response header, returns secret key. * data is in/out: input=srv data to send, output=cli data received. + * rekey drops the cert and verifies against cached_crt; peer_crt (or NULL) + * receives a copy of the peer cert to cache at the initial handshake. */ int oap_srv_process(const struct name_info * info, buffer_t req_buf, buffer_t * rsp_buf, buffer_t * data, - struct crypt_sk * sk); + struct crypt_sk * sk, + bool rekey, + const buffer_t * cached_crt, + buffer_t * peer_crt); -/* Complete OAP, returns secret key and server data, frees ctx */ +/* + * Complete OAP, returns secret key and server data, frees ctx. + * cached_crt verifies a cert-less re-key; peer_crt (or NULL) receives a + * copy of the peer cert to cache at the initial handshake. + */ int oap_cli_complete(void * ctx, const struct name_info * info, buffer_t rsp_buf, buffer_t * data, - struct crypt_sk * sk); + struct crypt_sk * sk, + const buffer_t * cached_crt, + buffer_t * peer_crt); /* Free OAP state (on failure before complete) */ void oap_ctx_free(void * ctx); diff --git a/src/irmd/oap/auth.c b/src/irmd/oap/auth.c index 4b86f055..f70f9df1 100644 --- a/src/irmd/oap/auth.c +++ b/src/irmd/oap/auth.c @@ -29,8 +29,8 @@ #define OUROBOROS_PREFIX "irmd/oap" #include <ouroboros/crypt.h> +#include <ouroboros/endian.h> #include <ouroboros/errno.h> -#include <ouroboros/list.h> #include <ouroboros/logs.h> #include <ouroboros/pthread.h> #include <ouroboros/time.h> @@ -44,38 +44,99 @@ #include <stdlib.h> #include <string.h> -struct oap_replay_entry { - struct list_head next; - uint64_t timestamp; - uint8_t id[OAP_ID_SIZE]; +/* + * Replay cache: three timestamp-generation hash buckets. A header's bucket + * is gen(T) = T / OAP_REPLAY_TIMER, taken mod 3. Staleness bounds a valid T + * to generations {G-1, G, G+1} (G is now's generation; a within-slack future + * stamp can reach G+1), which are distinct mod 3; the aliasing generation + * G-3 is always rejected as too old first. Each bucket is an open-addressed + * hash set whose slots are live iff slot.gen == bucket.gen, so a stale bucket + * clears in O(1) by bumping its gen. Overflow fails closed (reject), never + * evicts, so a flood cannot displace a genuine entry into a replayable state. + */ +#define OAP_REPLAY_GENS 3 + +struct oap_replay_slot { + uint64_t gen; /* live iff == bucket gen; 0 = never used */ + uint64_t ts; + uint8_t id[OAP_ID_SIZE]; +}; + +struct oap_replay_bucket { + uint64_t gen; + size_t count; + struct oap_replay_slot * slots; }; static struct { struct auth_ctx * ca_ctx; struct { - struct list_head list; - pthread_mutex_t mtx; + size_t mask; /* slots per bucket - 1 */ + size_t cap; /* fail-closed threshold */ + struct oap_replay_bucket bucket[OAP_REPLAY_GENS]; + pthread_mutex_t mtx; } replay; } oap_auth; +/* FNV-1a over id || ts; the table mask reduces it to a slot index. */ +static size_t replay_hash(const uint8_t * id, + uint64_t ts) +{ + uint64_t hh = 14695981039346656037ULL; + size_t i; + + for (i = 0; i < OAP_ID_SIZE; i++) { + hh ^= id[i]; + hh *= 1099511628211ULL; + } + + for (i = 0; i < sizeof(ts); i++) { + hh ^= (uint8_t) (ts >> (i * 8)); + hh *= 1099511628211ULL; + } + + return (size_t) hh; +} + int oap_auth_init(void) { + size_t m = 1; + int i; + oap_auth.ca_ctx = auth_create_ctx(); if (oap_auth.ca_ctx == NULL) { log_err("Failed to create OAP auth context."); goto fail_ctx; } - list_head_init(&oap_auth.replay.list); + while (m < (size_t) OAP_REPLAY_MAX * 2) + m <<= 1; + + oap_auth.replay.mask = m - 1; + oap_auth.replay.cap = OAP_REPLAY_MAX; + + for (i = 0; i < OAP_REPLAY_GENS; i++) { + struct oap_replay_bucket * b = &oap_auth.replay.bucket[i]; + b->gen = 0; + b->count = 0; + b->slots = calloc(m, sizeof(*b->slots)); + if (b->slots == NULL) { + log_err("Failed to alloc OAP replay bucket."); + goto fail_bucket; + } + } if (pthread_mutex_init(&oap_auth.replay.mtx, NULL)) { log_err("Failed to init OAP replay mutex."); - goto fail_mtx; + goto fail_bucket; } return 0; - fail_mtx: + fail_bucket: + for (i = 0; i < OAP_REPLAY_GENS; i++) + free(oap_auth.replay.bucket[i].slots); + auth_destroy_ctx(oap_auth.ca_ctx); fail_ctx: return -1; @@ -83,16 +144,13 @@ int oap_auth_init(void) void oap_auth_fini(void) { - struct list_head * p; - struct list_head * h; + int i; pthread_mutex_lock(&oap_auth.replay.mtx); - list_for_each_safe(p, h, &oap_auth.replay.list) { - struct oap_replay_entry * e; - e = list_entry(p, struct oap_replay_entry, next); - list_del(&e->next); - free(e); + for (i = 0; i < OAP_REPLAY_GENS; i++) { + free(oap_auth.replay.bucket[i].slots); + oap_auth.replay.bucket[i].slots = NULL; } pthread_mutex_unlock(&oap_auth.replay.mtx); @@ -106,18 +164,214 @@ int oap_auth_add_ca_crt(void * crt) return auth_add_crt_to_store(oap_auth.ca_ctx, crt); } +int oap_auth_add_chain_crt(void * crt) +{ + return auth_add_crt_to_chain(oap_auth.ca_ctx, crt); +} + +/* HKDF info = LABEL (incl. NUL separator) || request-hash [|| response-hash] */ +#define OAP_BIND_LABEL "o7s-oap-bind" +#define OAP_KC_LABEL "o7s-oap-kc" +#define OAP_HS_LABEL "o7s-oap-hs" + +int oap_resp_hash(int md_nid, + buffer_t kex, + buffer_t data, + buffer_t crt, + buffer_t * out) +{ + buffer_t cat = BUF_INIT; + uint8_t * p; + ssize_t len; + + assert(out != NULL); + assert(out->data != NULL); + + cat.len = kex.len + data.len + crt.len; + if (cat.len == 0) + return -EINVAL; + + cat.data = malloc(cat.len); + if (cat.data == NULL) + return -ENOMEM; + + p = cat.data; + if (kex.len > 0) { + memcpy(p, kex.data, kex.len); + p += kex.len; + } + + if (data.len > 0) { + memcpy(p, data.data, data.len); + p += data.len; + } + + if (crt.len > 0) + memcpy(p, crt.data, crt.len); + + len = md_digest(md_nid, cat, out->data); + + freebuf(cat); + + if (len < 0) + return -ECRYPT; + + out->len = (size_t) len; + + return 0; +} + +/* HKDF-expand sk->key with info into out; -ECRYPT on failure. */ +static int oap_hkdf_expand(const struct crypt_sk * sk, + buffer_t info, + uint8_t * out, + size_t outlen) +{ + buffer_t prk; + buffer_t okm; + + prk.len = SYMMKEYSZ; + prk.data = sk->key; + okm.len = outlen; + okm.data = out; + + if (crypt_hkdf_expand(prk, info, okm) < 0) + return -ECRYPT; + + return 0; +} + +/* info = label || H(req) */ +#define OAP_HS_INFO_SZ (sizeof(OAP_HS_LABEL) + MAX_HASH_SIZE) +int oap_derive_hs_key(const struct crypt_sk * sk, + buffer_t req_hash, + uint8_t * out) +{ + uint8_t info_buf[OAP_HS_INFO_SZ]; + buffer_t info; + size_t len; + + assert(sk != NULL); + assert(req_hash.data != NULL); + assert(out != NULL); + + if (req_hash.len == 0 || req_hash.len > MAX_HASH_SIZE) + return -EINVAL; + + len = sizeof(OAP_HS_LABEL); + memcpy(info_buf, OAP_HS_LABEL, len); + memcpy(info_buf + len, req_hash.data, req_hash.len); + len += req_hash.len; + + info.len = len; + info.data = info_buf; + + return oap_hkdf_expand(sk, info, out, SYMMKEYSZ); +} + +/* info = label || H(req) || H(resp) || cipher_nid || kdf_nid */ +#define OAP_BIND_INFO_SZ \ + (sizeof(OAP_BIND_LABEL) + 2 * MAX_HASH_SIZE + 2 * sizeof(uint16_t)) +int oap_bind_session_key(struct crypt_sk * sk, + buffer_t req_hash, + buffer_t resp_hash, + int kdf_nid) +{ + uint8_t info_buf[OAP_BIND_INFO_SZ]; + uint8_t tmp[SYMMKEYSZ]; + uint16_t suite[2]; + buffer_t info; + size_t len; + + assert(sk != NULL); + assert(req_hash.data != NULL); + assert(resp_hash.data != NULL); + + if (req_hash.len == 0 || req_hash.len > MAX_HASH_SIZE) + return -EINVAL; + + if (resp_hash.len == 0 || resp_hash.len > MAX_HASH_SIZE) + return -EINVAL; + + len = sizeof(OAP_BIND_LABEL); + memcpy(info_buf, OAP_BIND_LABEL, len); + memcpy(info_buf + len, req_hash.data, req_hash.len); + len += req_hash.len; + + memcpy(info_buf + len, resp_hash.data, resp_hash.len); + len += resp_hash.len; + + suite[0] = hton16((uint16_t) sk->nid); + suite[1] = hton16((uint16_t) kdf_nid); + memcpy(info_buf + len, suite, sizeof(suite)); + len += sizeof(suite); + + info.len = len; + info.data = info_buf; + + if (oap_hkdf_expand(sk, info, tmp, SYMMKEYSZ) < 0) + return -ECRYPT; + + memcpy(sk->key, tmp, SYMMKEYSZ); + crypt_secure_clear(tmp, SYMMKEYSZ); + + return 0; +} + +/* info = label || H(req) || H(resp) */ +#define OAP_KC_INFO_SZ (sizeof(OAP_KC_LABEL) + 2 * MAX_HASH_SIZE) +int oap_key_confirm_tag(const struct crypt_sk * sk, + buffer_t req_hash, + buffer_t resp_hash, + uint8_t * out, + size_t outlen) +{ + uint8_t info_buf[OAP_KC_INFO_SZ]; + buffer_t info; + size_t len; + + assert(sk != NULL); + assert(req_hash.data != NULL); + assert(resp_hash.data != NULL); + assert(out != NULL); + + if (req_hash.len == 0 || req_hash.len > MAX_HASH_SIZE) + return -EINVAL; + + if (resp_hash.len == 0 || resp_hash.len > MAX_HASH_SIZE) + return -EINVAL; + + if (outlen > MAX_HASH_SIZE) + return -EINVAL; + + len = sizeof(OAP_KC_LABEL); + memcpy(info_buf, OAP_KC_LABEL, len); + memcpy(info_buf + len, req_hash.data, req_hash.len); + len += req_hash.len; + + memcpy(info_buf + len, resp_hash.data, resp_hash.len); + len += resp_hash.len; + + info.len = len; + info.data = info_buf; + + return oap_hkdf_expand(sk, info, out, outlen); +} + #define TIMESYNC_SLACK 100 /* ms */ #define ID_IS_EQUAL(id1, id2) (memcmp(id1, id2, OAP_ID_SIZE) == 0) int oap_check_hdr(const struct oap_hdr * hdr) { - struct list_head * p; - struct list_head * h; - struct timespec now; - struct oap_replay_entry * new; - uint64_t stamp; - uint64_t cur; - uint8_t * id; - ssize_t delta; + struct oap_replay_bucket * b; + struct oap_replay_slot * slots; + struct timespec now; + uint64_t stamp; + uint64_t cur; + uint64_t gen; + uint8_t * id; + size_t h; + ssize_t delta; + int ret = 0; assert(hdr != NULL); @@ -131,63 +385,72 @@ int oap_check_hdr(const struct oap_hdr * hdr) delta = (ssize_t)(cur - stamp) / MILLION; if (delta < -TIMESYNC_SLACK) { log_err_id(id, "OAP header from %zd ms into future.", -delta); - goto fail_stamp; + return -EAUTH; } if (delta > OAP_REPLAY_TIMER * 1000) { log_err_id(id, "OAP header too old (%zd ms).", delta); - goto fail_stamp; + return -EAUTH; } - new = malloc(sizeof(*new)); - if (new == NULL) { - log_err_id(id, "Failed to allocate memory for OAP element."); - goto fail_stamp; - } + gen = stamp / ((uint64_t) OAP_REPLAY_TIMER * BILLION); pthread_mutex_lock(&oap_auth.replay.mtx); - list_for_each_safe(p, h, &oap_auth.replay.list) { - struct oap_replay_entry * e; - e = list_entry(p, struct oap_replay_entry, next); - if (cur > e->timestamp + OAP_REPLAY_TIMER * BILLION) { - list_del(&e->next); - free(e); - continue; - } + b = &oap_auth.replay.bucket[gen % OAP_REPLAY_GENS]; - if (e->timestamp == stamp && ID_IS_EQUAL(e->id, id)) { - log_warn_id(id, "OAP header already known."); - goto fail_replay; - } + /* Rotate a stale bucket in O(1): its old-gen slots become free. */ + if (b->gen != gen) { + b->gen = gen; + b->count = 0; } - memcpy(new->id, id, OAP_ID_SIZE); - new->timestamp = stamp; + slots = b->slots; - list_add_tail(&new->next, &oap_auth.replay.list); + h = replay_hash(id, stamp) & oap_auth.replay.mask; + while (slots[h].gen == gen) { + if (slots[h].ts == stamp && ID_IS_EQUAL(slots[h].id, id)) { + log_warn_id(id, "OAP header already known."); + ret = -EREPLAY; + goto out; + } - pthread_mutex_unlock(&oap_auth.replay.mtx); + h = (h + 1) & oap_auth.replay.mask; + } - return 0; + /* Empty slot found; fail closed when the window is at capacity. */ + if (b->count >= oap_auth.replay.cap) { + log_warn_id(id, "OAP replay cache full; rejecting."); + ret = -EAUTH; + goto out; + } - fail_replay: + slots[h].gen = gen; + slots[h].ts = stamp; + memcpy(slots[h].id, id, OAP_ID_SIZE); + b->count++; + out: pthread_mutex_unlock(&oap_auth.replay.mtx); - free(new); - fail_stamp: - return -EAUTH; + + return ret; } -int oap_auth_peer(char * name, - const struct oap_hdr * local_hdr, - const struct oap_hdr * peer_hdr) +int oap_auth_peer(char * name, + const struct sec_config * cfg, + const struct oap_hdr * local_hdr, + const struct oap_hdr * peer_hdr, + const buffer_t * cached_crt) { void * crt; void * pk = NULL; - buffer_t sign; /* Signed region */ + void * pin = NULL; + buffer_t crt_der; /* cert source: wire, else cached (re-key) */ + buffer_t sign; /* Signed region */ uint8_t * id = peer_hdr->id.data; + int ret; assert(name != NULL); + assert(cfg != NULL); assert(local_hdr != NULL); assert(peer_hdr != NULL); @@ -196,13 +459,22 @@ int oap_auth_peer(char * name, goto fail_check; } - if (peer_hdr->crt.len == 0) { + /* Re-key drops the wire cert; fall back to the cached peer cert. */ + crt_der = peer_hdr->crt; + if (crt_der.len == 0 && cached_crt != NULL) + crt_der = *cached_crt; + + if (crt_der.len == 0) { + if (cfg->a.req) { + log_err_id(id, "Peer did not provide a certificate."); + goto fail_check; + } log_dbg_id(id, "No crt provided."); name[0] = '\0'; return 0; } - if (crypt_load_crt_der(peer_hdr->crt, &crt) < 0) { + if (crypt_load_crt_der(crt_der, &crt) < 0) { log_err_id(id, "Failed to load crt."); goto fail_check; } @@ -216,26 +488,58 @@ int oap_auth_peer(char * name, log_dbg_id(id, "Got public key from crt."); - if (auth_verify_crt(oap_auth.ca_ctx, crt) < 0) { + if (cfg->a.cacert[0] != '\0') { + if (crypt_load_crt_file(cfg->a.cacert, &pin) < 0) { + log_err_id(id, "Failed to load pinned CA %s.", + cfg->a.cacert); + goto fail_crt; + } + } + + ret = auth_verify_crt_pin(oap_auth.ca_ctx, crt, pin); + if (ret == -ENOENT) { + log_err_id(id, "Peer crt not issued by pinned CA %s.", + cfg->a.cacert); + goto fail_pin; + } + + if (ret < 0) { log_err_id(id, "Failed to verify peer with CA store."); - goto fail_crt; + goto fail_pin; } log_dbg_id(id, "Successfully verified peer crt."); - sign = peer_hdr->hdr; + /* Digest pin: peer must sign with the configured digest */ + if (crypt_pk_requires_md(pk) && + cfg->d.nid != NID_undef && peer_hdr->md_nid != cfg->d.nid) { + log_err_id(id, "Peer did not sign with %s.", + md_nid_to_str(cfg->d.nid)); + goto fail_pin; + } + + /* Sealed responses verify over the reconstructed plaintext. */ + sign = peer_hdr->sealed_pt.data != NULL ? + peer_hdr->sealed_pt : peer_hdr->hdr; sign.len -= peer_hdr->sig.len; if (auth_verify_sig(pk, peer_hdr->md_nid, sign, peer_hdr->sig) < 0) { log_err_id(id, "Failed to verify signature."); - goto fail_check_sig; + goto fail_pin; } - if (crypt_get_crt_name(crt, name) < 0) { - log_warn_id(id, "Failed to extract name from certificate."); - name[0] = '\0'; + ret = crypt_get_crt_name(crt, name); + if (ret < 0) { + if (ret == -ENAME) + log_err_id(id, "Certificate CN too long."); + else + log_err_id(id, "No name in certificate."); + goto fail_pin; } + if (pin != NULL) + crypt_free_crt(pin); + crypt_free_key(pk); crypt_free_crt(crt); @@ -243,7 +547,9 @@ int oap_auth_peer(char * name, return 0; - fail_check_sig: + fail_pin: + if (pin != NULL) + crypt_free_crt(pin); fail_crt: crypt_free_key(pk); crypt_free_crt(crt); diff --git a/src/irmd/oap/auth.h b/src/irmd/oap/auth.h index 4f748750..72938b53 100644 --- a/src/irmd/oap/auth.h +++ b/src/irmd/oap/auth.h @@ -23,13 +23,46 @@ #ifndef OUROBOROS_IRMD_OAP_AUTH_H #define OUROBOROS_IRMD_OAP_AUTH_H +#include <ouroboros/crypt.h> + #include "hdr.h" int oap_check_hdr(const struct oap_hdr * hdr); -/* name is updated with the peer's certificate name if available */ -int oap_auth_peer(char * name, - const struct oap_hdr * local_hdr, - const struct oap_hdr * peer_hdr); +/* + * name is set to the peer crt CN, "" if no crt was presented. + * cached_crt (or NULL) is the peer cert from the initial handshake, used + * to verify a cert-less re-key. + */ +int oap_auth_peer(char * name, + const struct sec_config * cfg, + const struct oap_hdr * local_hdr, + const struct oap_hdr * peer_hdr, + const buffer_t * cached_crt); + +/* Derive the handshake key that seals the response identity block. */ +int oap_derive_hs_key(const struct crypt_sk * sk, + buffer_t req_hash, + uint8_t * out); + +/* resp_hash = H(kex || data || crt): binds the server response transcript. */ +int oap_resp_hash(int md_nid, + buffer_t kex, + buffer_t data, + buffer_t crt, + buffer_t * out); + +/* Fold request + response transcript + negotiated suite into the key. */ +int oap_bind_session_key(struct crypt_sk * sk, + buffer_t req_hash, + buffer_t resp_hash, + int kdf_nid); + +/* Server->client key-confirmation tag derived from the bound key. */ +int oap_key_confirm_tag(const struct crypt_sk * sk, + buffer_t req_hash, + buffer_t resp_hash, + uint8_t * out, + size_t outlen); #endif /* OUROBOROS_IRMD_OAP_AUTH_H */ diff --git a/src/irmd/oap/cli.c b/src/irmd/oap/cli.c index 7a202da7..689d67ca 100644 --- a/src/irmd/oap/cli.c +++ b/src/irmd/oap/cli.c @@ -54,7 +54,7 @@ struct oap_cli_ctx { uint8_t req_hash[MAX_HASH_SIZE]; size_t req_hash_len; int req_md_nid; - struct sec_config kcfg; + struct sec_config scfg; struct oap_hdr local_hdr; void * pkp; /* Ephemeral keypair */ uint8_t * key; /* For client-encap KEM */ @@ -69,7 +69,7 @@ struct oap_cli_ctx { extern int load_cli_credentials(const struct name_info * info, void ** pkp, void ** crt); -extern int load_cli_kex_config(const struct name_info * info, +extern int load_cli_sec_config(const struct name_info * info, struct sec_config * cfg); extern int load_server_kem_pk(const char * name, struct sec_config * cfg, @@ -87,13 +87,18 @@ int load_cli_credentials(const struct name_info * info, return load_credentials(info->name, &info->c, pkp, crt); } -int load_cli_kex_config(const struct name_info * info, +int load_cli_sec_config(const struct name_info * info, struct sec_config * cfg) { assert(info != NULL); assert(cfg != NULL); - return load_kex_config(info->name, info->c.enc, cfg); + memset(cfg, 0, sizeof(*cfg)); + + /* A client authenticates the server by default, like an https client */ + cfg->a.req = OAP_CLIENT_AUTH_DEFAULT; + + return load_sec_config(info->name, info->c.sec, cfg); } int load_server_kem_pk(const char * name, @@ -133,13 +138,13 @@ int load_server_kem_pk(const char * name, static int do_client_kex_prepare_dhe(struct oap_cli_ctx * s) { - struct sec_config * kcfg = &s->kcfg; + struct sec_config * scfg = &s->scfg; buffer_t * kex = &s->local_hdr.kex; uint8_t * id = s->id.data; ssize_t len; /* Generate ephemeral keypair, send PK */ - len = kex_pkp_create(kcfg, &s->pkp, kex->data); + len = kex_pkp_create(scfg, &s->pkp, kex->data); if (len < 0) { log_err_id(id, "Failed to generate DHE keypair."); return -ECRYPT; @@ -147,7 +152,7 @@ static int do_client_kex_prepare_dhe(struct oap_cli_ctx * s) kex->len = (size_t) len; log_dbg_id(id, "Generated ephemeral %s keys (%zd bytes).", - kcfg->x.str, len); + scfg->x.str, len); return 0; } @@ -155,24 +160,24 @@ static int do_client_kex_prepare_dhe(struct oap_cli_ctx * s) static int do_client_kex_prepare_kem_encap(const char * server_name, struct oap_cli_ctx * s) { - struct sec_config * kcfg = &s->kcfg; + struct sec_config * scfg = &s->scfg; buffer_t * kex = &s->local_hdr.kex; uint8_t * id = s->id.data; buffer_t server_pk = BUF_INIT; uint8_t key_buf[SYMMKEYSZ]; ssize_t len; - if (load_server_kem_pk(server_name, kcfg, &server_pk) < 0) { + if (load_server_kem_pk(server_name, scfg, &server_pk) < 0) { log_err_id(id, "Failed to load server KEM pk."); return -ECRYPT; } - if (IS_HYBRID_KEM(kcfg->x.str)) + if (IS_HYBRID_KEM(scfg->x.str)) len = kex_kem_encap_raw(server_pk, kex->data, - kcfg->k.nid, key_buf); + scfg->k.nid, key_buf); else len = kex_kem_encap(server_pk, kex->data, - kcfg->k.nid, key_buf); + scfg->k.nid, key_buf); freebuf(server_pk); @@ -198,13 +203,13 @@ static int do_client_kex_prepare_kem_encap(const char * server_name, static int do_client_kex_prepare_kem_decap(struct oap_cli_ctx * s) { - struct sec_config * kcfg = &s->kcfg; + struct sec_config * scfg = &s->scfg; buffer_t * kex = &s->local_hdr.kex; uint8_t * id = s->id.data; ssize_t len; /* Server encaps: generate keypair, send PK */ - len = kex_pkp_create(kcfg, &s->pkp, kex->data); + len = kex_pkp_create(scfg, &s->pkp, kex->data); if (len < 0) { log_err_id(id, "Failed to generate KEM keypair."); return -ECRYPT; @@ -219,13 +224,13 @@ static int do_client_kex_prepare_kem_decap(struct oap_cli_ctx * s) static int do_client_kex_prepare(const char * server_name, struct oap_cli_ctx * s) { - struct sec_config * kcfg = &s->kcfg; + struct sec_config * scfg = &s->scfg; - if (!IS_KEX_ALGO_SET(kcfg)) + if (!IS_KEX_ALGO_SET(scfg)) return 0; - if (IS_KEM_ALGORITHM(kcfg->x.str)) { - if (kcfg->x.mode == KEM_MODE_CLIENT_ENCAP) + if (IS_KEM_ALGORITHM(scfg->x.str)) { + if (scfg->x.mode == KEM_MODE_CLIENT_ENCAP) return do_client_kex_prepare_kem_encap(server_name, s); else return do_client_kex_prepare_kem_decap(s); @@ -237,11 +242,13 @@ static int do_client_kex_prepare(const char * server_name, int oap_cli_prepare(void ** ctx, const struct name_info * info, buffer_t * req_buf, - buffer_t data) + buffer_t data, + bool rekey) { struct oap_cli_ctx * s; void * pkp = NULL; void * crt = NULL; + buffer_t no_tag = BUF_INIT; ssize_t ret; assert(ctx != NULL); @@ -276,22 +283,34 @@ int oap_cli_prepare(void ** ctx, goto fail_id; } - /* Load KEX config */ - if (load_cli_kex_config(info, &s->kcfg) < 0) { - log_err_id(s->id.data, "Failed to load KEX config for %s.", + /* Load security config */ + if (load_cli_sec_config(info, &s->scfg) < 0) { + log_err_id(s->id.data, "Failed to load security config for %s.", info->name); goto fail_kex; } - oap_hdr_init(&s->local_hdr, s->id, s->kex_buf, data, s->kcfg.c.nid); + /* Re-key forces server-encap: client-encap forfeits FS/PCS. */ + if (rekey && s->scfg.x.mode == KEM_MODE_CLIENT_ENCAP) { + s->scfg.x.mode = KEM_MODE_SERVER_ENCAP; + log_dbg_id(s->id.data, "Re-key forcing ephemeral server KEX."); + } + + /* Re-key omits the cert; the server verifies against its cache. */ + if (rekey && crt != NULL) { + crypt_free_crt(crt); + crt = NULL; + } + + oap_hdr_init(&s->local_hdr, s->id, s->kex_buf, data, s->scfg.c.nid); if (do_client_kex_prepare(info->name, s) < 0) { log_err_id(s->id.data, "Failed to prepare client KEX."); goto fail_kex; } - if (oap_hdr_encode(&s->local_hdr, pkp, crt, &s->kcfg, - (buffer_t) BUF_INIT, NID_undef)) { + if (oap_hdr_encode(&s->local_hdr, pkp, crt, &s->scfg, + no_tag, NID_undef, NULL)) { log_err_id(s->id.data, "Failed to create OAP request header."); goto fail_hdr; } @@ -299,7 +318,7 @@ int oap_cli_prepare(void ** ctx, debug_oap_hdr_snd(&s->local_hdr); /* Compute and store hash of request for verification in complete */ - s->req_md_nid = s->kcfg.d.nid != NID_undef ? s->kcfg.d.nid : NID_sha384; + s->req_md_nid = s->scfg.d.nid != NID_undef ? s->scfg.d.nid : NID_sha384; ret = md_digest(s->req_md_nid, s->local_hdr.hdr, s->req_hash); if (ret < 0) { log_err_id(s->id.data, "Failed to hash request."); @@ -324,6 +343,7 @@ int oap_cli_prepare(void ** ctx, return 0; fail_hash: + oap_hdr_fini(&s->local_hdr); fail_hdr: crypt_secure_free(s->key, SYMMKEYSZ); crypt_free_key(s->pkp); @@ -358,11 +378,11 @@ static int do_client_kex_complete_kem(struct oap_cli_ctx * s, const struct oap_hdr * peer_hdr, struct crypt_sk * sk) { - struct sec_config * kcfg = &s->kcfg; + struct sec_config * scfg = &s->scfg; uint8_t * id = s->id.data; uint8_t key_buf[SYMMKEYSZ]; - if (kcfg->x.mode == KEM_MODE_SERVER_ENCAP) { + if (scfg->x.mode == KEM_MODE_SERVER_ENCAP) { buffer_t ct; if (peer_hdr->kex.len == 0) { @@ -373,27 +393,27 @@ static int do_client_kex_complete_kem(struct oap_cli_ctx * s, ct.data = peer_hdr->kex.data; ct.len = peer_hdr->kex.len; - if (kex_kem_decap(s->pkp, ct, kcfg->k.nid, key_buf) < 0) { + if (kex_kem_decap(s->pkp, ct, scfg->k.nid, key_buf) < 0) { log_err_id(id, "Failed to decapsulate KEM."); return -ECRYPT; } log_dbg_id(id, "Client decapsulated server CT."); - } else if (kcfg->x.mode == KEM_MODE_CLIENT_ENCAP) { + } else if (scfg->x.mode == KEM_MODE_CLIENT_ENCAP) { /* Key already derived during prepare */ memcpy(sk->key, s->key, SYMMKEYSZ); - sk->nid = kcfg->c.nid; - log_info_id(id, "Negotiated %s + %s.", kcfg->x.str, - kcfg->c.str); + sk->nid = scfg->c.nid; + log_info_id(id, "Negotiated %s + %s.", scfg->x.str, + scfg->c.str); return 0; } memcpy(sk->key, key_buf, SYMMKEYSZ); - sk->nid = kcfg->c.nid; + sk->nid = scfg->c.nid; crypt_secure_clear(key_buf, SYMMKEYSZ); - log_info_id(id, "Negotiated %s + %s.", kcfg->x.str, kcfg->c.str); + log_info_id(id, "Negotiated %s + %s.", scfg->x.str, scfg->c.str); return 0; } @@ -402,7 +422,7 @@ static int do_client_kex_complete_dhe(struct oap_cli_ctx * s, const struct oap_hdr * peer_hdr, struct crypt_sk * sk) { - struct sec_config * kcfg = &s->kcfg; + struct sec_config * scfg = &s->scfg; uint8_t * id = s->id.data; uint8_t key_buf[SYMMKEYSZ]; @@ -412,7 +432,7 @@ static int do_client_kex_complete_dhe(struct oap_cli_ctx * s, return -ECRYPT; } - if (kex_dhe_derive(kcfg, s->pkp, peer_hdr->kex, key_buf) < 0) { + if (kex_dhe_derive(scfg, s->pkp, peer_hdr->kex, key_buf) < 0) { log_err_id(id, "Failed to derive DHE secret."); return -ECRYPT; } @@ -420,10 +440,10 @@ static int do_client_kex_complete_dhe(struct oap_cli_ctx * s, log_dbg_id(id, "DHE: derived shared secret."); memcpy(sk->key, key_buf, SYMMKEYSZ); - sk->nid = kcfg->c.nid; + sk->nid = scfg->c.nid; crypt_secure_clear(key_buf, SYMMKEYSZ); - log_info_id(id, "Negotiated %s + %s.", kcfg->x.str, kcfg->c.str); + log_info_id(id, "Negotiated %s + %s.", scfg->x.str, scfg->c.str); return 0; } @@ -433,17 +453,17 @@ static int do_client_kex_complete(struct oap_cli_ctx * s, const struct oap_hdr * peer_hdr, struct crypt_sk * sk) { - struct sec_config * kcfg = &s->kcfg; + struct sec_config * scfg = &s->scfg; uint8_t * id = s->id.data; int cipher_nid; int kdf_nid; - if (!IS_KEX_ALGO_SET(kcfg)) + if (!IS_KEX_ALGO_SET(scfg)) return 0; /* Save client's configured minimums */ - cipher_nid = kcfg->c.nid; - kdf_nid = kcfg->k.nid; + cipher_nid = scfg->c.nid; + kdf_nid = scfg->k.nid; /* Accept server's cipher choice */ if (peer_hdr->cipher_str == NULL) { @@ -451,15 +471,15 @@ static int do_client_kex_complete(struct oap_cli_ctx * s, return -ECRYPT; } - SET_KEX_CIPHER(kcfg, peer_hdr->cipher_str); - if (crypt_validate_nid(kcfg->c.nid) < 0) { + SET_KEX_CIPHER(scfg, peer_hdr->cipher_str); + if (crypt_validate_nid(scfg->c.nid) < 0) { log_err_id(id, "Server cipher '%s' not supported.", peer_hdr->cipher_str); return -ENOTSUP; } /* Verify server cipher >= client's minimum */ - if (crypt_cipher_rank(kcfg->c.nid) < crypt_cipher_rank(cipher_nid)) { + if (crypt_cipher_rank(scfg->c.nid) < crypt_cipher_rank(cipher_nid)) { log_err_id(id, "Server cipher %s too weak.", peer_hdr->cipher_str); return -ECRYPT; @@ -469,20 +489,20 @@ static int do_client_kex_complete(struct oap_cli_ctx * s, peer_hdr->cipher_str); /* Accept server's KDF for non-client-encap modes */ - if (kcfg->x.mode != KEM_MODE_CLIENT_ENCAP + if (scfg->x.mode != KEM_MODE_CLIENT_ENCAP && peer_hdr->kdf_nid != NID_undef) { if (crypt_kdf_rank(peer_hdr->kdf_nid) < crypt_kdf_rank(kdf_nid)) { log_err_id(id, "Server KDF too weak."); return -ECRYPT; } - SET_KEX_KDF_NID(kcfg, peer_hdr->kdf_nid); + SET_KEX_KDF_NID(scfg, peer_hdr->kdf_nid); log_dbg_id(id, "Accepted server KDF %s.", - md_nid_to_str(kcfg->k.nid)); + md_nid_to_str(scfg->k.nid)); } /* Derive shared secret */ - if (IS_KEM_ALGORITHM(kcfg->x.str)) + if (IS_KEM_ALGORITHM(scfg->x.str)) return do_client_kex_complete_kem(s, peer_hdr, sk); return do_client_kex_complete_dhe(s, peer_hdr, sk); @@ -492,12 +512,20 @@ int oap_cli_complete(void * ctx, const struct name_info * info, buffer_t rsp_buf, buffer_t * data, - struct crypt_sk * sk) + struct crypt_sk * sk, + const buffer_t * cached_crt, + buffer_t * peer_crt) { struct oap_cli_ctx * s = ctx; struct oap_hdr peer_hdr; char peer[NAME_SIZE + 1]; + uint8_t kc_buf[MAX_HASH_SIZE]; + uint8_t resp_hash_buf[MAX_HASH_SIZE]; + uint8_t hs_key[SYMMKEYSZ]; + buffer_t req_hash = BUF_INIT; + buffer_t resp_hash = BUF_INIT; uint8_t * id; + int rc; assert(ctx != NULL); assert(info != NULL); @@ -515,7 +543,7 @@ int oap_cli_complete(void * ctx, log_dbg_id(id, "Completing OAP for %s.", info->name); /* Decode response header using client's md_nid for hash length */ - if (oap_hdr_decode(&peer_hdr, rsp_buf, s->req_md_nid) < 0) { + if (oap_hdr_decode(&peer_hdr, rsp_buf, s->req_md_nid, false) < 0) { log_err_id(id, "Failed to decode OAP response header."); goto fail_oap; } @@ -528,20 +556,52 @@ int oap_cli_complete(void * ctx, goto fail_oap; } - /* Authenticate server */ - if (oap_auth_peer(peer, &s->local_hdr, &peer_hdr) < 0) { - log_err_id(id, "Failed to authenticate server."); + /* Complete key exchange first; the sealed identity needs the secret */ + if (do_client_kex_complete(s, &peer_hdr, sk) < 0) { + log_err_id(id, "Failed to complete key exchange."); goto fail_oap; } - /* Verify request hash in authenticated response */ - if (peer_hdr.req_hash.len == 0) { - log_err_id(id, "Response missing req_hash."); + req_hash.data = s->req_hash; + req_hash.len = s->req_hash_len; + + /* Decrypt the sealed server identity (data+cert+sig) before auth */ + if (sk->nid != NID_undef && peer_hdr.sealed.data != NULL) { + if (oap_derive_hs_key(sk, req_hash, hs_key) < 0) { + log_err_id(id, "Failed to derive handshake key."); + goto fail_oap; + } + + rc = oap_hdr_unseal(&peer_hdr, hs_key); + + crypt_secure_clear(hs_key, SYMMKEYSZ); + + if (rc < 0) { + log_err_id(id, "Failed to unseal server identity."); + goto fail_oap; + } + } + + /* Authenticate server (cert + signature now in cleartext) */ + if (oap_auth_peer(peer, &s->scfg, &s->local_hdr, &peer_hdr, + cached_crt) < 0) { + log_err_id(id, "Failed to authenticate server."); goto fail_oap; } - if (memcmp(peer_hdr.req_hash.data, s->req_hash, s->req_hash_len) != 0) { - log_err_id(id, "Response req_hash mismatch."); + /* Surface the peer cert so the caller can cache it for re-key. */ + if (peer_crt != NULL && peer_hdr.crt.len > 0) { + peer_crt->data = malloc(peer_hdr.crt.len); + if (peer_crt->data == NULL) + goto fail_oap; + + memcpy(peer_crt->data, peer_hdr.crt.data, peer_hdr.crt.len); + peer_crt->len = peer_hdr.crt.len; + } + + /* Response must carry a transcript tag of the expected length */ + if (peer_hdr.rsp_tag.len != s->req_hash_len) { + log_err_id(id, "Response transcript tag mismatch."); goto fail_oap; } @@ -552,10 +612,43 @@ int oap_cli_complete(void * ctx, goto fail_oap; } - /* Complete key exchange */ - if (do_client_kex_complete(s, &peer_hdr, sk) < 0) { - log_err_id(id, "Failed to complete key exchange."); - goto fail_oap; + if (sk->nid != NID_undef) { + /* Encrypted: bind the key and verify key confirmation */ + resp_hash.data = resp_hash_buf; + + if (oap_resp_hash(s->req_md_nid, peer_hdr.kex, + peer_hdr.data, peer_hdr.crt, + &resp_hash) < 0) { + log_err_id(id, "Failed to hash response."); + goto fail_oap; + } + + if (oap_bind_session_key(sk, req_hash, resp_hash, + s->scfg.k.nid) < 0) { + log_err_id(id, "Failed to bind session key."); + goto fail_oap; + } + + if (oap_key_confirm_tag(sk, req_hash, resp_hash, kc_buf, + s->req_hash_len) < 0) { + log_err_id(id, "Failed to confirm session key."); + goto fail_oap; + } + + if (crypt_ct_cmp(peer_hdr.rsp_tag.data, kc_buf, + s->req_hash_len) != 0) { + log_err_id(id, "Key confirmation mismatch."); + goto fail_oap; + } + } else { + /* Cleartext path is config-driven, never a wire downgrade */ + assert(!IS_KEX_ALGO_SET(&s->scfg)); + /* Unencrypted: verify request-echo integrity */ + if (crypt_ct_cmp(peer_hdr.rsp_tag.data, s->req_hash, + s->req_hash_len) != 0) { + log_err_id(id, "Response tag mismatch."); + goto fail_oap; + } } /* Copy piggybacked data from server response */ @@ -566,11 +659,14 @@ int oap_cli_complete(void * ctx, log_info_id(id, "OAP completed for %s.", info->name); + freebuf(peer_hdr.sealed_pt); + oap_ctx_free(s); return 0; fail_oap: + freebuf(peer_hdr.sealed_pt); oap_ctx_free(s); return -ECRYPT; } diff --git a/src/irmd/oap/hdr.c b/src/irmd/oap/hdr.c index 5465dd2a..f8400b46 100644 --- a/src/irmd/oap/hdr.c +++ b/src/irmd/oap/hdr.c @@ -30,6 +30,7 @@ #include <ouroboros/crypt.h> #include <ouroboros/endian.h> +#include <ouroboros/errno.h> #include <ouroboros/hash.h> #include <ouroboros/logs.h> #include <ouroboros/rib.h> @@ -45,9 +46,17 @@ #include <string.h> #include <time.h> +#define OAP_SEAL_TAGSZ 16 /* AEAD tag on the sealed identity block */ +/* Sealed length prefix: data_len ‖ crt_len. */ +#define OAP_SEAL_LENSZ (sizeof(uint16_t) + sizeof(uint16_t)) + +/* hs_key is single-use per handshake, so a fixed nonce is reuse-safe. */ +static const uint8_t oap_seal_nonce[12]; + int oap_hdr_decode(struct oap_hdr * oap_hdr, buffer_t hdr, - int req_md_nid) + int req_md_nid, + bool rekey) { off_t offset; uint16_t kex_len; @@ -88,11 +97,13 @@ int oap_hdr_decode(struct oap_hdr * oap_hdr, oap_hdr->md_str = md_nid_to_str(oap_hdr->md_nid); offset += sizeof(uint16_t); - /* Validate NIDs: NID_undef is valid at parse time, else must be known. + /* + * Validate NIDs: NID_undef is valid at parse time, else must be known. * Note: md_nid=NID_undef only valid for PQC; enforced at sign/verify. */ if (ciph_nid != NID_undef && crypt_validate_nid(ciph_nid) < 0) goto fail_decode; + if (oap_hdr->kdf_nid != NID_undef && md_validate_nid(oap_hdr->kdf_nid) < 0) goto fail_decode; @@ -115,10 +126,37 @@ int oap_hdr_decode(struct oap_hdr * oap_hdr, data_len = (size_t) ntoh16(*(uint16_t *)(hdr.data + offset)); offset += sizeof(uint16_t); - /* Response includes req_hash when md_nid is set */ + assert((size_t) offset == OAP_HDR_MIN_SIZE); + + /* Response includes rsp_tag when md_nid is set */ hash_len = (req_md_nid != NID_undef) ? (size_t) md_len(req_md_nid) : 0; + /* Encrypted response: sealed block is data_len‖crt_len‖data‖crt‖sig. */ + if (req_md_nid != NID_undef && ciph_nid != NID_undef) { + if (hdr.len < (size_t) offset + oap_hdr->kex.len + hash_len + + OAP_SEAL_TAGSZ + OAP_SEAL_LENSZ) + goto fail_decode; + + oap_hdr->kex.data = hdr.data + offset; + offset += oap_hdr->kex.len; + + oap_hdr->rsp_tag.data = hdr.data + offset; + oap_hdr->rsp_tag.len = hash_len; + offset += hash_len; + + oap_hdr->sealed.data = hdr.data + offset; + oap_hdr->sealed.len = hdr.len - offset; + + /* crt/data/sig lengths are sealed; set by oap_hdr_unseal. */ + oap_hdr->crt.len = crt_len; + oap_hdr->data.len = data_len; + + oap_hdr->hdr = hdr; + + return 0; + } + /* Validate total length */ if (hdr.len < (size_t) offset + crt_len + oap_hdr->kex.len + data_len + hash_len) @@ -128,8 +166,12 @@ int oap_hdr_decode(struct oap_hdr * oap_hdr, sig_len = hdr.len - offset - crt_len - oap_hdr->kex.len - data_len - hash_len; - /* Unsigned packets must not have trailing bytes */ - if (crt_len == 0 && sig_len != 0) + /* + * Unsigned packets must not have trailing bytes. A re-key request + * is signed but cert-less (verified against the cached peer cert), + * so the rekey caller permits crt_len==0 with a signature. + */ + if (crt_len == 0 && sig_len != 0 && !rekey) goto fail_decode; /* Parse variable fields */ @@ -144,8 +186,8 @@ int oap_hdr_decode(struct oap_hdr * oap_hdr, oap_hdr->data.len = data_len; offset += data_len; - oap_hdr->req_hash.data = hdr.data + offset; - oap_hdr->req_hash.len = hash_len; + oap_hdr->rsp_tag.data = hdr.data + offset; + oap_hdr->rsp_tag.len = hash_len; offset += hash_len; oap_hdr->sig.data = hdr.data + offset; @@ -164,6 +206,7 @@ void oap_hdr_fini(struct oap_hdr * oap_hdr) { assert(oap_hdr != NULL); + freebuf(oap_hdr->sealed_pt); freebuf(oap_hdr->hdr); memset(oap_hdr, 0, sizeof(*oap_hdr)); } @@ -207,12 +250,229 @@ void oap_hdr_init(struct oap_hdr * hdr, hdr->nid = nid; } +/* Write the 36-byte fixed header; stamp is already in network order. */ +static void write_oap_fixed(uint8_t * buf, + const struct oap_hdr * hdr, + const struct sec_config * scfg, + size_t crt_len, + size_t data_len, + uint64_t stamp) +{ + uint16_t v; + uint16_t kex_len; + off_t offset = 0; + + memcpy(buf + offset, hdr->id.data, hdr->id.len); + offset += hdr->id.len; + + memcpy(buf + offset, &stamp, sizeof(stamp)); + offset += sizeof(stamp); + + v = hton16(hdr->nid); + memcpy(buf + offset, &v, sizeof(v)); + offset += sizeof(v); + + v = hton16(scfg->k.nid); + memcpy(buf + offset, &v, sizeof(v)); + offset += sizeof(v); + + v = hton16(scfg->d.nid); + memcpy(buf + offset, &v, sizeof(v)); + offset += sizeof(v); + + v = hton16((uint16_t) crt_len); + memcpy(buf + offset, &v, sizeof(v)); + offset += sizeof(v); + + kex_len = (uint16_t) hdr->kex.len; + if (hdr->kex.len > 0 && IS_KEM_ALGORITHM(scfg->x.str)) { + if (IS_HYBRID_KEM(scfg->x.str)) + kex_len |= OAP_KEX_FMT_BIT; + if (scfg->x.mode == KEM_MODE_CLIENT_ENCAP) + kex_len |= OAP_KEX_ROLE_BIT; + } + + kex_len = hton16(kex_len); + memcpy(buf + offset, &kex_len, sizeof(kex_len)); + offset += sizeof(kex_len); + + v = hton16((uint16_t) data_len); + memcpy(buf + offset, &v, sizeof(v)); +} + +/* + * Pack lens ‖ data ‖ crt, sign prefix ‖ body, append the signature, then + * AEAD-seal lens ‖ data ‖ crt ‖ sig under prefix as AAD. The cert, app data + * and their sizes stay confidential; *out is the opaque sealed block. The + * signature rides inside the seal so it can't deanonymise the server. + */ +static int oap_seal_body(int nid, + const uint8_t * seal_key, + void * pkp, + int md_nid, + buffer_t prefix, + buffer_t data, + buffer_t crt, + buffer_t * out) +{ + buffer_t sig = BUF_INIT; + buffer_t sign; + buffer_t aad; + buffer_t plain; + uint8_t * buf; + uint8_t * tmp; + uint16_t datalen; + uint16_t crtlen; + size_t body_len; + off_t offset; + + datalen = hton16((uint16_t) data.len); + crtlen = hton16((uint16_t) crt.len); + + body_len = OAP_SEAL_LENSZ + data.len + crt.len; + + buf = malloc(prefix.len + body_len); + if (buf == NULL) + return -1; + + memcpy(buf, prefix.data, prefix.len); + offset = (off_t) prefix.len; + + memcpy(buf + offset, &datalen, sizeof(datalen)); + offset += sizeof(datalen); + + memcpy(buf + offset, &crtlen, sizeof(crtlen)); + offset += sizeof(crtlen); + + if (data.len != 0) + memcpy(buf + offset, data.data, data.len); + + offset += data.len; + + if (crt.len != 0) + memcpy(buf + offset, crt.data, crt.len); + + /* Sign prefix ‖ lens ‖ data ‖ crt (plaintext, before sealing). */ + sign.data = buf; + sign.len = prefix.len + body_len; + + if (pkp != NULL && auth_sign(pkp, md_nid, sign, &sig) < 0) + goto fail_buf; + + /* Append the signature so the seal covers lens ‖ data ‖ crt ‖ sig. */ + if (sig.len != 0) { + tmp = realloc(buf, prefix.len + body_len + sig.len); + if (tmp == NULL) + goto fail_sig; + + buf = tmp; + memcpy(buf + prefix.len + body_len, sig.data, sig.len); + } + + aad.data = buf; + aad.len = prefix.len; + plain.data = buf + prefix.len; + plain.len = body_len + sig.len; + + if (crypt_oneshot_seal(nid, seal_key, oap_seal_nonce, + aad, plain, out) < 0) + goto fail_sig; + + free(buf); + freebuf(sig); + + return 0; + + fail_sig: + freebuf(sig); + fail_buf: + free(buf); + return -1; +} + +/* Encode an identity-hidden response: wire = prefix ‖ oap_seal_body(...). */ +static int oap_hdr_encode_sealed(struct oap_hdr * hdr, + void * pkp, + void * crt, + struct sec_config * scfg, + buffer_t rsp_tag, + int req_md_nid, + const uint8_t * seal_key) +{ + struct timespec now; + uint64_t stamp; + buffer_t der = BUF_INIT; + buffer_t sealed = BUF_INIT; + buffer_t prefix; + off_t offset; + + clock_gettime(CLOCK_REALTIME, &now); + stamp = hton64(TS_TO_UINT64(now)); + + if (crt != NULL && crypt_crt_der(crt, &der) < 0) + goto fail_der; + + prefix.len = OAP_HDR_MIN_SIZE + hdr->kex.len + rsp_tag.len; + prefix.data = malloc(prefix.len); + if (prefix.data == NULL) + goto fail_der; + + /* Cleartext crt_len/data_len are 0; real lengths prefix the seal. */ + write_oap_fixed(prefix.data, hdr, scfg, 0, 0, stamp); + offset = OAP_HDR_MIN_SIZE; + + if (hdr->kex.len != 0) + memcpy(prefix.data + offset, hdr->kex.data, hdr->kex.len); + + offset += hdr->kex.len; + + if (rsp_tag.len != 0) + memcpy(prefix.data + offset, rsp_tag.data, rsp_tag.len); + + offset += rsp_tag.len; + + assert((size_t) offset == prefix.len); + + if (oap_seal_body(hdr->nid, seal_key, pkp, scfg->d.nid, + prefix, hdr->data, der, &sealed) < 0) + goto fail_prefix; + + hdr->hdr.len = prefix.len + sealed.len; + hdr->hdr.data = malloc(hdr->hdr.len); + if (hdr->hdr.data == NULL) + goto fail_sealed; + + memcpy(hdr->hdr.data, prefix.data, prefix.len); + memcpy(hdr->hdr.data + prefix.len, sealed.data, sealed.len); + + freebuf(sealed); + free(prefix.data); + freebuf(der); + + if (oap_hdr_decode(hdr, hdr->hdr, req_md_nid, false) < 0) + goto fail_decode; + + return 0; + + fail_decode: + oap_hdr_fini(hdr); + return -1; + fail_sealed: + freebuf(sealed); + fail_prefix: + free(prefix.data); + fail_der: + freebuf(der); + return -1; +} + int oap_hdr_encode(struct oap_hdr * hdr, void * pkp, void * crt, - struct sec_config * kcfg, - buffer_t req_hash, - int req_md_nid) + struct sec_config * scfg, + buffer_t rsp_tag, + int req_md_nid, + const uint8_t * seal_key) { struct timespec now; uint64_t stamp; @@ -220,16 +480,15 @@ int oap_hdr_encode(struct oap_hdr * hdr, buffer_t der = BUF_INIT; buffer_t sig = BUF_INIT; buffer_t sign; - uint16_t len; - uint16_t ciph_nid; - uint16_t kdf_nid; - uint16_t md_nid; - uint16_t kex_len; off_t offset; assert(hdr != NULL); assert(hdr->id.data != NULL && hdr->id.len == OAP_ID_SIZE); - assert(kcfg != NULL); + assert(scfg != NULL); + + if (seal_key != NULL) + return oap_hdr_encode_sealed(hdr, pkp, crt, scfg, rsp_tag, + req_md_nid, seal_key); clock_gettime(CLOCK_REALTIME, &now); stamp = hton64(TS_TO_UINT64(now)); @@ -237,86 +496,40 @@ int oap_hdr_encode(struct oap_hdr * hdr, if (crt != NULL && crypt_crt_der(crt, &der) < 0) goto fail_der; - ciph_nid = hton16(hdr->nid); - kdf_nid = hton16(kcfg->k.nid); - md_nid = hton16(kcfg->d.nid); - - /* Build kex_len with flags */ - kex_len = (uint16_t) hdr->kex.len; - if (hdr->kex.len > 0 && IS_KEM_ALGORITHM(kcfg->x.str)) { - if (IS_HYBRID_KEM(kcfg->x.str)) - kex_len |= OAP_KEX_FMT_BIT; - if (kcfg->x.mode == KEM_MODE_CLIENT_ENCAP) - kex_len |= OAP_KEX_ROLE_BIT; - } - kex_len = hton16(kex_len); - - /* Fixed header (36 bytes) + variable fields + req_hash (if auth) */ + /* Fixed header (36 bytes) + variable fields + rsp_tag (rsp only) */ out.len = OAP_HDR_MIN_SIZE + der.len + hdr->kex.len + hdr->data.len + - req_hash.len; + rsp_tag.len; out.data = malloc(out.len); if (out.data == NULL) goto fail_out; - offset = 0; - - /* id (16 bytes) */ - memcpy(out.data + offset, hdr->id.data, hdr->id.len); - offset += hdr->id.len; - - /* timestamp (8 bytes) */ - memcpy(out.data + offset, &stamp, sizeof(stamp)); - offset += sizeof(stamp); - - /* cipher_nid (2 bytes) */ - memcpy(out.data + offset, &ciph_nid, sizeof(ciph_nid)); - offset += sizeof(ciph_nid); - - /* kdf_nid (2 bytes) */ - memcpy(out.data + offset, &kdf_nid, sizeof(kdf_nid)); - offset += sizeof(kdf_nid); - - /* md_nid (2 bytes) */ - memcpy(out.data + offset, &md_nid, sizeof(md_nid)); - offset += sizeof(md_nid); - - /* crt_len (2 bytes) */ - len = hton16((uint16_t) der.len); - memcpy(out.data + offset, &len, sizeof(len)); - offset += sizeof(len); - - /* kex_len + flags (2 bytes) */ - memcpy(out.data + offset, &kex_len, sizeof(kex_len)); - offset += sizeof(kex_len); - - /* data_len (2 bytes) */ - len = hton16((uint16_t) hdr->data.len); - memcpy(out.data + offset, &len, sizeof(len)); - offset += sizeof(len); - - /* Fixed header complete (36 bytes) */ - assert((size_t) offset == OAP_HDR_MIN_SIZE); + write_oap_fixed(out.data, hdr, scfg, der.len, hdr->data.len, stamp); + offset = OAP_HDR_MIN_SIZE; /* certificate (variable) */ if (der.len != 0) memcpy(out.data + offset, der.data, der.len); + offset += der.len; /* kex data (variable) */ if (hdr->kex.len != 0) memcpy(out.data + offset, hdr->kex.data, hdr->kex.len); + offset += hdr->kex.len; /* data (variable) */ if (hdr->data.len != 0) memcpy(out.data + offset, hdr->data.data, hdr->data.len); + offset += hdr->data.len; - /* req_hash (variable, only for authenticated responses) */ - if (req_hash.len != 0) - memcpy(out.data + offset, req_hash.data, req_hash.len); - offset += req_hash.len; + /* rsp_tag (variable, response only) */ + if (rsp_tag.len != 0) + memcpy(out.data + offset, rsp_tag.data, rsp_tag.len); + + offset += rsp_tag.len; assert((size_t) offset == out.len); @@ -324,7 +537,7 @@ int oap_hdr_encode(struct oap_hdr * hdr, sign.data = out.data; sign.len = out.len; - if (pkp != NULL && auth_sign(pkp, kcfg->d.nid, sign, &sig) < 0) + if (pkp != NULL && auth_sign(pkp, scfg->d.nid, sign, &sig) < 0) goto fail_sig; hdr->hdr = out; @@ -340,7 +553,7 @@ int oap_hdr_encode(struct oap_hdr * hdr, clrbuf(out); } - if (oap_hdr_decode(hdr, hdr->hdr, req_md_nid) < 0) + if (oap_hdr_decode(hdr, hdr->hdr, req_md_nid, false) < 0) goto fail_decode; freebuf(der); @@ -360,28 +573,99 @@ int oap_hdr_encode(struct oap_hdr * hdr, return -1; } +int oap_hdr_unseal(struct oap_hdr * hdr, + const uint8_t * key) +{ + buffer_t pt = BUF_INIT; + buffer_t prefix; + uint8_t * recon; + size_t body_len; + size_t pt_len; + size_t data_len; + size_t crt_len; + + assert(hdr != NULL); + assert(key != NULL); + + if (hdr->sealed.data == NULL || hdr->sealed.len == 0) + return -EINVAL; + + /* AAD prefix is fixed‖kex‖rsp_tag; sealed starts right after. */ + prefix.data = hdr->hdr.data; + prefix.len = (size_t) (hdr->sealed.data - hdr->hdr.data); + + if (crypt_oneshot_open(hdr->nid, key, oap_seal_nonce, prefix, + hdr->sealed, &pt) < 0) + return -ECRYPT; + + pt_len = pt.len; + + /* Plaintext = data_len ‖ crt_len ‖ data ‖ crt ‖ sig. */ + if (pt_len < OAP_SEAL_LENSZ) + goto fail_auth; + + data_len = (size_t) ntoh16(*(uint16_t *) pt.data); + crt_len = (size_t) ntoh16(*(uint16_t *)(pt.data + sizeof(uint16_t))); + + body_len = OAP_SEAL_LENSZ + data_len + crt_len; + if (pt_len < body_len) + goto fail_auth; + + /* Rebuild prefix ‖ lens ‖ data ‖ crt ‖ sig (whole signed region). */ + recon = malloc(prefix.len + pt_len); + if (recon == NULL) + goto fail_mem; + + memcpy(recon, prefix.data, prefix.len); + memcpy(recon + prefix.len, pt.data, pt_len); + + freebuf(pt); + + hdr->sealed_pt.data = recon; + hdr->sealed_pt.len = prefix.len + pt_len; + + hdr->data.data = recon + prefix.len + OAP_SEAL_LENSZ; + hdr->data.len = data_len; + hdr->crt.data = recon + prefix.len + OAP_SEAL_LENSZ + data_len; + hdr->crt.len = crt_len; + hdr->sig.data = recon + prefix.len + body_len; + hdr->sig.len = pt_len - body_len; + + return 0; + + fail_mem: + freebuf(pt); + return -ENOMEM; + fail_auth: + freebuf(pt); + return -EAUTH; +} + #ifdef DEBUG_PROTO_OAP #define OAP_KEX_IS_KEM(hdr) ((hdr)->kex_flags.role | (hdr)->kex_flags.fmt) static void debug_oap_hdr(const struct oap_hdr * hdr) { assert(hdr); + if (hdr->sealed.len > 0) + log_proto(" Sealed block: [%zu bytes] on wire", + hdr->sealed.len); + if (hdr->crt.len > 0) log_proto(" crt: [%zu bytes]", hdr->crt.len); + else if (hdr->sealed.len > 0) + log_proto(" crt: <sealed>"); else log_proto(" crt: <none>"); if (hdr->kex.len > 0) { if (OAP_KEX_IS_KEM(hdr)) - log_proto(" Key Exchange Data:" - " [%zu bytes] [%s]", + log_proto(" Key Exchange Data: [%zu bytes] [%s]", hdr->kex.len, hdr->kex_flags.role ? - "Client encaps" : - "Server encaps"); + "Client encaps" : "Server encaps"); else - log_proto(" Key Exchange Data:" - " [%zu bytes]", + log_proto(" Key Exchange Data: [%zu bytes]", hdr->kex.len); } else log_proto(" Key Exchange Data: <none>"); @@ -403,16 +687,20 @@ static void debug_oap_hdr(const struct oap_hdr * hdr) if (hdr->data.len > 0) log_proto(" Data: [%zu bytes]", hdr->data.len); + else if (hdr->sealed.len > 0) + log_proto(" Data: <sealed>"); else log_proto(" Data: <none>"); - if (hdr->req_hash.len > 0) - log_proto(" Req Hash: [%zu bytes]", hdr->req_hash.len); + if (hdr->rsp_tag.len > 0) + log_proto(" Rsp Tag: [%zu bytes]", hdr->rsp_tag.len); else - log_proto(" Req Hash: <none>"); + log_proto(" Rsp Tag: <none>"); if (hdr->sig.len > 0) log_proto(" Signature: [%zu bytes]", hdr->sig.len); + else if (hdr->sealed.len > 0) + log_proto(" Signature: <sealed>"); else log_proto(" Signature: <none>"); } @@ -432,8 +720,9 @@ void debug_oap_hdr_rcv(const struct oap_hdr * hdr) tm = gmtime(&stamp); strftime(tmstr, sizeof(tmstr), RIB_TM_FORMAT, tm); - log_proto("OAP_HDR [" HASH_FMT64 " @ %s ] <--", - HASH_VAL64(hdr->id.data), tmstr); + log_proto("OAP_HDR [" HASH_FMT64 " @ %s ]%s <--", + HASH_VAL64(hdr->id.data), tmstr, + hdr->sealed.len > 0 ? " [sealed]" : ""); debug_oap_hdr(hdr); #else @@ -455,8 +744,9 @@ void debug_oap_hdr_snd(const struct oap_hdr * hdr) tm = gmtime(&stamp); strftime(tmstr, sizeof(tmstr), RIB_TM_FORMAT, tm); - log_proto("OAP_HDR [" HASH_FMT64 " @ %s ] -->", - HASH_VAL64(hdr->id.data), tmstr); + log_proto("OAP_HDR [" HASH_FMT64 " @ %s ]%s -->", + HASH_VAL64(hdr->id.data), tmstr, + hdr->sealed.len > 0 ? " [sealed]" : ""); debug_oap_hdr(hdr); #else diff --git a/src/irmd/oap/hdr.h b/src/irmd/oap/hdr.h index 6016452c..1a599727 100644 --- a/src/irmd/oap/hdr.h +++ b/src/irmd/oap/hdr.h @@ -43,6 +43,9 @@ #define OAP_KEX_IS_RAW_FMT(hdr) (((hdr)->kex_flags.fmt) == 1) /* + * Plaintext layout (request, and unencrypted/signed response). The + * signature covers the whole packet except itself. + * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ---+ @@ -83,8 +86,8 @@ * | | | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | * | | | - * + req_hash (variable, response only) + | - * | H(request) using req md_nid / sha384 | | + * + rsp_tag (variable, response only) + | + * | key-confirm tag (enc), else H(request) | | * | | | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ---+ * | | @@ -92,6 +95,25 @@ * | DSA signature over signed region | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * + * Encrypted response - wire layout. The certificate, application data and + * signature are AEAD-sealed - hiding the server identity and the cert/data + * sizes; kex and rsp_tag move ahead of the sealed block as cleartext AAD. + * + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ---+ + * | fixed header (36 bytes, see above) | | + * + id, timestamp, NIDs, crt_len=0, kex_len, data_len=0 + | AAD + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | + * | kex_data (variable) | | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | + * | rsp_tag (variable, response only) | | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ---+ + * | SEAL( data_len ‖ crt_len ‖ data ‖ crt ‖ sig ) | | + * + encrypted cert, app data and signature + | Sealed + * | + AEAD tag (128 bits) | | area + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ---+ + * * cipher_nid: NID value for symmetric cipher (0 = none) * kdf_nid: NID value for KDF function (0 = none) * md_nid: NID value for signature hash (0 = PQC/no signature) @@ -105,6 +127,11 @@ * Request: sig_len = total - 36 - crt_len - kex_len - data_len * Response: sig_len = total - 36 - crt_len - kex_len - data_len - hash_len * where hash_len = md_len(req_md_nid / sha384) + * + * The signed plaintext inside the seal is prefix ‖ data_len ‖ crt_len ‖ + * data ‖ crt ‖ sig; the cleartext prefix (fixed ‖ kex ‖ rsp_tag) is the + * AEAD AAD. Cleartext crt_len/data_len are 0 - the real lengths are sealed, + * hiding the cert and data sizes; oap_hdr_unseal reads them to split. */ /* Parsed OAP header - buffers pointing to a single memory region */ @@ -120,12 +147,15 @@ struct oap_hdr { bool fmt; /* Format */ bool role; /* Role */ } kex_flags; + buffer_t id; buffer_t crt; buffer_t kex; buffer_t data; - buffer_t req_hash; /* H(request) - response only */ + buffer_t rsp_tag; /* key-confirm tag / H(req), rsp only */ buffer_t sig; + buffer_t sealed; /* wire ciphertext ‖ tag (sealed rsp) */ + buffer_t sealed_pt; /* prefix‖lens‖data‖crt‖sig, owned */ buffer_t hdr; }; @@ -141,13 +171,19 @@ void oap_hdr_fini(struct oap_hdr * oap_hdr); int oap_hdr_encode(struct oap_hdr * hdr, void * pkp, void * crt, - struct sec_config * kcfg, - buffer_t req_hash, - int req_md_nid); + struct sec_config * scfg, + buffer_t rsp_tag, + int req_md_nid, + const uint8_t * seal_key); int oap_hdr_decode(struct oap_hdr * hdr, buffer_t buf, - int req_md_nid); + int req_md_nid, + bool rekey); + +/* Decrypt a sealed response identity block; fills data, crt and sig. */ +int oap_hdr_unseal(struct oap_hdr * hdr, + const uint8_t * key); void debug_oap_hdr_rcv(const struct oap_hdr * hdr); diff --git a/src/irmd/oap/internal.h b/src/irmd/oap/internal.h index 6dd44d56..4a156723 100644 --- a/src/irmd/oap/internal.h +++ b/src/irmd/oap/internal.h @@ -36,12 +36,13 @@ int oap_check_hdr(const struct oap_hdr * hdr); -int oap_auth_peer(char * name, - const struct oap_hdr * local_hdr, - const struct oap_hdr * peer_hdr); +int oap_auth_peer(char * name, + const struct sec_config * cfg, + const struct oap_hdr * local_hdr, + const struct oap_hdr * peer_hdr); int oap_negotiate_cipher(const struct oap_hdr * peer_hdr, - struct sec_config * kcfg); + struct sec_config * scfg); #ifndef OAP_TEST_MODE int load_credentials(const char * name, @@ -49,7 +50,7 @@ int load_credentials(const char * name, void ** pkp, void ** crt); -int load_kex_config(const char * name, +int load_sec_config(const char * name, const char * path, struct sec_config * cfg); #endif @@ -59,7 +60,7 @@ int load_srv_credentials(const struct name_info * info, void ** pkp, void ** crt); -int load_srv_kex_config(const struct name_info * info, +int load_srv_sec_config(const struct name_info * info, struct sec_config * cfg); int load_server_kem_keypair(const char * name, @@ -69,7 +70,7 @@ int load_server_kem_keypair(const char * name, extern int load_srv_credentials(const struct name_info * info, void ** pkp, void ** crt); -extern int load_srv_kex_config(const struct name_info * info, +extern int load_srv_sec_config(const struct name_info * info, struct sec_config * cfg); extern int load_server_kem_keypair(const char * name, struct sec_config * cfg, @@ -78,7 +79,7 @@ extern int load_server_kem_keypair(const char * name, int do_server_kex(const struct name_info * info, struct oap_hdr * peer_hdr, - struct sec_config * kcfg, + struct sec_config * scfg, buffer_t * kex, struct crypt_sk * sk); @@ -87,7 +88,7 @@ int load_cli_credentials(const struct name_info * info, void ** pkp, void ** crt); -int load_cli_kex_config(const struct name_info * info, +int load_cli_sec_config(const struct name_info * info, struct sec_config * cfg); int load_server_kem_pk(const char * name, @@ -97,21 +98,21 @@ int load_server_kem_pk(const char * name, extern int load_cli_credentials(const struct name_info * info, void ** pkp, void ** crt); -extern int load_cli_kex_config(const struct name_info * info, +extern int load_cli_sec_config(const struct name_info * info, struct sec_config * cfg); extern int load_server_kem_pk(const char * name, struct sec_config * cfg, buffer_t * pk); #endif -int oap_client_kex_prepare(struct sec_config * kcfg, +int oap_client_kex_prepare(struct sec_config * scfg, buffer_t server_pk, buffer_t * kex, uint8_t * key, void ** ephemeral_pkp); int oap_client_kex_complete(const struct oap_hdr * peer_hdr, - struct sec_config * kcfg, + struct sec_config * scfg, void * pkp, uint8_t * key); diff --git a/src/irmd/oap/io.c b/src/irmd/oap/io.c index c2c91b91..b5daa432 100644 --- a/src/irmd/oap/io.c +++ b/src/irmd/oap/io.c @@ -50,11 +50,17 @@ static bool file_exists(const char * path) { struct stat s; - if (stat(path, &s) < 0 && errno == ENOENT) { + if (stat(path, &s) == 0) + return true; + + if (errno == ENOENT) { log_dbg("File %s does not exist.", path); return false; } + /* Can't stat for another reason; assume present, fail on load */ + log_warn("Failed to stat %s: %s.", path, strerror(errno)); + return true; } @@ -96,16 +102,16 @@ int load_credentials(const char * name, return -EAUTH; } -int load_kex_config(const char * name, +int load_sec_config(const char * name, const char * path, struct sec_config * cfg) { + void * pin; + assert(name != NULL); assert(cfg != NULL); - memset(cfg, 0, sizeof(*cfg)); - - /* Load encryption config */ + /* Load security config */ if (!file_exists(path)) log_dbg("No encryption %s for %s.", path, name); @@ -114,6 +120,15 @@ int load_kex_config(const char * name, return -1; } + if (cfg->a.cacert[0] != '\0') { + if (crypt_load_crt_file(cfg->a.cacert, &pin) < 0) { + log_err("Failed to load pinned CA %s for %s.", + cfg->a.cacert, name); + return -EAUTH; + } + crypt_free_crt(pin); + } + if (!IS_KEX_ALGO_SET(cfg)) { log_info("Key exchange not configured for %s.", name); return 0; @@ -125,8 +140,13 @@ int load_kex_config(const char * name, return -ENOTSUP; } #endif - if (cfg->c.nid == NID_undef) { - log_err("Invalid cipher for %s.", name); + if (crypt_kex_rank(cfg->x.nid) < 1) { + log_err("Key exchange not supported for %s.", name); + return -ENOTSUP; + } + + if (crypt_cipher_rank(cfg->c.nid) < 1) { + log_err("Cipher not supported for %s.", name); return -ECRYPT; } diff --git a/src/irmd/oap/io.h b/src/irmd/oap/io.h index 2d47c62f..953e3898 100644 --- a/src/irmd/oap/io.h +++ b/src/irmd/oap/io.h @@ -32,7 +32,7 @@ int load_credentials(const char * name, void ** pkp, void ** crt); -int load_kex_config(const char * name, +int load_sec_config(const char * name, const char * path, struct sec_config * cfg); #endif diff --git a/src/irmd/oap/srv.c b/src/irmd/oap/srv.c index afc54acc..5d631618 100644 --- a/src/irmd/oap/srv.c +++ b/src/irmd/oap/srv.c @@ -49,7 +49,7 @@ extern int load_srv_credentials(const struct name_info * info, void ** pkp, void ** crt); -extern int load_srv_kex_config(const struct name_info * info, +extern int load_srv_sec_config(const struct name_info * info, struct sec_config * cfg); extern int load_server_kem_keypair(const char * name, bool raw_fmt, @@ -67,13 +67,16 @@ int load_srv_credentials(const struct name_info * info, return load_credentials(info->name, &info->s, pkp, crt); } -int load_srv_kex_config(const struct name_info * info, +int load_srv_sec_config(const struct name_info * info, struct sec_config * cfg) { assert(info != NULL); assert(cfg != NULL); - return load_kex_config(info->name, info->s.enc, cfg); + memset(cfg, 0, sizeof(*cfg)); + + /* Client auth stays opt-in (mTLS); enable with auth=required */ + return load_sec_config(info->name, info->s.sec, cfg); } int load_server_kem_keypair(const char * name, @@ -135,7 +138,7 @@ static int get_algo_from_peer_key(const struct oap_hdr * peer_hdr, } static int negotiate_cipher(const struct oap_hdr * peer_hdr, - struct sec_config * kcfg) + struct sec_config * scfg) { uint8_t * id = peer_hdr->id.data; int cli_nid; @@ -143,27 +146,25 @@ static int negotiate_cipher(const struct oap_hdr * peer_hdr, int srv_rank; /* Cipher: select the strongest of client and server */ - cli_nid = peer_hdr->cipher_str != NULL - ? (int) crypt_str_to_nid(peer_hdr->cipher_str) - : NID_undef; + if (peer_hdr->cipher_str != NULL) + cli_nid = (int) crypt_str_to_nid(peer_hdr->cipher_str); + else + cli_nid = NID_undef; - if (cli_nid != NID_undef - && crypt_cipher_rank(cli_nid) < 0) { + if (cli_nid != NID_undef && crypt_cipher_rank(cli_nid) < 0) { log_err_id(id, "Unsupported cipher '%s'.", peer_hdr->cipher_str); return -ENOTSUP; } cli_rank = crypt_cipher_rank(cli_nid); - srv_rank = crypt_cipher_rank(kcfg->c.nid); + srv_rank = crypt_cipher_rank(scfg->c.nid); if (cli_rank > srv_rank) { - SET_KEX_CIPHER_NID(kcfg, cli_nid); - log_dbg_id(id, "Selected client cipher %s.", - kcfg->c.str); + SET_KEX_CIPHER_NID(scfg, cli_nid); + log_dbg_id(id, "Selected client cipher %s.", scfg->c.str); } else if (srv_rank > 0) { - log_dbg_id(id, "Selected server cipher %s.", - kcfg->c.str); + log_dbg_id(id, "Selected server cipher %s.", scfg->c.str); } else { log_err_id(id, "Encryption requested, no cipher."); return -ECRYPT; @@ -178,31 +179,27 @@ static int negotiate_cipher(const struct oap_hdr * peer_hdr, } cli_rank = crypt_kdf_rank(peer_hdr->kdf_nid); - srv_rank = crypt_kdf_rank(kcfg->k.nid); + srv_rank = crypt_kdf_rank(scfg->k.nid); - /* - * For client-encap KEM, the KDF is baked into - * the ciphertext. The server must use the client's - * KDF and can only verify the minimum. - */ + /* Client-encap KEM bakes KDF into ciphertext; verify min. */ if (OAP_KEX_ROLE(peer_hdr) == KEM_MODE_CLIENT_ENCAP) { if (srv_rank > cli_rank) { log_err_id(id, "Client KDF too weak."); return -ECRYPT; } - SET_KEX_KDF_NID(kcfg, peer_hdr->kdf_nid); + SET_KEX_KDF_NID(scfg, peer_hdr->kdf_nid); } else if (cli_rank > srv_rank) { - SET_KEX_KDF_NID(kcfg, peer_hdr->kdf_nid); + SET_KEX_KDF_NID(scfg, peer_hdr->kdf_nid); log_dbg_id(id, "Selected client KDF %s.", - md_nid_to_str(kcfg->k.nid)); + md_nid_to_str(scfg->k.nid)); } else if (srv_rank > 0) { log_dbg_id(id, "Selected server KDF %s.", - md_nid_to_str(kcfg->k.nid)); + md_nid_to_str(scfg->k.nid)); } - if (IS_KEX_ALGO_SET(kcfg)) + if (IS_KEX_ALGO_SET(scfg)) log_info_id(id, "Negotiated %s + %s.", - kcfg->x.str, kcfg->c.str); + scfg->x.str, scfg->c.str); else log_info_id(id, "No key exchange."); @@ -211,7 +208,7 @@ static int negotiate_cipher(const struct oap_hdr * peer_hdr, static int do_server_kem_decap(const struct name_info * info, const struct oap_hdr * peer_hdr, - struct sec_config * kcfg, + struct sec_config * scfg, struct crypt_sk * sk) { buffer_t ct; @@ -228,7 +225,7 @@ static int do_server_kem_decap(const struct name_info * info, ct.data = peer_hdr->kex.data; ct.len = peer_hdr->kex.len; - ret = kex_kem_decap(server_pkp, ct, kcfg->k.nid, sk->key); + ret = kex_kem_decap(server_pkp, ct, scfg->k.nid, sk->key); crypt_free_key(server_pkp); @@ -243,7 +240,7 @@ static int do_server_kem_decap(const struct name_info * info, } static int do_server_kem_encap(const struct oap_hdr * peer_hdr, - struct sec_config * kcfg, + struct sec_config * scfg, buffer_t * kex, struct crypt_sk * sk) { @@ -254,12 +251,12 @@ static int do_server_kem_encap(const struct oap_hdr * peer_hdr, client_pk.data = peer_hdr->kex.data; client_pk.len = peer_hdr->kex.len; - if (IS_HYBRID_KEM(kcfg->x.str)) + if (IS_HYBRID_KEM(scfg->x.str)) ct_len = kex_kem_encap_raw(client_pk, kex->data, - kcfg->k.nid, sk->key); + scfg->k.nid, sk->key); else ct_len = kex_kem_encap(client_pk, kex->data, - kcfg->k.nid, sk->key); + scfg->k.nid, sk->key); if (ct_len < 0) { log_err_id(id, "Failed to encapsulate KEM."); @@ -275,26 +272,26 @@ static int do_server_kem_encap(const struct oap_hdr * peer_hdr, static int do_server_kex_kem(const struct name_info * info, struct oap_hdr * peer_hdr, - struct sec_config * kcfg, + struct sec_config * scfg, buffer_t * kex, struct crypt_sk * sk) { int ret; - kcfg->x.mode = peer_hdr->kex_flags.role; + scfg->x.mode = peer_hdr->kex_flags.role; - if (kcfg->x.mode == KEM_MODE_CLIENT_ENCAP) { - ret = do_server_kem_decap(info, peer_hdr, kcfg, sk); + if (scfg->x.mode == KEM_MODE_CLIENT_ENCAP) { + ret = do_server_kem_decap(info, peer_hdr, scfg, sk); kex->len = 0; } else { - ret = do_server_kem_encap(peer_hdr, kcfg, kex, sk); + ret = do_server_kem_encap(peer_hdr, scfg, kex, sk); } return ret; } static int do_server_kex_dhe(const struct oap_hdr * peer_hdr, - struct sec_config * kcfg, + struct sec_config * scfg, buffer_t * kex, struct crypt_sk * sk) { @@ -303,7 +300,7 @@ static int do_server_kex_dhe(const struct oap_hdr * peer_hdr, int ret; uint8_t * id = peer_hdr->id.data; - key_len = kex_pkp_create(kcfg, &epkp, kex->data); + key_len = kex_pkp_create(scfg, &epkp, kex->data); if (key_len < 0) { log_err_id(id, "Failed to generate key pair."); return -ECRYPT; @@ -311,9 +308,9 @@ static int do_server_kex_dhe(const struct oap_hdr * peer_hdr, kex->len = (size_t) key_len; - log_dbg_id(id, "Generated %s ephemeral keys.", kcfg->x.str); + log_dbg_id(id, "Generated %s ephemeral keys.", scfg->x.str); - ret = kex_dhe_derive(kcfg, epkp, peer_hdr->kex, sk->key); + ret = kex_dhe_derive(scfg, epkp, peer_hdr->kex, sk->key); if (ret < 0) { log_err_id(id, "Failed to derive secret."); kex_pkp_destroy(epkp); @@ -327,7 +324,7 @@ static int do_server_kex_dhe(const struct oap_hdr * peer_hdr, int do_server_kex(const struct name_info * info, struct oap_hdr * peer_hdr, - struct sec_config * kcfg, + struct sec_config * scfg, buffer_t * kex, struct crypt_sk * sk) { @@ -339,60 +336,71 @@ int do_server_kex(const struct name_info * info, /* No KEX data from client */ if (peer_hdr->kex.len == 0) { - if (IS_KEX_ALGO_SET(kcfg)) { + if (IS_KEX_ALGO_SET(scfg)) { log_warn_id(id, "KEX requested without info."); return -ECRYPT; } return 0; } - if (negotiate_cipher(peer_hdr, kcfg) < 0) + if (negotiate_cipher(peer_hdr, scfg) < 0) return -ECRYPT; /* Save server's configured KEX before overwriting */ - srv_kex_nid = kcfg->x.nid; + srv_kex_nid = scfg->x.nid; if (OAP_KEX_ROLE(peer_hdr) != KEM_MODE_CLIENT_ENCAP) { /* Server encapsulation or DHE: extract algo from DER PK */ if (get_algo_from_peer_key(peer_hdr, algo_buf) < 0) return -ECRYPT; - SET_KEX_ALGO(kcfg, algo_buf); + SET_KEX_ALGO(scfg, algo_buf); /* Reject if client KEX is weaker than server's */ - if (crypt_kex_rank(kcfg->x.nid) + if (crypt_kex_rank(scfg->x.nid) < crypt_kex_rank(srv_kex_nid)) { log_err_id(id, "Client KEX %s too weak.", - kcfg->x.str); + scfg->x.str); return -ECRYPT; } } /* Dispatch based on algorithm type */ - if (IS_KEM_ALGORITHM(kcfg->x.str)) - return do_server_kex_kem(info, peer_hdr, kcfg, kex, sk); + if (IS_KEM_ALGORITHM(scfg->x.str)) + return do_server_kex_kem(info, peer_hdr, scfg, kex, sk); else - return do_server_kex_dhe(peer_hdr, kcfg, kex, sk); + return do_server_kex_dhe(peer_hdr, scfg, kex, sk); } int oap_srv_process(const struct name_info * info, buffer_t req_buf, buffer_t * rsp_buf, buffer_t * data, - struct crypt_sk * sk) + struct crypt_sk * sk, + bool rekey, + const buffer_t * cached_crt, + buffer_t * peer_crt) { struct oap_hdr peer_hdr; struct oap_hdr local_hdr; - struct sec_config kcfg; + struct sec_config scfg; uint8_t kex_buf[CRYPT_KEY_BUFSZ]; uint8_t hash_buf[MAX_HASH_SIZE]; - buffer_t req_hash = BUF_INIT; + uint8_t kc_buf[MAX_HASH_SIZE]; + uint8_t resp_hash_buf[MAX_HASH_SIZE]; + uint8_t hs_key[SYMMKEYSZ]; + const uint8_t * seal_key = NULL; + buffer_t req_hash = BUF_INIT; + buffer_t resp_hash = BUF_INIT; + buffer_t crt_der = BUF_INIT; + buffer_t rsp_tag = BUF_INIT; ssize_t hash_ret; - char cli_name[NAME_SIZE + 1]; /* TODO */ + char cli_name[NAME_SIZE + 1]; uint8_t * id; void * pkp = NULL; void * crt = NULL; int req_md_nid; + int ret; assert(info != NULL); assert(rsp_buf != NULL); @@ -412,13 +420,19 @@ int oap_srv_process(const struct name_info * info, goto fail_cred; } - if (load_srv_kex_config(info, &kcfg) < 0) { - log_err("Failed to load KEX config for %s.", info->name); + /* Re-key omits the cert; the peer verifies against its cache. */ + if (rekey && crt != NULL) { + crypt_free_crt(crt); + crt = NULL; + } + + if (load_srv_sec_config(info, &scfg) < 0) { + log_err("Failed to load security config for %s.", info->name); goto fail_kex; } /* Decode incoming header (NID_undef = request, no hash) */ - if (oap_hdr_decode(&peer_hdr, req_buf, NID_undef) < 0) { + if (oap_hdr_decode(&peer_hdr, req_buf, NID_undef, rekey) < 0) { log_err("Failed to decode OAP header."); goto fail_auth; } @@ -427,22 +441,38 @@ int oap_srv_process(const struct name_info * info, id = peer_hdr.id.data; /* Logging */ - if (oap_check_hdr(&peer_hdr) < 0) { - log_err_id(id, "OAP header failed replay check."); + ret = oap_check_hdr(&peer_hdr); + if (ret == -EREPLAY) { + log_warn_id(id, "OAP header failed replay check."); + goto fail_replay; + } + if (ret < 0) { + log_err_id(id, "OAP header check failed."); goto fail_auth; } oap_hdr_init(&local_hdr, peer_hdr.id, kex_buf, *data, NID_undef); - if (oap_auth_peer(cli_name, &local_hdr, &peer_hdr) < 0) { + if (oap_auth_peer(cli_name, &scfg, &local_hdr, &peer_hdr, + cached_crt) < 0) { log_err_id(id, "Failed to authenticate client."); goto fail_auth; } - if (do_server_kex(info, &peer_hdr, &kcfg, &local_hdr.kex, sk) < 0) + /* Surface the peer cert so the caller can cache it for re-key. */ + if (peer_crt != NULL && peer_hdr.crt.len > 0) { + peer_crt->data = malloc(peer_hdr.crt.len); + if (peer_crt->data == NULL) + goto fail_auth; + + memcpy(peer_crt->data, peer_hdr.crt.data, peer_hdr.crt.len); + peer_crt->len = peer_hdr.crt.len; + } + + if (do_server_kex(info, &peer_hdr, &scfg, &local_hdr.kex, sk) < 0) goto fail_kex; - sk->nid = kcfg.c.nid; + sk->nid = scfg.c.nid; /* Build response header with hash of client request */ local_hdr.nid = sk->nid; @@ -458,10 +488,58 @@ int oap_srv_process(const struct name_info * info, goto fail_auth; } req_hash.data = hash_buf; - req_hash.len = (size_t) hash_ret; + req_hash.len = (size_t) hash_ret; - if (oap_hdr_encode(&local_hdr, pkp, crt, &kcfg, - req_hash, req_md_nid) < 0) { + rsp_tag = req_hash; + + /* Bind the key to the transcript and confirm it to the client */ + if (sk->nid != NID_undef) { + if (crt != NULL && crypt_crt_der(crt, &crt_der) < 0) { + log_err_id(id, "Failed to serialize cert."); + goto fail_auth; + } + + resp_hash.data = resp_hash_buf; + + ret = oap_resp_hash(req_md_nid, local_hdr.kex, *data, + crt_der, &resp_hash); + + freebuf(crt_der); + + if (ret < 0) { + log_err_id(id, "Failed to hash response."); + goto fail_auth; + } + + /* Derive the identity-seal key before bind mutates sk->key */ + if (oap_derive_hs_key(sk, req_hash, hs_key) < 0) { + log_err_id(id, "Failed to derive handshake key."); + goto fail_auth; + } + + seal_key = hs_key; + + if (oap_bind_session_key(sk, req_hash, resp_hash, + scfg.k.nid) < 0) { + log_err_id(id, "Failed to bind session key."); + goto fail_auth; + } + + if (oap_key_confirm_tag(sk, req_hash, resp_hash, kc_buf, + (size_t) hash_ret) < 0) { + log_err_id(id, "Failed to confirm session key."); + goto fail_auth; + } + + rsp_tag.data = kc_buf; + } + + ret = oap_hdr_encode(&local_hdr, pkp, crt, &scfg, + rsp_tag, req_md_nid, seal_key); + + crypt_secure_clear(hs_key, SYMMKEYSZ); + + if (ret < 0) { log_err_id(id, "Failed to create OAP response header."); goto fail_auth; } @@ -486,11 +564,17 @@ int oap_srv_process(const struct name_info * info, fail_data: oap_hdr_fini(&local_hdr); fail_auth: + crypt_secure_clear(hs_key, SYMMKEYSZ); crypt_free_crt(crt); crypt_free_key(pkp); fail_cred: return -EAUTH; + fail_replay: + crypt_free_crt(crt); + crypt_free_key(pkp); + return -EREPLAY; + fail_kex: crypt_free_crt(crt); crypt_free_key(pkp); diff --git a/src/irmd/oap/tests/common.c b/src/irmd/oap/tests/common.c index 0a1af100..49ea9187 100644 --- a/src/irmd/oap/tests/common.c +++ b/src/irmd/oap/tests/common.c @@ -29,39 +29,51 @@ #include <string.h> #include <stdio.h> -int load_srv_kex_config(const struct name_info * info, +int load_srv_sec_config(const struct name_info * info, struct sec_config * cfg) { (void) info; memset(cfg, 0, sizeof(*cfg)); + cfg->a.req = test_cfg.srv.req_auth; + if (test_cfg.srv.cacert != NULL) + strcpy(cfg->a.cacert, test_cfg.srv.cacert); + + /* Digest is kept without kex, as in parse_sec_config */ + SET_KEX_DIGEST_NID(cfg, test_cfg.srv.md); + if (test_cfg.srv.kex == NID_undef) return 0; SET_KEX_ALGO_NID(cfg, test_cfg.srv.kex); SET_KEX_CIPHER_NID(cfg, test_cfg.srv.cipher); SET_KEX_KDF_NID(cfg, test_cfg.srv.kdf); - SET_KEX_DIGEST_NID(cfg, test_cfg.srv.md); SET_KEX_KEM_MODE(cfg, test_cfg.srv.kem_mode); return 0; } -int load_cli_kex_config(const struct name_info * info, +int load_cli_sec_config(const struct name_info * info, struct sec_config * cfg) { (void) info; memset(cfg, 0, sizeof(*cfg)); + cfg->a.req = test_cfg.cli.req_auth; + if (test_cfg.cli.cacert != NULL) + strcpy(cfg->a.cacert, test_cfg.cli.cacert); + + /* Digest is kept without kex, as in parse_sec_config */ + SET_KEX_DIGEST_NID(cfg, test_cfg.cli.md); + if (test_cfg.cli.kex == NID_undef) return 0; SET_KEX_ALGO_NID(cfg, test_cfg.cli.kex); SET_KEX_CIPHER_NID(cfg, test_cfg.cli.cipher); SET_KEX_KDF_NID(cfg, test_cfg.cli.kdf); - SET_KEX_DIGEST_NID(cfg, test_cfg.cli.md); SET_KEX_KEM_MODE(cfg, test_cfg.cli.kem_mode); return 0; @@ -152,13 +164,15 @@ void oap_test_teardown(struct oap_test_ctx * ctx) if (ctx->cli.state != NULL) { res.key = ctx->cli.key; oap_cli_complete(ctx->cli.state, &ctx->cli.info, dummy, - &ctx->data, &res); + &ctx->data, &res, NULL, NULL); ctx->cli.state = NULL; } freebuf(ctx->data); freebuf(ctx->resp_hdr); freebuf(ctx->req_hdr); + freebuf(ctx->srv_crt); + freebuf(ctx->cli_crt); crypt_free_crt(ctx->im_ca); crypt_free_crt(ctx->root_ca); @@ -170,7 +184,7 @@ void oap_test_teardown(struct oap_test_ctx * ctx) int oap_cli_prepare_ctx(struct oap_test_ctx * ctx) { return oap_cli_prepare(&ctx->cli.state, &ctx->cli.info, &ctx->req_hdr, - ctx->data); + ctx->data, ctx->rekey); } int oap_srv_process_ctx(struct oap_test_ctx * ctx) @@ -179,7 +193,9 @@ int oap_srv_process_ctx(struct oap_test_ctx * ctx) int ret; ret = oap_srv_process(&ctx->srv.info, ctx->req_hdr, - &ctx->resp_hdr, &ctx->data, &res); + &ctx->resp_hdr, &ctx->data, &res, ctx->rekey, + ctx->rekey ? &ctx->srv_crt : NULL, + ctx->rekey ? NULL : &ctx->srv_crt); if (ret == 0) ctx->srv.nid = res.nid; @@ -192,7 +208,9 @@ int oap_cli_complete_ctx(struct oap_test_ctx * ctx) int ret; ret = oap_cli_complete(ctx->cli.state, &ctx->cli.info, ctx->resp_hdr, - &ctx->data, &res); + &ctx->data, &res, + ctx->rekey ? &ctx->cli_crt : NULL, + ctx->rekey ? NULL : &ctx->cli_crt); ctx->cli.state = NULL; if (ret == 0) @@ -243,6 +261,147 @@ int roundtrip_auth_only(const char * root_ca, return TEST_RC_FAIL; } +int roundtrip_rekey(const char * root_ca, + const char * im_ca_str) +{ + struct oap_test_ctx ctx; + + TEST_START(); + + if (oap_test_setup(&ctx, root_ca, im_ca_str) < 0) + goto fail; + + /* Initial handshake: the client caches the server cert. */ + if (oap_cli_prepare_ctx(&ctx) < 0) { + printf("Initial client prepare failed.\n"); + goto fail_cleanup; + } + + if (oap_srv_process_ctx(&ctx) < 0) { + printf("Initial server process failed.\n"); + goto fail_cleanup; + } + + if (oap_cli_complete_ctx(&ctx) < 0) { + printf("Initial client complete failed.\n"); + goto fail_cleanup; + } + + if (memcmp(ctx.cli.key, ctx.srv.key, SYMMKEYSZ) != 0) { + printf("Initial keys do not match.\n"); + goto fail_cleanup; + } + + if (ctx.cli_crt.len == 0) { + printf("Server cert was not cached for re-key.\n"); + goto fail_cleanup; + } + + /* Re-key: cert dropped on the wire, verified against the cache. */ + freebuf(ctx.req_hdr); + freebuf(ctx.resp_hdr); + freebuf(ctx.data); + + ctx.rekey = true; + + if (oap_cli_prepare_ctx(&ctx) < 0) { + printf("Re-key client prepare failed.\n"); + goto fail_cleanup; + } + + if (oap_srv_process_ctx(&ctx) < 0) { + printf("Re-key server process failed.\n"); + goto fail_cleanup; + } + + if (oap_cli_complete_ctx(&ctx) < 0) { + printf("Re-key client complete failed.\n"); + goto fail_cleanup; + } + + if (memcmp(ctx.cli.key, ctx.srv.key, SYMMKEYSZ) != 0) { + printf("Re-key keys do not match.\n"); + goto fail_cleanup; + } + + oap_test_teardown(&ctx); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_cleanup: + oap_test_teardown(&ctx); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +int roundtrip_rekey_badcache(const char * root_ca, + const char * im_ca_str) +{ + struct oap_test_ctx ctx; + + TEST_START(); + + if (oap_test_setup(&ctx, root_ca, im_ca_str) < 0) + goto fail; + + if (oap_cli_prepare_ctx(&ctx) < 0) { + printf("Initial client prepare failed.\n"); + goto fail_cleanup; + } + + if (oap_srv_process_ctx(&ctx) < 0) { + printf("Initial server process failed.\n"); + goto fail_cleanup; + } + + if (oap_cli_complete_ctx(&ctx) < 0) { + printf("Initial client complete failed.\n"); + goto fail_cleanup; + } + + if (ctx.cli_crt.len == 0) { + printf("Server cert was not cached.\n"); + goto fail_cleanup; + } + + /* Corrupt the cached cert: the re-key must fail closed. */ + ctx.cli_crt.data[ctx.cli_crt.len / 2] ^= 0xFF; + + freebuf(ctx.req_hdr); + freebuf(ctx.resp_hdr); + freebuf(ctx.data); + + ctx.rekey = true; + + if (oap_cli_prepare_ctx(&ctx) < 0) { + printf("Re-key client prepare failed.\n"); + goto fail_cleanup; + } + + if (oap_srv_process_ctx(&ctx) < 0) { + printf("Re-key server process failed.\n"); + goto fail_cleanup; + } + + if (oap_cli_complete_ctx(&ctx) == 0) { + printf("Re-key accepted a corrupted cached cert.\n"); + goto fail_cleanup; + } + + oap_test_teardown(&ctx); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_cleanup: + oap_test_teardown(&ctx); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + int roundtrip_kex_only(void) { struct name_info cli_info; @@ -271,14 +430,15 @@ int roundtrip_kex_only(void) } if (oap_cli_prepare(&cli_state, &cli_info, &req_hdr, - data) < 0) { + data, false) < 0) { printf("Client prepare failed.\n"); goto fail_cleanup; } res.key = srv_key; - if (oap_srv_process(&srv_info, req_hdr, &resp_hdr, &data, &res) < 0) { + if (oap_srv_process(&srv_info, req_hdr, &resp_hdr, &data, &res, + false, NULL, NULL) < 0) { printf("Server process failed.\n"); goto fail_cleanup; } @@ -287,7 +447,8 @@ int roundtrip_kex_only(void) res.key = cli_key; - if (oap_cli_complete(cli_state, &cli_info, resp_hdr, &data, &res) < 0) { + if (oap_cli_complete(cli_state, &cli_info, resp_hdr, &data, &res, + NULL, NULL) < 0) { printf("Client complete failed.\n"); cli_state = NULL; goto fail_cleanup; @@ -316,7 +477,8 @@ int roundtrip_kex_only(void) fail_cleanup: if (cli_state != NULL) { res.key = cli_key; - oap_cli_complete(cli_state, &cli_info, resp_hdr, &data, &res); + oap_cli_complete(cli_state, &cli_info, resp_hdr, &data, + &res, NULL, NULL); } freebuf(resp_hdr); freebuf(req_hdr); @@ -396,7 +558,7 @@ int corrupted_response(const char * root_ca, res.key = ctx.cli.key; if (oap_cli_complete(ctx.cli.state, &ctx.cli.info, ctx.resp_hdr, - &ctx.data, &res) == 0) { + &ctx.data, &res, NULL, NULL) == 0) { printf("Client should reject corrupted response.\n"); ctx.cli.state = NULL; goto fail_cleanup; diff --git a/src/irmd/oap/tests/common.h b/src/irmd/oap/tests/common.h index d4b6733a..c47096fb 100644 --- a/src/irmd/oap/tests/common.h +++ b/src/irmd/oap/tests/common.h @@ -32,12 +32,14 @@ /* Per-side security configuration for tests */ struct test_sec_cfg { - int kex; /* KEX algorithm NID */ - int cipher; /* Cipher NID for encryption */ - int kdf; /* KDF NID for key derivation */ - int md; /* Digest NID for signatures */ - int kem_mode; /* KEM encapsulation mode (0 for ECDH) */ - bool auth; /* Use authentication (certificates) */ + int kex; /* KEX algorithm NID */ + int cipher; /* Cipher NID for encryption */ + int kdf; /* KDF NID for key derivation */ + int md; /* Digest NID for signatures */ + int kem_mode; /* KEM encapsulation mode (0 for ECDH) */ + bool auth; /* Use authentication (certificates) */ + bool req_auth; /* Require peer authentication */ + const char * cacert; /* Pinned issuing CA path */ }; /* Test configuration - set by each test before running roundtrip */ @@ -69,6 +71,11 @@ struct oap_test_ctx { buffer_t data; void * root_ca; void * im_ca; + + /* Re-key (tier iii): drop the cert, verify against the cache. */ + bool rekey; + buffer_t srv_crt; /* client cert cached by server */ + buffer_t cli_crt; /* server cert cached by client */ }; int oap_test_setup(struct oap_test_ctx * ctx, @@ -86,6 +93,12 @@ int oap_cli_complete_ctx(struct oap_test_ctx * ctx); int roundtrip_auth_only(const char * root_ca, const char * im_ca_str); +int roundtrip_rekey(const char * root_ca, + const char * im_ca_str); + +int roundtrip_rekey_badcache(const char * root_ca, + const char * im_ca_str); + int roundtrip_kex_only(void); int corrupted_request(const char * root_ca, diff --git a/src/irmd/oap/tests/oap_test.c b/src/irmd/oap/tests/oap_test.c index a324b586..fc10150b 100644 --- a/src/irmd/oap/tests/oap_test.c +++ b/src/irmd/oap/tests/oap_test.c @@ -32,6 +32,7 @@ #include <ouroboros/crypt.h> #include <ouroboros/endian.h> +#include <ouroboros/errno.h> #include <ouroboros/flow.h> #include <ouroboros/name.h> #include <ouroboros/random.h> @@ -41,9 +42,12 @@ #include <test/certs/ecdsa.h> #include "oap.h" +#include "oap/auth.h" #include "common.h" #include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> #include <string.h> #ifdef HAVE_OPENSSL @@ -174,6 +178,7 @@ static int test_oap_roundtrip(int kex) oap_test_teardown(&ctx); TEST_SUCCESS("(%s)", kex_str); + return TEST_RC_SUCCESS; fail_cleanup: @@ -198,6 +203,20 @@ static int test_oap_roundtrip_auth_only(void) return roundtrip_auth_only(root_ca_crt_ec, im_ca_crt_ec); } +static int test_oap_rekey(void) +{ + test_default_cfg(); + + return roundtrip_rekey(root_ca_crt_ec, im_ca_crt_ec); +} + +static int test_oap_rekey_badcache(void) +{ + test_default_cfg(); + + return roundtrip_rekey_badcache(root_ca_crt_ec, im_ca_crt_ec); +} + static int test_oap_roundtrip_kex_only(void) { memset(&test_cfg, 0, sizeof(test_cfg)); @@ -238,6 +257,7 @@ static int test_oap_piggyback_data(void) ctx.data.data = malloc(ctx.data.len); if (ctx.data.data == NULL) goto fail_cleanup; + memcpy(ctx.data.data, cli_data_str, ctx.data.len); if (oap_cli_prepare_ctx(&ctx) < 0) @@ -288,6 +308,7 @@ static int test_oap_piggyback_data(void) oap_test_teardown(&ctx); TEST_SUCCESS(); + return TEST_RC_SUCCESS; fail_cleanup: @@ -356,6 +377,7 @@ static int test_oap_inflated_length_field(void) oap_test_teardown(&ctx); TEST_SUCCESS(); + return TEST_RC_SUCCESS; fail_cleanup: @@ -400,6 +422,7 @@ static int test_oap_deflated_length_field(void) oap_test_teardown(&ctx); TEST_SUCCESS(); + return TEST_RC_SUCCESS; fail_cleanup: @@ -458,6 +481,7 @@ static int test_oap_nid_without_kex(void) oap_test_teardown(&ctx); TEST_SUCCESS(); + return TEST_RC_SUCCESS; fail_cleanup: @@ -509,6 +533,61 @@ static int test_oap_unsupported_nid(void) oap_test_teardown(&ctx); TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + + fail_cleanup: + oap_test_teardown(&ctx); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* Client rejects a response whose key-confirmation tag is tampered */ +static int test_oap_key_confirm_mismatch(void) +{ + struct oap_test_ctx ctx; + + TEST_START(); + + /* Unauthenticated + encrypted: response unsigned, KC is the gate */ + memset(&test_cfg, 0, sizeof(test_cfg)); + test_cfg.srv.kex = NID_X25519; + test_cfg.srv.cipher = NID_aes_256_gcm; + test_cfg.srv.kdf = NID_sha256; + test_cfg.srv.md = NID_sha256; + test_cfg.srv.auth = NO_AUTH; + test_cfg.cli.kex = NID_X25519; + test_cfg.cli.cipher = NID_aes_256_gcm; + test_cfg.cli.kdf = NID_sha256; + test_cfg.cli.md = NID_sha256; + test_cfg.cli.auth = NO_AUTH; + + if (oap_test_setup(&ctx, root_ca_crt_ec, im_ca_crt_ec) < 0) + goto fail; + + if (oap_cli_prepare_ctx(&ctx) < 0) { + printf("Client prepare failed.\n"); + goto fail_cleanup; + } + + if (oap_srv_process_ctx(&ctx) < 0) { + printf("Server process failed.\n"); + goto fail_cleanup; + } + + /* The key-confirm tag is the last field of an unsigned response */ + ctx.resp_hdr.data[ctx.resp_hdr.len - 1] ^= 0xFF; + + if (oap_cli_complete_ctx(&ctx) == 0) { + printf("Client accepted a bad key-confirmation tag.\n"); + goto fail_cleanup; + } + + oap_test_teardown(&ctx); + + TEST_SUCCESS(); + return TEST_RC_SUCCESS; fail_cleanup: @@ -609,6 +688,7 @@ static int test_oap_cipher_mismatch(void) oap_test_teardown(&ctx); TEST_SUCCESS(); + return TEST_RC_SUCCESS; fail_cleanup: @@ -655,6 +735,7 @@ static int test_oap_srv_enc_cli_none(void) oap_test_teardown(&ctx); TEST_SUCCESS(); + return TEST_RC_SUCCESS; fail_cleanup: @@ -724,6 +805,7 @@ static int test_oap_cli_enc_srv_none(void) oap_test_teardown(&ctx); TEST_SUCCESS(); + return TEST_RC_SUCCESS; fail_cleanup: @@ -733,7 +815,7 @@ static int test_oap_cli_enc_srv_none(void) return TEST_RC_FAIL; } -/* Client rejects server response with downgraded cipher */ +/* Unauthenticated server: client floor-rejects a downgraded cipher */ static int test_oap_cli_rejects_downgrade(void) { struct oap_test_ctx ctx; @@ -747,7 +829,7 @@ static int test_oap_cli_rejects_downgrade(void) test_cfg.srv.cipher = NID_aes_256_gcm; test_cfg.srv.kdf = NID_sha256; test_cfg.srv.md = NID_sha256; - test_cfg.srv.auth = AUTH; + test_cfg.srv.auth = NO_AUTH; test_cfg.cli.kex = NID_X25519; test_cfg.cli.cipher = NID_aes_256_gcm; @@ -769,7 +851,7 @@ static int test_oap_cli_rejects_downgrade(void) } /* Tamper: replace cipher NID with weaker one */ - weak = hton16(NID_aes_128_ctr); + weak = hton16(NID_aes_128_gcm); memcpy(ctx.resp_hdr.data + OAP_CIPHER_NID_OFFSET, &weak, sizeof(weak)); @@ -782,6 +864,69 @@ static int test_oap_cli_rejects_downgrade(void) oap_test_teardown(&ctx); TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + + fail_cleanup: + oap_test_teardown(&ctx); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* + * Suite binding: a cipher swapped to a higher rank clears the client floor + * check, but the bound key commits to the negotiated suite, so the swap must + * still fail key confirmation. + */ +static int test_oap_cli_rejects_suite_swap(void) +{ + struct oap_test_ctx ctx; + uint16_t swap; + + TEST_START(); + + memset(&test_cfg, 0, sizeof(test_cfg)); + + /* Both AES-128-GCM: a swap to AES-256 outranks the client floor */ + test_cfg.srv.kex = NID_X25519; + test_cfg.srv.cipher = NID_aes_128_gcm; + test_cfg.srv.kdf = NID_sha256; + test_cfg.srv.md = NID_sha256; + test_cfg.srv.auth = NO_AUTH; + test_cfg.cli.kex = NID_X25519; + test_cfg.cli.cipher = NID_aes_128_gcm; + test_cfg.cli.kdf = NID_sha256; + test_cfg.cli.md = NID_sha256; + test_cfg.cli.auth = NO_AUTH; + + if (oap_test_setup(&ctx, root_ca_crt_ec, im_ca_crt_ec) < 0) + goto fail; + + if (oap_cli_prepare_ctx(&ctx) < 0) { + printf("Client prepare failed.\n"); + goto fail_cleanup; + } + + if (oap_srv_process_ctx(&ctx) < 0) { + printf("Server process failed.\n"); + goto fail_cleanup; + } + + /* Swap the response cipher to a higher-ranked one */ + swap = hton16(NID_aes_256_gcm); + memcpy(ctx.resp_hdr.data + OAP_CIPHER_NID_OFFSET, + &swap, sizeof(swap)); + + if (oap_cli_complete_ctx(&ctx) == 0) { + printf("Client accepted a swapped cipher suite.\n"); + goto fail_cleanup; + } + + oap_test_teardown(&ctx); + + TEST_SUCCESS(); + return TEST_RC_SUCCESS; fail_cleanup: @@ -831,6 +976,7 @@ static int test_oap_srv_rejects_weak_kex(void) oap_test_teardown(&ctx); TEST_SUCCESS(); + return TEST_RC_SUCCESS; fail_cleanup: @@ -890,6 +1036,7 @@ static int test_oap_roundtrip_md(int md) oap_test_teardown(&ctx); TEST_SUCCESS("(%s)", md_str ? md_str : "default"); + return TEST_RC_SUCCESS; fail_cleanup: @@ -955,6 +1102,7 @@ static int test_oap_outdated_packet(void) oap_test_teardown(&ctx); TEST_SUCCESS(); + return TEST_RC_SUCCESS; fail_cleanup: @@ -1003,6 +1151,7 @@ static int test_oap_future_packet(void) oap_test_teardown(&ctx); TEST_SUCCESS(); + return TEST_RC_SUCCESS; fail_cleanup: @@ -1053,15 +1202,16 @@ static int test_oap_replay_packet(void) freebuf(ctx.req_hdr); ctx.req_hdr = saved_req; - /* Replayed request should fail */ - if (oap_srv_process_ctx(&ctx) == 0) { - printf("Server should reject replayed packet.\n"); + /* Replay must return -EREPLAY so callers can drop silently. */ + if (oap_srv_process_ctx(&ctx) != -EREPLAY) { + printf("Replayed packet rejection != -EREPLAY.\n"); goto fail_cleanup; } oap_test_teardown(&ctx); TEST_SUCCESS(); + return TEST_RC_SUCCESS; fail_cleanup: @@ -1071,6 +1221,150 @@ static int test_oap_replay_packet(void) return TEST_RC_FAIL; } +/* Encode a distinct OAP session ID from an index */ +static void make_id(uint8_t * id, + size_t idx) +{ + memset(id, 0, OAP_ID_SIZE); + memcpy(id, &idx, sizeof(idx)); +} + +/* + * Replay cache fails closed at capacity: a flood is rejected and no genuine + * entry is evicted (so it cannot be replayed). + */ +static int test_oap_replay_cap(void) +{ + struct oap_hdr h; + struct timespec now; + uint8_t id[OAP_ID_SIZE]; + uint64_t stamp; + size_t i; + + TEST_START(); + + if (oap_auth_init() < 0) { + printf("Failed to init OAP.\n"); + goto fail; + } + + clock_gettime(CLOCK_REALTIME, &now); + stamp = TS_TO_UINT64(now); + + memset(&h, 0, sizeof(h)); + h.id.data = id; + h.id.len = OAP_ID_SIZE; + h.timestamp = stamp; + + /* Fill one generation bucket to capacity with distinct IDs */ + for (i = 0; i < OAP_REPLAY_MAX; i++) { + make_id(id, i); + if (oap_check_hdr(&h) != 0) { + printf("Distinct header %zu rejected.\n", i); + goto fail_fini; + } + } + + /* One past capacity fails closed (rejected, not evict-oldest) */ + make_id(id, OAP_REPLAY_MAX); + if (oap_check_hdr(&h) != -EAUTH) { + printf("Header past capacity not fail-closed.\n"); + goto fail_fini; + } + + /* No genuine entry was evicted: the oldest still reads as a replay */ + make_id(id, 0); + if (oap_check_hdr(&h) != -EREPLAY) { + printf("Genuine entry evicted under flood.\n"); + goto fail_fini; + } + + oap_auth_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + + fail_fini: + oap_auth_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* + * Distinct timestamp generations use separate buckets and are detected + * independently (covers the multi-generation / rotation path). + */ +static int test_oap_replay_generations(void) +{ + struct oap_hdr h; + struct timespec now; + uint8_t id[OAP_ID_SIZE]; + uint64_t cur; + uint64_t gen_ns; + uint64_t stamp_a; + uint64_t stamp_b; + + TEST_START(); + + if (oap_auth_init() < 0) { + printf("Failed to init OAP.\n"); + goto fail; + } + + clock_gettime(CLOCK_REALTIME, &now); + cur = TS_TO_UINT64(now); + gen_ns = (uint64_t) OAP_REPLAY_TIMER * BILLION; + + /* stamp_a in the current generation, stamp_b one generation older */ + stamp_a = cur; + stamp_b = (cur / gen_ns) * gen_ns - 1; + + memset(&h, 0, sizeof(h)); + h.id.data = id; + h.id.len = OAP_ID_SIZE; + make_id(id, 1); + + /* First sighting in each generation is accepted */ + h.timestamp = stamp_a; + if (oap_check_hdr(&h) != 0) { + printf("Gen-A header rejected.\n"); + goto fail_fini; + } + + h.timestamp = stamp_b; + if (oap_check_hdr(&h) != 0) { + printf("Gen-B header rejected.\n"); + goto fail_fini; + } + + /* Each generation independently detects its own replay */ + h.timestamp = stamp_a; + if (oap_check_hdr(&h) != -EREPLAY) { + printf("Gen-A replay not detected.\n"); + goto fail_fini; + } + + h.timestamp = stamp_b; + if (oap_check_hdr(&h) != -EREPLAY) { + printf("Gen-B replay not detected.\n"); + goto fail_fini; + } + + oap_auth_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + + fail_fini: + oap_auth_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + /* Server rejects client certificate when root CA is missing from store */ static int test_oap_missing_root_ca(void) { @@ -1125,6 +1419,7 @@ static int test_oap_missing_root_ca(void) oap_test_teardown(&ctx); TEST_SUCCESS(); + return TEST_RC_SUCCESS; fail_teardown: @@ -1173,6 +1468,355 @@ static int test_oap_server_name_mismatch(void) oap_test_teardown(&ctx); TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + + fail_cleanup: + oap_test_teardown(&ctx); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* Client requiring auth rejects a response without certificate */ +static int test_oap_cli_requires_srv_auth(void) +{ + struct oap_test_ctx ctx; + + test_default_cfg(); + test_cfg.srv.auth = NO_AUTH; + test_cfg.cli.req_auth = true; + + TEST_START(); + + if (oap_test_setup(&ctx, root_ca_crt_ec, im_ca_crt_ec) < 0) + goto fail; + + if (oap_cli_prepare_ctx(&ctx) < 0) { + printf("Client prepare failed.\n"); + goto fail_cleanup; + } + + if (oap_srv_process_ctx(&ctx) < 0) { + printf("Server process failed.\n"); + goto fail_cleanup; + } + + if (oap_cli_complete_ctx(&ctx) == 0) { + printf("Client should reject unauthenticated server.\n"); + goto fail_cleanup; + } + + oap_test_teardown(&ctx); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + + fail_cleanup: + oap_test_teardown(&ctx); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* Server requiring auth rejects a request without certificate */ +static int test_oap_srv_requires_cli_auth(void) +{ + struct oap_test_ctx ctx; + + test_default_cfg(); + test_cfg.srv.req_auth = true; + + TEST_START(); + + if (oap_test_setup(&ctx, root_ca_crt_ec, im_ca_crt_ec) < 0) + goto fail; + + if (oap_cli_prepare_ctx(&ctx) < 0) { + printf("Client prepare failed.\n"); + goto fail_cleanup; + } + + if (oap_srv_process_ctx(&ctx) == 0) { + printf("Server should reject unauthenticated client.\n"); + goto fail_cleanup; + } + + oap_test_teardown(&ctx); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + + fail_cleanup: + oap_test_teardown(&ctx); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* Roundtrip succeeds when both sides require and provide auth */ +static int test_oap_mutual_req_auth(void) +{ + struct oap_test_ctx ctx; + + test_default_cfg(); + test_cfg.srv.req_auth = true; + test_cfg.cli.auth = AUTH; + test_cfg.cli.req_auth = true; + + TEST_START(); + + if (oap_test_setup(&ctx, root_ca_crt_ec, im_ca_crt_ec) < 0) + goto fail; + + if (oap_cli_prepare_ctx(&ctx) < 0) { + printf("Client prepare failed.\n"); + goto fail_cleanup; + } + + if (oap_srv_process_ctx(&ctx) < 0) { + printf("Server process failed.\n"); + goto fail_cleanup; + } + + if (oap_cli_complete_ctx(&ctx) < 0) { + printf("Client complete failed.\n"); + goto fail_cleanup; + } + + if (memcmp(ctx.cli.key, ctx.srv.key, SYMMKEYSZ) != 0) { + printf("Client and server keys do not match!\n"); + goto fail_cleanup; + } + + oap_test_teardown(&ctx); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + + fail_cleanup: + oap_test_teardown(&ctx); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* Client rejects a server signature with a different digest */ +static int test_oap_cli_rejects_md_mismatch(void) +{ + struct oap_test_ctx ctx; + + test_default_cfg(); + test_cfg.srv.md = NID_sha384; + + TEST_START(); + + if (oap_test_setup(&ctx, root_ca_crt_ec, im_ca_crt_ec) < 0) + goto fail; + + if (oap_cli_prepare_ctx(&ctx) < 0) { + printf("Client prepare failed.\n"); + goto fail_cleanup; + } + + if (oap_srv_process_ctx(&ctx) < 0) { + printf("Server process failed.\n"); + goto fail_cleanup; + } + + if (oap_cli_complete_ctx(&ctx) == 0) { + printf("Client should reject digest mismatch.\n"); + goto fail_cleanup; + } + + oap_test_teardown(&ctx); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + + fail_cleanup: + oap_test_teardown(&ctx); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* Server rejects a client signature with a different digest */ +static int test_oap_srv_rejects_md_mismatch(void) +{ + struct oap_test_ctx ctx; + + test_default_cfg(); + test_cfg.cli.auth = AUTH; + test_cfg.cli.md = NID_sha384; + + TEST_START(); + + if (oap_test_setup(&ctx, root_ca_crt_ec, im_ca_crt_ec) < 0) + goto fail; + + if (oap_cli_prepare_ctx(&ctx) < 0) { + printf("Client prepare failed.\n"); + goto fail_cleanup; + } + + if (oap_srv_process_ctx(&ctx) == 0) { + printf("Server should reject digest mismatch.\n"); + goto fail_cleanup; + } + + oap_test_teardown(&ctx); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + + fail_cleanup: + oap_test_teardown(&ctx); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* Naive substring search over raw bytes (memmem is not portable here). */ +static bool buf_contains(const uint8_t * hay, + size_t hlen, + const uint8_t * needle, + size_t nlen) +{ + size_t i; + + if (nlen == 0 || nlen > hlen) + return false; + + for (i = 0; i + nlen <= hlen; i++) { + if (memcmp(hay + i, needle, nlen) == 0) + return true; + } + + return false; +} + +/* The server certificate must not appear in cleartext on the wire */ +static int test_oap_server_cert_hidden(void) +{ + struct oap_test_ctx ctx; + void * crt = NULL; + buffer_t der = BUF_INIT; + + test_default_cfg(); + + TEST_START(); + + if (oap_test_setup(&ctx, root_ca_crt_ec, im_ca_crt_ec) < 0) + goto fail; + + if (oap_cli_prepare_ctx(&ctx) < 0) { + printf("Client prepare failed.\n"); + goto fail_cleanup; + } + + if (oap_srv_process_ctx(&ctx) < 0) { + printf("Server process failed.\n"); + goto fail_cleanup; + } + + if (crypt_load_crt_str(signed_server_crt_ec, &crt) < 0) { + printf("Failed to load server crt.\n"); + goto fail_cleanup; + } + + if (crypt_crt_der(crt, &der) < 0) { + printf("Failed to DER-encode server crt.\n"); + goto fail_crt; + } + + if (der.len == 0 || der.len > ctx.resp_hdr.len) { + printf("Unexpected cert/response sizes.\n"); + goto fail_der; + } + + if (buf_contains(ctx.resp_hdr.data, ctx.resp_hdr.len, + der.data, der.len)) { + printf("Server certificate found in cleartext.\n"); + goto fail_der; + } + + /* The handshake must still complete and agree on a key */ + if (oap_cli_complete_ctx(&ctx) < 0) { + printf("Client complete failed.\n"); + goto fail_der; + } + + if (memcmp(ctx.cli.key, ctx.srv.key, SYMMKEYSZ) != 0) { + printf("Client and server keys do not match!\n"); + goto fail_der; + } + + freebuf(der); + crypt_free_crt(crt); + oap_test_teardown(&ctx); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + + fail_der: + freebuf(der); + fail_crt: + crypt_free_crt(crt); + fail_cleanup: + oap_test_teardown(&ctx); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* Tampering the sealed identity block fails the handshake */ +static int test_oap_sealed_tamper(void) +{ + struct oap_test_ctx ctx; + size_t pos; + + test_default_cfg(); + + TEST_START(); + + if (oap_test_setup(&ctx, root_ca_crt_ec, im_ca_crt_ec) < 0) + goto fail; + + if (oap_cli_prepare_ctx(&ctx) < 0) { + printf("Client prepare failed.\n"); + goto fail_cleanup; + } + + if (oap_srv_process_ctx(&ctx) < 0) { + printf("Server process failed.\n"); + goto fail_cleanup; + } + + if (ctx.resp_hdr.len < 64) { + printf("Response too short for test.\n"); + goto fail_cleanup; + } + + /* Flip a byte inside the sealed ciphertext, before the AEAD tag */ + pos = ctx.resp_hdr.len - 32; + ctx.resp_hdr.data[pos] ^= 0xFF; + + if (oap_cli_complete_ctx(&ctx) == 0) { + printf("Client accepted a tampered identity block.\n"); + goto fail_cleanup; + } + + oap_test_teardown(&ctx); + + TEST_SUCCESS(); + return TEST_RC_SUCCESS; fail_cleanup: @@ -1191,17 +1835,22 @@ int oap_test(int argc, (void) argv; ret |= test_oap_auth_init_fini(); + ret |= test_oap_replay_cap(); + ret |= test_oap_replay_generations(); #ifdef HAVE_OPENSSL ret |= test_oap_roundtrip_auth_only(); ret |= test_oap_roundtrip_kex_only(); ret |= test_oap_piggyback_data(); + ret |= test_oap_rekey(); + ret |= test_oap_rekey_badcache(); ret |= test_oap_roundtrip_all(); ret |= test_oap_roundtrip_md_all(); ret |= test_oap_corrupted_request(); ret |= test_oap_corrupted_response(); + ret |= test_oap_key_confirm_mismatch(); ret |= test_oap_truncated_request(); ret |= test_oap_inflated_length_field(); ret |= test_oap_deflated_length_field(); @@ -1212,6 +1861,7 @@ int oap_test(int argc, ret |= test_oap_srv_enc_cli_none(); ret |= test_oap_cli_enc_srv_none(); ret |= test_oap_cli_rejects_downgrade(); + ret |= test_oap_cli_rejects_suite_swap(); ret |= test_oap_srv_rejects_weak_kex(); ret |= test_oap_outdated_packet(); @@ -1219,6 +1869,17 @@ int oap_test(int argc, ret |= test_oap_replay_packet(); ret |= test_oap_missing_root_ca(); ret |= test_oap_server_name_mismatch(); + + ret |= test_oap_cli_requires_srv_auth(); + ret |= test_oap_srv_requires_cli_auth(); + ret |= test_oap_mutual_req_auth(); + + + ret |= test_oap_cli_rejects_md_mismatch(); + ret |= test_oap_srv_rejects_md_mismatch(); + + ret |= test_oap_server_cert_hidden(); + ret |= test_oap_sealed_tamper(); #else (void) test_oap_roundtrip_auth_only; (void) test_oap_roundtrip_kex_only; @@ -1229,6 +1890,7 @@ int oap_test(int argc, (void) test_oap_roundtrip_md_all; (void) test_oap_corrupted_request; (void) test_oap_corrupted_response; + (void) test_oap_key_confirm_mismatch; (void) test_oap_truncated_request; (void) test_oap_inflated_length_field; (void) test_oap_deflated_length_field; @@ -1238,12 +1900,23 @@ int oap_test(int argc, (void) test_oap_srv_enc_cli_none; (void) test_oap_cli_enc_srv_none; (void) test_oap_cli_rejects_downgrade; + (void) test_oap_cli_rejects_suite_swap; (void) test_oap_srv_rejects_weak_kex; (void) test_oap_outdated_packet; (void) test_oap_future_packet; (void) test_oap_replay_packet; + (void) test_oap_replay_generations; (void) test_oap_missing_root_ca; (void) test_oap_server_name_mismatch; + (void) test_oap_cli_requires_srv_auth; + (void) test_oap_srv_requires_cli_auth; + (void) test_oap_mutual_req_auth; + (void) test_oap_cli_rejects_md_mismatch; + (void) test_oap_srv_rejects_md_mismatch; + (void) test_oap_server_cert_hidden; + (void) test_oap_sealed_tamper; + (void) test_oap_rekey; + (void) test_oap_rekey_badcache; ret = TEST_RC_SKIP; #endif diff --git a/src/irmd/oap/tests/oap_test_ml_dsa.c b/src/irmd/oap/tests/oap_test_ml_dsa.c index 81b307ab..8691aa00 100644 --- a/src/irmd/oap/tests/oap_test_ml_dsa.c +++ b/src/irmd/oap/tests/oap_test_ml_dsa.c @@ -179,6 +179,7 @@ int load_server_kem_pk(const char * name, pk->data = malloc(test_kem_pk_len); if (pk->data == NULL) return -1; + memcpy(pk->data, test_kem_pk, test_kem_pk_len); pk->len = test_kem_pk_len; @@ -237,6 +238,39 @@ static int test_oap_roundtrip_auth_only(void) return roundtrip_auth_only(root_ca_crt_ml, im_ca_crt_ml); } +/* Digest pin does not apply to PQC: the digest is intrinsic */ +static int test_oap_cli_md_pin_exempts_pqc(void) +{ + test_cfg_init(NID_undef, NID_undef, NID_undef, 0, NO_CLI_AUTH); + test_cfg.cli.md = NID_sha256; + + return roundtrip_auth_only(root_ca_crt_ml, im_ca_crt_ml); +} + +static int test_oap_srv_md_pin_exempts_pqc(void) +{ + test_cfg_init(NID_undef, NID_undef, NID_undef, 0, CLI_AUTH); + test_cfg.srv.md = NID_sha256; + + return roundtrip_auth_only(root_ca_crt_ml, im_ca_crt_ml); +} + +static int test_oap_rekey(void) +{ + test_cfg_init(NID_X25519, NID_aes_256_gcm, NID_sha256, + 0, NO_CLI_AUTH); + + return roundtrip_rekey(root_ca_crt_ml, im_ca_crt_ml); +} + +static int test_oap_rekey_badcache(void) +{ + test_cfg_init(NID_X25519, NID_aes_256_gcm, NID_sha256, + 0, NO_CLI_AUTH); + + return roundtrip_rekey_badcache(root_ca_crt_ml, im_ca_crt_ml); +} + static int test_oap_corrupted_request(void) { test_cfg_init(NID_MLKEM768, NID_aes_256_gcm, get_random_kdf(), @@ -422,6 +456,8 @@ int oap_test_ml_dsa(int argc, #ifdef HAVE_OPENSSL_ML_KEM ret |= test_oap_roundtrip_auth_only(); + ret |= test_oap_cli_md_pin_exempts_pqc(); + ret |= test_oap_srv_md_pin_exempts_pqc(); ret |= test_oap_roundtrip_kem_all(); @@ -430,8 +466,15 @@ int oap_test_ml_dsa(int argc, ret |= test_oap_corrupted_request(); ret |= test_oap_corrupted_response(); ret |= test_oap_truncated_request(); + + ret |= test_oap_rekey(); + ret |= test_oap_rekey_badcache(); #else (void) test_oap_roundtrip_auth_only; + (void) test_oap_cli_md_pin_exempts_pqc; + (void) test_oap_srv_md_pin_exempts_pqc; + (void) test_oap_rekey; + (void) test_oap_rekey_badcache; (void) test_oap_roundtrip_kem; (void) test_oap_roundtrip_kem_all; (void) test_oap_kem_srv_uncfg; diff --git a/src/irmd/reg/flow.c b/src/irmd/reg/flow.c index 93c3e128..8be2dfc7 100644 --- a/src/irmd/reg/flow.c +++ b/src/irmd/reg/flow.c @@ -24,6 +24,7 @@ #define OUROBOROS_PREFIX "reg/flow" +#include <ouroboros/crypt.h> #include <ouroboros/logs.h> #include "flow.h" @@ -32,6 +33,7 @@ #include <errno.h> #include <stdbool.h> #include <stdlib.h> +#include <string.h> struct reg_flow * reg_flow_create(const struct flow_info * info) { @@ -42,6 +44,7 @@ struct reg_flow * reg_flow_create(const struct flow_info * info) assert(info->n_pid != 0); assert(info->n_1_pid == 0); assert(info->mpl == 0); + assert(info->mtu == 0); assert(info->state == FLOW_INIT); flow = malloc(sizeof(*flow)); @@ -67,10 +70,12 @@ static void destroy_rbuffs(struct reg_flow * flow) { if (flow->n_rb != NULL) ssm_rbuff_destroy(flow->n_rb); + flow->n_rb = NULL; if (flow->n_1_rb != NULL) ssm_rbuff_destroy(flow->n_1_rb); + flow->n_1_rb = NULL; } @@ -78,6 +83,11 @@ void reg_flow_destroy(struct reg_flow * flow) { assert(flow != NULL); + if (flow->rk.pending_seed != NULL) + crypt_secure_free(flow->rk.pending_seed, SYMMKEYSZ); + + freebuf(flow->rk.peer_crt); + switch(flow->info.state) { case FLOW_ACCEPT_PENDING: clrbuf(flow->req_data); @@ -160,6 +170,7 @@ int reg_flow_update(struct reg_flow * flow, assert(info->mpl != 0); flow->info.mpl = info->mpl; + flow->info.mtu = info->mtu; if (flow->info.state == FLOW_ALLOC_PENDING) break; diff --git a/src/irmd/reg/flow.h b/src/irmd/reg/flow.h index 9a4046d3..166bed61 100644 --- a/src/irmd/reg/flow.h +++ b/src/irmd/reg/flow.h @@ -49,6 +49,22 @@ struct reg_flow { bool direct; + /* Tier-2 re-key state (encrypted flows only) */ + struct { + bool encrypted; /* flow carries a cipher */ + uint8_t epoch; /* last epoch installed by app */ + bool initiator; /* OAP initiator (role 0) */ + bool in_flight; /* a re-key is in progress */ + bool req_queued; /* a peer REQ is in the inbox */ + bool resp_queued; /* a peer RESP is in the inbox */ + uint8_t * pending_seed; /* secure heap; NULL until set */ + uint8_t pending_epoch; + bool pending_initiator; /* pending seed: oap_cli side */ + bool has_pending; /* new seed awaits app pull */ + uint8_t pulled; /* direct: per-app pull mask */ + buffer_t peer_crt; /* peer cert DER, cached at HS */ + } rk; + struct ssm_rbuff * n_rb; struct ssm_rbuff * n_1_rb; }; diff --git a/src/irmd/reg/reg.c b/src/irmd/reg/reg.c index 0025f695..ebf3959d 100644 --- a/src/irmd/reg/reg.c +++ b/src/irmd/reg/reg.c @@ -25,6 +25,7 @@ The IPC Resource Manager - Registry #define OUROBOROS_PREFIX "reg" #include <ouroboros/bitmap.h> +#include <ouroboros/crypt.h> #include <ouroboros/errno.h> #include <ouroboros/list.h> #include <ouroboros/logs.h> @@ -871,6 +872,7 @@ int reg_list_ipcps(ipcp_list_msg_t *** ipcps) fail: while (i-- > 0) ipcp_list_msg__free_unpacked((*ipcps)[i], NULL); + free(*ipcps); fail_malloc: pthread_mutex_unlock(®.mtx); @@ -1032,6 +1034,20 @@ int reg_get_name_for_flow_id(char * buf, return f == NULL ? -ENOENT : 0; } +void reg_set_name_for_flow_id(const char * name, + int flow_id) +{ + struct reg_flow * f; + + pthread_mutex_lock(®.mtx); + + f = __reg_get_flow(flow_id); + if (f != NULL) + strcpy(f->name, name); + + pthread_mutex_unlock(®.mtx); +} + int reg_list_names(name_info_msg_t *** names) { struct list_head * p; @@ -1076,6 +1092,7 @@ int reg_list_names(name_info_msg_t *** names) fail: while (i-- > 0) name_info_msg__free_unpacked((*names)[i], NULL); + free(*names); fail_malloc: pthread_mutex_unlock(®.mtx); @@ -1820,7 +1837,11 @@ int reg_respond_alloc(struct flow_info * info, goto fail_flow; } - assert(flow->info.state == FLOW_ALLOC_PENDING); + if (flow->info.state != FLOW_ALLOC_PENDING) { + log_warn("Flow %d already responded.", info->id); + goto fail_flow; + } + assert(flow->rsp_data.len == 0); assert(flow->rsp_data.data == NULL); @@ -2098,6 +2119,511 @@ bool reg_flow_is_direct(int flow_id) return ret; } +void reg_flow_set_rekey(int flow_id, + bool initiator, + buffer_t peer_crt) +{ + struct reg_flow * flow; + uint8_t * crt = NULL; + + /* Copy the cert outside the lock; publish it with rk.encrypted. */ + if (peer_crt.len > 0) { + crt = malloc(peer_crt.len); + if (crt != NULL) + memcpy(crt, peer_crt.data, peer_crt.len); + else + log_warn("Failed to cache peer cert for re-key."); + } + + pthread_mutex_lock(®.mtx); + + flow = __reg_get_flow(flow_id); + if (flow != NULL) { + flow->rk.encrypted = true; + flow->rk.initiator = initiator; + flow->rk.epoch = 0; + if (crt != NULL) { + freebuf(flow->rk.peer_crt); + flow->rk.peer_crt.data = crt; + flow->rk.peer_crt.len = peer_crt.len; + crt = NULL; + } + } + + pthread_mutex_unlock(®.mtx); + + free(crt); +} + +int reg_flow_get_peer_crt(int flow_id, + buffer_t * crt) +{ + struct reg_flow * flow; + int ret = -ENOENT; + + assert(crt != NULL); + + clrbuf(*crt); + + pthread_mutex_lock(®.mtx); + + flow = __reg_get_flow(flow_id); + if (flow != NULL && flow->rk.peer_crt.len > 0) { + crt->data = malloc(flow->rk.peer_crt.len); + if (crt->data == NULL) { + ret = -ENOMEM; + } else { + memcpy(crt->data, flow->rk.peer_crt.data, + flow->rk.peer_crt.len); + crt->len = flow->rk.peer_crt.len; + ret = 0; + } + } + + pthread_mutex_unlock(®.mtx); + + return ret; +} + +int reg_flow_get_epoch(int flow_id) +{ + struct reg_flow * flow; + int epoch = -1; + + pthread_mutex_lock(®.mtx); + + flow = __reg_get_flow(flow_id); + if (flow != NULL && flow->rk.encrypted) + epoch = flow->rk.epoch; + + pthread_mutex_unlock(®.mtx); + + return epoch; +} + +bool reg_flow_rekey_pending(int flow_id) +{ + struct reg_flow * flow; + bool ret = false; + + pthread_mutex_lock(®.mtx); + + flow = __reg_get_flow(flow_id); + if (flow != NULL) + ret = flow->rk.has_pending; + + pthread_mutex_unlock(®.mtx); + + return ret; +} + +pid_t reg_flow_get_n_1_pid(int flow_id) +{ + struct reg_flow * flow; + pid_t pid = -1; + + pthread_mutex_lock(®.mtx); + + flow = __reg_get_flow(flow_id); + if (flow != NULL) + pid = flow->info.n_1_pid; + + pthread_mutex_unlock(®.mtx); + + return pid; +} + +int reg_flow_snapshot_rekey_due(struct rekey_info * snap, + int max) +{ + struct list_head * p; + int n = 0; + + pthread_mutex_lock(®.mtx); + + llist_for_each(p, ®.flows) { + struct reg_flow * f; + + if (n == max) + break; + + f = list_entry(p, struct reg_flow, next); + + if (f->info.state != FLOW_ALLOCATED) + continue; + + if (!f->rk.encrypted) + continue; + + /* Direct flows have no IPCP initiator; either side drives. */ + if (!f->direct && !f->rk.initiator) + continue; + + if (f->rk.in_flight || f->rk.has_pending) + continue; + + f->rk.in_flight = true; + + snap[n].flow_id = f->info.id; + snap[n].n_pid = f->info.n_pid; + snap[n].n_1_pid = f->info.n_1_pid; + snap[n].epoch = f->rk.epoch; + snap[n].direct = f->direct; + strcpy(snap[n].name, f->name); + ++n; + } + + pthread_mutex_unlock(®.mtx); + + return n; +} + +void reg_flow_clear_in_flight(int flow_id) +{ + struct reg_flow * flow; + + pthread_mutex_lock(®.mtx); + + flow = __reg_get_flow(flow_id); + if (flow != NULL) + flow->rk.in_flight = false; + + pthread_mutex_unlock(®.mtx); +} + +/* Test-and-set the in-flight latch; refuse if a re-key is already active. */ +bool reg_flow_rekey_begin(int flow_id) +{ + struct reg_flow * flow; + bool ret = false; + + pthread_mutex_lock(®.mtx); + + flow = __reg_get_flow(flow_id); + if (flow != NULL && flow->rk.encrypted) { + if (!flow->rk.in_flight && !flow->rk.has_pending) { + flow->rk.in_flight = true; + ret = true; + } + } + + pthread_mutex_unlock(®.mtx); + + return ret; +} + +/* Initiator yields the responder role while driving its own exchange. */ +bool reg_flow_rekey_should_yield(int flow_id) +{ + struct reg_flow * flow; + bool ret = false; + + pthread_mutex_lock(®.mtx); + + flow = __reg_get_flow(flow_id); + if (flow != NULL) + ret = flow->rk.initiator && flow->rk.in_flight; + + pthread_mutex_unlock(®.mtx); + + return ret; +} + +int reg_flow_store_pending(int flow_id, + const uint8_t * seed, + uint8_t epoch, + bool initiator) +{ + struct reg_flow * flow; + int ret = -ENOENT; + + pthread_mutex_lock(®.mtx); + + flow = __reg_get_flow(flow_id); + if (flow != NULL) { + /* Exchange done: release the latch regardless of parking. */ + flow->rk.in_flight = false; + + if (flow->rk.pending_seed == NULL) + flow->rk.pending_seed = crypt_secure_malloc(SYMMKEYSZ); + + if (flow->rk.pending_seed != NULL) { + memcpy(flow->rk.pending_seed, seed, SYMMKEYSZ); + flow->rk.pending_epoch = epoch; + flow->rk.pending_initiator = initiator; + flow->rk.has_pending = true; + /* Doorbell raised only after the seed is parked. */ + if (flow->n_rb != NULL) + ssm_rbuff_set_bits(flow->n_rb, RB_REKEY); + ret = 0; + } else { + ret = -ENOMEM; + } + } + + pthread_mutex_unlock(®.mtx); + + return ret; +} + +/* Direct re-key: which of the two local apps has pulled the seed. */ +#define RK_N_PID 0x1 /* acceptor (n_pid) pulled the seed */ +#define RK_N_1_PID 0x2 /* allocator (n_1_pid) pulled the seed */ +#define RK_PID_MASK (RK_N_PID | RK_N_1_PID) + +/* + * Park a single re-key seed for a direct flow and ring BOTH apps' + * doorbells. The seed is the one shared secret; each app pulls it once + * (reg_flow_take_pending), so it is held until both have taken it. + */ +int reg_flow_store_pending_direct(int flow_id, + const uint8_t * seed, + uint8_t epoch) +{ + struct reg_flow * flow; + int ret = -ENOENT; + + pthread_mutex_lock(®.mtx); + + flow = __reg_get_flow(flow_id); + if (flow == NULL) + goto out; + + /* Exchange done: release the latch regardless of parking. */ + flow->rk.in_flight = false; + + if (flow->rk.pending_seed == NULL) + flow->rk.pending_seed = crypt_secure_malloc(SYMMKEYSZ); + + if (flow->rk.pending_seed == NULL) { + ret = -ENOMEM; + goto out; + } + + memcpy(flow->rk.pending_seed, seed, SYMMKEYSZ); + flow->rk.pending_epoch = epoch; + flow->rk.has_pending = true; + flow->rk.pulled = 0; + + /* A departed peer never pulls; treat its side as already done. */ + if (flow->info.n_pid <= 0) + flow->rk.pulled |= RK_N_PID; + + if (flow->info.n_1_pid <= 0) + flow->rk.pulled |= RK_N_1_PID; + + if (flow->n_rb != NULL && !(flow->rk.pulled & RK_N_PID)) + ssm_rbuff_set_bits(flow->n_rb, RB_REKEY); + + if (flow->n_1_rb != NULL && !(flow->rk.pulled & RK_N_1_PID)) + ssm_rbuff_set_bits(flow->n_1_rb, RB_REKEY); + + ret = 0; + out: + pthread_mutex_unlock(®.mtx); + + return ret; +} + +/* A caller may act on a flow if it is privileged or owns the flow. */ +static bool uid_may_access(uid_t caller, + uid_t owner) +{ + return is_ouroboros_member_uid(caller) || caller == owner; +} + +/* + * Caller holds reg.mtx. The direct seed is shared by both apps, so the + * per-app initiator role is resolved from the verified caller pid (the + * allocator is n_1_pid), and the seed is held until both have pulled. + */ +static void __take_pending_direct(struct reg_flow * flow, + pid_t cpid, + uint8_t * seed, + uint8_t * epoch, + bool * initiator) +{ + bool allocator; + + allocator = cpid == flow->info.n_1_pid; + + memcpy(seed, flow->rk.pending_seed, SYMMKEYSZ); + *epoch = flow->rk.pending_epoch; + *initiator = allocator; + flow->rk.epoch = flow->rk.pending_epoch; + + if (allocator) { + flow->rk.pulled |= RK_N_1_PID; + if (flow->n_1_rb != NULL) + ssm_rbuff_clr_bits(flow->n_1_rb, RB_REKEY); + } else { + flow->rk.pulled |= RK_N_PID; + if (flow->n_rb != NULL) + ssm_rbuff_clr_bits(flow->n_rb, RB_REKEY); + } + + if ((flow->rk.pulled & RK_PID_MASK) != RK_PID_MASK) + return; + + flow->rk.has_pending = false; + flow->rk.pulled = 0; + crypt_secure_clear(flow->rk.pending_seed, SYMMKEYSZ); +} + +int reg_flow_take_pending(int flow_id, + uid_t uid, + pid_t cpid, + uint8_t * seed, + uint8_t * epoch, + bool * initiator) +{ + struct reg_flow * flow; + int ret = -ENOENT; + + pthread_mutex_lock(®.mtx); + + flow = __reg_get_flow(flow_id); + if (flow == NULL || !flow->rk.has_pending) + goto out; + + if (!uid_may_access(uid, flow->info.uid)) { + ret = -EPERM; + goto out; + } + + if (flow->direct) { + __take_pending_direct(flow, cpid, seed, epoch, initiator); + ret = 0; + goto out; + } + + memcpy(seed, flow->rk.pending_seed, SYMMKEYSZ); + *epoch = flow->rk.pending_epoch; + *initiator = flow->rk.pending_initiator; + flow->rk.epoch = flow->rk.pending_epoch; + flow->rk.has_pending = false; + crypt_secure_clear(flow->rk.pending_seed, SYMMKEYSZ); + if (flow->n_rb != NULL) + ssm_rbuff_clr_bits(flow->n_rb, RB_REKEY); + + ret = 0; + out: + pthread_mutex_unlock(®.mtx); + + return ret; +} + +/* + * Admit a peer-driven re-key arrival before a worker event is allocated: + * the flow must exist, carry a cipher, and the update must come from its + * own lower IPCP. Coalesces to one queued REQ and one queued RESP per flow + * so a flooding peer cannot grow the inbox without bound. + */ +bool reg_flow_rekey_arr_admit(int flow_id, + pid_t n_1_pid, + bool is_req) +{ + struct reg_flow * flow; + bool admit = false; + + pthread_mutex_lock(®.mtx); + + flow = __reg_get_flow(flow_id); + if (flow != NULL && flow->rk.encrypted + && flow->info.n_1_pid == n_1_pid) { + if (is_req && !flow->rk.req_queued) { + flow->rk.req_queued = true; + admit = true; + } else if (!is_req && flow->rk.in_flight + && !flow->rk.resp_queued) { + flow->rk.resp_queued = true; + admit = true; + } + } + + pthread_mutex_unlock(®.mtx); + + return admit; +} + +void reg_flow_rekey_arr_done(int flow_id, + bool is_req) +{ + struct reg_flow * flow; + + pthread_mutex_lock(®.mtx); + + flow = __reg_get_flow(flow_id); + if (flow != NULL) { + if (is_req) + flow->rk.req_queued = false; + else + flow->rk.resp_queued = false; + } + + pthread_mutex_unlock(®.mtx); +} + +bool reg_flow_owned_by(int flow_id, + uid_t uid) +{ + struct reg_flow * flow; + bool ret = false; + + pthread_mutex_lock(®.mtx); + + flow = __reg_get_flow(flow_id); + if (flow != NULL) + ret = uid_may_access(uid, flow->info.uid); + + pthread_mutex_unlock(®.mtx); + + return ret; +} + +/* Caller holds reg.mtx. */ +static void __notify_proc(pid_t pid, + int flow_id, + int event) +{ + struct reg_proc * proc; + + proc = __reg_get_proc(pid); + if (proc != NULL) + ssm_flow_set_notify(proc->set, flow_id, event); +} + +void reg_notify_flow(int flow_id, + int event) +{ + struct reg_flow * flow; + + pthread_mutex_lock(®.mtx); + + flow = __reg_get_flow(flow_id); + if (flow != NULL) + __notify_proc(flow->info.n_pid, flow_id, event); + + pthread_mutex_unlock(®.mtx); +} + +/* Wake both endpoints of a direct flow (acceptor and allocator). */ +void reg_notify_flow_peers(int flow_id, + int event) +{ + struct reg_flow * flow; + + pthread_mutex_lock(®.mtx); + + flow = __reg_get_flow(flow_id); + if (flow != NULL) { + __notify_proc(flow->info.n_pid, flow_id, event); + __notify_proc(flow->info.n_1_pid, flow_id, event); + } + + pthread_mutex_unlock(®.mtx); +} + int reg_respond_flow_direct(int flow_id, buffer_t * pbuf) { diff --git a/src/irmd/reg/reg.h b/src/irmd/reg/reg.h index 6b576471..8a313d46 100644 --- a/src/irmd/reg/reg.h +++ b/src/irmd/reg/reg.h @@ -109,6 +109,9 @@ int reg_get_name_for_hash(char * buf, int reg_get_name_for_flow_id(char * buf, int flow_id); +void reg_set_name_for_flow_id(const char * name, + int flow_id); + /* TODO don't rely on protobuf here */ int reg_list_names(name_info_msg_t *** names); @@ -163,6 +166,70 @@ int reg_wait_flow_direct(int flow_id, bool reg_flow_is_direct(int flow_id); +/* Per-flow snapshot for the re-key timer */ +struct rekey_info { + int flow_id; + pid_t n_pid; + pid_t n_1_pid; + char name[NAME_SIZE + 1]; + uint8_t epoch; + bool direct; +}; + +void reg_flow_set_rekey(int flow_id, + bool initiator, + buffer_t peer_crt); + +int reg_flow_get_peer_crt(int flow_id, + buffer_t * crt); + +int reg_flow_get_epoch(int flow_id); + +bool reg_flow_rekey_pending(int flow_id); + +pid_t reg_flow_get_n_1_pid(int flow_id); + +int reg_flow_snapshot_rekey_due(struct rekey_info * snap, + int max); + +void reg_flow_clear_in_flight(int flow_id); + +bool reg_flow_rekey_begin(int flow_id); + +bool reg_flow_rekey_should_yield(int flow_id); + +int reg_flow_store_pending(int flow_id, + const uint8_t * seed, + uint8_t epoch, + bool initiator); + +int reg_flow_store_pending_direct(int flow_id, + const uint8_t * seed, + uint8_t epoch); + +int reg_flow_take_pending(int flow_id, + uid_t uid, + pid_t cpid, + uint8_t * seed, + uint8_t * epoch, + bool * initiator); + +bool reg_flow_rekey_arr_admit(int flow_id, + pid_t n_1_pid, + bool is_req); + +void reg_flow_rekey_arr_done(int flow_id, + bool is_req); + +bool reg_flow_owned_by(int flow_id, + uid_t uid); + +void reg_notify_flow(int flow_id, + int event); + +void reg_notify_flow_peers(int flow_id, + int event); + void reg_dealloc_flow(struct flow_info * info); void reg_dealloc_flow_resp(struct flow_info * info); diff --git a/src/irmd/reg/tests/flow_test.c b/src/irmd/reg/tests/flow_test.c index 7e1c1360..18214078 100644 --- a/src/irmd/reg/tests/flow_test.c +++ b/src/irmd/reg/tests/flow_test.c @@ -122,6 +122,21 @@ static int test_reg_flow_create_has_mpl(void) { return TEST_RC_SUCCESS; } +static int test_reg_flow_create_has_mtu(void) { + struct flow_info info = { + .id = 1, + .n_pid = 1, + .n_1_pid = 0, + .mtu = 1400, + .qs = qos_raw, + .state = FLOW_ALLOC_PENDING + }; + + reg_flow_create(&info); /* assert fail */ + + return TEST_RC_SUCCESS; +} + static int test_reg_flow_update(void) { struct reg_flow * f; @@ -136,7 +151,7 @@ static int test_reg_flow_update(void) struct flow_info upd = { .id = 1, .n_pid = 1, - .qs = qos_data, + .qs = qos_msg, .state = FLOW_DEALLOCATED }; @@ -179,7 +194,7 @@ static int test_reg_flow_update_wrong_id(void) struct flow_info upd = { .id = 2, .n_pid = 1, - .qs = qos_data, + .qs = qos_msg, .state = FLOW_DEALLOCATED }; @@ -210,6 +225,7 @@ static int test_reg_flow_assert_fails(void) ret |= test_assert_fail(test_reg_flow_create_has_n_1_pid); ret |= test_assert_fail(test_reg_flow_create_wrong_state); ret |= test_assert_fail(test_reg_flow_create_has_mpl); + ret |= test_assert_fail(test_reg_flow_create_has_mtu); ret |= test_assert_fail(test_reg_flow_update_wrong_id); return ret; diff --git a/src/irmd/reg/tests/reg_test.c b/src/irmd/reg/tests/reg_test.c index f4b0188b..a8c1b1fa 100644 --- a/src/irmd/reg/tests/reg_test.c +++ b/src/irmd/reg/tests/reg_test.c @@ -31,6 +31,7 @@ #define TEST_N_1_PID 3999 #define TEST_FAKE_ID 9128349 #define TEST_MPL 5 +#define TEST_MTU 1400 #define TEST_PROG "reg_test" /* own binary for binary check */ #define TEST_IPCP "testipcp" #define TEST_NAME "testname" @@ -239,7 +240,7 @@ static int test_reg_accept_flow_success(void) struct flow_info n_1_info = { .n_1_pid = TEST_N_1_PID, - .qs = qos_data, + .qs = qos_msg, .state = FLOW_ALLOCATED /* RESPONSE SUCCESS */ }; @@ -266,6 +267,7 @@ static int test_reg_accept_flow_success(void) n_1_info.id = info.id; n_1_info.mpl = 1; + n_1_info.mtu = TEST_MTU; pthread_create(&thr, NULL, test_flow_respond_accept, &n_1_info); @@ -284,6 +286,11 @@ static int test_reg_accept_flow_success(void) goto fail; } + if (info.mtu != TEST_MTU) { + printf("MTU not propagated.\n"); + goto fail; + } + if (rbuf.data == NULL) { printf("rbuf data not returned.\n"); goto fail; @@ -336,7 +343,7 @@ static int test_reg_accept_flow_success_no_crypt(void) struct flow_info n_1_info = { .n_1_pid = TEST_N_1_PID, - .qs = qos_data, + .qs = qos_msg, .state = FLOW_ALLOCATED /* RESPONSE SUCCESS */ }; @@ -363,6 +370,7 @@ static int test_reg_accept_flow_success_no_crypt(void) n_1_info.id = info.id; n_1_info.mpl = 1; + n_1_info.mtu = TEST_MTU; pthread_create(&thr, NULL, test_flow_respond_accept, &n_1_info); @@ -381,6 +389,11 @@ static int test_reg_accept_flow_success_no_crypt(void) goto fail; } + if (info.mtu != TEST_MTU) { + printf("MTU not propagated.\n"); + goto fail; + } + if (rbuf.data == NULL) { printf("rbuf data was not returned.\n"); goto fail; @@ -431,7 +444,7 @@ static int test_reg_allocate_flow_fail(void) struct flow_info n_1_info = { .n_1_pid = TEST_N_1_PID, - .qs = qos_data, + .qs = qos_msg, .state = FLOW_DEALLOCATED /* RESPONSE FAIL */ }; @@ -489,6 +502,93 @@ static int test_reg_allocate_flow_fail(void) return TEST_RC_FAIL; } +static int test_reg_respond_alloc_duplicate(void) +{ + pthread_t thr; + struct timespec abstime; + struct timespec timeo = TIMESPEC_INIT_S(1); + buffer_t rbuf = BUF_INIT; + buffer_t empty = BUF_INIT; + struct flow_info dup_info; + + struct flow_info info = { + .n_pid = TEST_PID, + .qs = qos_raw + }; + + struct flow_info n_1_info = { + .n_1_pid = TEST_N_1_PID, + .qs = qos_msg, + .state = FLOW_ALLOCATED /* RESPONSE SUCCESS */ + }; + + TEST_START(); + + clock_gettime(PTHREAD_COND_CLOCK, &abstime); + ts_add(&abstime, &timeo, &abstime); + + if (reg_init() < 0) { + printf("Failed to init registry.\n"); + goto fail; + } + + if (reg_create_flow(&info) < 0) { + printf("Failed to add flow.\n"); + goto fail; + } + + info.n_1_pid = TEST_N_1_PID; + + if (reg_prepare_flow_alloc(&info) < 0) { + printf("Failed to prepare flow for alloc.\n"); + goto fail; + } + + n_1_info.id = info.id; + n_1_info.mpl = 1; + n_1_info.mtu = TEST_MTU; + + pthread_create(&thr, NULL, test_flow_respond_alloc, &n_1_info); + + if (reg_wait_flow_allocated(&info, &rbuf, &abstime) < 0) { + printf("Flow allocation failed.\n"); + pthread_join(thr, NULL); + reg_destroy_flow(info.id); + reg_fini(); + goto fail; + } + + pthread_join(thr, NULL); + freebuf(rbuf); + + if (info.mtu != TEST_MTU) { + printf("MTU not propagated.\n"); + goto fail; + } + + /* Duplicate reply on an already-ALLOCATED flow must not assert. */ + dup_info = n_1_info; + dup_info.state = FLOW_DEALLOCATED; + + if (reg_respond_alloc(&dup_info, &empty, -EREPLAY) != -1) { + printf("Duplicate respond_alloc should return -1.\n"); + goto fail; + } + + reg_dealloc_flow(&info); + reg_dealloc_flow_resp(&info); + reg_destroy_flow(n_1_info.id); + + reg_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail: + REG_TEST_FAIL(); + return TEST_RC_FAIL; +} + struct direct_alloc_info { struct flow_info info; buffer_t rsp; @@ -564,7 +664,7 @@ static int test_reg_direct_flow_success(void) dai.info.id = info.id; dai.info.n_1_pid = TEST_N_1_PID; dai.info.mpl = TEST_MPL; - dai.info.qs = qos_data; + dai.info.qs = qos_msg; dai.info.state = FLOW_ALLOCATED; dai.rsp.len = 0; dai.rsp.data = NULL; @@ -671,6 +771,167 @@ static int test_reg_direct_flow_success(void) return TEST_RC_FAIL; } +/* + * Direct-flow re-key: one shared seed is parked for both local apps. The + * per-app initiator role is resolved from the verified caller pid (the + * allocator is n_1_pid), and the seed is held until both have pulled it. + */ +static int test_reg_direct_flow_rekey(void) +{ + pthread_t thr; + struct timespec abstime; + struct timespec timeo = TIMESPEC_INIT_S(1); + buffer_t rbuf = BUF_INIT; + buffer_t rsp; + buffer_t no_crt = BUF_INIT; + struct direct_alloc_info dai; + uint8_t seed[SYMMKEYSZ]; + uint8_t out[SYMMKEYSZ]; + uint8_t epoch; + bool initiator; + size_t i; + + struct flow_info info = { + .n_pid = TEST_PID, + .qs = qos_raw + }; + + TEST_START(); + + for (i = 0; i < SYMMKEYSZ; ++i) + seed[i] = (uint8_t) i; + + clock_gettime(PTHREAD_COND_CLOCK, &abstime); + + ts_add(&abstime, &timeo, &abstime); + + if (reg_init() < 0) { + printf("Failed to init registry.\n"); + goto fail; + } + + if (reg_create_flow(&info) < 0) { + printf("Failed to add flow.\n"); + goto fail; + } + + if (reg_prepare_flow_accept(&info) < 0) { + printf("Failed to prepare for accept.\n"); + goto fail; + } + + dai.info.id = info.id; + dai.info.n_1_pid = TEST_N_1_PID; + dai.info.mpl = TEST_MPL; + dai.info.qs = qos_msg; + dai.info.state = FLOW_ALLOCATED; + dai.rsp.len = 0; + dai.rsp.data = NULL; + dai.abstime = abstime; + + pthread_create(&thr, NULL, test_flow_alloc_direct, &dai); + + if (reg_wait_flow_accepted(&info, &rbuf, &abstime) < 0) { + printf("Flow accept failed.\n"); + pthread_join(thr, NULL); + goto fail; + } + + freebuf(rbuf); + + rsp.data = (uint8_t *) strdup(TEST_DATA2); + if (rsp.data == NULL) { + printf("Failed to strdup rsp data.\n"); + pthread_join(thr, NULL); + goto fail; + } + rsp.len = strlen(TEST_DATA2) + 1; + + if (reg_respond_flow_direct(info.id, &rsp) < 0) { + printf("Failed to respond direct.\n"); + freebuf(rsp); + pthread_join(thr, NULL); + goto fail; + } + + pthread_join(thr, NULL); + + freebuf(dai.rsp); + + if (!reg_flow_is_direct(info.id)) { + printf("Flow not marked direct.\n"); + goto fail; + } + + reg_flow_set_rekey(info.id, false, no_crt); + + if (reg_flow_store_pending_direct(info.id, seed, 5) < 0) { + printf("Failed to store pending direct seed.\n"); + goto fail; + } + + if (!reg_flow_rekey_pending(info.id)) { + printf("Seed not pending after store.\n"); + goto fail; + } + + /* Allocator (n_1_pid) pulls: initiator role, seed still held. */ + if (reg_flow_take_pending(info.id, 0, TEST_N_1_PID, out, + &epoch, &initiator) != 0) { + printf("Allocator failed to take pending seed.\n"); + goto fail; + } + + if (!initiator || epoch != 5 || memcmp(out, seed, SYMMKEYSZ) != 0) { + printf("Allocator got wrong seed/role/epoch.\n"); + goto fail; + } + + if (!reg_flow_rekey_pending(info.id)) { + printf("Seed cleared before both apps pulled.\n"); + goto fail; + } + + /* Acceptor (n_pid) pulls: responder role, seed now released. */ + if (reg_flow_take_pending(info.id, 0, TEST_PID, out, + &epoch, &initiator) != 0) { + printf("Acceptor failed to take pending seed.\n"); + goto fail; + } + + if (initiator || epoch != 5 || memcmp(out, seed, SYMMKEYSZ) != 0) { + printf("Acceptor got wrong seed/role/epoch.\n"); + goto fail; + } + + if (reg_flow_rekey_pending(info.id)) { + printf("Seed still pending after both pulled.\n"); + goto fail; + } + + if (reg_flow_get_epoch(info.id) != 5) { + printf("Flow epoch not advanced.\n"); + goto fail; + } + + info.n_pid = TEST_PID; + reg_dealloc_flow(&info); + + info.n_pid = TEST_N_1_PID; + reg_dealloc_flow(&info); + + reg_destroy_flow(info.id); + + reg_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail: + REG_TEST_FAIL(); + return TEST_RC_FAIL; +} + static int test_reg_flow(void) { int rc = 0; @@ -679,7 +940,9 @@ static int test_reg_flow(void) { rc |= test_reg_accept_flow_success(); rc |= test_reg_accept_flow_success_no_crypt(); rc |= test_reg_allocate_flow_fail(); + rc |= test_reg_respond_alloc_duplicate(); rc |= test_reg_direct_flow_success(); + rc |= test_reg_direct_flow_rekey(); return rc; } @@ -774,6 +1037,7 @@ static int test_reg_list_ipcps(void) while (len-- > 0) ipcp_list_msg__free_unpacked(ipcps[len], NULL); + free(ipcps); for (i = 0; i < 10; i++) @@ -840,6 +1104,7 @@ static int test_insert_ipcps(void) while (len-- > 0) ipcp_list_msg__free_unpacked(ipcps[len], NULL); + free(ipcps); reg_clear(); @@ -1017,6 +1282,7 @@ static int test_reg_list_names(void) for (i = 0; i < len; i++) name_info_msg__free_unpacked(names[i], NULL); + free(names); for (i = 0; i < 10; i++) { diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt index 79263924..3abf39d0 100644 --- a/src/lib/CMakeLists.txt +++ b/src/lib/CMakeLists.txt @@ -17,8 +17,12 @@ protobuf_generate_c(IPCP_PROTO_SRCS IPCP_PROTO_HDRS set(SOURCE_FILES_COMMON bitmap.c btree.c - crc32.c + crc/crc8.c + crc/crc16.c + crc/crc32.c + crc/crc64.c crypt.c + crypt/keyrot.c hash.c lockfile.c logs.c @@ -36,6 +40,7 @@ set(SOURCE_FILES_COMMON ssm/pool.c sockets.c tpm.c + tw.c utils.c ) @@ -88,6 +93,13 @@ if(HAVE_FUSE) target_link_libraries(ouroboros-common PRIVATE Fuse::Fuse) endif() +if(HAVE_LIBURCU) + target_link_libraries(ouroboros-common PRIVATE Urcu::Urcu) + # urcu headers require C99; override the global -std=c89 for this TU only. + set_source_files_properties(crypt/keyrot.c PROPERTIES + COMPILE_OPTIONS "-std=gnu99") +endif() + install(TARGETS ouroboros-common EXPORT OuroborosTargets LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}) @@ -155,5 +167,6 @@ configure_file("${CMAKE_CURRENT_SOURCE_DIR}/ssm/ssm.h.in" if(BUILD_TESTS) add_subdirectory(tests) + add_subdirectory(crc/tests) add_subdirectory(ssm/tests) endif() diff --git a/src/lib/config.h.in b/src/lib/config.h.in index 08e9baf6..26ebe56b 100644 --- a/src/lib/config.h.in +++ b/src/lib/config.h.in @@ -20,6 +20,14 @@ * Foundation, Inc., http://www.fsf.org/about/contact/. */ +#ifndef MILLION +#define MILLION 1000000LL +#endif + +#ifndef BILLION +#define BILLION 1000000000LL +#endif + #cmakedefine HAVE_SYS_RANDOM #cmakedefine HAVE_EXPLICIT_BZERO #cmakedefine HAVE_LIBGCRYPT @@ -29,7 +37,7 @@ #cmakedefine HAVE_OPENSSL_ML_DSA #cmakedefine HAVE_OPENSSL_SLH_DSA #define HAVE_ENCRYPTION -#define SECMEM_GUARD @SECMEM_GUARD@ +#define SECMEM_MINSIZE @SECMEM_MINSIZE@ #endif #define PROC_SECMEM_MAX @PROC_SECMEM_MAX@ @@ -37,6 +45,8 @@ #cmakedefine QOS_DISABLE_CRC #cmakedefine HAVE_OPENSSL_RNG +#cmakedefine HAVE_PCLMUL +#cmakedefine HAVE_PMULL #define SHM_LOCKFILE_NAME "@SHM_LOCKFILE_NAME@" #define FLOW_ALLOC_TIMEOUT @FLOW_ALLOC_TIMEOUT@ @@ -60,16 +70,20 @@ #cmakedefine PROC_FLOW_STATS #endif +#cmakedefine HAVE_LIBURCU + +#cmakedefine FRCT_DEBUG_STDOUT + #define PTHREAD_COND_CLOCK @PTHREAD_COND_CLOCK@ -#define PROG_MAX_FLOWS @PROG_MAX_FLOWS@ -#define PROG_RES_FDS @PROG_RES_FDS@ -#define PROG_MAX_FQUEUES @PROG_MAX_FQUEUES@ +#define PROC_MAX_FLOWS @PROC_MAX_FLOWS@ +#define PROC_RES_FDS @PROC_RES_FDS@ +#define PROC_MAX_FQUEUES @PROC_MAX_FQUEUES@ /* Default Delta-t parameters */ #cmakedefine FRCT_LINUX_RTT_ESTIMATOR -#define DELT_A (@DELTA_T_ACK@) /* ns */ -#define DELT_R (@DELTA_T_RTX@) /* ns */ +#define DELT_A (@DELTA_T_ACK@) /* ms */ +#define DELT_R (@DELTA_T_RTX@) /* ms */ #define RQ_SIZE (@FRCT_REORDER_QUEUE_SIZE@) #define START_WINDOW (@FRCT_START_WINDOW@) @@ -80,9 +94,6 @@ #define TICTIME (@FRCT_TICK_TIME@ * 1000) /* ns */ /* Retransmission tuning */ -#cmakedefine RXM_BUFFER_ON_HEAP -#cmakedefine RXM_BLOCKING - #define RXMQ_RES (@RXM_MIN_RESOLUTION@) /* 2^N ns */ #define RXMQ_BUMP (@RXM_WHEEL_MULTIPLIER@) #define RXMQ_LVLS (@RXM_WHEEL_LEVELS@) @@ -91,4 +102,9 @@ #define ACKQ_SLOTS (@ACK_WHEEL_SLOTS@) #define ACKQ_RES (@ACK_WHEEL_RESOLUTION@) /* 2^N ns */ -#define KEY_ROTATION_BIT (@KEY_ROTATION_BIT@) /* Bit for key rotation */ +#define KEY_LEAF_BITS (@KEY_LEAF_BITS@) /* pkts/leaf-key = 2^n */ +#define KEY_NODE_BITS (@KEY_NODE_BITS@) /* leaf-keys/node = 2^n */ +#define KEY_NODE_COUNT (@KEY_NODE_COUNT@) /* node keys/batch N */ +#define KEY_REKEY_WATERMARK (@KEY_REKEY_WATERMARK@) /* node-keys-left trig */ +#define KEY_REPLAY_WINDOW (@KEY_REPLAY_WINDOW@) /* rx replay win pkts */ +#define FLOW_WM_CHECK (1u << @KEY_REKEY_WM_CHECK_BITS@) /* wm chk/n wr */ diff --git a/src/lib/crc/crc16.c b/src/lib/crc/crc16.c new file mode 100644 index 00000000..9dc59429 --- /dev/null +++ b/src/lib/crc/crc16.c @@ -0,0 +1,61 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * 16-bit Cyclic Redundancy Check (CCITT-FALSE variant) + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +/* + * CRC-16/CCITT-FALSE (reveng catalog, alias CRC-16/IBM-3740): + * poly = 0x1021 + * init = 0xffff + * refin = false + * refout = false + * xorout = 0x0000 + * check = crc16_ccitt_false("123456789") == 0x29b1 + */ + +#include "config.h" + +#include <ouroboros/crc16.h> + +/* Bit-by-bit MSB-first CRC. */ +void crc16_ccitt_false(uint16_t * crc, + const void * buf, + size_t len) +{ + const uint8_t * p; + uint16_t c; + size_t n; + int i; + + p = (const uint8_t *) buf; + c = *crc ^ 0xffff; + + for (n = 0; n < len; n++) { + c ^= ((uint16_t) p[n]) << 8; + for (i = 0; i < 8; i++) { + if (c & 0x8000) + c = (uint16_t) ((c << 1) ^ 0x1021); + else + c = (uint16_t) (c << 1); + } + } + + *crc = c; +} diff --git a/src/lib/crc32.c b/src/lib/crc/crc32.c index 0fdb62b1..0fdb62b1 100644 --- a/src/lib/crc32.c +++ b/src/lib/crc/crc32.c diff --git a/src/lib/crc/crc64.c b/src/lib/crc/crc64.c new file mode 100644 index 00000000..1b6fb5f6 --- /dev/null +++ b/src/lib/crc/crc64.c @@ -0,0 +1,363 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * 64-bit Cyclic Redundancy Check (NVMe variant) + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +/* + * CRC-64/NVMe (reveng catalog): + * poly = 0xad93d23594c93659 + * init = 0xffffffffffffffff + * refin = true + * refout = true + * xorout = 0xffffffffffffffff + * check = crc64_nvme("123456789") == 0xae8b14860a799888 + */ + +#include "config.h" + +#include <ouroboros/crc64.h> + +/* + * Reflected CRC-64/NVMe table. Polynomial in reflected form: + * 0x9a6c9329ac4bc9b5 (bitrev of 0xad93d23594c93659). + */ +static const uint64_t crc64_nvme_tab[256] = { + 0x0000000000000000ULL, 0x7f6ef0c830358979ULL, + 0xfedde190606b12f2ULL, 0x81b31158505e9b8bULL, + 0xc962e5739841b68fULL, 0xb60c15bba8743ff6ULL, + 0x37bf04e3f82aa47dULL, 0x48d1f42bc81f2d04ULL, + 0xa61cecb46814fe75ULL, 0xd9721c7c5821770cULL, + 0x58c10d24087fec87ULL, 0x27affdec384a65feULL, + 0x6f7e09c7f05548faULL, 0x1010f90fc060c183ULL, + 0x91a3e857903e5a08ULL, 0xeecd189fa00bd371ULL, + 0x78e0ff3b88be6f81ULL, 0x078e0ff3b88be6f8ULL, + 0x863d1eabe8d57d73ULL, 0xf953ee63d8e0f40aULL, + 0xb1821a4810ffd90eULL, 0xceecea8020ca5077ULL, + 0x4f5ffbd87094cbfcULL, 0x30310b1040a14285ULL, + 0xdefc138fe0aa91f4ULL, 0xa192e347d09f188dULL, + 0x2021f21f80c18306ULL, 0x5f4f02d7b0f40a7fULL, + 0x179ef6fc78eb277bULL, 0x68f0063448deae02ULL, + 0xe943176c18803589ULL, 0x962de7a428b5bcf0ULL, + 0xf1c1fe77117cdf02ULL, 0x8eaf0ebf2149567bULL, + 0x0f1c1fe77117cdf0ULL, 0x7072ef2f41224489ULL, + 0x38a31b04893d698dULL, 0x47cdebccb908e0f4ULL, + 0xc67efa94e9567b7fULL, 0xb9100a5cd963f206ULL, + 0x57dd12c379682177ULL, 0x28b3e20b495da80eULL, + 0xa900f35319033385ULL, 0xd66e039b2936bafcULL, + 0x9ebff7b0e12997f8ULL, 0xe1d10778d11c1e81ULL, + 0x606216208142850aULL, 0x1f0ce6e8b1770c73ULL, + 0x8921014c99c2b083ULL, 0xf64ff184a9f739faULL, + 0x77fce0dcf9a9a271ULL, 0x08921014c99c2b08ULL, + 0x4043e43f0183060cULL, 0x3f2d14f731b68f75ULL, + 0xbe9e05af61e814feULL, 0xc1f0f56751dd9d87ULL, + 0x2f3dedf8f1d64ef6ULL, 0x50531d30c1e3c78fULL, + 0xd1e00c6891bd5c04ULL, 0xae8efca0a188d57dULL, + 0xe65f088b6997f879ULL, 0x9931f84359a27100ULL, + 0x1882e91b09fcea8bULL, 0x67ec19d339c963f2ULL, + 0xd75adabd7a6e2d6fULL, 0xa8342a754a5ba416ULL, + 0x29873b2d1a053f9dULL, 0x56e9cbe52a30b6e4ULL, + 0x1e383fcee22f9be0ULL, 0x6156cf06d21a1299ULL, + 0xe0e5de5e82448912ULL, 0x9f8b2e96b271006bULL, + 0x71463609127ad31aULL, 0x0e28c6c1224f5a63ULL, + 0x8f9bd7997211c1e8ULL, 0xf0f5275142244891ULL, + 0xb824d37a8a3b6595ULL, 0xc74a23b2ba0eececULL, + 0x46f932eaea507767ULL, 0x3997c222da65fe1eULL, + 0xafba2586f2d042eeULL, 0xd0d4d54ec2e5cb97ULL, + 0x5167c41692bb501cULL, 0x2e0934dea28ed965ULL, + 0x66d8c0f56a91f461ULL, 0x19b6303d5aa47d18ULL, + 0x980521650afae693ULL, 0xe76bd1ad3acf6feaULL, + 0x09a6c9329ac4bc9bULL, 0x76c839faaaf135e2ULL, + 0xf77b28a2faafae69ULL, 0x8815d86aca9a2710ULL, + 0xc0c42c4102850a14ULL, 0xbfaadc8932b0836dULL, + 0x3e19cdd162ee18e6ULL, 0x41773d1952db919fULL, + 0x269b24ca6b12f26dULL, 0x59f5d4025b277b14ULL, + 0xd846c55a0b79e09fULL, 0xa72835923b4c69e6ULL, + 0xeff9c1b9f35344e2ULL, 0x90973171c366cd9bULL, + 0x1124202993385610ULL, 0x6e4ad0e1a30ddf69ULL, + 0x8087c87e03060c18ULL, 0xffe938b633338561ULL, + 0x7e5a29ee636d1eeaULL, 0x0134d92653589793ULL, + 0x49e52d0d9b47ba97ULL, 0x368bddc5ab7233eeULL, + 0xb738cc9dfb2ca865ULL, 0xc8563c55cb19211cULL, + 0x5e7bdbf1e3ac9decULL, 0x21152b39d3991495ULL, + 0xa0a63a6183c78f1eULL, 0xdfc8caa9b3f20667ULL, + 0x97193e827bed2b63ULL, 0xe877ce4a4bd8a21aULL, + 0x69c4df121b863991ULL, 0x16aa2fda2bb3b0e8ULL, + 0xf86737458bb86399ULL, 0x8709c78dbb8deae0ULL, + 0x06bad6d5ebd3716bULL, 0x79d4261ddbe6f812ULL, + 0x3105d23613f9d516ULL, 0x4e6b22fe23cc5c6fULL, + 0xcfd833a67392c7e4ULL, 0xb0b6c36e43a74e9dULL, + 0x9a6c9329ac4bc9b5ULL, 0xe50263e19c7e40ccULL, + 0x64b172b9cc20db47ULL, 0x1bdf8271fc15523eULL, + 0x530e765a340a7f3aULL, 0x2c608692043ff643ULL, + 0xadd397ca54616dc8ULL, 0xd2bd67026454e4b1ULL, + 0x3c707f9dc45f37c0ULL, 0x431e8f55f46abeb9ULL, + 0xc2ad9e0da4342532ULL, 0xbdc36ec59401ac4bULL, + 0xf5129aee5c1e814fULL, 0x8a7c6a266c2b0836ULL, + 0x0bcf7b7e3c7593bdULL, 0x74a18bb60c401ac4ULL, + 0xe28c6c1224f5a634ULL, 0x9de29cda14c02f4dULL, + 0x1c518d82449eb4c6ULL, 0x633f7d4a74ab3dbfULL, + 0x2bee8961bcb410bbULL, 0x548079a98c8199c2ULL, + 0xd53368f1dcdf0249ULL, 0xaa5d9839ecea8b30ULL, + 0x449080a64ce15841ULL, 0x3bfe706e7cd4d138ULL, + 0xba4d61362c8a4ab3ULL, 0xc52391fe1cbfc3caULL, + 0x8df265d5d4a0eeceULL, 0xf29c951de49567b7ULL, + 0x732f8445b4cbfc3cULL, 0x0c41748d84fe7545ULL, + 0x6bad6d5ebd3716b7ULL, 0x14c39d968d029fceULL, + 0x95708ccedd5c0445ULL, 0xea1e7c06ed698d3cULL, + 0xa2cf882d2576a038ULL, 0xdda178e515432941ULL, + 0x5c1269bd451db2caULL, 0x237c997575283bb3ULL, + 0xcdb181ead523e8c2ULL, 0xb2df7122e51661bbULL, + 0x336c607ab548fa30ULL, 0x4c0290b2857d7349ULL, + 0x04d364994d625e4dULL, 0x7bbd94517d57d734ULL, + 0xfa0e85092d094cbfULL, 0x856075c11d3cc5c6ULL, + 0x134d926535897936ULL, 0x6c2362ad05bcf04fULL, + 0xed9073f555e26bc4ULL, 0x92fe833d65d7e2bdULL, + 0xda2f7716adc8cfb9ULL, 0xa54187de9dfd46c0ULL, + 0x24f29686cda3dd4bULL, 0x5b9c664efd965432ULL, + 0xb5517ed15d9d8743ULL, 0xca3f8e196da80e3aULL, + 0x4b8c9f413df695b1ULL, 0x34e26f890dc31cc8ULL, + 0x7c339ba2c5dc31ccULL, 0x035d6b6af5e9b8b5ULL, + 0x82ee7a32a5b7233eULL, 0xfd808afa9582aa47ULL, + 0x4d364994d625e4daULL, 0x3258b95ce6106da3ULL, + 0xb3eba804b64ef628ULL, 0xcc8558cc867b7f51ULL, + 0x8454ace74e645255ULL, 0xfb3a5c2f7e51db2cULL, + 0x7a894d772e0f40a7ULL, 0x05e7bdbf1e3ac9deULL, + 0xeb2aa520be311aafULL, 0x944455e88e0493d6ULL, + 0x15f744b0de5a085dULL, 0x6a99b478ee6f8124ULL, + 0x224840532670ac20ULL, 0x5d26b09b16452559ULL, + 0xdc95a1c3461bbed2ULL, 0xa3fb510b762e37abULL, + 0x35d6b6af5e9b8b5bULL, 0x4ab846676eae0222ULL, + 0xcb0b573f3ef099a9ULL, 0xb465a7f70ec510d0ULL, + 0xfcb453dcc6da3dd4ULL, 0x83daa314f6efb4adULL, + 0x0269b24ca6b12f26ULL, 0x7d0742849684a65fULL, + 0x93ca5a1b368f752eULL, 0xeca4aad306bafc57ULL, + 0x6d17bb8b56e467dcULL, 0x12794b4366d1eea5ULL, + 0x5aa8bf68aecec3a1ULL, 0x25c64fa09efb4ad8ULL, + 0xa4755ef8cea5d153ULL, 0xdb1bae30fe90582aULL, + 0xbcf7b7e3c7593bd8ULL, 0xc399472bf76cb2a1ULL, + 0x422a5673a732292aULL, 0x3d44a6bb9707a053ULL, + 0x759552905f188d57ULL, 0x0afba2586f2d042eULL, + 0x8b48b3003f739fa5ULL, 0xf42643c80f4616dcULL, + 0x1aeb5b57af4dc5adULL, 0x6585ab9f9f784cd4ULL, + 0xe436bac7cf26d75fULL, 0x9b584a0fff135e26ULL, + 0xd389be24370c7322ULL, 0xace74eec0739fa5bULL, + 0x2d545fb4576761d0ULL, 0x523aaf7c6752e8a9ULL, + 0xc41748d84fe75459ULL, 0xbb79b8107fd2dd20ULL, + 0x3acaa9482f8c46abULL, 0x45a459801fb9cfd2ULL, + 0x0d75adabd7a6e2d6ULL, 0x721b5d63e7936bafULL, + 0xf3a84c3bb7cdf024ULL, 0x8cc6bcf387f8795dULL, + 0x620ba46c27f3aa2cULL, 0x1d6554a417c62355ULL, + 0x9cd645fc4798b8deULL, 0xe3b8b53477ad31a7ULL, + 0xab69411fbfb21ca3ULL, 0xd407b1d78f8795daULL, + 0x55b4a08fdfd90e51ULL, 0x2ada5047efec8728ULL +}; + +static __inline__ uint64_t crc64_nvme_step(uint64_t c, + const uint8_t * p, + size_t len) +{ + size_t n; + + for (n = 0; n < len; n++) + c = crc64_nvme_tab[(c ^ p[n]) & 0xff] ^ (c >> 8); + + return c; +} + +void crc64_nvme_table(uint64_t * crc, + const void * buf, + size_t len) +{ + uint64_t c; + + c = crc64_nvme_step(*crc ^ UINT64_MAX, + (const uint8_t *) buf, len); + + *crc = c ^ UINT64_MAX; +} + +#ifdef HAVE_PCLMUL + +#include <smmintrin.h> +#include <wmmintrin.h> + +/* + * Fold-by-16 constants for reflected CRC-64/NVMe. Properties of the + * polynomial; identical between the PCLMUL and PMULL backends. + * k3 = bitrev64(x^(128+64) mod P) << 1 + * k4 = bitrev64(x^(128+0) mod P) << 1 + */ +static const uint64_t k3_clmul = 0xeadc41fd2ba3d420ULL; +static const uint64_t k4_clmul = 0x21e9761e252621acULL; + +__attribute__((target("pclmul,sse4.1"))) +static __m128i fold16(__m128i x, + __m128i k) +{ + __m128i lo; + __m128i hi; + + lo = _mm_clmulepi64_si128(x, k, 0x00); + hi = _mm_clmulepi64_si128(x, k, 0x11); + return _mm_xor_si128(lo, hi); +} + +/* + * Fold-by-16 over 16-byte chunks; the 128-bit folded state is then + * emitted as 16 little-endian bytes and run through the byte-table + * loop together with any tail (<=15 bytes). The 16-byte minimum on + * the bulk loop is why the short-input path uses the table directly. + */ +__attribute__((target("pclmul,sse4.1"))) +static void crc64_nvme_clmul(uint64_t * crc, + const void * buf, + size_t len) +{ + const uint8_t * p; + uint64_t seed; + uint64_t c; + size_t off; + __m128i x; + __m128i k; + uint8_t post[16]; + + p = (const uint8_t *) buf; + seed = *crc; + + if (len < 16) { + c = crc64_nvme_step(seed ^ UINT64_MAX, p, len); + *crc = c ^ UINT64_MAX; + return; + } + + x = _mm_loadu_si128((const __m128i *) p); + x = _mm_xor_si128(x, _mm_cvtsi64_si128((int64_t) + (seed ^ UINT64_MAX))); + + k = _mm_set_epi64x((int64_t) k4_clmul, (int64_t) k3_clmul); + + off = 16; + while (off + 16 <= len) { + __m128i d; + + d = _mm_loadu_si128((const __m128i *) (p + off)); + x = _mm_xor_si128(fold16(x, k), d); + off += 16; + } + + _mm_storeu_si128((__m128i *) post, x); + + c = crc64_nvme_step(0, post, 16); + c = crc64_nvme_step(c, p + off, len - off); + + *crc = c ^ UINT64_MAX; +} + +#endif /* HAVE_PCLMUL */ + +#ifdef HAVE_PMULL + +#include <arm_neon.h> + +/* Same fold-by-16 constants as the PCLMUL path (poly properties). */ +static const uint64_t k3_pmull = 0xeadc41fd2ba3d420ULL; +static const uint64_t k4_pmull = 0x21e9761e252621acULL; + +__attribute__((target("+crypto"))) +static uint64x2_t fold16_pmull(uint64x2_t x, + uint64x2_t k) +{ + poly64x2_t xp; + poly64x2_t kp; + uint64x2_t lo; + uint64x2_t hi; + + xp = vreinterpretq_p64_u64(x); + kp = vreinterpretq_p64_u64(k); + lo = vreinterpretq_u64_p128( + vmull_p64((poly64_t) vgetq_lane_u64(x, 0), + (poly64_t) vgetq_lane_u64(k, 0))); + hi = vreinterpretq_u64_p128(vmull_high_p64(xp, kp)); + return veorq_u64(lo, hi); +} + +__attribute__((target("+crypto"))) +static void crc64_nvme_pmull(uint64_t * crc, + const void * buf, + size_t len) +{ + const uint8_t * p; + uint64_t seed; + uint64_t c; + size_t off; + uint64x2_t x; + uint64x2_t k; + uint64_t seed_lane[2]; + uint64_t k_lanes[2]; + uint8_t post[16]; + + p = (const uint8_t *) buf; + seed = *crc; + + if (len < 16) { + c = crc64_nvme_step(seed ^ UINT64_MAX, p, len); + *crc = c ^ UINT64_MAX; + return; + } + + x = vld1q_u64((const uint64_t *) p); + seed_lane[0] = seed ^ UINT64_MAX; + seed_lane[1] = 0; + x = veorq_u64(x, vld1q_u64(seed_lane)); + + k_lanes[0] = k3_pmull; + k_lanes[1] = k4_pmull; + k = vld1q_u64(k_lanes); + + off = 16; + while (off + 16 <= len) { + uint64x2_t d; + + d = vld1q_u64((const uint64_t *) (p + off)); + x = veorq_u64(fold16_pmull(x, k), d); + off += 16; + } + + vst1q_u8(post, vreinterpretq_u8_u64(x)); + + c = crc64_nvme_step(0, post, 16); + c = crc64_nvme_step(c, p + off, len - off); + + *crc = c ^ UINT64_MAX; +} +#endif /* HAVE_PMULL */ + +void crc64_nvme(uint64_t * crc, + const void * buf, + size_t len) +{ +#ifdef HAVE_PCLMUL + crc64_nvme_clmul(crc, buf, len); +#elif defined(HAVE_PMULL) + crc64_nvme_pmull(crc, buf, len); +#else + crc64_nvme_table(crc, buf, len); +#endif +} diff --git a/src/lib/crc/crc8.c b/src/lib/crc/crc8.c new file mode 100644 index 00000000..20976b29 --- /dev/null +++ b/src/lib/crc/crc8.c @@ -0,0 +1,62 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * 8-bit Cyclic Redundancy Check (AUTOSAR variant) + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +/* + * CRC-8/AUTOSAR (reveng catalog): + * poly = 0x2f + * init = 0xff + * refin = false + * refout = false + * xorout = 0xff + * check = crc8_autosar("123456789") == 0xdf + */ + +#include "config.h" + +#include <ouroboros/crc8.h> + + + /* Bit-by-bit MSB-first CRC. */ +void crc8_autosar(uint8_t * crc, + const void * buf, + size_t len) +{ + const uint8_t * p; + uint8_t c; + size_t n; + int i; + + p = (const uint8_t *) buf; + c = *crc ^ 0xff; + + for (n = 0; n < len; n++) { + c ^= p[n]; + for (i = 0; i < 8; i++) { + if (c & 0x80) + c = (uint8_t) ((c << 1) ^ 0x2f); + else + c = (uint8_t) (c << 1); + } + } + + *crc = c ^ 0xff; +} diff --git a/src/lib/crc/tests/CMakeLists.txt b/src/lib/crc/tests/CMakeLists.txt new file mode 100644 index 00000000..11daca5a --- /dev/null +++ b/src/lib/crc/tests/CMakeLists.txt @@ -0,0 +1,21 @@ +get_filename_component(PARENT_PATH ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY) +get_filename_component(PARENT_DIR ${PARENT_PATH} NAME) + +compute_test_prefix() + +create_test_sourcelist(${PARENT_DIR}_tests test_suite.c + # Add new tests here + crc8_test.c + crc16_test.c + crc32_test.c + crc64_test.c + ) + +add_executable(${PARENT_DIR}_test ${${PARENT_DIR}_tests}) + +disable_test_logging_for_target(${PARENT_DIR}_test) +target_link_libraries(${PARENT_DIR}_test ouroboros-common) + +add_dependencies(build_tests ${PARENT_DIR}_test) + +ouroboros_register_tests(TARGET ${PARENT_DIR}_test TESTS ${${PARENT_DIR}_tests}) diff --git a/src/lib/crc/tests/crc16_test.c b/src/lib/crc/tests/crc16_test.c new file mode 100644 index 00000000..03a5b504 --- /dev/null +++ b/src/lib/crc/tests/crc16_test.c @@ -0,0 +1,67 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * Test of the CRC-16/CCITT-FALSE function + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +#include "config.h" + +#include <ouroboros/crc16.h> + +#include <test/test.h> + +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> + +/* reveng-catalog smoke vectors. */ +static int test_crc16_ccitt_false_basic(void) +{ + uint16_t crc; + + TEST_START(); + + crc = 0; + crc16_ccitt_false(&crc, "", 0); + if (crc != 0xffff) + goto fail; + + crc = 0; + crc16_ccitt_false(&crc, "123456789", 9); + if (crc != 0x29b1) + goto fail; + + TEST_SUCCESS(); + return TEST_RC_SUCCESS; + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +int crc16_test(int argc, + char ** argv) +{ + int ret = 0; + + (void) argc; + (void) argv; + + ret |= test_crc16_ccitt_false_basic(); + return ret; +} diff --git a/src/lib/tests/crc32_test.c b/src/lib/crc/tests/crc32_test.c index 5a1ddd87..5a1ddd87 100644 --- a/src/lib/tests/crc32_test.c +++ b/src/lib/crc/tests/crc32_test.c diff --git a/src/lib/crc/tests/crc64_test.c b/src/lib/crc/tests/crc64_test.c new file mode 100644 index 00000000..cf3f5ca3 --- /dev/null +++ b/src/lib/crc/tests/crc64_test.c @@ -0,0 +1,126 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * Test of the CRC-64/NVMe function + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +#include "config.h" + +#include <ouroboros/crc64.h> +#include <ouroboros/random.h> + +#include <test/test.h> + +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> + +/* Reference impl, internal to libouroboros-common. */ +extern void crc64_nvme_table(uint64_t * crc, + const void * buf, + size_t len); + +/* reveng-catalog smoke vectors plus a 16-byte fold-boundary check. */ +static int test_crc64_nvme_basic(void) +{ + uint64_t crc; + + TEST_START(); + + crc = 0; + crc64_nvme(&crc, "", 0); + if (crc != 0x0000000000000000ULL) + goto fail; + + crc = 0; + crc64_nvme(&crc, "123456789", 9); + if (crc != 0xae8b14860a799888ULL) + goto fail; + + crc = 0; + crc64_nvme(&crc, "0123456789abcdef", 16); + if (crc != 0x091485ca7018730eULL) + goto fail; + + TEST_SUCCESS(); + return TEST_RC_SUCCESS; + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +#if defined(HAVE_PCLMUL) || defined(HAVE_PMULL) +/* Cross-check the accelerated dispatcher path against the byte-table. */ +static int test_crc64_nvme_random(void) +{ + static const size_t lens[] = { + 0, 1, 7, 8, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128, + 129, 255, 256, 257, 1023, 1024, 1025, 4096 + }; + uint8_t buf[4096]; + size_t i; + uint64_t ref; + uint64_t got; + + TEST_START(); + + if (random_buffer(buf, sizeof(buf)) < 0) { + printf("Failed to generate random data.\n"); + goto fail; + } + + for (i = 0; i < sizeof(lens) / sizeof(lens[0]); i++) { + ref = 0; + crc64_nvme_table(&ref, buf, lens[i]); + + got = 0; + crc64_nvme(&got, buf, lens[i]); + + if (ref == got) + continue; + + printf("Mismatch at len=%zu: table=0x%016lx disp=0x%016lx\n", + lens[i], + (unsigned long) ref, + (unsigned long) got); + goto fail; + } + + TEST_SUCCESS(); + return TEST_RC_SUCCESS; + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +#endif +} + +int crc64_test(int argc, + char ** argv) +{ + int ret = 0; + + (void) argc; + (void) argv; + + ret |= test_crc64_nvme_basic(); +#if defined(HAVE_PCLMUL) || defined(HAVE_PMULL) + ret |= test_crc64_nvme_random(); +#endif + return ret; +} diff --git a/src/lib/crc/tests/crc8_test.c b/src/lib/crc/tests/crc8_test.c new file mode 100644 index 00000000..f7bb33b8 --- /dev/null +++ b/src/lib/crc/tests/crc8_test.c @@ -0,0 +1,67 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * Test of the CRC-8/AUTOSAR function + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +#include "config.h" + +#include <ouroboros/crc8.h> + +#include <test/test.h> + +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> + +/* reveng-catalog smoke vectors. */ +static int test_crc8_autosar_basic(void) +{ + uint8_t crc; + + TEST_START(); + + crc = 0; + crc8_autosar(&crc, "", 0); + if (crc != 0x00) + goto fail; + + crc = 0; + crc8_autosar(&crc, "123456789", 9); + if (crc != 0xdf) + goto fail; + + TEST_SUCCESS(); + return TEST_RC_SUCCESS; + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +int crc8_test(int argc, + char ** argv) +{ + int ret = 0; + + (void) argc; + (void) argv; + + ret |= test_crc8_autosar_basic(); + return ret; +} diff --git a/src/lib/crypt.c b/src/lib/crypt.c index 71197f6e..5da9d392 100644 --- a/src/lib/crypt.c +++ b/src/lib/crypt.c @@ -27,10 +27,14 @@ #include <config.h> #include <ouroboros/errno.h> +#include <ouroboros/pthread.h> #include <ouroboros/random.h> #include <ouroboros/crypt.h> +#include "crypt/keyrot.h" + #ifdef HAVE_OPENSSL +#include <openssl/crypto.h> #include <openssl/evp.h> #include "crypt/openssl.h" #endif @@ -50,18 +54,12 @@ static const struct nid_map cipher_nid_map[] = { {NID_aes_192_gcm, "aes-192-gcm"}, {NID_aes_256_gcm, "aes-256-gcm"}, {NID_chacha20_poly1305, "chacha20-poly1305"}, - {NID_aes_128_ctr, "aes-128-ctr"}, - {NID_aes_192_ctr, "aes-192-ctr"}, - {NID_aes_256_ctr, "aes-256-ctr"}, {NID_undef, NULL} }; /* Ordered in strength preference, lowest first */ const uint16_t crypt_supported_nids[] = { #ifdef HAVE_OPENSSL - NID_aes_128_ctr, - NID_aes_192_ctr, - NID_aes_256_ctr, NID_aes_128_gcm, NID_aes_192_gcm, NID_aes_256_gcm, @@ -87,23 +85,23 @@ static const struct nid_map kex_nid_map[] = { {NID_undef, NULL} }; -/* Ordered in strength preference, lowest first */ +/* Ordered in strength preference, lowest first (NIST SP 800-57 levels) */ const uint16_t kex_supported_nids[] = { #ifdef HAVE_OPENSSL - NID_ffdhe2048, - NID_X9_62_prime256v1, - NID_X25519, - NID_ffdhe3072, - NID_secp384r1, - NID_ffdhe4096, - NID_X448, - NID_secp521r1, + NID_ffdhe2048, /* FFDHE-2048, ~112-bit */ + NID_X9_62_prime256v1, /* ECDH P-256, 128-bit */ + NID_X25519, /* ECDH X25519, 128-bit */ + NID_ffdhe3072, /* FFDHE-3072, ~128-bit */ + NID_ffdhe4096, /* FFDHE-4096, ~152-bit */ + NID_secp384r1, /* ECDH P-384, 192-bit */ + NID_X448, /* ECDH X448, 224-bit */ + NID_secp521r1, /* ECDH P-521, 256-bit */ #ifdef HAVE_OPENSSL_ML_KEM - NID_MLKEM512, - NID_MLKEM768, - NID_MLKEM1024, - NID_X25519MLKEM768, - NID_X448MLKEM1024, + NID_MLKEM512, /* ML-KEM-512, PQC L1 (~AES-128) */ + NID_MLKEM768, /* ML-KEM-768, PQC L3 (~AES-192) */ + NID_MLKEM1024, /* ML-KEM-1024, PQC L5 (~AES-256) */ + NID_X25519MLKEM768, /* X25519 + ML-KEM-768, PQC L3 */ + NID_X448MLKEM1024, /* X448 + ML-KEM-1024, PQC L5 */ #endif #endif NID_undef @@ -137,11 +135,13 @@ const uint16_t md_supported_nids[] = { }; struct crypt_ctx { - void * ctx; /* Encryption context */ + struct keyrot * kr; /* backend-independent key rotation */ + void * cipher; /* backend AEAD cipher context */ }; struct auth_ctx { - void * store; + void * store; /* trusted anchors */ + void * chain; /* untrusted build-only interm */ }; static int parse_kex_value(const char * value, @@ -162,6 +162,7 @@ int parse_sec_config(struct sec_config * cfg, char * equals; char * key; char * value; + bool no_enc = false; assert(cfg != NULL); assert(fp != NULL); @@ -172,6 +173,7 @@ int parse_sec_config(struct sec_config * cfg, SET_KEX_KDF_NID(cfg, NID_sha256); SET_KEX_CIPHER_NID(cfg, NID_aes_256_gcm); SET_KEX_DIGEST_NID(cfg, NID_sha256); + /* a.req is seeded per-role by the caller; only auth= overrides it */ while (fgets(line, sizeof(line), fp) != NULL) { char * trimmed; @@ -180,12 +182,10 @@ int parse_sec_config(struct sec_config * cfg, if (line[0] == '#' || line[0] == '\n') continue; - /* Check for 'none' keyword */ + /* Bare 'none' keyword replaced by encryption=none */ trimmed = trim_whitespace(line); - if (strcmp(trimmed, "none") == 0) { - memset(cfg, 0, sizeof(*cfg)); - return 0; - } + if (strcmp(trimmed, "none") == 0) + return -EINVAL; /* Find the = separator */ equals = strchr(line, '='); @@ -221,9 +221,34 @@ int parse_sec_config(struct sec_config * cfg, } else { return -EINVAL; } + } else if (strcmp(key, "auth") == 0) { + if (strcmp(value, "required") == 0) { + cfg->a.req = true; + } else if (strcmp(value, "optional") == 0) { + cfg->a.req = false; + } else { + return -EINVAL; + } + } else if (strcmp(key, "cacert") == 0) { + if (strlen(value) >= sizeof(cfg->a.cacert)) + return -EINVAL; + strcpy(cfg->a.cacert, value); + } else if (strcmp(key, "encryption") == 0) { + if (strcmp(value, "none") != 0) + return -EINVAL; + no_enc = true; + } else { + return -EINVAL; } } + if (no_enc) { + /* Digest stays: it belongs to the auth axis */ + CLEAR_KEX_ALGO(cfg); + CLEAR_KEX_KDF(cfg); + CLEAR_KEX_CIPHER(cfg); + } + return 0; } @@ -239,12 +264,17 @@ int load_sec_config_file(struct sec_config * cfg, fp = fopen(path, "r"); if (fp == NULL) { - /* File doesn't exist - disable encryption */ - CLEAR_KEX_ALGO(cfg); - return 0; + /* Absent config disables encryption; other errors fail */ + if (errno == ENOENT) { + CLEAR_KEX_ALGO(cfg); + return 0; + } + return -errno; } + pthread_cleanup_push(__cleanup_fclose, fp); ret = parse_sec_config(cfg, fp); + pthread_cleanup_pop(0); fclose(fp); @@ -592,19 +622,71 @@ int crypt_kex_rank(int nid) return -1; } -/* Hash length now returned by md_digest() */ +/* AEAD primitive: 1:1 backend wrappers used by the data path below. */ +static int crypt_seal(void * cipher, + const uint8_t * key, + const uint8_t * nonce, + buffer_t aad, + buffer_t in, + uint8_t * out, + uint8_t * tag) +{ +#ifdef HAVE_OPENSSL + return openssl_seal(cipher, key, nonce, aad, in, out, tag); +#else + (void) cipher; + (void) key; + (void) nonce; + (void) aad; + (void) in; + (void) out; + (void) tag; -int crypt_encrypt(struct crypt_ctx * ctx, - buffer_t in, - buffer_t * out) + return -ECRYPT; +#endif +} + +static int crypt_open(void * cipher, + const uint8_t * key, + const uint8_t * nonce, + buffer_t aad, + buffer_t in, + const uint8_t * tag, + buffer_t * out) { - assert(ctx != NULL); - assert(ctx->ctx != NULL); +#ifdef HAVE_OPENSSL + return openssl_open(cipher, key, nonce, aad, in, tag, out); +#else + (void) cipher; + (void) key; + (void) nonce; + (void) aad; + (void) in; + (void) tag; + (void) out; + + return -ECRYPT; +#endif +} + +int crypt_oneshot_seal(int nid, + const uint8_t * key, + const uint8_t * nonce, + buffer_t aad, + buffer_t in, + buffer_t * out) +{ + assert(key != NULL); + assert(nonce != NULL); + assert(out != NULL); #ifdef HAVE_OPENSSL - return openssl_encrypt(ctx->ctx, in, out); + return openssl_oneshot_seal(nid, key, nonce, aad, in, out); #else - (void) ctx; + (void) nid; + (void) key; + (void) nonce; + (void) aad; (void) in; (void) out; @@ -612,17 +694,24 @@ int crypt_encrypt(struct crypt_ctx * ctx, #endif } -int crypt_decrypt(struct crypt_ctx * ctx, - buffer_t in, - buffer_t * out) +int crypt_oneshot_open(int nid, + const uint8_t * key, + const uint8_t * nonce, + buffer_t aad, + buffer_t in, + buffer_t * out) { - assert(ctx != NULL); - assert(ctx->ctx != NULL); + assert(key != NULL); + assert(nonce != NULL); + assert(out != NULL); #ifdef HAVE_OPENSSL - return openssl_decrypt(ctx->ctx, in, out); + return openssl_oneshot_open(nid, key, nonce, aad, in, out); #else - (void) ctx; + (void) nid; + (void) key; + (void) nonce; + (void) aad; (void) in; (void) out; @@ -630,6 +719,115 @@ int crypt_decrypt(struct crypt_ctx * ctx, #endif } +/* + * Data-path encrypt: rotate the key, frame selector ‖ ct ‖ tag, seal. + * Backend-agnostic: composed from keyrot_*, crypt_seal and crypt_get_tagsz. + */ +int crypt_encrypt(struct crypt_ctx * ctx, + buffer_t in, + buffer_t * out) +{ + uint8_t nonce[KR_NONCE_LEN]; + const uint8_t * key; + uint8_t * ct; + buffer_t aad; + int tagsz; + int out_sz; + + assert(ctx != NULL); + assert(ctx->kr != NULL); + + tagsz = crypt_get_tagsz(ctx); + if (tagsz < 0) + return -ECRYPT; + + out->data = malloc(KR_SELECTOR_LEN + in.len + (size_t) tagsz); + if (out->data == NULL) + goto fail_malloc; + + ct = out->data + KR_SELECTOR_LEN; + + /* keyrot writes the selector into the wire header (== AAD). */ + if (keyrot_tx_next(ctx->kr, out->data, &key, nonce) != 0) + goto fail_encrypt; + + aad.data = out->data; + aad.len = KR_SELECTOR_LEN; + + out_sz = crypt_seal(ctx->cipher, key, nonce, aad, in, ct, ct + in.len); + if (out_sz < 0) + goto fail_encrypt; + + out->len = KR_SELECTOR_LEN + (size_t) out_sz + (size_t) tagsz; + + return 0; + fail_encrypt: + free(out->data); + fail_malloc: + clrbuf(*out); + return -ECRYPT; +} + +/* + * Data-path decrypt: look up the rotated key from the selector, open, and + * commit the replay window only after the tag verifies. + */ +int crypt_decrypt(struct crypt_ctx * ctx, + buffer_t in, + buffer_t * out) +{ + uint8_t nonce[KR_NONCE_LEN]; + const uint8_t * key; + const uint8_t * tag; + struct kr_rx rx; + buffer_t aad; + buffer_t ct; + int tagsz; + int in_sz; + + assert(ctx != NULL); + assert(ctx->kr != NULL); + + tagsz = crypt_get_tagsz(ctx); + if (tagsz < 0) + return -ECRYPT; + + if (in.len < (size_t) (KR_SELECTOR_LEN + tagsz)) + return -ECRYPT; + + if (keyrot_rx_lookup(ctx->kr, in.data, &key, nonce, &rx) != 0) + return -ECRYPT; + + in_sz = (int) in.len - KR_SELECTOR_LEN - tagsz; + + /* +1 keeps malloc(0) defined for an empty (zero-length) frame. */ + out->data = malloc((size_t) in_sz + 1); + if (out->data == NULL) + goto fail_malloc; + + aad.data = in.data; + aad.len = KR_SELECTOR_LEN; + + ct.data = in.data + KR_SELECTOR_LEN; + ct.len = (size_t) in_sz; + + tag = in.data + KR_SELECTOR_LEN + in_sz; + + if (crypt_open(ctx->cipher, key, nonce, aad, ct, tag, out) < 0) + goto fail_decrypt; + + /* Commit replay state only after the tag verifies. */ + if (keyrot_rx_commit(ctx->kr, &rx) != 0) + goto fail_decrypt; + + return 0; + fail_decrypt: + free(out->data); + fail_malloc: + clrbuf(*out); + return -ECRYPT; +} + struct crypt_ctx * crypt_create_ctx(struct crypt_sk * sk) { struct crypt_ctx * crypt; @@ -643,16 +841,23 @@ struct crypt_ctx * crypt_create_ctx(struct crypt_sk * sk) memset(crypt, 0, sizeof(*crypt)); + crypt->kr = keyrot_create(sk->key, sk->epoch, sk->role); + if (crypt->kr == NULL) + goto fail_kr; + #ifdef HAVE_OPENSSL - crypt->ctx = openssl_crypt_create_ctx(sk); - if (crypt->ctx == NULL) - goto fail_ctx; + crypt->cipher = openssl_crypt_create_ctx(sk); + if (crypt->cipher == NULL) + goto fail_cipher; #endif return crypt; + #ifdef HAVE_OPENSSL - fail_ctx: - free(crypt); + fail_cipher: + keyrot_destroy(crypt->kr); #endif + fail_kr: + free(crypt); fail_crypt: return NULL; } @@ -662,43 +867,70 @@ void crypt_destroy_ctx(struct crypt_ctx * crypt) if (crypt == NULL) return; + keyrot_destroy(crypt->kr); #ifdef HAVE_OPENSSL - assert(crypt->ctx != NULL); - openssl_crypt_destroy_ctx(crypt->ctx); -#else - assert(crypt->ctx == NULL); + openssl_crypt_destroy_ctx(crypt->cipher); #endif free(crypt); } -int crypt_get_ivsz(struct crypt_ctx * ctx) +int crypt_get_headsz(struct crypt_ctx * ctx) { - if (ctx == NULL) - return -EINVAL; + assert(ctx != NULL); + assert(ctx->kr != NULL); -#ifdef HAVE_OPENSSL - assert(ctx->ctx != NULL); - return openssl_crypt_get_ivsz(ctx->ctx); -#else - assert(ctx->ctx == NULL); - return -ENOTSUP; -#endif + (void) ctx; /* validated only; header size is a constant */ + + return KR_SELECTOR_LEN; +} + +int crypt_rekey(struct crypt_ctx * ctx, + struct crypt_sk * sk) +{ + assert(ctx != NULL); + assert(sk != NULL); + assert(ctx->kr != NULL); + + return keyrot_rekey(ctx->kr, sk->key, sk->epoch) == 0 ? 0 : -ECRYPT; } int crypt_get_tagsz(struct crypt_ctx * ctx) { - if (ctx == NULL) - return -EINVAL; + assert(ctx != NULL); + assert(ctx->cipher != NULL); #ifdef HAVE_OPENSSL - assert(ctx->ctx != NULL); - return openssl_crypt_get_tagsz(ctx->ctx); + return openssl_crypt_get_tagsz(ctx->cipher); #else - assert(ctx->ctx == NULL); + (void) ctx; return -ENOTSUP; #endif } +int crypt_nodes_left(struct crypt_ctx * ctx) +{ + assert(ctx != NULL); + assert(ctx->kr != NULL); + + return (int) keyrot_tx_nodes_left(ctx->kr); +} + +int crypt_peer_synced(struct crypt_ctx * ctx) +{ + assert(ctx != NULL); + assert(ctx->kr != NULL); + + return keyrot_peer_switched(ctx->kr) ? 1 : 0; +} + +void crypt_tx_promote(struct crypt_ctx * ctx) +{ + assert(ctx != NULL); + assert(ctx->kr != NULL); + + keyrot_tx_promote(ctx->kr); +} + int crypt_load_privkey_file(const char * path, void ** key) { @@ -801,6 +1033,25 @@ int crypt_load_privkey_raw_file(const char * path, #endif } +int crypt_ct_cmp(const void * a, + const void * b, + size_t len) +{ +#ifdef HAVE_OPENSSL + return CRYPTO_memcmp(a, b, len); +#else + const volatile uint8_t * pa = a; + const volatile uint8_t * pb = b; + uint8_t d = 0; + size_t i; + + for (i = 0; i < len; i++) + d |= pa[i] ^ pb[i]; + + return d != 0; +#endif +} + int crypt_cmp_key(const void * key1, const void * key2) { @@ -967,9 +1218,15 @@ struct auth_ctx * auth_create_ctx(void) ctx->store = openssl_auth_create_store(); if (ctx->store == NULL) goto fail_store; + + ctx->chain = openssl_auth_create_chain(); + if (ctx->chain == NULL) + goto fail_chain; #endif return ctx; #ifdef HAVE_OPENSSL + fail_chain: + openssl_auth_destroy_store(ctx->store); fail_store: free(ctx); #endif @@ -982,6 +1239,7 @@ void auth_destroy_ctx(struct auth_ctx * ctx) if (ctx == NULL) return; #ifdef HAVE_OPENSSL + openssl_auth_destroy_chain(ctx->chain); openssl_auth_destroy_store(ctx->store); #endif free(ctx); @@ -1003,11 +1261,27 @@ int auth_add_crt_to_store(struct auth_ctx * ctx, #endif } +int auth_add_crt_to_chain(struct auth_ctx * ctx, + void * crt) +{ + assert(ctx != NULL); + assert(crt != NULL); + +#ifdef HAVE_OPENSSL + return openssl_auth_add_crt_to_chain(ctx->chain, crt); +#else + (void) ctx; + (void) crt; + + return 0; +#endif +} + int auth_verify_crt(struct auth_ctx * ctx, void * crt) { #ifdef HAVE_OPENSSL - return openssl_verify_crt(ctx->store, crt); + return openssl_verify_crt(ctx->store, ctx->chain, crt); #else (void) ctx; (void) crt; @@ -1016,6 +1290,32 @@ int auth_verify_crt(struct auth_ctx * ctx, #endif } +int auth_verify_crt_pin(struct auth_ctx * ctx, + void * crt, + void * pin) +{ +#ifdef HAVE_OPENSSL + return openssl_verify_crt_pin(ctx->store, ctx->chain, crt, pin); +#else + (void) ctx; + (void) crt; + (void) pin; + + return 0; +#endif +} + +bool crypt_pk_requires_md(const void * pk) +{ +#ifdef HAVE_OPENSSL + return openssl_pk_requires_md((const EVP_PKEY *) pk); +#else + (void) pk; + + return false; +#endif +} + int auth_sign(void * pkp, int md_nid, buffer_t msg, @@ -1077,10 +1377,25 @@ ssize_t md_len(int md_nid) #endif } +int crypt_hkdf_expand(buffer_t key, + buffer_t info, + buffer_t out) +{ +#ifdef HAVE_OPENSSL + return openssl_hkdf_expand(key, info, out) == 0 ? 0 : -ECRYPT; +#else + (void) key; + (void) info; + (void) out; + + return -ECRYPT; +#endif +} + int crypt_secure_malloc_init(size_t max) { #ifdef HAVE_OPENSSL - return openssl_secure_malloc_init(max, SECMEM_GUARD); + return openssl_secure_malloc_init(max, SECMEM_MINSIZE); #else (void) max; return 0; diff --git a/src/lib/crypt/keyrot.c b/src/lib/crypt/keyrot.c new file mode 100644 index 00000000..8b0d9429 --- /dev/null +++ b/src/lib/crypt/keyrot.c @@ -0,0 +1,741 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * Data-plane key-rotation schedule (node/leaf keys, selector) + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +#define _POSIX_C_SOURCE 200809L + +#include <config.h> + +#include <ouroboros/atomics.h> +#include <ouroboros/crypt.h> +#include <ouroboros/pthread.h> +#include <ouroboros/rcu.h> + +#include "crypt/keyrot.h" + +#include <assert.h> +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> + +/* + * Per-flow keys are addressed by (epoch, node, leaf) and derived as: + * root = per-batch HKDF PRK from the OAP exchange, wiped once expanded + * nodes = HKDF-Expand(root, "o7s-keyrot-node") -> KEY_NODE_COUNT keys + * leaf = HKDF-Expand(node, "o7s-keyrot-leaf"|dir|leaf) -> AEAD key + * The epoch is a small wrapping counter, carried in the selector, that picks + * the live batch; a Tier-2 OAP re-key advances it. The "dir" byte forks the + * leaf keys per direction. + * + * Concurrency: cur/prev batch pointers are published by a re-key and read on + * the data path under an rcu_guard (lock-free RCU with liburcu, else a per- + * keyrot rwlock). The per-batch TX counter is atomic, so the (epoch, counter) + * nonce is unique without serialising TX. Leaf caches are THREAD-LOCAL (an app + * writer and the FRCT retransmit timer never share cache state), keyed on a + * global batch id and direct-mapped. + */ + +#define KR_WITHIN_BITS (KEY_LEAF_BITS + KEY_NODE_BITS) +#define KR_WITHIN_MASK (((uint64_t) 1 << KR_WITHIN_BITS) - 1) +#define KR_N (KEY_NODE_COUNT) +#define KR_LEAVES (1u << KEY_NODE_BITS) +#define KR_BATCH_MAX ((uint64_t) KR_N << KR_WITHIN_BITS) +#define KR_NODES_SZ ((size_t) KR_N * SYMMKEYSZ) +#define KR_TCACHE_WAYS 16 /* per-thread cache slots per direction (pow2) */ +#define KR_EPOCHS 16 /* 4-bit wire epoch: gens before wrap */ + +#define KR_RP_WORDS (KEY_REPLAY_WINDOW / 64) /* pow2; RFC 6479 bitmap */ +#define KR_RP_SHIFT 6 +#define KR_RP_MASK 63 +#define KR_RP_WINDOW (KEY_REPLAY_WINDOW - 64) /* reserve 1 slack word */ + +static const char kr_node_label[] = "o7s-keyrot-node"; +static const char kr_leaf_label[] = "o7s-keyrot-leaf"; + +struct kr_batch { + uint64_t id; /* process-global, unique; cache key (no ABA) */ + uint8_t epoch; /* 4-bit wire selector */ + uint8_t * nodes; /* KR_NODES_SZ in secure heap; NULL if empty */ + uint64_t tx_ctr; /* atomic; per-batch so nonces never collide */ + + struct { /* RFC 6479-like anti-replay window */ + uint64_t last; /* highest accepted ctr + 1 */ + uint64_t bits[KR_RP_WORDS]; + pthread_mutex_t mtx; + } rp; +}; + +struct kr_keycache { + uint8_t * key; /* SYMMKEYSZ, points into the per-thread slab */ + uint64_t id; /* batch the cached key belongs to */ + uint16_t node; + uint8_t leaf; + uint8_t dir; + bool valid; +}; + +struct keyrot { + struct kr_batch * cur; /* published; read on data path */ + struct kr_batch * prev; /* NULL = none */ + struct rcu_guard guard; /* re-key vs readers */ + uint8_t role; + uint8_t tx_epoch; /* epoch TX currently stamps */ + bool peer_switched; /* peer is on the cur epoch */ +}; + +/* Per-thread leaf-key caches, freed by the thread-exit destructor. */ +struct kr_tcache { + struct kr_keycache tx[KR_TCACHE_WAYS]; + struct kr_keycache rx[KR_TCACHE_WAYS]; + uint8_t * slab; /* 2*KR_TCACHE_WAYS*SYMMKEYSZ secure heap */ +}; + +static struct { + uint64_t next_id; /* batch-id allocator (atomic) */ + pthread_key_t tcache_key; /* per-thread leaf-key caches */ + pthread_once_t tcache_once; +} kr_g = { 0, 0, PTHREAD_ONCE_INIT }; + +static void kr_tcache_free(void * p) +{ + struct kr_tcache * t = p; + + if (t == NULL) + return; + + crypt_secure_free(t->slab, 2 * KR_TCACHE_WAYS * SYMMKEYSZ); + free(t); +} + +static void kr_tcache_init(void) +{ + pthread_key_create(&kr_g.tcache_key, kr_tcache_free); +} + +static struct kr_tcache * kr_tcache_get(void) +{ + struct kr_tcache * t; + size_t i; + + pthread_once(&kr_g.tcache_once, kr_tcache_init); + + t = pthread_getspecific(kr_g.tcache_key); + if (t != NULL) + return t; + + t = malloc(sizeof(*t)); + if (t == NULL) + goto fail_alloc; + + memset(t, 0, sizeof(*t)); + + t->slab = crypt_secure_malloc(2 * KR_TCACHE_WAYS * SYMMKEYSZ); + if (t->slab == NULL) + goto fail_slab; + + for (i = 0; i < KR_TCACHE_WAYS; i++) { + t->tx[i].key = t->slab + i * SYMMKEYSZ; + t->rx[i].key = t->slab + (KR_TCACHE_WAYS + i) * SYMMKEYSZ; + } + + if (pthread_setspecific(kr_g.tcache_key, t) != 0) + goto fail_set; + + return t; + + fail_set: + crypt_secure_free(t->slab, 2 * KR_TCACHE_WAYS * SYMMKEYSZ); + fail_slab: + free(t); + fail_alloc: + return NULL; +} + +static uint8_t * kr_expand_nodes(const uint8_t * root) +{ + uint8_t * nodes; + buffer_t prk; + buffer_t info; + buffer_t okm; + + nodes = crypt_secure_malloc(KR_NODES_SZ); + if (nodes == NULL) + return NULL; + + prk.len = SYMMKEYSZ; + prk.data = (uint8_t *) root; + info.len = sizeof(kr_node_label) - 1; + info.data = (uint8_t *) kr_node_label; + okm.len = KR_NODES_SZ; + okm.data = nodes; + + if (crypt_hkdf_expand(prk, info, okm) != 0) + goto fail_expand; + + return nodes; + + fail_expand: + crypt_secure_free(nodes, KR_NODES_SZ); + return NULL; +} + +static int kr_leaf_key(const uint8_t * node, + uint8_t leaf, + uint8_t dir, + uint8_t * out) +{ + uint8_t info_buf[sizeof(kr_leaf_label) - 1 + 2]; + buffer_t prk; + buffer_t info; + buffer_t okm; + size_t n = sizeof(kr_leaf_label) - 1; + + memcpy(info_buf, kr_leaf_label, n); + info_buf[n] = dir; + info_buf[n + 1] = leaf; + + prk.len = SYMMKEYSZ; + prk.data = (uint8_t *) node; + info.len = n + 2; + info.data = info_buf; + okm.len = SYMMKEYSZ; + okm.data = out; + + return crypt_hkdf_expand(prk, info, okm); +} + +static __inline__ bool kr_kc_hit(const struct kr_keycache * kc, + const struct kr_batch * b, + uint16_t node, + uint8_t leaf, + uint8_t dir) +{ + if (!kc->valid) + return false; + + if (kc->id != b->id) + return false; + + if (kc->node != node) + return false; + + if (kc->leaf != leaf) + return false; + + return kc->dir == dir; +} + +/* Fetch the leaf key; derive into the (direct-mapped) slot on a miss. */ +static const uint8_t * kr_kc_get(struct kr_keycache * cache, + const struct kr_batch * b, + uint16_t node, + uint8_t leaf, + uint8_t dir) +{ + struct kr_keycache * kc; + uint8_t * nkey; + + kc = &cache[b->id & (KR_TCACHE_WAYS - 1)]; + + if (kr_kc_hit(kc, b, node, leaf, dir)) + return kc->key; + + nkey = b->nodes + (size_t) node * SYMMKEYSZ; + if (kr_leaf_key(nkey, leaf, dir, kc->key) != 0) + return NULL; + + kc->valid = true; + kc->id = b->id; + kc->node = node; + kc->leaf = leaf; + kc->dir = dir; + + return kc->key; +} + +static void kr_sel_enc(uint8_t epoch, + uint16_t node, + uint32_t seq, + uint8_t sel[KR_SELECTOR_LEN]) +{ + sel[0] = (uint8_t) ((epoch << 4) | ((node >> 8) & 0x0F)); + sel[1] = (uint8_t) (node & 0xFF); + sel[2] = (uint8_t) (seq >> 24); + sel[3] = (uint8_t) (seq >> 16); + sel[4] = (uint8_t) (seq >> 8); + sel[5] = (uint8_t) (seq); +} + +static void kr_sel_dec(const uint8_t sel[KR_SELECTOR_LEN], + uint8_t * epoch, + uint16_t * node, + uint32_t * seq) +{ + *epoch = (uint8_t) (sel[0] >> 4); + *node = (uint16_t) (((sel[0] & 0x0F) << 8) | sel[1]); + *seq = ((uint32_t) sel[2] << 24) | ((uint32_t) sel[3] << 16) | + ((uint32_t) sel[4] << 8) | (uint32_t) sel[5]; +} + +static uint64_t kr_ctr(uint16_t node, + uint32_t seq) +{ + return ((uint64_t) node << KR_WITHIN_BITS) | + ((uint64_t) seq & KR_WITHIN_MASK); +} + +static void kr_nonce(uint64_t ctr, + uint8_t * nonce) +{ + size_t i; + + memset(nonce, 0, KR_NONCE_LEN); + + /* ctr big-endian in the low 8 bytes; high bytes stay zero */ + for (i = 0; i < 8; i++) + nonce[i] = (uint8_t) (ctr >> (56 - 8 * i)); +} + +static struct kr_batch * kr_batch_create(uint8_t epoch, + const uint8_t * root) +{ + struct kr_batch * b; + + b = malloc(sizeof(*b)); + if (b == NULL) + goto fail_alloc; + + b->nodes = kr_expand_nodes(root); + if (b->nodes == NULL) + goto fail_nodes; + + b->id = FETCH_ADD_RELAXED(&kr_g.next_id, 1); + b->epoch = epoch; + b->tx_ctr = 0; + if (pthread_mutex_init(&b->rp.mtx, NULL) != 0) + goto fail_lock; + + b->rp.last = 0; + memset(b->rp.bits, 0, sizeof(b->rp.bits)); + + return b; + + fail_lock: + crypt_secure_free(b->nodes, KR_NODES_SZ); + free(b); + return NULL; + fail_nodes: + free(b); + fail_alloc: + return NULL; +} + +static void kr_batch_free(struct kr_batch * b) +{ + if (b == NULL) + return; + + pthread_mutex_destroy(&b->rp.mtx); + crypt_secure_free(b->nodes, KR_NODES_SZ); + free(b); +} + +/* + * RFC 6479 anti-replay window keyed on the per-batch counter, with + * seq = ctr + 1 so 0 means "nothing accepted yet". Returns 0 if the + * packet is fresh (and records it), -1 on a replay or a too-old ctr. + */ +static int kr_rp_commit(struct kr_batch * b, + uint64_t ctr) +{ + uint64_t seq; + uint64_t idx; + uint64_t cur; + uint64_t diff; + + seq = ctr + 1; + + pthread_mutex_lock(&b->rp.mtx); + + if (seq > b->rp.last) { + idx = seq >> KR_RP_SHIFT; + cur = b->rp.last >> KR_RP_SHIFT; + diff = idx - cur; + if (diff > KR_RP_WORDS) + diff = KR_RP_WORDS; + + while (diff-- > 0) { + cur++; + b->rp.bits[cur & (KR_RP_WORDS - 1)] = 0; + } + + b->rp.bits[idx & (KR_RP_WORDS - 1)] |= + (uint64_t) 1 << (seq & KR_RP_MASK); + b->rp.last = seq; + goto finish; + } + + if (b->rp.last - seq >= KR_RP_WINDOW) + goto fail; + + idx = seq >> KR_RP_SHIFT; + if (b->rp.bits[idx & (KR_RP_WORDS - 1)] + & ((uint64_t) 1 << (seq & KR_RP_MASK))) + goto fail; + + b->rp.bits[idx & (KR_RP_WORDS - 1)] |= + (uint64_t) 1 << (seq & KR_RP_MASK); + finish: + pthread_mutex_unlock(&b->rp.mtx); + + return 0; + fail: + pthread_mutex_unlock(&b->rp.mtx); + + return -1; +} + +struct keyrot * keyrot_create(const uint8_t * root, + uint8_t epoch, + uint8_t role) +{ + struct keyrot * kr; + + assert(root != NULL); + assert(role <= 1); + + if (epoch >= KR_EPOCHS) + goto fail_kr; + + kr = malloc(sizeof(*kr)); + if (kr == NULL) + goto fail_kr; + + memset(kr, 0, sizeof(*kr)); + + kr->role = role; + kr->tx_epoch = epoch; + kr->peer_switched = true; + kr->prev = NULL; + + kr->cur = kr_batch_create(epoch, root); + if (kr->cur == NULL) + goto fail_cur; + + if (rcu_guard_init(&kr->guard)) + goto fail_guard; + + return kr; + + fail_guard: + kr_batch_free(kr->cur); + fail_cur: + free(kr); + fail_kr: + return NULL; +} + +void keyrot_destroy(struct keyrot * kr) +{ + if (kr == NULL) + return; + + /* Wait out any in-flight reader before freeing batches. */ + rcu_drain(&kr->guard); + + kr_batch_free(kr->cur); + kr_batch_free(kr->prev); + + rcu_guard_fini(&kr->guard); + + free(kr); +} + +int keyrot_rekey(struct keyrot * kr, + const uint8_t * root, + uint8_t epoch) +{ + struct kr_batch * nb; + struct kr_batch * old_prev; + + assert(kr != NULL); + assert(root != NULL); + + if (epoch >= KR_EPOCHS) + return -1; + + nb = kr_batch_create(epoch, root); + if (nb == NULL) + return -1; + + rcu_wrlock(&kr->guard); + + old_prev = kr->prev; + rcu_assign(kr->prev, kr->cur); + rcu_publish(nb); + rcu_assign(kr->cur, nb); + + /* TX keeps the old epoch until the peer is seen on the new one. */ + STORE_RELEASE(&kr->peer_switched, false); + + rcu_wrunlock(&kr->guard); + + /* old_prev is unreachable now; reclaim past any live reader. */ + rcu_reclaim(&kr->guard); + kr_batch_free(old_prev); + + return 0; +} + +void keyrot_tx_promote(struct keyrot * kr) +{ + assert(kr != NULL); + + /* Serialise with keyrot_rekey so tx_epoch tracks a consistent cur. */ + rcu_wrlock(&kr->guard); + STORE_RELAXED(&kr->tx_epoch, rcu_deref(kr->cur)->epoch); + rcu_wrunlock(&kr->guard); +} + +int keyrot_tx_next(struct keyrot * kr, + uint8_t sel[KR_SELECTOR_LEN], + const uint8_t ** key, + uint8_t nonce[KR_NONCE_LEN]) +{ + struct kr_tcache * tc; + struct kr_batch * cur; + struct kr_batch * prev; + struct kr_batch * b; + uint64_t ctr; + uint16_t node; + uint8_t leaf; + uint8_t txe; + uint8_t epoch; + uint32_t seq; + const uint8_t * k; + + assert(kr != NULL); + assert(key != NULL); + + tc = kr_tcache_get(); + if (tc == NULL) + return -1; + + rcu_rdlock(&kr->guard); + + cur = rcu_deref(kr->cur); + prev = rcu_deref(kr->prev); + rcu_consume(cur); + rcu_consume(prev); + txe = LOAD_RELAXED(&kr->tx_epoch); + + if (cur->epoch == txe) + b = cur; + else if (prev != NULL && prev->epoch == txe) + b = prev; + else + b = NULL; + + if (b == NULL) { + rcu_rdunlock(&kr->guard); + return -1; /* tx_epoch batch gone; next promote resyncs */ + } + + /* Slot reserved even if exhausted; tx_nodes_left clamps the count. */ + ctr = FETCH_ADD_RELAXED(&b->tx_ctr, 1); + if (ctr >= KR_BATCH_MAX) { + rcu_rdunlock(&kr->guard); + return -1; /* batch exhausted */ + } + + node = (uint16_t) (ctr >> KR_WITHIN_BITS); + leaf = (uint8_t) ((ctr >> KEY_LEAF_BITS) & (KR_LEAVES - 1)); + seq = (uint32_t) (ctr & KR_WITHIN_MASK); + epoch = b->epoch; + + k = kr_kc_get(tc->tx, b, node, leaf, kr->role); + + rcu_rdunlock(&kr->guard); + + if (k == NULL) + return -1; + + kr_sel_enc(epoch, node, seq, sel); + kr_nonce(ctr, nonce); + + *key = k; + + return 0; +} + +int keyrot_rx_lookup(struct keyrot * kr, + const uint8_t sel[KR_SELECTOR_LEN], + const uint8_t ** key, + uint8_t nonce[KR_NONCE_LEN], + struct kr_rx * rx) +{ + struct kr_tcache * tc; + struct kr_batch * cur; + struct kr_batch * prev; + struct kr_batch * b; + uint8_t epoch; + uint16_t node; + uint32_t seq; + uint64_t ctr; + uint8_t leaf; + const uint8_t * k; + + assert(kr != NULL); + assert(key != NULL); + + kr_sel_dec(sel, &epoch, &node, &seq); + + if (node >= KR_N) + return -1; + + tc = kr_tcache_get(); + if (tc == NULL) + return -1; + + rcu_rdlock(&kr->guard); + + cur = rcu_deref(kr->cur); + prev = rcu_deref(kr->prev); + rcu_consume(cur); + rcu_consume(prev); + + if (epoch == cur->epoch) { + b = cur; + } else if (prev != NULL && epoch == prev->epoch) { + b = prev; + } else { + rcu_rdunlock(&kr->guard); + return -1; /* unknown epoch */ + } + + ctr = kr_ctr(node, seq); + leaf = (uint8_t) ((ctr >> KEY_LEAF_BITS) & (KR_LEAVES - 1)); + + /* peer's tx direction */ + k = kr_kc_get(tc->rx, b, node, leaf, (uint8_t) (kr->role ^ 1)); + + rx->id = b->id; + rx->ctr = ctr; + + rcu_rdunlock(&kr->guard); + + if (k == NULL) + return -1; + + kr_nonce(ctr, nonce); + + *key = k; + + return 0; +} + +/* + * Commit a packet that authenticated under the batch keyrot_rx_lookup + * selected. Re-finds that batch by id (epoch may have advanced) and, + * if still resident, advances the replay window and records that the + * peer is on the current batch. Runs only post-AEAD so a forged or + * replayed packet can mutate no receiver state. Returns -1 on replay. + */ +int keyrot_rx_commit(struct keyrot * kr, + const struct kr_rx * rx) +{ + struct kr_batch * cur; + struct kr_batch * prev; + struct kr_batch * b; + int rc; + + assert(kr != NULL); + assert(rx != NULL); + + rcu_rdlock(&kr->guard); + + cur = rcu_deref(kr->cur); + prev = rcu_deref(kr->prev); + rcu_consume(cur); + rcu_consume(prev); + + if (cur->id == rx->id) + b = cur; + else if (prev != NULL && prev->id == rx->id) + b = prev; + else + b = NULL; + + if (b == NULL) { + rcu_rdunlock(&kr->guard); + return 0; /* batch evicted post-auth; nothing to protect */ + } + + rc = kr_rp_commit(b, rx->ctr); + if (rc == 0 && b == cur) + STORE_RELEASE(&kr->peer_switched, true); + + rcu_rdunlock(&kr->guard); + + return rc; +} + +bool keyrot_peer_switched(const struct keyrot * kr) +{ + assert(kr != NULL); + + return LOAD_ACQUIRE(&kr->peer_switched); +} + +unsigned keyrot_tx_nodes_left(struct keyrot * kr) +{ + struct kr_batch * cur; + struct kr_batch * prev; + struct kr_batch * b; + uint64_t ctr; + unsigned used; + uint8_t txe; + + assert(kr != NULL); + + rcu_rdlock(&kr->guard); + cur = rcu_deref(kr->cur); + prev = rcu_deref(kr->prev); + rcu_consume(cur); + rcu_consume(prev); + txe = LOAD_RELAXED(&kr->tx_epoch); + + if (cur->epoch == txe) + b = cur; + else if (prev != NULL && prev->epoch == txe) + b = prev; + else + b = NULL; + + ctr = b != NULL ? LOAD_RELAXED(&b->tx_ctr) : KR_BATCH_MAX; + rcu_rdunlock(&kr->guard); + + used = (unsigned) (ctr >> KR_WITHIN_BITS); + if (used >= KR_N) + return 0; + + return KR_N - used; +} diff --git a/src/lib/crypt/keyrot.h b/src/lib/crypt/keyrot.h new file mode 100644 index 00000000..6a598f76 --- /dev/null +++ b/src/lib/crypt/keyrot.h @@ -0,0 +1,74 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * Data-plane key-rotation schedule (node/leaf keys, selector) + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +#ifndef OUROBOROS_LIB_CRYPT_KEYROT_H +#define OUROBOROS_LIB_CRYPT_KEYROT_H + +#include <ouroboros/crypt.h> /* SYMMKEYSZ, NONCESZ */ + +#include <stdbool.h> +#include <stdint.h> + +#define KR_SELECTOR_LEN 6 +#define KR_NONCE_LEN NONCESZ + +struct keyrot; + +struct kr_rx { + uint64_t id; /* batch id of the matched epoch */ + uint64_t ctr; /* packet counter for replay check */ +}; + +struct keyrot * keyrot_create(const uint8_t * root, + uint8_t epoch, + uint8_t role); + +void keyrot_destroy(struct keyrot * kr); + +int keyrot_rekey(struct keyrot * kr, + const uint8_t * root, + uint8_t epoch); + +/* Promote TX to the installed (new) batch once the peer is on it. */ +void keyrot_tx_promote(struct keyrot * kr); + +int keyrot_tx_next(struct keyrot * kr, + uint8_t sel[KR_SELECTOR_LEN], + const uint8_t ** key, + uint8_t nonce[KR_NONCE_LEN]); + +int keyrot_rx_lookup(struct keyrot * kr, + const uint8_t sel[KR_SELECTOR_LEN], + const uint8_t ** key, + uint8_t nonce[KR_NONCE_LEN], + struct kr_rx * rx); + +/* Commit an authenticated packet: replay window + peer-switched. */ +int keyrot_rx_commit(struct keyrot * kr, + const struct kr_rx * rx); + +/* True once an RX packet under the current batch has been observed. */ +bool keyrot_peer_switched(const struct keyrot * kr); + +unsigned keyrot_tx_nodes_left(struct keyrot * kr); + +#endif /* OUROBOROS_LIB_CRYPT_KEYROT_H */ diff --git a/src/lib/crypt/openssl.c b/src/lib/crypt/openssl.c index 5916e3cb..d5d9adf5 100644 --- a/src/lib/crypt/openssl.c +++ b/src/lib/crypt/openssl.c @@ -30,6 +30,8 @@ #include <ouroboros/errno.h> #include <ouroboros/crypt.h> #include <ouroboros/hash.h> +#include <ouroboros/name.h> +#include <ouroboros/pthread.h> #include <ouroboros/random.h> #include <ouroboros/utils.h> @@ -52,27 +54,14 @@ #define HKDF_INFO_DHE "o7s-ossl-dhe" #define HKDF_INFO_ENCAP "o7s-ossl-encap" -#define HKDF_INFO_ROTATION "o7s-key-rotation" #define HKDF_SALT_LEN 32 /* SHA-256 output size */ +#define AEAD_NONCE_LEN 12 /* 96-bit deterministic IV (SP 800-38D) */ +#define AEAD_TAG_LEN 16 /* 128-bit AEAD authentication tag */ struct ossl_crypt_ctx { EVP_CIPHER_CTX * evp_ctx; const EVP_CIPHER * cipher; - int ivsz; int tagsz; - - struct { - uint8_t * cur; /* current key */ - uint8_t * prv; /* rotated key */ - } keys; - - struct { - uint32_t cntr; /* counter */ - uint32_t mask; /* phase mask */ - uint32_t age; /* counter within epoch */ - uint8_t phase; /* current key phase */ - uint8_t salt[HKDF_SALT_LEN]; - } rot; /* rotation logic */ }; struct kdf_info { @@ -83,17 +72,6 @@ struct kdf_info { buffer_t key; }; -/* Key rotation macros */ -#define HAS_PHASE_BIT_TOGGLED(ctx) \ - (((ctx)->rot.cntr & (ctx)->rot.mask) != \ - (((ctx)->rot.cntr - 1) & (ctx)->rot.mask)) - -#define HAS_GRACE_EXPIRED(ctx) \ - ((ctx)->rot.age >= ((ctx)->rot.mask >> 1)) - -#define ROTATION_TOO_RECENT(ctx) \ - ((ctx)->rot.age < ((ctx)->rot.mask - ((ctx)->rot.mask >> 2))) - /* Convert hash NID to OpenSSL digest name string for HKDF */ static const char * hash_nid_to_digest_name(int nid) { @@ -102,11 +80,11 @@ static const char * hash_nid_to_digest_name(int nid) md = EVP_get_digestbynid(nid); if (md == NULL) - return "SHA256"; /* fallback to SHA-256 */ + return NULL; name = EVP_MD_get0_name(md); if (name == NULL) - return "SHA256"; /* fallback to SHA-256 */ + return NULL; return name; } @@ -144,21 +122,20 @@ static int get_pk_bytes_from_key(EVP_PKEY * key, } /* Derive salt from public key bytes by hashing them */ -static int derive_salt_from_pk_bytes(buffer_t pk, - uint8_t * salt, - size_t salt_len) +static int derive_salt_from_pk_bytes(buffer_t pk, + buffer_t salt) { uint8_t hash[EVP_MAX_MD_SIZE]; unsigned hash_len; assert(pk.data != NULL); - assert(salt != NULL); + assert(salt.data != NULL); if (EVP_Digest(pk.data, pk.len, hash, &hash_len, EVP_sha256(), NULL) != 1) goto fail_digest; - memcpy(salt, hash, salt_len < hash_len ? salt_len : hash_len); + memcpy(salt.data, hash, salt.len < hash_len ? salt.len : hash_len); return 0; fail_digest: @@ -166,10 +143,9 @@ static int derive_salt_from_pk_bytes(buffer_t pk, } /* Derive salt from two public key byte buffers (DHE) in canonical order */ -static int derive_salt_from_pk_bytes_dhe(buffer_t local, - buffer_t remote, - uint8_t * salt, - size_t salt_len) +static int derive_salt_from_pk_bytes_dhe(buffer_t local, + buffer_t remote, + buffer_t salt) { uint8_t * concat; size_t concat_len; @@ -180,7 +156,7 @@ static int derive_salt_from_pk_bytes_dhe(buffer_t local, assert(local.data != NULL); assert(remote.data != NULL); - assert(salt != NULL); + assert(salt.data != NULL); concat_len = local.len + remote.len; concat = OPENSSL_malloc(concat_len); @@ -204,7 +180,7 @@ static int derive_salt_from_pk_bytes_dhe(buffer_t local, OPENSSL_free(concat); - memcpy(salt, hash, salt_len < hash_len ? salt_len : hash_len); + memcpy(salt.data, hash, salt.len < hash_len ? salt.len : hash_len); return 0; fail_digest: @@ -225,6 +201,8 @@ static int derive_key_hkdf(struct kdf_info * ki) int idx; digest = hash_nid_to_digest_name(ki->nid); + if (digest == NULL) + goto fail_fetch; kdf = EVP_KDF_fetch(NULL, "HKDF", NULL); if (kdf == NULL) @@ -258,117 +236,144 @@ static int derive_key_hkdf(struct kdf_info * ki) return -ECRYPT; } -/* Key rotation helper functions implementation */ -static int should_rotate_key_rx(struct ossl_crypt_ctx * ctx, - uint8_t rx_phase) +int openssl_hkdf_expand(buffer_t key, + buffer_t info, + buffer_t out) { - assert(ctx != NULL); + EVP_KDF * kdf; + EVP_KDF_CTX * kctx; + OSSL_PARAM params[5]; + int mode = EVP_KDF_HKDF_MODE_EXPAND_ONLY; + int idx = 0; + int ret = -1; - /* Phase must have changed */ - if (rx_phase == ctx->rot.phase) - return 0; + kdf = EVP_KDF_fetch(NULL, "HKDF", NULL); + if (kdf == NULL) + goto fail_fetch; - if (ROTATION_TOO_RECENT(ctx)) - return 0; + kctx = EVP_KDF_CTX_new(kdf); + if (kctx == NULL) + goto fail_ctx; + + params[idx++] = OSSL_PARAM_construct_utf8_string( + "digest", (char *) "SHA256", 0); + params[idx++] = OSSL_PARAM_construct_int("mode", &mode); + params[idx++] = OSSL_PARAM_construct_octet_string( + "key", key.data, key.len); + params[idx++] = OSSL_PARAM_construct_octet_string( + "info", info.data, info.len); + params[idx] = OSSL_PARAM_construct_end(); - return 1; + if (EVP_KDF_derive(kctx, out.data, out.len, params) == 1) + ret = 0; + + EVP_KDF_CTX_free(kctx); + fail_ctx: + EVP_KDF_free(kdf); + fail_fetch: + return ret; } -static int rotate_key(struct ossl_crypt_ctx * ctx) +/* AEAD seal: encrypt in with key/nonce, bind aad, append tag */ +int openssl_seal(struct ossl_crypt_ctx * ctx, + const uint8_t * key, + const uint8_t * nonce, + buffer_t aad, + buffer_t in, + uint8_t * out, + uint8_t * tag) { - struct kdf_info ki; - uint8_t * tmp; + int out_sz; + int tmp_sz; assert(ctx != NULL); + assert(ctx->tagsz > 0); /* AEAD mandated at ctx creation */ - /* Swap keys - move current to prev */ - tmp = ctx->keys.prv; - ctx->keys.prv = ctx->keys.cur; + EVP_CIPHER_CTX_reset(ctx->evp_ctx); - if (tmp != NULL) { - /* Reuse old prev_key memory for new key */ - ctx->keys.cur = tmp; - } else { - /* First rotation - allocate new memory */ - ctx->keys.cur = OPENSSL_secure_malloc(SYMMKEYSZ); - if (ctx->keys.cur == NULL) - return -ECRYPT; - } + if (EVP_EncryptInit_ex(ctx->evp_ctx, ctx->cipher, NULL, + NULL, NULL) != 1) + return -1; - /* Derive new key from previous key using HKDF */ - ki.secret.data = ctx->keys.prv; - ki.secret.len = SYMMKEYSZ; - ki.nid = NID_sha256; - ki.salt.data = ctx->rot.salt; - ki.salt.len = HKDF_SALT_LEN; - ki.info.data = (uint8_t *) HKDF_INFO_ROTATION; - ki.info.len = strlen(HKDF_INFO_ROTATION); - ki.key.data = ctx->keys.cur; - ki.key.len = SYMMKEYSZ; + /* Pin the AEAD nonce to 96 bits (SP 800-38D deterministic IV). */ + if (EVP_CIPHER_CTX_ctrl(ctx->evp_ctx, EVP_CTRL_AEAD_SET_IVLEN, + AEAD_NONCE_LEN, NULL) != 1) + return -1; - if (derive_key_hkdf(&ki) != 0) - return -ECRYPT; + if (EVP_EncryptInit_ex(ctx->evp_ctx, NULL, NULL, + key, nonce) != 1) + return -1; - ctx->rot.age = 0; - ctx->rot.phase = !ctx->rot.phase; + if (EVP_EncryptUpdate(ctx->evp_ctx, NULL, &tmp_sz, + aad.data, (int) aad.len) != 1) + return -1; - return 0; -} + if (EVP_EncryptUpdate(ctx->evp_ctx, out, &out_sz, + in.data, (int) in.len) != 1) + return -1; -static void cleanup_old_key(struct ossl_crypt_ctx * ctx) -{ - assert(ctx != NULL); + if (EVP_EncryptFinal_ex(ctx->evp_ctx, out + out_sz, &tmp_sz) != 1) + return -1; - if (ctx->keys.prv == NULL) - return; + out_sz += tmp_sz; - if (!HAS_GRACE_EXPIRED(ctx)) - return; + if (EVP_CIPHER_CTX_ctrl(ctx->evp_ctx, EVP_CTRL_AEAD_GET_TAG, + ctx->tagsz, tag) != 1) + return -1; - OPENSSL_secure_clear_free(ctx->keys.prv, SYMMKEYSZ); - ctx->keys.prv = NULL; + return out_sz; } -static int try_decrypt(struct ossl_crypt_ctx * ctx, - uint8_t * key, - uint8_t * iv, - uint8_t * input, - int in_sz, - uint8_t * out, - int * out_sz) +/* AEAD open: decrypt in with key/nonce, verify aad and tag */ +int openssl_open(struct ossl_crypt_ctx * ctx, + const uint8_t * key, + const uint8_t * nonce, + buffer_t aad, + buffer_t in, + const uint8_t * tag, + buffer_t * out) { - uint8_t * tag; - int tmp_sz; - int ret; + int out_sz; + int tmp_sz; - tag = input + in_sz; + assert(ctx != NULL); + assert(ctx->tagsz > 0); /* AEAD mandated at ctx creation */ EVP_CIPHER_CTX_reset(ctx->evp_ctx); - ret = EVP_DecryptInit_ex(ctx->evp_ctx, ctx->cipher, NULL, key, iv); - if (ret != 1) + if (EVP_DecryptInit_ex(ctx->evp_ctx, ctx->cipher, NULL, + NULL, NULL) != 1) return -1; - if (ctx->tagsz > 0) { - ret = EVP_CIPHER_CTX_ctrl(ctx->evp_ctx, EVP_CTRL_AEAD_SET_TAG, - ctx->tagsz, tag); - if (ret != 1) - return -1; - } + /* Pin the AEAD nonce to 96 bits (SP 800-38D deterministic IV). */ + if (EVP_CIPHER_CTX_ctrl(ctx->evp_ctx, EVP_CTRL_AEAD_SET_IVLEN, + AEAD_NONCE_LEN, NULL) != 1) + return -1; - ret = EVP_DecryptUpdate(ctx->evp_ctx, out, &tmp_sz, input, in_sz); - if (ret != 1) + if (EVP_DecryptInit_ex(ctx->evp_ctx, NULL, NULL, key, nonce) != 1) return -1; - *out_sz = tmp_sz; + if (EVP_CIPHER_CTX_ctrl(ctx->evp_ctx, EVP_CTRL_AEAD_SET_TAG, + ctx->tagsz, (void *) tag) != 1) + return -1; - ret = EVP_DecryptFinal_ex(ctx->evp_ctx, out + tmp_sz, &tmp_sz); - if (ret != 1) + if (EVP_DecryptUpdate(ctx->evp_ctx, NULL, &tmp_sz, + aad.data, (int) aad.len) != 1) return -1; - *out_sz += tmp_sz; + if (EVP_DecryptUpdate(ctx->evp_ctx, out->data, &out_sz, + in.data, (int) in.len) != 1) + return -1; - return 0; + if (EVP_DecryptFinal_ex(ctx->evp_ctx, out->data + out_sz, + &tmp_sz) != 1) + return -1; + + out_sz += tmp_sz; + + out->len = (size_t) out_sz; + + return out_sz; } /* @@ -396,11 +401,14 @@ static int __openssl_dhe_derive(EVP_PKEY * pkp, ret = i2d_PUBKEY(pkp, &local_pk.data); if (ret <= 0) goto fail_local; + local_pk.len = (size_t) ret; + ki.salt.len = HKDF_SALT_LEN; + ki.salt.data = salt_buf; + /* Derive salt from both public keys */ - if (derive_salt_from_pk_bytes_dhe(local_pk, remote_pk, salt_buf, - HKDF_SALT_LEN) < 0) + if (derive_salt_from_pk_bytes_dhe(local_pk, remote_pk, ki.salt) < 0) goto fail_salt; ctx = EVP_PKEY_CTX_new(pkp, NULL); @@ -437,13 +445,11 @@ static int __openssl_dhe_derive(EVP_PKEY * pkp, ki.info.data = (uint8_t *) HKDF_INFO_DHE; ki.key.len = SYMMKEYSZ; ki.key.data = s; - ki.salt.len = HKDF_SALT_LEN; - ki.salt.data = salt_buf; /* Derive symmetric key from shared secret using HKDF */ ret = derive_key_hkdf(&ki); - OPENSSL_free(secret); + OPENSSL_clear_free(secret, secret_len); EVP_PKEY_CTX_free(ctx); OPENSSL_free(local_pk.data); @@ -452,7 +458,7 @@ static int __openssl_dhe_derive(EVP_PKEY * pkp, return 0; fail_derive: - OPENSSL_free(secret); + OPENSSL_clear_free(secret, secret_len); fail_ctx: EVP_PKEY_CTX_free(ctx); fail_salt: @@ -624,14 +630,22 @@ ssize_t openssl_pkp_create(const char * algo, if (raw.len == 0) goto fail_pubkey; + if (raw.len > CRYPT_KEY_BUFSZ) { + OPENSSL_free(raw.data); + goto fail_pubkey; + } + memcpy(pk, raw.data, raw.len); OPENSSL_free(raw.data); return (ssize_t) raw.len; } else { /* DER encode standard algorithms */ + len = i2d_PUBKEY(*pkp, NULL); /* pre-flight length */ + if (len < 0 || len > CRYPT_KEY_BUFSZ) + goto fail_pubkey; + pos = pk; /* i2d_PUBKEY increments the ptr, don't use pk! */ - len = i2d_PUBKEY(*pkp, &pos); - if (len < 0) + if (i2d_PUBKEY(*pkp, &pos) < 0) goto fail_pubkey; return len; @@ -692,7 +706,7 @@ static ssize_t __openssl_kem_encap(EVP_PKEY * pub, /* Derive symmetric key from shared secret using HKDF */ ret = derive_key_hkdf(&ki); - OPENSSL_free(secret); + OPENSSL_clear_free(secret, secret_len); EVP_PKEY_CTX_free(ctx); if (ret != 0) @@ -701,7 +715,7 @@ static ssize_t __openssl_kem_encap(EVP_PKEY * pub, return (ssize_t) ct_len; fail_secret: - OPENSSL_free(secret); + OPENSSL_clear_free(secret, secret_len); fail_encap: EVP_PKEY_CTX_free(ctx); fail_ctx: @@ -717,13 +731,17 @@ ssize_t openssl_kem_encap(buffer_t pk, EVP_PKEY * pub; uint8_t * pos; uint8_t salt[HKDF_SALT_LEN]; + buffer_t salt_b; ssize_t ret; assert(pk.data != NULL); assert(ct != NULL); assert(s != NULL); - if (derive_salt_from_pk_bytes(pk, salt, HKDF_SALT_LEN) < 0) + salt_b.len = HKDF_SALT_LEN; + salt_b.data = salt; + + if (derive_salt_from_pk_bytes(pk, salt_b) < 0) goto fail_salt; pos = pk.data; @@ -749,13 +767,17 @@ ssize_t openssl_kem_encap_raw(buffer_t pk, EVP_PKEY * pub; const char * algo; uint8_t salt[HKDF_SALT_LEN]; + buffer_t salt_b; ssize_t ret; assert(pk.data != NULL); assert(ct != NULL); assert(s != NULL); - if (derive_salt_from_pk_bytes(pk, salt, HKDF_SALT_LEN) < 0) + salt_b.len = HKDF_SALT_LEN; + salt_b.data = salt; + + if (derive_salt_from_pk_bytes(pk, salt_b) < 0) goto fail_salt; algo = __openssl_hybrid_algo_from_len(pk.len); @@ -789,12 +811,16 @@ int openssl_kem_decap(EVP_PKEY * priv, size_t secret_len; int ret; uint8_t salt[HKDF_SALT_LEN]; + buffer_t salt_b; /* Extract public key bytes from private key */ if (get_pk_bytes_from_key(priv, &pk) < 0) goto fail_pk; - if (derive_salt_from_pk_bytes(pk, salt, HKDF_SALT_LEN) < 0) + salt_b.len = HKDF_SALT_LEN; + salt_b.data = salt; + + if (derive_salt_from_pk_bytes(pk, salt_b) < 0) goto fail_salt; ctx = EVP_PKEY_CTX_new(priv, NULL); @@ -833,7 +859,7 @@ int openssl_kem_decap(EVP_PKEY * priv, /* Derive symmetric key from shared secret using HKDF */ ret = derive_key_hkdf(&ki); - OPENSSL_free(secret); + OPENSSL_clear_free(secret, secret_len); EVP_PKEY_CTX_free(ctx); OPENSSL_free(pk.data); @@ -843,7 +869,7 @@ int openssl_kem_decap(EVP_PKEY * priv, return 0; fail_secret: - OPENSSL_free(secret); + OPENSSL_clear_free(secret, secret_len); fail_ctx: EVP_PKEY_CTX_free(ctx); fail_salt: @@ -857,13 +883,14 @@ void openssl_pkp_destroy(EVP_PKEY * pkp) EVP_PKEY_free(pkp); } -int __openssl_get_curve(EVP_PKEY * pub, - char * algo) +static int openssl_get_curve(EVP_PKEY * pub, + char * algo) { int ret; size_t len = KEX_ALGO_BUFSZ; ret = EVP_PKEY_get_utf8_string_param(pub, "group", algo, len, &len); + return ret == 1 ? 0 : -ECRYPT; } @@ -888,9 +915,10 @@ int openssl_get_algo_from_pk_der(buffer_t pk, strcpy(algo, type_str); - if ((IS_EC_GROUP(algo) || IS_DH_GROUP(algo)) && - __openssl_get_curve(pub, algo) < 0) - goto fail_pub; + if (IS_EC_GROUP(algo) || IS_DH_GROUP(algo)) { + if (openssl_get_curve(pub, algo) < 0) + goto fail_pub; + } EVP_PKEY_free(pub); return 0; @@ -948,141 +976,122 @@ int openssl_dhe_derive(EVP_PKEY * pkp, return -ECRYPT; } -int openssl_encrypt(struct ossl_crypt_ctx * ctx, - buffer_t in, - buffer_t * out) +/* Set up a fresh AEAD cipher ctx for nid: reject non-AEAD / oversized IV. */ +static int ossl_cipher_ctx_init(struct ossl_crypt_ctx * ctx, + int nid) { - uint8_t * ptr; - uint8_t * iv; - int in_sz; - int out_sz; - int tmp_sz; - int ret; - - assert(ctx != NULL); - - in_sz = (int) in.len; - - out->data = malloc(in.len + EVP_MAX_BLOCK_LENGTH + \ - ctx->ivsz + ctx->tagsz); - if (out->data == NULL) - goto fail_malloc; - - iv = out->data; - ptr = out->data + ctx->ivsz; + ctx->cipher = EVP_get_cipherbynid(nid); + if (ctx->cipher == NULL) + return -1; - if (random_buffer(iv, ctx->ivsz) < 0) - goto fail_encrypt; + /* IV must fit the NONCESZ nonce buffer. */ + if (EVP_CIPHER_get_iv_length(ctx->cipher) > NONCESZ) + return -1; - /* Set IV bit 7 to current key phase (KEY_ROTATION_BIT of counter) */ - if (ctx->rot.cntr & ctx->rot.mask) - iv[0] |= 0x80; - else - iv[0] &= 0x7F; + /* Authenticated encryption is mandatory; reject non-AEAD ciphers. */ + if ((EVP_CIPHER_flags(ctx->cipher) & EVP_CIPH_FLAG_AEAD_CIPHER) == 0) + return -1; - EVP_CIPHER_CTX_reset(ctx->evp_ctx); + ctx->tagsz = AEAD_TAG_LEN; - ret = EVP_EncryptInit_ex(ctx->evp_ctx, ctx->cipher, NULL, - ctx->keys.cur, iv); - if (ret != 1) - goto fail_encrypt; + ctx->evp_ctx = EVP_CIPHER_CTX_new(); + if (ctx->evp_ctx == NULL) + return -1; - ret = EVP_EncryptUpdate(ctx->evp_ctx, ptr, &tmp_sz, in.data, in_sz); - if (ret != 1) - goto fail_encrypt; + return 0; +} - out_sz = tmp_sz; - ret = EVP_EncryptFinal_ex(ctx->evp_ctx, ptr + tmp_sz, &tmp_sz); - if (ret != 1) - goto fail_encrypt; +/* One-shot AEAD seal over an explicit key/nonce (no keyrot). out = ct ‖ tag. */ +int openssl_oneshot_seal(int nid, + const uint8_t * key, + const uint8_t * nonce, + buffer_t aad, + buffer_t in, + buffer_t * out) +{ + struct ossl_crypt_ctx ctx; + int out_sz; - out_sz += tmp_sz; + assert(key != NULL); + assert(nonce != NULL); + assert(out != NULL); - /* For AEAD ciphers, get and append the authentication tag */ - if (ctx->tagsz > 0) { - ret = EVP_CIPHER_CTX_ctrl(ctx->evp_ctx, EVP_CTRL_AEAD_GET_TAG, - ctx->tagsz, ptr + out_sz); - if (ret != 1) - goto fail_encrypt; - out_sz += ctx->tagsz; - } + memset(&ctx, 0, sizeof(ctx)); - assert(out_sz >= in_sz); + if (ossl_cipher_ctx_init(&ctx, nid) < 0) + goto fail_cipher; - out->len = (size_t) out_sz + ctx->ivsz; + out->data = malloc(in.len + EVP_MAX_BLOCK_LENGTH + ctx.tagsz); + if (out->data == NULL) + goto fail_ctx; - /* Increment packet counter and check for key rotation */ - ctx->rot.cntr++; - ctx->rot.age++; + out_sz = openssl_seal(&ctx, key, nonce, aad, in, + out->data, out->data + in.len); + if (out_sz < 0) + goto fail_seal; - if (HAS_PHASE_BIT_TOGGLED(ctx)) { - if (rotate_key(ctx) != 0) - goto fail_encrypt; - } + out->len = (size_t) out_sz + ctx.tagsz; - cleanup_old_key(ctx); + EVP_CIPHER_CTX_free(ctx.evp_ctx); return 0; - fail_encrypt: + + fail_seal: free(out->data); - fail_malloc: + fail_ctx: + EVP_CIPHER_CTX_free(ctx.evp_ctx); + fail_cipher: clrbuf(*out); return -ECRYPT; } -int openssl_decrypt(struct ossl_crypt_ctx * ctx, - buffer_t in, - buffer_t * out) +/* One-shot AEAD open; in = ct ‖ tag, verifies aad and tag. */ +int openssl_oneshot_open(int nid, + const uint8_t * key, + const uint8_t * nonce, + buffer_t aad, + buffer_t in, + buffer_t * out) { - uint8_t * iv; - uint8_t * input; - uint8_t rx_phase; - int out_sz; - int in_sz; - - assert(ctx != NULL); + struct ossl_crypt_ctx ctx; + buffer_t ct; + const uint8_t * tag; + int in_sz; - in_sz = (int) in.len - ctx->ivsz; - if (in_sz < ctx->tagsz) - return -ECRYPT; - - in_sz -= ctx->tagsz; + assert(key != NULL); + assert(nonce != NULL); + assert(out != NULL); - out->data = malloc(in_sz + EVP_MAX_BLOCK_LENGTH); - if (out->data == NULL) - goto fail_malloc; + memset(&ctx, 0, sizeof(ctx)); - iv = in.data; - input = in.data + ctx->ivsz; + if (ossl_cipher_ctx_init(&ctx, nid) < 0) + goto fail_cipher; - /* Extract phase from IV bit 7 and check for key rotation */ - rx_phase = (iv[0] & 0x80) ? 1 : 0; + if (in.len < (size_t) ctx.tagsz) + goto fail_ctx; - if (should_rotate_key_rx(ctx, rx_phase)) { - if (rotate_key(ctx) != 0) - goto fail_decrypt; - } + in_sz = (int) in.len - ctx.tagsz; - ctx->rot.cntr++; - ctx->rot.age++; + out->data = malloc((size_t) in_sz + EVP_MAX_BLOCK_LENGTH); + if (out->data == NULL) + goto fail_ctx; - if (try_decrypt(ctx, ctx->keys.cur, iv, input, in_sz, out->data, - &out_sz) != 0) { - if (ctx->keys.prv == NULL) - goto fail_decrypt; - if (try_decrypt(ctx, ctx->keys.prv, iv, input, in_sz, - out->data, &out_sz) != 0) - goto fail_decrypt; - } + ct.data = in.data; + ct.len = (size_t) in_sz; + tag = in.data + in_sz; - assert(out_sz <= in_sz); + if (openssl_open(&ctx, key, nonce, aad, ct, tag, out) < 0) + goto fail_open; - out->len = (size_t) out_sz; + EVP_CIPHER_CTX_free(ctx.evp_ctx); return 0; - fail_decrypt: + + fail_open: free(out->data); - fail_malloc: + fail_ctx: + EVP_CIPHER_CTX_free(ctx.evp_ctx); + fail_cipher: clrbuf(*out); return -ECRYPT; } @@ -1093,51 +1102,19 @@ struct ossl_crypt_ctx * openssl_crypt_create_ctx(struct crypt_sk * sk) assert(sk != NULL); assert(sk->key != NULL); - assert(sk->rot_bit > 0 && sk->rot_bit < 32); ctx = malloc(sizeof(*ctx)); if (ctx == NULL) - goto fail_malloc; + goto fail_malloc; memset(ctx, 0, sizeof(*ctx)); - ctx->keys.cur = OPENSSL_secure_malloc(SYMMKEYSZ); - if (ctx->keys.cur == NULL) - goto fail_key; - - memcpy(ctx->keys.cur, sk->key, SYMMKEYSZ); - - ctx->keys.prv = NULL; - - /* Derive rotation salt from initial shared secret */ - if (EVP_Digest(sk->key, SYMMKEYSZ, ctx->rot.salt, NULL, - EVP_sha256(), NULL) != 1) - goto fail_cipher; - - ctx->cipher = EVP_get_cipherbynid(sk->nid); - if (ctx->cipher == NULL) - goto fail_cipher; - - ctx->ivsz = EVP_CIPHER_iv_length(ctx->cipher); - - /* Set tag size for AEAD ciphers (GCM, CCM, OCB, ChaCha20-Poly1305) */ - if (EVP_CIPHER_flags(ctx->cipher) & EVP_CIPH_FLAG_AEAD_CIPHER) - ctx->tagsz = 16; /* Standard AEAD tag length (128 bits) */ - - ctx->rot.cntr = 0; - ctx->rot.mask = (1U << sk->rot_bit); - ctx->rot.age = 0; - ctx->rot.phase = 0; - - ctx->evp_ctx = EVP_CIPHER_CTX_new(); - if (ctx->evp_ctx == NULL) + if (ossl_cipher_ctx_init(ctx, sk->nid) < 0) goto fail_cipher; return ctx; fail_cipher: - OPENSSL_secure_clear_free(ctx->keys.cur, SYMMKEYSZ); - fail_key: free(ctx); fail_malloc: return NULL; @@ -1148,23 +1125,10 @@ void openssl_crypt_destroy_ctx(struct ossl_crypt_ctx * ctx) if (ctx == NULL) return; - if (ctx->keys.cur != NULL) - OPENSSL_secure_clear_free(ctx->keys.cur, SYMMKEYSZ); - - if (ctx->keys.prv != NULL) - OPENSSL_secure_clear_free(ctx->keys.prv, SYMMKEYSZ); - EVP_CIPHER_CTX_free(ctx->evp_ctx); free(ctx); } -int openssl_crypt_get_ivsz(struct ossl_crypt_ctx * ctx) -{ - assert(ctx != NULL); - - return ctx->ivsz; -} - int openssl_crypt_get_tagsz(struct ossl_crypt_ctx * ctx) { assert(ctx != NULL); @@ -1184,7 +1148,12 @@ int openssl_load_crt_file(const char * path, if (fp == NULL) goto fail_file; + pthread_cleanup_push(__cleanup_fclose, fp); + xcrt = PEM_read_X509(fp, NULL, NULL, NULL); + + pthread_cleanup_pop(false); + if (xcrt == NULL) goto fail_crt; @@ -1200,35 +1169,58 @@ int openssl_load_crt_file(const char * path, return -1; } -int openssl_load_crt_str(const char * str, - void ** crt) +static void * rd_crt_bio(BIO * bio) +{ + return PEM_read_bio_X509(bio, NULL, NULL, NULL); +} + +static void * rd_privkey_bio(BIO * bio) +{ + return PEM_read_bio_PrivateKey(bio, NULL, NULL, ""); +} + +static void * rd_pubkey_bio(BIO * bio) +{ + return PEM_read_bio_PUBKEY(bio, NULL, NULL, NULL); +} + +/* Decode a PEM object from an in-memory string via rd. */ +static int load_pem_str(const char * str, + void * (* rd)(BIO *), + void ** out) { BIO * bio; - X509 * xcrt; + void * obj; bio = BIO_new(BIO_s_mem()); if (bio == NULL) goto fail_bio; if (BIO_write(bio, str, strlen(str)) < 0) - goto fail_crt; + goto fail_obj; - xcrt = PEM_read_bio_X509(bio, NULL, NULL, NULL); - if (xcrt == NULL) - goto fail_crt; + obj = rd(bio); + if (obj == NULL) + goto fail_obj; BIO_free(bio); - *crt = (void *) xcrt; + *out = obj; return 0; - fail_crt: + fail_obj: BIO_free(bio); fail_bio: - *crt = NULL; + *out = NULL; return -1; } +int openssl_load_crt_str(const char * str, + void ** crt) +{ + return load_pem_str(str, rd_crt_bio, crt); +} + int openssl_load_crt_der(buffer_t buf, void ** crt) { @@ -1288,7 +1280,12 @@ int openssl_load_privkey_file(const char * path, if (fp == NULL) goto fail_file; + pthread_cleanup_push(__cleanup_fclose, fp); + pkey = PEM_read_PrivateKey(fp, NULL, NULL, ""); + + pthread_cleanup_pop(false); + if (pkey == NULL) goto fail_key; @@ -1307,30 +1304,7 @@ int openssl_load_privkey_file(const char * path, int openssl_load_privkey_str(const char * str, void ** key) { - BIO * bio; - EVP_PKEY * pkey; - - bio = BIO_new(BIO_s_mem()); - if (bio == NULL) - goto fail_bio; - - if (BIO_write(bio, str, strlen(str)) < 0) - goto fail_key; - - pkey = PEM_read_bio_PrivateKey(bio, NULL, NULL, NULL); - if (pkey == NULL) - goto fail_key; - - BIO_free(bio); - - *key = (void *) pkey; - - return 0; - fail_key: - BIO_free(bio); - fail_bio: - *key = NULL; - return -1; + return load_pem_str(str, rd_privkey_bio, key); } int openssl_load_pubkey_file(const char * path, @@ -1343,7 +1317,12 @@ int openssl_load_pubkey_file(const char * path, if (fp == NULL) goto fail_file; + pthread_cleanup_push(__cleanup_fclose, fp); + pkey = PEM_read_PUBKEY(fp, NULL, NULL, NULL); + + pthread_cleanup_pop(false); + if (pkey == NULL) goto fail_key; @@ -1375,7 +1354,12 @@ int openssl_load_pubkey_file_to_der(const char * path, if (fp == NULL) goto fail_file; + pthread_cleanup_push(__cleanup_fclose, fp); + pkey = PEM_read_PUBKEY(fp, NULL, NULL, NULL); + + pthread_cleanup_pop(false); + if (pkey == NULL) goto fail_key; @@ -1402,30 +1386,7 @@ int openssl_load_pubkey_file_to_der(const char * path, int openssl_load_pubkey_str(const char * str, void ** key) { - BIO * bio; - EVP_PKEY * pkey; - - bio = BIO_new(BIO_s_mem()); - if (bio == NULL) - goto fail_bio; - - if (BIO_write(bio, str, strlen(str)) < 0) - goto fail_key; - - pkey = PEM_read_bio_PUBKEY(bio, NULL, NULL, NULL); - if (pkey == NULL) - goto fail_key; - - BIO_free(bio); - - *key = (void *) pkey; - - return 0; - fail_key: - BIO_free(bio); - fail_bio: - *key = NULL; - return -1; + return load_pem_str(str, rd_pubkey_bio, key); } int openssl_load_pubkey_raw_file(const char * path, @@ -1443,7 +1404,12 @@ int openssl_load_pubkey_raw_file(const char * path, if (fp == NULL) goto fail_file; + pthread_cleanup_push(__cleanup_fclose, fp); + bytes_read = fread(tmp_buf, 1, CRYPT_KEY_BUFSZ, fp); + + pthread_cleanup_pop(false); + if (bytes_read == 0) goto fail_read; @@ -1485,11 +1451,17 @@ static const char * __openssl_hybrid_algo_from_sk_len(size_t len) return NULL; } +/* Wipe the raw-key staging buffer if a cancel aborts the read. */ +static void __cleanse_key_buf(void * o) +{ + OPENSSL_cleanse(o, CRYPT_KEY_BUFSZ); +} + int openssl_load_privkey_raw_file(const char * path, void ** key) { FILE * fp; - uint8_t tmp_buf[4096]; + uint8_t tmp_buf[CRYPT_KEY_BUFSZ]; size_t bytes_read; const char * algo; EVP_PKEY * pkey; @@ -1501,7 +1473,14 @@ int openssl_load_privkey_raw_file(const char * path, if (fp == NULL) goto fail_file; + pthread_cleanup_push(__cleanup_fclose, fp); + pthread_cleanup_push(__cleanse_key_buf, tmp_buf); + bytes_read = fread(tmp_buf, 1, sizeof(tmp_buf), fp); + + pthread_cleanup_pop(false); + pthread_cleanup_pop(false); + if (bytes_read == 0) goto fail_read; @@ -1552,65 +1531,71 @@ void openssl_free_key(EVP_PKEY * key) int openssl_check_crt_name(void * crt, const char * name) { - char * subj; - char * cn; - X509 * xcrt; + const unsigned char * cn; + ASN1_STRING * val; + X509_NAME * nm; + int idx; + int len; - xcrt = (X509 *) crt; + nm = X509_get_subject_name((X509 *) crt); + if (nm == NULL) + return -1; + + idx = X509_NAME_get_index_by_NID(nm, NID_commonName, -1); + if (idx < 0) + return -1; - subj = X509_NAME_oneline(X509_get_subject_name(xcrt), NULL, 0); - if (subj == NULL) - goto fail_subj; + val = X509_NAME_ENTRY_get_data(X509_NAME_get_entry(nm, idx)); + cn = ASN1_STRING_get0_data(val); + len = ASN1_STRING_length(val); - cn = strstr(subj, "CN="); - if (cn == NULL) - goto fail_cn; + if (len < 0 || (size_t) len != strlen(name)) + return -1; - if (strcmp(cn + 3, name) != 0) - goto fail_cn; + if (memchr(cn, '\0', (size_t) len) != NULL) + return -1; - free(subj); + if (memcmp(cn, name, (size_t) len) != 0) + return -1; return 0; - fail_cn: - free(subj); - fail_subj: - return -1; } int openssl_get_crt_name(void * crt, char * name) { - char * subj; - char * cn; - char * end; - X509 * xcrt; + const unsigned char * cn; + ASN1_STRING * val; + X509_NAME * nm; + int idx; + int len; - xcrt = (X509 *) crt; + nm = X509_get_subject_name((X509 *) crt); + if (nm == NULL) + return -1; - subj = X509_NAME_oneline(X509_get_subject_name(xcrt), NULL, 0); - if (subj == NULL) - goto fail_subj; + idx = X509_NAME_get_index_by_NID(nm, NID_commonName, -1); + if (idx < 0) + return -1; - cn = strstr(subj, "CN="); - if (cn == NULL) - goto fail_cn; + val = X509_NAME_ENTRY_get_data(X509_NAME_get_entry(nm, idx)); + cn = ASN1_STRING_get0_data(val); + len = ASN1_STRING_length(val); - cn += 3; /* Skip "CN=" */ + if (len < 0) + return -1; - /* Find end of CN (comma or slash for next field) */ - end = strpbrk(cn, ",/"); - if (end != NULL) - *end = '\0'; + if ((size_t) len > NAME_SIZE) + return -ENAME; - strcpy(name, cn); - free(subj); + /* Reject an embedded NUL that would truncate the parsed name. */ + if (memchr(cn, '\0', (size_t) len) != NULL) + return -1; + + memcpy(name, cn, (size_t) len); + name[len] = '\0'; return 0; - fail_cn: - free(subj); - fail_subj: - return -1; } int openssl_crt_str(const void * crt, @@ -1695,12 +1680,43 @@ int openssl_auth_add_crt_to_store(void * store, return ret == 1 ? 0 : -1; } -int openssl_verify_crt(void * store, - void * crt) +void * openssl_auth_create_chain(void) +{ + return sk_X509_new_null(); +} + +void openssl_auth_destroy_chain(void * chain) +{ + sk_X509_pop_free((STACK_OF(X509) *) chain, X509_free); +} + +int openssl_auth_add_crt_to_chain(void * chain, + void * crt) +{ + if (X509_up_ref((X509 *) crt) != 1) + goto fail_ref; + + if (sk_X509_push((STACK_OF(X509) *) chain, (X509 *) crt) == 0) + goto fail_push; + + return 0; + fail_push: + X509_free((X509 *) crt); + fail_ref: + return -1; +} + +int openssl_verify_crt_pin(void * store, + void * untrusted, + void * crt, + void * pin) { X509_STORE_CTX * ctx; X509_STORE * _store; X509* _crt; + STACK_OF(X509) * chain; + int i; + int n; int ret; _store = (X509_STORE *) store; @@ -1710,7 +1726,8 @@ int openssl_verify_crt(void * store, if (ctx == NULL) goto fail_store_ctx; - ret = X509_STORE_CTX_init(ctx, _store, _crt, NULL); + ret = X509_STORE_CTX_init(ctx, _store, _crt, + (STACK_OF(X509) *) untrusted); if (ret != 1) goto fail_ca; @@ -1718,13 +1735,39 @@ int openssl_verify_crt(void * store, if (ret != 1) goto fail_ca; + /* Peer cert only verifies a signature; gate on sig KU, not role. */ + if ((X509_get_key_usage(_crt) & KU_DIGITAL_SIGNATURE) == 0) + goto fail_ca; + + if (pin != NULL) { + chain = X509_STORE_CTX_get0_chain(ctx); + if (chain == NULL) + goto fail_ca; + n = sk_X509_num(chain); + for (i = 1; i < n; i++) /* Skip the leaf */ + if (X509_cmp(sk_X509_value(chain, i), pin) == 0) + break; + if (i == n) + goto fail_pin; + } + X509_STORE_CTX_free(ctx); return 0; + fail_pin: + X509_STORE_CTX_free(ctx); + return -ENOENT; fail_ca: X509_STORE_CTX_free(ctx); fail_store_ctx: - return -1; + return -EAUTH; +} + +int openssl_verify_crt(void * store, + void * untrusted, + void * crt) +{ + return openssl_verify_crt_pin(store, untrusted, crt, NULL); } static const EVP_MD * select_md(EVP_PKEY * pkey, @@ -1739,6 +1782,12 @@ static const EVP_MD * select_md(EVP_PKEY * pkey, return EVP_get_digestbynid(nid); } +bool openssl_pk_requires_md(const EVP_PKEY * pk) +{ + /* Provider-based (PQC) signatures have an intrinsic digest */ + return EVP_PKEY_get_id(pk) >= 0; +} + int openssl_sign(EVP_PKEY * pkp, int nid, buffer_t msg, @@ -1866,9 +1915,10 @@ void * openssl_secure_malloc(size_t size) return OPENSSL_secure_malloc(size); } -void openssl_secure_free(void * ptr) +void openssl_secure_free(void * ptr, + size_t size) { - OPENSSL_secure_free(ptr); + OPENSSL_secure_clear_free(ptr, size); } void openssl_secure_clear(void * ptr, @@ -1876,6 +1926,7 @@ void openssl_secure_clear(void * ptr, { OPENSSL_cleanse(ptr, size); } + void openssl_cleanup(void) { OPENSSL_cleanup(); diff --git a/src/lib/crypt/openssl.h b/src/lib/crypt/openssl.h index af285232..e5cc35f7 100644 --- a/src/lib/crypt/openssl.h +++ b/src/lib/crypt/openssl.h @@ -61,20 +61,44 @@ int openssl_get_algo_from_pk_der(buffer_t pk, int openssl_get_algo_from_pk_raw(buffer_t pk, char * algo); -int openssl_encrypt(struct ossl_crypt_ctx * ctx, - buffer_t in, - buffer_t * out); - -int openssl_decrypt(struct ossl_crypt_ctx * ctx, - buffer_t in, - buffer_t * out); +int openssl_seal(struct ossl_crypt_ctx * ctx, + const uint8_t * key, + const uint8_t * nonce, + buffer_t aad, + buffer_t in, + uint8_t * out, + uint8_t * tag); + +int openssl_open(struct ossl_crypt_ctx * ctx, + const uint8_t * key, + const uint8_t * nonce, + buffer_t aad, + buffer_t in, + const uint8_t * tag, + buffer_t * out); + +int openssl_oneshot_seal(int nid, + const uint8_t * key, + const uint8_t * nonce, + buffer_t aad, + buffer_t in, + buffer_t * out); + +int openssl_oneshot_open(int nid, + const uint8_t * key, + const uint8_t * nonce, + buffer_t aad, + buffer_t in, + buffer_t * out); + +int openssl_hkdf_expand(buffer_t key, + buffer_t info, + buffer_t out); struct ossl_crypt_ctx * openssl_crypt_create_ctx(struct crypt_sk * sk); void openssl_crypt_destroy_ctx(struct ossl_crypt_ctx * ctx); -int openssl_crypt_get_ivsz(struct ossl_crypt_ctx * ctx); - int openssl_crypt_get_tagsz(struct ossl_crypt_ctx * ctx); /* AUTHENTICATION */ @@ -136,9 +160,24 @@ void openssl_auth_destroy_store(void * store); int openssl_auth_add_crt_to_store(void * store, void * crt); +void * openssl_auth_create_chain(void); + +void openssl_auth_destroy_chain(void * chain); + +int openssl_auth_add_crt_to_chain(void * chain, + void * crt); + int openssl_verify_crt(void * store, + void * untrusted, void * crt); +int openssl_verify_crt_pin(void * store, + void * untrusted, + void * crt, + void * pin); + +bool openssl_pk_requires_md(const EVP_PKEY * pk); + int openssl_sign(EVP_PKEY * pkp, int md_nid, buffer_t msg, diff --git a/src/lib/dev.c b/src/lib/dev.c index 9cfc24ee..d0997273 100644 --- a/src/lib/dev.c +++ b/src/lib/dev.c @@ -29,10 +29,13 @@ #include "config.h" #include "ssm.h" +#include <ouroboros/atomics.h> #include <ouroboros/bitmap.h> #include <ouroboros/cep.h> +#include <ouroboros/crc16.h> #include <ouroboros/crypt.h> #include <ouroboros/dev.h> +#include <ouroboros/endian.h> #include <ouroboros/errno.h> #include <ouroboros/fccntl.h> #include <ouroboros/flow.h> @@ -45,32 +48,33 @@ #include <ouroboros/np1_flow.h> #include <ouroboros/pthread.h> #include <ouroboros/random.h> +#ifdef PROC_FLOW_STATS +#include <ouroboros/rib.h> +#endif #include <ouroboros/serdes-irm.h> +#include <ouroboros/sockets.h> #include <ouroboros/ssm_flow_set.h> #include <ouroboros/ssm_pool.h> #include <ouroboros/ssm_rbuff.h> -#include <ouroboros/sockets.h> +#include <ouroboros/tw.h> #include <ouroboros/utils.h> -#ifdef PROC_FLOW_STATS -#include <ouroboros/rib.h> -#endif +#include <assert.h> #ifdef HAVE_LIBGCRYPT #include <gcrypt.h> #endif -#include <assert.h> -#include <stdlib.h> -#include <string.h> -#include <stdio.h> #include <stdarg.h> #include <stdbool.h> +#include <inttypes.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> #include <sys/types.h> #ifndef CLOCK_REALTIME_COARSE #define CLOCK_REALTIME_COARSE CLOCK_REALTIME #endif -/* Partial read information. */ #define NO_PART -1 #define DONE_PART -2 @@ -78,19 +82,12 @@ #define SECMEMSZ 16384 #define MSGBUFSZ 2048 -/* map flow_ids to flow descriptors; track state of the flow */ struct fmap { int fd; - /* TODO: use actual flow state */ enum flow_state state; }; -#define frcti_to_flow(frcti) \ - ((struct flow *)((uint8_t *) frcti - offsetof(struct flow, frcti))) - struct flow { - struct list_head next; - struct flow_info info; struct ssm_rbuff * rx_rb; @@ -101,8 +98,14 @@ struct flow { ssize_t part_idx; struct crypt_ctx * crypt; - int headsz; /* IV */ - int tailsz; /* Tag + CRC */ + int headsz; /* Selector */ + int tailsz; /* Tag + CRC */ + + struct timespec rk_grace; /* TX-promote deadline (0 = none) */ + struct timespec rk_attempt; /* Last re-key attempt (backoff) */ + bool rk_wm_inflight; /* Re-key trigger in flight */ + uint32_t rk_wm_ctr; /* Throttles the consult */ + bool rk_initiator; /* OAP initiator this re-key */ struct timespec snd_act; struct timespec rcv_act; @@ -135,16 +138,10 @@ struct { struct flow * flows; struct fmap * id_to_fd; - struct list_head flow_list; pthread_mutex_t mtx; pthread_cond_t cond; - pthread_t tx; - pthread_t rx; - size_t n_frcti; - fset_t * frct_set; - pthread_rwlock_t lock; } proc; @@ -243,7 +240,7 @@ static int proc_announce(const struct proc_info * proc) return irm__irm_result_des(&msg); } -/* IRMd will clean up the mess if this fails */ +/* IRMd cleans up on failure. */ static void proc_exit(void) { uint8_t buf[SOCK_BUF_SIZE]; @@ -264,7 +261,7 @@ static int spb_encrypt(struct flow * flow, uint8_t * tail; if (flow->crypt == NULL) - return 0; /* No encryption */ + return 0; in.data = ssm_pk_buff_head(spb); in.len = ssm_pk_buff_len(spb); @@ -272,11 +269,11 @@ static int spb_encrypt(struct flow * flow, if (crypt_encrypt(flow->crypt, in, &out) < 0) goto fail_encrypt; - head = ssm_pk_buff_head_alloc(spb, flow->headsz); + head = ssm_pk_buff_push(spb, flow->headsz); if (head == NULL) goto fail_alloc; - tail = ssm_pk_buff_tail_alloc(spb, flow->tailsz); + tail = ssm_pk_buff_push_tail(spb, flow->tailsz); if (tail == NULL) goto fail_alloc; @@ -299,17 +296,16 @@ static int spb_decrypt(struct flow * flow, uint8_t * head; if (flow->crypt == NULL) - return 0; /* No decryption */ + return 0; in.data = ssm_pk_buff_head(spb); in.len = ssm_pk_buff_len(spb); if (crypt_decrypt(flow->crypt, in, &out) < 0) - return -ENOMEM; - + return -ECRYPT; - head = ssm_pk_buff_head_release(spb, flow->headsz) + flow->headsz; - ssm_pk_buff_tail_release(spb, flow->tailsz); + head = ssm_pk_buff_pop(spb, flow->headsz) + flow->headsz; + ssm_pk_buff_pop_tail(spb, flow->tailsz); memcpy(head, out.data, out.len); @@ -318,130 +314,357 @@ static int spb_decrypt(struct flow * flow, return 0; } -#include "frct.c" +/* tw_move under proc.lock rdlock; gates teardown vs in-flight fires. */ +static void tw_move_safe(void) +{ + pthread_rwlock_rdlock(&proc.lock); + + pthread_cleanup_push(__cleanup_rwlock_unlock, &proc.lock); + + tw_move(); + + pthread_cleanup_pop(1); +} -void * flow_tx(void * o) +static int crc_add(struct ssm_pk_buff * spb, + size_t head_skip) { - struct timespec tic = TIMESPEC_INIT_NS(TICTIME); + uint8_t * head; + uint8_t * tail; - (void) o; + tail = ssm_pk_buff_push_tail(spb, CRCLEN); + if (tail == NULL) + return -ENOMEM; - while (true) { - timerwheel_move(); + head = ssm_pk_buff_head(spb) + head_skip; - nanosleep(&tic, NULL); - } + mem_hash(HASH_CRC32, tail, head, tail - head); - return (void *) 0; + return 0; } -static void flow_send_keepalive(struct flow * flow, - struct timespec now) +static int crc_check(struct ssm_pk_buff * spb, + size_t head_skip) { - struct ssm_pk_buff * spb; - ssize_t idx; - uint8_t * ptr; + uint32_t crc; + uint8_t * head; + uint8_t * tail; - idx = ssm_pool_alloc(proc.pool, 0, &ptr, &spb); - if (idx < 0) - return; + if (ssm_pk_buff_len(spb) < head_skip + CRCLEN) + return 1; - pthread_rwlock_wrlock(&proc.lock); + head = ssm_pk_buff_head(spb) + head_skip; + tail = ssm_pk_buff_pop_tail(spb, CRCLEN); - flow->snd_act = now; + mem_hash(HASH_CRC32, &crc, head, tail - head); - if (ssm_rbuff_write(flow->tx_rb, idx)) - ssm_pool_remove(proc.pool, idx); + return !(crc == *((uint32_t *) tail)); +} + +/* FRCT included here so it can use proc and dev.c statics directly. */ +#include "frct.c" + +/* Decrypt before any check so the plaintext is authoritative. */ +static bool invalid_pkt(struct flow * flow, + struct ssm_pk_buff * spb) +{ + const struct frct_pci * pci; + uint16_t flags; + size_t pci_total; + + if (spb == NULL || ssm_pk_buff_len(spb) == 0) + return true; + + if (spb_decrypt(flow, spb) < 0) + return true; + + if (flow->frcti == NULL) { + if (flow->info.qs.ber == 0 && crc_check(spb, 0) != 0) + return true; + return false; + } + + if (ssm_pk_buff_len(spb) < FRCT_PCILEN) + return true; + + pci = (const struct frct_pci *) ssm_pk_buff_head(spb); + flags = ntoh16(pci->flags); + + /* Untrusted flag read; mismatch on HCS will drop on corrupt. */ + if (flags & FRCT_DATA) + pci_total = frcti_data_hdr_len(flow->frcti); else - ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT); + pci_total = frcti_ctrl_hdr_len(flow->frcti); - pthread_rwlock_unlock(&proc.lock); + if (ssm_pk_buff_len(spb) < pci_total) + return true; + + if (frct_hcs_check(pci, flow->frcti) != 0) + return true; + + /* HCS valid: CRC32 on SACK; or on DATA if ber = 0. */ + if (flags & FRCT_SACK) { + if (crc_check(spb, pci_total) != 0) + return true; + + } else if ((flags & FRCT_DATA) && flow->info.qs.ber == 0) { + if (crc_check(spb, pci_total) != 0) + return true; + } + + return false; } -/* Needs rdlock on proc. */ -static void _flow_keepalive(struct flow * flow) +static bool deadline_passed(const struct timespec * abs) { - struct timespec now; - struct timespec s_act; - struct timespec r_act; - int flow_id; - time_t timeo; - uint32_t acl; + struct timespec now; - s_act = flow->snd_act; - r_act = flow->rcv_act; + if (abs == NULL) + return false; - flow_id = flow->info.id; - timeo = flow->info.qs.timeout; + clock_gettime(PTHREAD_COND_CLOCK, &now); - acl = ssm_rbuff_get_acl(flow->rx_rb); - if (timeo == 0 || acl & (ACL_FLOWPEER | ACL_FLOWDOWN)) - return; + return ts_diff_ns(&now, abs) >= 0; +} + +/* Clamp the wait by min(dl, next tw expiry, now + TICTIME). */ +static void compute_wait_deadline(const struct timespec * dl, + struct timespec * out) +{ + struct timespec now; + struct timespec cap; + struct timespec expiry; + struct timespec tic = TIMESPEC_INIT_NS(TICTIME); clock_gettime(PTHREAD_COND_CLOCK, &now); + ts_add(&now, &tic, &cap); - if (ts_diff_ns(&now, &r_act) > (int64_t) timeo * MILLION) { - ssm_rbuff_set_acl(flow->rx_rb, ACL_FLOWPEER); - ssm_flow_set_notify(proc.fqset, flow_id, FLOW_PEER); - return; - } + tw_next_expiry(&expiry); - if (ts_diff_ns(&now, &s_act) > (int64_t) timeo * (MILLION >> 2)) { - pthread_rwlock_unlock(&proc.lock); + *out = (ts_diff_ns(&cap, &expiry) < 0) ? expiry : cap; + if (dl != NULL && ts_diff_ns(out, dl) > 0) + *out = *dl; +} - flow_send_keepalive(flow, now); +static void flow_drain_rx_nb(struct flow * flow) +{ + ssize_t idx; + struct ssm_pk_buff * spb; + struct ssm_rbuff * rx_rb; + struct frcti * frcti; +#ifdef PROC_FLOW_STATS + struct timespec t_a; + struct timespec t_b; +#endif + + if (flow->frcti != NULL) + STAT_BUMP(flow->frcti, drain_calls); + while (true) { pthread_rwlock_rdlock(&proc.lock); + + rx_rb = flow->rx_rb; + if (rx_rb == NULL) { + pthread_rwlock_unlock(&proc.lock); + return; + } + + idx = ssm_rbuff_read(rx_rb); + if (idx < 0) { + pthread_rwlock_unlock(&proc.lock); + return; + } + + spb = ssm_pool_get(proc.pool, idx); + if (invalid_pkt(flow, spb)) { + ssm_pool_remove(proc.pool, idx); + pthread_rwlock_unlock(&proc.lock); + continue; + } + + frcti = flow->frcti; + if (frcti != NULL) { +#ifdef PROC_FLOW_STATS + clock_gettime(CLOCK_MONOTONIC, &t_a); + FRCTI_RCV(frcti, spb); + clock_gettime(CLOCK_MONOTONIC, &t_b); + STAT_ADD(frcti, rcv_proc_ns, + (size_t) ts_diff_ns(&t_b, &t_a)); +#else + FRCTI_RCV(frcti, spb); +#endif + } else { + ssm_pool_remove(proc.pool, idx); + } + + pthread_rwlock_unlock(&proc.lock); + + /* Per-packet so the delayed-ACK fires on time in a burst. */ +#ifdef PROC_FLOW_STATS + clock_gettime(CLOCK_MONOTONIC, &t_a); + tw_move_safe(); + clock_gettime(CLOCK_MONOTONIC, &t_b); + if (frcti != NULL) + STAT_ADD(frcti, tw_move_ns, + (size_t) ts_diff_ns(&t_b, &t_a)); +#else + tw_move_safe(); +#endif } } -static void handle_keepalives(void) +/* TX-promotion grace when the peer's install latency is unknown (raw). */ +#define REKEY_GRACE_MS 1000 + +/* Last-resort promote within N node-keys of exhaustion (< watermark). */ +#define REKEY_PROMOTE_FLOOR 1 + +/* Throttle re-key retries so a failed attempt can't storm the IRMd. */ +#define REKEY_BACKOFF_NS (250 * MILLION) + +/* proc.lock (rd) only guards teardown; crypt_rekey self-synchronises. */ +static void flow_rekey(struct flow * flow) { - struct list_head * p; - struct list_head * h; + struct flow_info info; + struct crypt_sk sk; + struct timespec now; + struct timespec intv; + time_t ms; + uint8_t key[SYMMKEYSZ]; + uint8_t buf[SOCK_BUF_SIZE]; + buffer_t msg = {SOCK_BUF_SIZE, buf}; + bool has_key; + bool initiator = false; pthread_rwlock_rdlock(&proc.lock); + if (flow->info.id < 0 || flow->crypt == NULL) { + pthread_rwlock_unlock(&proc.lock); + return; + } - list_for_each_safe(p, h, &proc.flow_list) { - struct flow * flow; - flow = list_entry(p, struct flow, next); - _flow_keepalive(flow); + /* Back off so a failed attempt can't storm the IRMd per syscall. */ + clock_gettime(PTHREAD_COND_CLOCK, &now); + if (ts_diff_ns(&now, &flow->rk_attempt) < REKEY_BACKOFF_NS) { + pthread_rwlock_unlock(&proc.lock); + return; } + flow->rk_attempt = now; + info = flow->info; pthread_rwlock_unlock(&proc.lock); -} -static void __cleanup_fqueue_destroy(void * fq) -{ - fqueue_destroy((fqueue_t *) fq); + if (flow_update__irm_req_ser(&msg, &info, false) < 0) + return; + + if (send_recv_msg(&msg) < 0) + return; + + sk.key = key; + if (flow_rekey__irm_result_des(&msg, &sk, &has_key, &initiator) < 0) + return; + + if (!has_key) + return; + + pthread_rwlock_rdlock(&proc.lock); + if (flow->info.id == info.id && flow->crypt != NULL) { + if (crypt_rekey(flow->crypt, &sk) == 0) { + flow->rk_initiator = initiator; + /* Hold TX on the old epoch until the peer installs. */ + ms = flow->info.mpl > 0 ? flow->info.mpl * 3 + : REKEY_GRACE_MS; + intv.tv_sec = ms / 1000; + intv.tv_nsec = (ms % 1000) * MILLION; + clock_gettime(PTHREAD_COND_CLOCK, &now); + ts_add(&now, &intv, &flow->rk_grace); + } + /* Re-arm the watermark even if the install was a no-op. */ + STORE_RELAXED(&flow->rk_wm_inflight, false); + } + pthread_rwlock_unlock(&proc.lock); + + crypt_secure_clear(key, SYMMKEYSZ); } -void * flow_rx(void * o) +/* A clamp-timeout means tw work is due, not the caller deadline. */ +static int flow_rx_one(struct flow * flow, + struct timespec * abs) { - struct timespec tic = TIMESPEC_INIT_NS(TICTIME); - int ret; - struct fqueue * fq; + struct timespec wait_abs; + struct ssm_pk_buff * spb; + struct ssm_rbuff * rx_rb; + ssize_t idx; - (void) o; + while (true) { + compute_wait_deadline(abs, &wait_abs); - fq = fqueue_create(); + /* rdlock gates flow_fini; FLOWDOWN preempts the block. */ + pthread_rwlock_rdlock(&proc.lock); - pthread_cleanup_push(__cleanup_fqueue_destroy, fq); + rx_rb = flow->rx_rb; + if (rx_rb == NULL) { + pthread_rwlock_unlock(&proc.lock); + return -EFLOWDOWN; + } - /* fevent will filter all FRCT packets for us */ - while ((ret = fevent(proc.frct_set, fq, &tic)) != 0) { - if (ret == -ETIMEDOUT) { - handle_keepalives(); + /* Pull a parked re-key before re-blocking (idle reader). */ + if (flow->crypt != NULL + && (ssm_rbuff_get_flags(rx_rb) & RB_REKEY)) { + pthread_rwlock_unlock(&proc.lock); + flow_rekey(flow); continue; } - while (fqueue_next(fq) >= 0) - ; /* no need to act */ + idx = ssm_rbuff_read_b(rx_rb, &wait_abs); + if (idx == -ETIMEDOUT) { + pthread_rwlock_unlock(&proc.lock); + if (deadline_passed(abs)) + return -ETIMEDOUT; + tw_move_safe(); + continue; + } + if (idx < 0) { + pthread_rwlock_unlock(&proc.lock); + return idx; + } + + spb = ssm_pool_get(proc.pool, idx); + if (invalid_pkt(flow, spb)) { + ssm_pool_remove(proc.pool, idx); + pthread_rwlock_unlock(&proc.lock); + continue; + } + + if (flow->frcti != NULL) + FRCTI_RCV(flow->frcti, spb); + else + ssm_pool_remove(proc.pool, idx); + + pthread_rwlock_unlock(&proc.lock); + + tw_move_safe(); + return 0; } +} - pthread_cleanup_pop(true); +/* 0 = window open; -EAGAIN = !block and would block; else flow_rx_one rc. */ +static __inline__ int flow_wait_window(struct flow * flow, + size_t n, + bool block, + struct timespec * dl) +{ + int rc; - return (void *) 0; + while (true) { + flow_drain_rx_nb(flow); + if (FRCTI_IS_WINDOW_OPEN_N(flow->frcti, n)) + return 0; + if (!block) + return -EAGAIN; + rc = flow_rx_one(flow, dl); + if (rc < 0) + return rc; + } } static void flow_clear(int fd) @@ -451,36 +674,36 @@ static void flow_clear(int fd) proc.flows[fd].info.id = -1; } -static void __flow_fini(int fd) +/* Order before flow_fini's wrlock, which blocks on rdlock holders. */ +static void flow_quiesce(int fd) { - assert(fd >= 0 && fd < SYS_MAX_FLOWS); + struct ssm_rbuff * rx_rb = proc.flows[fd].rx_rb; + struct ssm_rbuff * tx_rb = proc.flows[fd].tx_rb; - if (proc.flows[fd].frcti != NULL) { - proc.n_frcti--; - if (proc.n_frcti == 0) { - pthread_cancel(proc.tx); - pthread_join(proc.tx, NULL); - } + if (rx_rb != NULL) + ssm_rbuff_set_bits(rx_rb, RB_FLOWDOWN); + + if (tx_rb != NULL) + ssm_rbuff_set_bits(tx_rb, RB_FLOWDOWN); +} - ssm_flow_set_del(proc.fqset, 0, proc.flows[fd].info.id); +static void do_flow_fini(int fd) +{ + assert(fd >= 0 && fd < PROC_MAX_FLOWS); + if (proc.flows[fd].frcti != NULL) frcti_destroy(proc.flows[fd].frcti); - } if (proc.flows[fd].info.id != -1) { flow_destroy(&proc.id_to_fd[proc.flows[fd].info.id]); bmp_release(proc.fds, fd); } - if (proc.flows[fd].rx_rb != NULL) { - ssm_rbuff_set_acl(proc.flows[fd].rx_rb, ACL_FLOWDOWN); + if (proc.flows[fd].rx_rb != NULL) ssm_rbuff_close(proc.flows[fd].rx_rb); - } - if (proc.flows[fd].tx_rb != NULL) { - ssm_rbuff_set_acl(proc.flows[fd].tx_rb, ACL_FLOWDOWN); + if (proc.flows[fd].tx_rb != NULL) ssm_rbuff_close(proc.flows[fd].tx_rb); - } if (proc.flows[fd].set != NULL) { ssm_flow_set_notify(proc.flows[fd].set, @@ -491,24 +714,40 @@ static void __flow_fini(int fd) crypt_destroy_ctx(proc.flows[fd].crypt); - list_del(&proc.flows[fd].next); - flow_clear(fd); } static void flow_fini(int fd) { + flow_quiesce(fd); + pthread_rwlock_wrlock(&proc.lock); - __flow_fini(fd); + do_flow_fini(fd); pthread_rwlock_unlock(&proc.lock); } #define IS_ENCRYPTED(crypt) ((crypt)->nid != NID_undef) -#define IS_ORDERED(flow) (flow.qs.in_order != 0) +#define IS_ORDERED(info) ((info)->qs.service != SVC_RAW) +#define IS_STREAM(info) ((info)->qs.service == SVC_STREAM) + +/* Raw MTU minus the wrapping (IV/Tag + optional CRC) dev.c adds. */ +static __inline__ size_t flow_user_mtu(const struct flow * flow, + size_t raw) +{ + size_t hdr; + + hdr = flow->headsz + flow->tailsz; + if (flow->info.qs.ber == 0 && flow->crypt == NULL) + hdr += CRCLEN; + + return raw > hdr ? raw - hdr : 0; +} + static int flow_init(struct flow_info * info, - struct crypt_sk * sk) + struct crypt_sk * sk, + time_t rtt_hint) { struct timespec now; struct flow * flow; @@ -550,33 +789,25 @@ static int flow_init(struct flow_info * info, flow->tailsz = 0; if (IS_ENCRYPTED(sk)) { - /* Set to lower value in tests, should we make configurable? */ - sk->rot_bit = KEY_ROTATION_BIT; flow->crypt = crypt_create_ctx(sk); if (flow->crypt == NULL) goto fail_crypt; - flow->headsz = crypt_get_ivsz(flow->crypt); + flow->headsz = crypt_get_headsz(flow->crypt); flow->tailsz = crypt_get_tagsz(flow->crypt); } assert(flow->frcti == NULL); - if (IS_ORDERED(flow->info)) { - flow->frcti = frcti_create(fd, DELT_A, DELT_R, info->mpl); + if (IS_ORDERED(&flow->info)) { + uint32_t frct_mtu = flow_user_mtu(flow, info->mtu); + + flow->frcti = frcti_create(fd, DELT_A, DELT_R, + info->mpl, rtt_hint, + info->qs, frct_mtu); if (flow->frcti == NULL) goto fail_frcti; - - if (ssm_flow_set_add(proc.fqset, 0, info->id)) - goto fail_flow_set_add; - - ++proc.n_frcti; - if (proc.n_frcti == 1 && - pthread_create(&proc.tx, NULL, flow_tx, NULL) < 0) - goto fail_tx_thread; } - list_add_tail(&flow->next, &proc.flow_list); - proc.id_to_fd[info->id].fd = fd; flow_set_state(&proc.id_to_fd[info->id], FLOW_ALLOCATED); @@ -585,10 +816,6 @@ static int flow_init(struct flow_info * info, return fd; - fail_tx_thread: - ssm_flow_set_del(proc.fqset, 0, info->id); - fail_flow_set_add: - frcti_destroy(flow->frcti); fail_frcti: crypt_destroy_ctx(flow->crypt); fail_crypt: @@ -655,13 +882,13 @@ static void init(int argc, gcry_control(GCRYCTL_INITIALIZATION_FINISHED, 0); } #endif - proc.fds = bmp_create(PROG_MAX_FLOWS - PROG_RES_FDS, PROG_RES_FDS); + proc.fds = bmp_create(PROC_MAX_FLOWS - PROC_RES_FDS, PROC_RES_FDS); if (proc.fds == NULL) { fprintf(stderr, "FATAL: Could not create fd bitmap.\n"); goto fail_fds; } - proc.fqueues = bmp_create(PROG_MAX_FQUEUES, 0); + proc.fqueues = bmp_create(PROC_MAX_FQUEUES, 0); if (proc.fqueues == NULL) { fprintf(stderr, "FATAL: Could not create fqueue bitmap.\n"); goto fail_fqueues; @@ -677,13 +904,13 @@ static void init(int argc, goto fail_rdrb; } - proc.flows = malloc(sizeof(*proc.flows) * PROG_MAX_FLOWS); + proc.flows = malloc(sizeof(*proc.flows) * PROC_MAX_FLOWS); if (proc.flows == NULL) { fprintf(stderr, "FATAL: Could not malloc flows.\n"); goto fail_flows; } - for (i = 0; i < PROG_MAX_FLOWS; ++i) + for (i = 0; i < PROC_MAX_FLOWS; ++i) flow_clear(i); proc.id_to_fd = malloc(sizeof(*proc.id_to_fd) * SYS_MAX_FLOWS); @@ -716,20 +943,14 @@ static void init(int argc, goto fail_fqset; } - proc.frct_set = fset_create(); - if (proc.frct_set == NULL || proc.frct_set->idx != 0) { - fprintf(stderr, "FATAL: Could not create FRCT set.\n"); - goto fail_frct_set; - } - - if (timerwheel_init() < 0) { + if (tw_init() < 0) { fprintf(stderr, "FATAL: Could not initialize timerwheel.\n"); goto fail_timerwheel; } if (crypt_secure_malloc_init(PROC_SECMEM_MAX) < 0) { fprintf(stderr, "FATAL: Could not init secure malloc.\n"); - goto fail_timerwheel; + goto fail_secmem; } #if defined PROC_FLOW_STATS @@ -741,24 +962,15 @@ static void init(int argc, } } #endif - if (pthread_create(&proc.rx, NULL, flow_rx, NULL) < 0) { - fprintf(stderr, "FATAL: Could not start monitor thread.\n"); - goto fail_monitor; - } - - list_head_init(&proc.flow_list); - return; - fail_monitor: #if defined PROC_FLOW_STATS - rib_fini(); fail_rib_init: + crypt_secure_malloc_fini(); #endif - timerwheel_fini(); + fail_secmem: + tw_fini(); fail_timerwheel: - fset_destroy(proc.frct_set); - fail_frct_set: ssm_flow_set_close(proc.fqset); fail_fqset: pthread_rwlock_destroy(&proc.lock); @@ -789,19 +1001,20 @@ static void fini(void) if (proc.fds == NULL) return; - pthread_cancel(proc.rx); - pthread_join(proc.rx, NULL); + /* Wake all in-flight readers/writers BEFORE wrlock acquire. */ + for (i = 0; i < PROC_MAX_FLOWS; ++i) + if (proc.flows[i].info.id != -1) + flow_quiesce(i); pthread_rwlock_wrlock(&proc.lock); - for (i = 0; i < PROG_MAX_FLOWS; ++i) { + for (i = 0; i < PROC_MAX_FLOWS; ++i) { struct flow * flow = &proc.flows[i]; if (flow->info.id != -1) { ssize_t idx; - ssm_rbuff_set_acl(flow->rx_rb, ACL_FLOWDOWN); while ((idx = ssm_rbuff_read(flow->rx_rb)) >= 0) ssm_pool_remove(proc.pool, idx); - __flow_fini(i); + do_flow_fini(i); } } @@ -813,9 +1026,9 @@ static void fini(void) #ifdef PROC_FLOW_STATS rib_fini(); #endif - timerwheel_fini(); + crypt_secure_malloc_fini(); - fset_destroy(proc.frct_set); + tw_fini(); ssm_flow_set_close(proc.fqset); @@ -860,6 +1073,10 @@ int flow_accept(qosspec_t * qs, if (qs != NULL) qs->ber = 1; #endif + /* STREAM cannot tolerate loss: drops create silent gaps. */ + if (qs != NULL && qs->service == SVC_STREAM && qs->loss != 0) + return -EINVAL; + memset(&flow, 0, sizeof(flow)); flow.n_pid = getpid(); @@ -872,13 +1089,16 @@ int flow_accept(qosspec_t * qs, if (err < 0) return err; - crypt.key = key; + crypt.key = key; + crypt.epoch = 0; + crypt.role = CRYPT_ROLE_RESP; err = flow__irm_result_des(&msg, &flow, &crypt); if (err < 0) return err; - fd = flow_init(&flow, &crypt); + /* No RTT in accept; rtt_hint=0 bootstraps from first ACK. */ + fd = flow_init(&flow, &crypt, 0); crypt_secure_clear(key, SYMMKEYSZ); @@ -899,11 +1119,16 @@ int flow_alloc(const char * dst, uint8_t key[SYMMKEYSZ]; int fd; int err; + struct timespec t0; + struct timespec t1; #ifdef QOS_DISABLE_CRC if (qs != NULL) qs->ber = 1; #endif + /* STREAM cannot tolerate loss: drops create silent gaps. */ + if (qs != NULL && qs->service == SVC_STREAM && qs->loss != 0) + return -EINVAL; memset(&flow, 0, sizeof(flow)); @@ -913,19 +1138,23 @@ int flow_alloc(const char * dst, if (flow_alloc__irm_req_ser(&msg, &flow, dst, timeo)) return -ENOMEM; + clock_gettime(PTHREAD_COND_CLOCK, &t0); + err = send_recv_msg(&msg); - if (err < 0) { - printf("send_recv_msg error %d\n", err); + if (err < 0) return err; - } - crypt.key = key; + clock_gettime(PTHREAD_COND_CLOCK, &t1); + + crypt.key = key; + crypt.epoch = 0; + crypt.role = CRYPT_ROLE_INIT; err = flow__irm_result_des(&msg, &flow, &crypt); if (err < 0) return err; - fd = flow_init(&flow, &crypt); + fd = flow_init(&flow, &crypt, ts_diff_ns(&t1, &t0)); crypt_secure_clear(key, SYMMKEYSZ); @@ -958,13 +1187,15 @@ int flow_join(const char * dst, if (err < 0) return err; - crypt.key = key; + crypt.key = key; + crypt.epoch = 0; + crypt.role = CRYPT_ROLE_INIT; err = flow__irm_result_des(&msg, &flow, &crypt); if (err < 0) return err; - fd = flow_init(&flow, &crypt); + fd = flow_init(&flow, &crypt, 0); crypt_secure_clear(key, SYMMKEYSZ); @@ -983,10 +1214,10 @@ int flow_dealloc(int fd) struct flow * flow; int err; - if (fd < 0 || fd >= SYS_MAX_FLOWS ) + if (fd < 0 || fd >= PROC_MAX_FLOWS ) return -EINVAL; - memset(&info, 0, sizeof(flow)); + memset(&info, 0, sizeof(info)); flow = &proc.flows[fd]; @@ -1008,9 +1239,8 @@ int flow_dealloc(int fd) pthread_rwlock_rdlock(&proc.lock); - timeo.tv_sec = frcti_dealloc(flow->frcti); - while (timeo.tv_sec < 0) { /* keep the flow active for rtx */ - ssize_t ret; + while (FRCTI_LINGERING(flow->frcti)) { + ssize_t ret; pthread_rwlock_unlock(&proc.lock); @@ -1018,12 +1248,12 @@ int flow_dealloc(int fd) pthread_rwlock_rdlock(&proc.lock); - timeo.tv_sec = frcti_dealloc(flow->frcti); - - if (ret == -EFLOWDOWN && timeo.tv_sec < 0) - timeo.tv_sec = -timeo.tv_sec; + if (ret == -EFLOWDOWN) + break; } + timeo.tv_sec = FRCTI_DEALLOC(flow->frcti); + pthread_cleanup_push(__cleanup_rwlock_unlock, &proc.lock); ssm_rbuff_fini(flow->tx_rb); @@ -1033,15 +1263,18 @@ int flow_dealloc(int fd) info.id = flow->info.id; info.n_pid = getpid(); - if (flow_dealloc__irm_req_ser(&msg, &info, &timeo) < 0) - return -ENOMEM; + if (flow_dealloc__irm_req_ser(&msg, &info, &timeo) < 0) { + err = -ENOMEM; + goto out; + } err = send_recv_msg(&msg); if (err < 0) - return err; + goto out; err = irm__irm_result_des(&msg); + out: flow_fini(fd); return err; @@ -1055,12 +1288,12 @@ int ipcp_flow_dealloc(int fd) struct flow * flow; int err; - if (fd < 0 || fd >= SYS_MAX_FLOWS ) + if (fd < 0 || fd >= PROC_MAX_FLOWS ) return -EINVAL; flow = &proc.flows[fd]; - memset(&info, 0, sizeof(flow)); + memset(&info, 0, sizeof(info)); pthread_rwlock_rdlock(&proc.lock); @@ -1074,15 +1307,18 @@ int ipcp_flow_dealloc(int fd) pthread_rwlock_unlock(&proc.lock); - if (ipcp_flow_dealloc__irm_req_ser(&msg, &info) < 0) - return -ENOMEM; + if (ipcp_flow_dealloc__irm_req_ser(&msg, &info) < 0) { + err = -ENOMEM; + goto out; + } err = send_recv_msg(&msg); if (err < 0) - return err; + goto out; err = irm__irm_result_des(&msg); + out: flow_fini(fd); return err; @@ -1098,12 +1334,20 @@ int fccntl(int fd, va_list l; struct timespec * timeo; qosspec_t * qs; - uint32_t rx_acl; - uint32_t tx_acl; size_t * qlen; struct flow * flow; - - if (fd < 0 || fd >= SYS_MAX_FLOWS) + uint16_t old_acc; + uint16_t new_acc; + size_t max; + size_t * maxp; + size_t rsz; + size_t * rszp; + time_t rto; + time_t * rtop; + int rc; + bool emit_eos = false; + + if (fd < 0 || fd >= PROC_MAX_FLOWS) return -EBADF; flow = &proc.flows[fd]; @@ -1167,36 +1411,44 @@ int fccntl(int fd, qlen = va_arg(l, size_t *); *qlen = ssm_rbuff_queued(flow->tx_rb); break; + case FLOWGMTU: + maxp = va_arg(l, size_t *); + if (maxp == NULL) + goto einval; + *maxp = flow_user_mtu(flow, flow->info.mtu); + break; case FLOWSFLAGS: + old_acc = flow->oflags & FLOWFACCMODE; flow->oflags = va_arg(l, uint32_t); - rx_acl = ssm_rbuff_get_acl(flow->rx_rb); - tx_acl = ssm_rbuff_get_acl(flow->rx_rb); - /* - * Making our own flow write only means making the - * the other side of the flow read only. - */ + new_acc = flow->oflags & FLOWFACCMODE; + + /* Defer EOS emit until after proc.lock is dropped: */ + /* frcti_fin_snd may block on shm-pool/tx-rb. */ + if (new_acc == FLOWFRDONLY + && old_acc != FLOWFRDONLY + && flow->frcti != NULL) + emit_eos = true; + + /* Our flow write-only -> peer's read-only; restore on RDWR. */ if (flow->oflags & FLOWFWRONLY) - rx_acl |= ACL_RDONLY; - if (flow->oflags & FLOWFRDWR) - rx_acl |= ACL_RDWR; + ssm_rbuff_clr_bits(flow->rx_rb, RB_WR); + else + ssm_rbuff_set_bits(flow->rx_rb, RB_WR); if (flow->oflags & FLOWFDOWN) { - rx_acl |= ACL_FLOWDOWN; - tx_acl |= ACL_FLOWDOWN; + ssm_rbuff_set_bits(flow->rx_rb, RB_FLOWDOWN); + ssm_rbuff_set_bits(flow->tx_rb, RB_FLOWDOWN); ssm_flow_set_notify(flow->set, flow->info.id, FLOW_DOWN); } else { - rx_acl &= ~ACL_FLOWDOWN; - tx_acl &= ~ACL_FLOWDOWN; + ssm_rbuff_clr_bits(flow->rx_rb, RB_FLOWDOWN); + ssm_rbuff_clr_bits(flow->tx_rb, RB_FLOWDOWN); ssm_flow_set_notify(flow->set, flow->info.id, FLOW_UP); } - ssm_rbuff_set_acl(flow->rx_rb, rx_acl); - ssm_rbuff_set_acl(flow->tx_rb, tx_acl); - break; case FLOWGFLAGS: fflags = va_arg(l, uint32_t *); @@ -1218,6 +1470,59 @@ int fccntl(int fd, goto eperm; *cflags = frcti_getflags(flow->frcti); break; + case FRCTSMAXSDU: + max = va_arg(l, size_t); + if (flow->frcti == NULL) + goto eperm; + if (frcti_set_max_rcv_sdu(flow->frcti, max) < 0) + goto einval; + break; + case FRCTGMAXSDU: + maxp = va_arg(l, size_t *); + if (maxp == NULL) + goto einval; + if (flow->frcti == NULL) + goto eperm; + *maxp = frcti_get_max_rcv_sdu(flow->frcti); + break; + case FRCTSRRINGSZ: + rsz = va_arg(l, size_t); + if (flow->frcti == NULL) + goto eperm; + rc = frcti_set_rcv_ring_sz(flow->frcti, rsz); + if (rc < 0) { + pthread_rwlock_unlock(&proc.lock); + va_end(l); + return rc; + } + break; + case FRCTGRRINGSZ: + rszp = va_arg(l, size_t *); + if (rszp == NULL) + goto einval; + if (flow->frcti == NULL) + goto eperm; + *rszp = frcti_get_rcv_ring_sz(flow->frcti); + break; + case FRCTSRTOMIN: + if (flow->frcti == NULL) + goto eperm; + rto = va_arg(l, time_t); + rc = frcti_set_rto_min(flow->frcti, rto); + if (rc < 0) { + pthread_rwlock_unlock(&proc.lock); + va_end(l); + return rc; + } + break; + case FRCTGRTOMIN: + if (flow->frcti == NULL) + goto eperm; + rtop = va_arg(l, time_t *); + if (rtop == NULL) + goto einval; + *rtop = frcti_get_rto_min(flow->frcti); + break; default: pthread_rwlock_unlock(&proc.lock); va_end(l); @@ -1227,6 +1532,9 @@ int fccntl(int fd, pthread_rwlock_unlock(&proc.lock); + if (emit_eos) + frcti_fin_snd(flow->frcti); + va_end(l); return 0; @@ -1241,86 +1549,275 @@ int fccntl(int fd, return -EPERM; } -static int chk_crc(struct ssm_pk_buff * spb) -{ - uint32_t crc; - uint8_t * head = ssm_pk_buff_head(spb); - uint8_t * tail = ssm_pk_buff_tail_release(spb, CRCLEN); - - mem_hash(HASH_CRC32, &crc, head, tail - head); - - return !(crc == *((uint32_t *) tail)); -} - -static int add_crc(struct ssm_pk_buff * spb) -{ - uint8_t * head; - uint8_t * tail; - - tail = ssm_pk_buff_tail_alloc(spb, CRCLEN); - if (tail == NULL) - return -ENOMEM; - - head = ssm_pk_buff_head(spb); - mem_hash(HASH_CRC32, tail, head, tail - head); - - return 0; -} - static int flow_tx_spb(struct flow * flow, struct ssm_pk_buff * spb, + uint16_t flags, bool block, struct timespec * abstime) { struct timespec now; ssize_t idx; + size_t pci_total; int ret; clock_gettime(PTHREAD_COND_CLOCK, &now); - - pthread_rwlock_wrlock(&proc.lock); - flow->snd_act = now; - pthread_rwlock_unlock(&proc.lock); - - idx = ssm_pk_buff_get_idx(spb); - - pthread_rwlock_rdlock(&proc.lock); + idx = ssm_pk_buff_get_off(spb); if (ssm_pk_buff_len(spb) > 0) { - if (frcti_snd(flow->frcti, spb) < 0) + if (FRCTI_SND(flow->frcti, spb, flags) < 0) goto enomem; - if (spb_encrypt(flow, spb) < 0) - goto enomem; + if (flow->info.qs.ber == 0) { + pci_total = flow->frcti != NULL + ? frcti_data_hdr_len(flow->frcti) : 0; + if (crc_add(spb, pci_total) != 0) + goto enomem; + } - if (flow->info.qs.ber == 0 && add_crc(spb) != 0) + if (spb_encrypt(flow, spb) < 0) goto enomem; } - pthread_cleanup_push(__cleanup_rwlock_unlock, &proc.lock); - if (!block) ret = ssm_rbuff_write(flow->tx_rb, idx); else ret = ssm_rbuff_write_b(flow->tx_rb, idx, abstime); if (ret < 0) - ssm_pool_remove(proc.pool, idx); - else - ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT); - - pthread_cleanup_pop(true); + return ret; + ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT); return 0; -enomem: - pthread_rwlock_unlock(&proc.lock); - ssm_pool_remove(proc.pool, idx); + enomem: return -ENOMEM; } +/* Per-fragment role for fragment i out of n; n == 1 yields SOLE. */ +static __inline__ uint16_t flow_frag_role(size_t i, size_t n) +{ + if (n == 1) + return FRCT_FR_SOLE; + + if (i == 0) + return FRCT_FR_FIRST; + + if (i + 1 == n) + return FRCT_FR_LAST; + + return FRCT_FR_MID; +} + +static ssize_t flow_write_stream(struct flow * flow, + const void * buf, + size_t count, + int oflags, + struct timespec * dl) +{ + const uint8_t * src = buf; + size_t payload; + size_t off = 0; + bool block = !(oflags & FLOWFWNOBLOCK); + + if (!FRCTI_IS_FRTX(flow->frcti)) + return -EMSGSIZE; + + payload = FRCTI_PAYLOAD_CAP(flow->frcti); + + while (off < count) { + struct ssm_pk_buff * spb; + uint8_t * ptr; + ssize_t idx; + size_t clen; + int ret; + + ret = flow_wait_window(flow, 1, block, dl); + if (ret < 0) + return off > 0 ? (ssize_t) off : (ssize_t) ret; + + clen = MIN(count - off, payload); + + if (block) + idx = ssm_pool_alloc_b(proc.pool, clen, &ptr, + &spb, dl); + else + idx = ssm_pool_alloc(proc.pool, clen, &ptr, &spb); + if (idx < 0) + return off > 0 ? (ssize_t) off : idx; + + memcpy(ptr, src + off, clen); + + ret = flow_tx_spb(flow, spb, 0, block, dl); + if (ret < 0) { + ssm_pool_remove(proc.pool, idx); + return off > 0 ? (ssize_t) off : (ssize_t) ret; + } + + off += clen; + } + + return (ssize_t) count; +} + +/* Per-fragment flow_tx_spb loop. Raw flows refuse; FRCT splits the SDU. */ +static ssize_t flow_write_frag(struct flow * flow, + const void * buf, + size_t count, + int oflags, + struct timespec * dl) +{ + const uint8_t * src = buf; + size_t frag_payload; + size_t n; + size_t off = 0; + size_t i; + int ret; + bool block = !(oflags & FLOWFWNOBLOCK); + + /* Raw flows carry no PCI; cannot fragment. */ + if (flow->frcti == NULL) + return -EMSGSIZE; + + frag_payload = FRCTI_PAYLOAD_CAP(flow->frcti); + + /* Guard the ceil-divide against size_t overflow. */ + if (count > SIZE_MAX - frag_payload + 1) + return -EMSGSIZE; + + n = (count + frag_payload - 1) / frag_payload; + + /* SDU larger than the FC window can ever offer would deadlock. */ + if (n > RQ_SIZE) + return -EMSGSIZE; + + /* SDU-atomic FC: wait for n seqnos to avoid overshoot mid-SDU. */ + ret = flow_wait_window(flow, n, block, dl); + if (ret < 0) + return (ssize_t) ret; + + STAT_BUMP(flow->frcti, sdu_snd_frag); + + for (i = 0; i < n; ++i) { + struct ssm_pk_buff * spb; + uint8_t * ptr; + ssize_t idx; + size_t clen; + + clen = (i + 1 == n) ? (count - off) : frag_payload; + + if (block) + idx = ssm_pool_alloc_b(proc.pool, clen, &ptr, + &spb, dl); + else + idx = ssm_pool_alloc(proc.pool, clen, &ptr, &spb); + if (idx < 0) { + if (off > 0) + STAT_BUMP(flow->frcti, sdu_snd_alloc); + return off > 0 ? (ssize_t) off : idx; + } + + memcpy(ptr, src + off, clen); + + ret = flow_tx_spb(flow, spb, flow_frag_role(i, n), + block, dl); + if (ret < 0) { + ssm_pool_remove(proc.pool, idx); + if (off > 0) + STAT_BUMP(flow->frcti, sdu_snd_tx); + return off > 0 ? (ssize_t) off : (ssize_t) ret; + } + + off += clen; + } + + return (ssize_t) count; +} + +/* + * Initiator promotes on the install grace (it holds the key-confirm + * tag); responder waits for peer_synced, with a near-exhaustion floor. + */ +static void flow_tx_promote(struct flow * flow) +{ + struct timespec now; + int nodes_left; + bool promote; + + if (flow->crypt == NULL) + return; + + if (flow->rk_grace.tv_sec == 0 && flow->rk_grace.tv_nsec == 0) + return; + + promote = crypt_peer_synced(flow->crypt); + + if (!promote && flow->rk_initiator) { + clock_gettime(PTHREAD_COND_CLOCK, &now); + promote = ts_diff_ns(&now, &flow->rk_grace) >= 0; + } + + if (!promote && !flow->rk_initiator) { + nodes_left = crypt_nodes_left(flow->crypt); + promote = nodes_left >= 0 && nodes_left <= REKEY_PROMOTE_FLOOR; + } + + if (!promote) + return; + + crypt_tx_promote(flow->crypt); + flow->rk_grace.tv_sec = 0; + flow->rk_grace.tv_nsec = 0; +} + +/* The reply carries no key; the seed arrives later over RB_REKEY. */ +static int flow_rekey_trigger(struct flow * flow) +{ + struct flow_info info; + uint8_t buf[SOCK_BUF_SIZE]; + buffer_t msg = {SOCK_BUF_SIZE, buf}; + + pthread_rwlock_rdlock(&proc.lock); + if (flow->info.id < 0 || flow->crypt == NULL) { + pthread_rwlock_unlock(&proc.lock); + return -1; + } + info = flow->info; + pthread_rwlock_unlock(&proc.lock); + + if (flow_update__irm_req_ser(&msg, &info, true) < 0) + return -1; + + if (send_recv_msg(&msg) < 0) + return -1; + + return 0; +} + +static bool flow_wm_due(struct flow * flow) +{ + uint32_t tick; + + if (KEY_REKEY_WATERMARK == 0) + return false; + + if (flow->crypt == NULL) + return false; + + if (LOAD_RELAXED(&flow->rk_wm_inflight)) + return false; + + tick = FETCH_ADD_RELAXED(&flow->rk_wm_ctr, 1); + if ((tick & (FLOW_WM_CHECK - 1)) != 0) + return false; + + if (ssm_rbuff_get_flags(flow->rx_rb) & RB_REKEY) + return false; + + return crypt_nodes_left(flow->crypt) <= KEY_REKEY_WATERMARK; +} + ssize_t flow_write(int fd, const void * buf, size_t count) @@ -1330,74 +1827,90 @@ ssize_t flow_write(int fd, int ret; int flags; struct timespec abs; - struct timespec * abstime = NULL; + struct timespec now; + struct timespec * dl = NULL; struct ssm_pk_buff * spb; uint8_t * ptr; if (buf == NULL && count != 0) return -EINVAL; - if (fd < 0 || fd >= PROG_MAX_FLOWS) + if (fd < 0 || fd >= PROC_MAX_FLOWS) return -EBADF; flow = &proc.flows[fd]; - clock_gettime(PTHREAD_COND_CLOCK, &abs); - - pthread_rwlock_wrlock(&proc.lock); + pthread_rwlock_rdlock(&proc.lock); if (flow->info.id < 0) { pthread_rwlock_unlock(&proc.lock); return -ENOTALLOC; } + flags = flow->oflags; + + clock_gettime(PTHREAD_COND_CLOCK, &now); + if (flow->snd_timesout) { - ts_add(&abs, &flow->snd_timeo, &abs); - abstime = &abs; + ts_add(&now, &flow->snd_timeo, &abs); + dl = &abs; } - flags = flow->oflags; - pthread_rwlock_unlock(&proc.lock); if ((flags & FLOWFACCMODE) == FLOWFRDONLY) return -EPERM; - if (flags & FLOWFWNOBLOCK) { - if (!frcti_is_window_open(flow->frcti)) - return -EAGAIN; - idx = ssm_pool_alloc(proc.pool, count, &ptr, &spb); - } else { - ret = frcti_window_wait(flow->frcti, abstime); + if (flow->crypt != NULL + && (ssm_rbuff_get_flags(flow->rx_rb) & RB_REKEY)) + flow_rekey(flow); + + flow_tx_promote(flow); + + /* Pre-empt TX key exhaustion; the timer is the backstop. */ + if (flow_wm_due(flow)) { + STORE_RELAXED(&flow->rk_wm_inflight, true); + if (flow_rekey_trigger(flow) < 0) + STORE_RELAXED(&flow->rk_wm_inflight, false); + } + + tw_move_safe(); + + if (flow->frcti != NULL) { + /* Pump rx_rb so a pure-writer processes ACKs. */ + ret = flow_wait_window(flow, 1, !(flags & FLOWFWNOBLOCK), dl); if (ret < 0) return ret; - idx = ssm_pool_alloc_b(proc.pool, count, &ptr, &spb, abstime); + + if (count > 0 && FRCTI_IS_STREAM(flow->frcti)) + return flow_write_stream(flow, buf, count, flags, dl); + + if (FRCTI_NEEDS_FRAG(flow->frcti, count)) + return flow_write_frag(flow, buf, count, flags, dl); + } else if (flow->info.mtu > 0 + && count > flow_user_mtu(flow, flow->info.mtu)) { + /* Raw flows carry no PCI; refuse anything > one n-1 frame. */ + return -EMSGSIZE; } + if (flags & FLOWFWNOBLOCK) + idx = ssm_pool_alloc(proc.pool, count, &ptr, &spb); + else + idx = ssm_pool_alloc_b(proc.pool, count, &ptr, &spb, dl); if (idx < 0) return idx; if (count > 0) memcpy(ptr, buf, count); - ret = flow_tx_spb(flow, spb, !(flags & FLOWFWNOBLOCK), abstime); - - return ret < 0 ? (ssize_t) ret : (ssize_t) count; -} - -static bool invalid_pkt(struct flow * flow, - struct ssm_pk_buff * spb) -{ - if (spb == NULL || ssm_pk_buff_len(spb) == 0) - return true; - - if (flow->info.qs.ber == 0 && chk_crc(spb) != 0) - return true; - - if (spb_decrypt(flow, spb) < 0) - return true; + ret = flow_tx_spb(flow, spb, FRCT_FR_SOLE, + !(flags & FLOWFWNOBLOCK), dl); + if (ret < 0) { + ssm_pool_remove(proc.pool, idx); + return (ssize_t) ret; + } - return false; + return (ssize_t) count; } static ssize_t flow_rx_spb(struct flow * flow, @@ -1408,19 +1921,14 @@ static ssize_t flow_rx_spb(struct flow * flow, ssize_t idx; struct timespec now; - idx = block ? ssm_rbuff_read_b(flow->rx_rb, abstime) : - ssm_rbuff_read(flow->rx_rb); + idx = block ? ssm_rbuff_read_b(flow->rx_rb, abstime) + : ssm_rbuff_read(flow->rx_rb); if (idx < 0) return idx; clock_gettime(PTHREAD_COND_CLOCK, &now); - - pthread_rwlock_wrlock(&proc.lock); - flow->rcv_act = now; - pthread_rwlock_unlock(&proc.lock); - *spb = ssm_pool_get(proc.pool, idx); if (invalid_pkt(flow, *spb)) { @@ -1431,28 +1939,128 @@ static ssize_t flow_rx_spb(struct flow * flow, return idx; } +static ssize_t raw_flow_read_pkt(struct flow * flow, + bool block, + struct timespec * dl) +{ + struct ssm_pk_buff * spb; + struct timespec wait_abs; + ssize_t idx; + + while (true) { + if (flow->crypt != NULL + && (ssm_rbuff_get_flags(flow->rx_rb) & RB_REKEY)) + flow_rekey(flow); + + if (!block) { + idx = ssm_rbuff_read(flow->rx_rb); + if (idx < 0) + return -EAGAIN; + } else { + compute_wait_deadline(dl, &wait_abs); + idx = ssm_rbuff_read_b(flow->rx_rb, &wait_abs); + if (idx == -ETIMEDOUT) { + if (deadline_passed(dl)) + return -ETIMEDOUT; + continue; + } + if (idx < 0) + return idx; + } + + spb = ssm_pool_get(proc.pool, idx); + if (!invalid_pkt(flow, spb)) + return idx; + + ssm_pool_remove(proc.pool, idx); + if (!block) + return -EAGAIN; + } +} + +static ssize_t deliver_pkt(struct flow * flow, + struct ssm_pk_buff * spb, + ssize_t idx, + void * buf, + size_t count, + bool partrd) +{ + uint8_t * packet = ssm_pk_buff_head(spb); + ssize_t n = ssm_pk_buff_len(spb); + + assert(n >= 0); + + if (n <= (ssize_t) count) { + memcpy(buf, packet, n); + ipcp_spb_release(spb); + if (partrd && n == (ssize_t) count) + flow->part_idx = DONE_PART; + else + flow->part_idx = NO_PART; + + return n; + } + + if (partrd) { + memcpy(buf, packet, count); + ssm_pk_buff_pop(spb, n); + flow->part_idx = idx; + return count; + } + + ipcp_spb_release(spb); + return -EMSGSIZE; +} + +/* Drive frcti_consume until it delivers or errors. */ +static ssize_t flow_read_frcti(struct flow * flow, + void * buf, + size_t count, + bool block, + struct timespec * dl) +{ + struct timespec now; + ssize_t bytes; + int rc; + + while (true) { + flow_drain_rx_nb(flow); + bytes = FRCTI_CONSUME(flow->frcti, buf, count); + if (bytes >= 0) + break; + if (bytes != -EAGAIN) + return bytes; + if (!block) + return -EAGAIN; + rc = flow_rx_one(flow, dl); + if (rc < 0) + return rc; + } + + clock_gettime(PTHREAD_COND_CLOCK, &now); + flow->rcv_act = now; + + return bytes; +} + ssize_t flow_read(int fd, void * buf, size_t count) { - ssize_t idx; - ssize_t n; - uint8_t * packet; + struct flow * flow; struct ssm_pk_buff * spb; struct timespec abs; struct timespec now; - struct timespec * abstime = NULL; - struct flow * flow; + struct timespec * dl = NULL; + ssize_t idx; bool block; bool partrd; - if (fd < 0 || fd >= PROG_MAX_FLOWS) + if (fd < 0 || fd >= PROC_MAX_FLOWS) return -EBADF; flow = &proc.flows[fd]; - clock_gettime(PTHREAD_COND_CLOCK, &now); - pthread_rwlock_rdlock(&proc.lock); if (flow->info.id < 0) { @@ -1461,8 +2069,8 @@ ssize_t flow_read(int fd, } if (flow->part_idx == DONE_PART) { - pthread_rwlock_unlock(&proc.lock); flow->part_idx = NO_PART; + pthread_rwlock_unlock(&proc.lock); return 0; } @@ -1470,75 +2078,40 @@ ssize_t flow_read(int fd, partrd = !(flow->oflags & FLOWFRNOPART); if (flow->rcv_timesout) { + clock_gettime(PTHREAD_COND_CLOCK, &now); ts_add(&now, &flow->rcv_timeo, &abs); - abstime = &abs; - } - - idx = flow->part_idx; - if (idx < 0) { - while ((idx = frcti_queued_pdu(flow->frcti)) < 0) { - pthread_rwlock_unlock(&proc.lock); - - idx = flow_rx_spb(flow, &spb, block, abstime); - if (idx < 0) { - if (block && idx != -EAGAIN) - return idx; - if (!block) - return idx; - - pthread_rwlock_rdlock(&proc.lock); - continue; - } - - pthread_rwlock_rdlock(&proc.lock); - - frcti_rcv(flow->frcti, spb); - } + dl = &abs; } - spb = ssm_pool_get(proc.pool, idx); - pthread_rwlock_unlock(&proc.lock); - packet = ssm_pk_buff_head(spb); + if (flow->crypt != NULL + && (ssm_rbuff_get_flags(flow->rx_rb) & RB_REKEY)) + flow_rekey(flow); - n = ssm_pk_buff_len(spb); + /* Advance TX off a stale epoch even on recv-mostly (ACK-only) flows. */ + flow_tx_promote(flow); - assert(n >= 0); + tw_move_safe(); - if (n <= (ssize_t) count) { - memcpy(buf, packet, n); - ipcp_spb_release(spb); - - pthread_rwlock_wrlock(&proc.lock); - - flow->part_idx = (partrd && n == (ssize_t) count) ? - DONE_PART : NO_PART; + idx = flow->part_idx; + if (idx < 0 && flow->frcti != NULL) + return flow_read_frcti(flow, buf, count, block, dl); - flow->rcv_act = now; + if (idx < 0) { + idx = raw_flow_read_pkt(flow, block, dl); + if (idx < 0) + return idx; + } - pthread_rwlock_unlock(&proc.lock); - return n; - } else { - if (partrd) { - memcpy(buf, packet, count); - ssm_pk_buff_head_release(spb, n); - pthread_rwlock_wrlock(&proc.lock); - flow->part_idx = idx; + spb = ssm_pool_get(proc.pool, idx); - flow->rcv_act = now; + clock_gettime(PTHREAD_COND_CLOCK, &now); + flow->rcv_act = now; - pthread_rwlock_unlock(&proc.lock); - return count; - } else { - ipcp_spb_release(spb); - return -EMSGSIZE; - } - } + return deliver_pkt(flow, spb, idx, buf, count, partrd); } -/* fqueue functions. */ - struct flow_set * fset_create(void) { struct flow_set * set; @@ -1614,7 +2187,7 @@ int fset_add(struct flow_set * set, struct flow * flow; int ret; - if (set == NULL || fd < 0 || fd >= SYS_MAX_FLOWS) + if (set == NULL || fd < 0 || fd >= PROC_MAX_FLOWS) return -EINVAL; flow = &proc.flows[fd]; @@ -1650,7 +2223,7 @@ void fset_del(struct flow_set * set, { struct flow * flow; - if (set == NULL || fd < 0 || fd >= SYS_MAX_FLOWS) + if (set == NULL || fd < 0 || fd >= PROC_MAX_FLOWS) return; flow = &proc.flows[fd]; @@ -1661,7 +2234,7 @@ void fset_del(struct flow_set * set, ssm_flow_set_del(proc.fqset, set->idx, flow->info.id); if (flow->frcti != NULL) - ssm_flow_set_add(proc.fqset, 0, proc.flows[fd].info.id); + ssm_flow_set_add(proc.fqset, 0, flow->info.id); pthread_rwlock_unlock(&proc.lock); } @@ -1672,7 +2245,7 @@ bool fset_has(const struct flow_set * set, struct flow * flow; bool ret; - if (set == NULL || fd < 0 || fd >= SYS_MAX_FLOWS) + if (set == NULL || fd < 0 || fd >= PROC_MAX_FLOWS) return false; flow = &proc.flows[fd]; @@ -1691,61 +2264,71 @@ bool fset_has(const struct flow_set * set, return ret; } -/* Filter fqueue events for non-data packets */ static int fqueue_filter(struct fqueue * fq) { struct ssm_pk_buff * spb; int fd; ssize_t idx; struct frcti * frcti; + int ret = 0; + + /* proc.lock rdlock gates frcti_destroy via flow_fini wrlock. */ + pthread_rwlock_rdlock(&proc.lock); while (fq->next < fq->fqsize) { - if (fq->fqueue[fq->next].event != FLOW_PKT) - return 1; + if (fq->fqueue[fq->next].event == FLOW_UPD) { + /* Re-key doorbell: pull internally, never surface. */ + fd = proc.id_to_fd[fq->fqueue[fq->next].flow_id].fd; + ++fq->next; + if (fd >= 0) { + pthread_rwlock_unlock(&proc.lock); + flow_rekey(&proc.flows[fd]); + pthread_rwlock_rdlock(&proc.lock); + } + continue; + } - pthread_rwlock_rdlock(&proc.lock); + if (fq->fqueue[fq->next].event != FLOW_PKT) { + ret = 1; + goto out; + } fd = proc.id_to_fd[fq->fqueue[fq->next].flow_id].fd; if (fd < 0) { ++fq->next; - pthread_rwlock_unlock(&proc.lock); continue; } frcti = proc.flows[fd].frcti; if (frcti == NULL) { - pthread_rwlock_unlock(&proc.lock); - return 1; + ret = 1; + goto out; } - if (__frcti_pdu_ready(frcti) >= 0) { - pthread_rwlock_unlock(&proc.lock); - return 1; + if (FRCTI_PDU_READY(frcti)) { + ret = 1; + goto out; } - pthread_rwlock_unlock(&proc.lock); - idx = flow_rx_spb(&proc.flows[fd], &spb, false, NULL); if (idx < 0) - return 0; - - pthread_rwlock_rdlock(&proc.lock); + goto out; spb = ssm_pool_get(proc.pool, idx); - __frcti_rcv(frcti, spb); + FRCTI_RCV(frcti, spb); - if (__frcti_pdu_ready(frcti) >= 0) { - pthread_rwlock_unlock(&proc.lock); - return 1; + if (FRCTI_PDU_READY(frcti)) { + ret = 1; + goto out; } - pthread_rwlock_unlock(&proc.lock); - ++fq->next; } - return 0; + out: + pthread_rwlock_unlock(&proc.lock); + return ret; } int fqueue_next(struct fqueue * fq) @@ -1792,7 +2375,8 @@ ssize_t fevent(struct flow_set * set, { ssize_t ret = 0; struct timespec abs; - struct timespec * t = NULL; + struct timespec * dl = NULL; + struct timespec wait_abs; if (set == NULL || fq == NULL) return -EINVAL; @@ -1800,17 +2384,26 @@ ssize_t fevent(struct flow_set * set, if (fq->fqsize > 0 && fq->next != fq->fqsize) return 1; - clock_gettime(PTHREAD_COND_CLOCK, &abs); - if (timeo != NULL) { - ts_add(&abs, timeo, &abs); - t = &abs; + struct timespec now; + clock_gettime(PTHREAD_COND_CLOCK, &now); + ts_add(&now, timeo, &abs); + dl = &abs; } while (ret == 0) { - ret = ssm_flow_set_wait(proc.fqset, set->idx, fq->fqueue, t); - if (ret == -ETIMEDOUT) - return -ETIMEDOUT; + tw_move_safe(); + + compute_wait_deadline(dl, &wait_abs); + + ret = ssm_flow_set_wait(proc.fqset, set->idx, + fq->fqueue, &wait_abs); + if (ret == -ETIMEDOUT) { + if (deadline_passed(dl)) + return -ETIMEDOUT; + ret = 0; + continue; + } fq->fqsize = ret; fq->next = 0; @@ -1823,13 +2416,12 @@ ssize_t fevent(struct flow_set * set, return 1; } -/* ipcp-dev functions. */ - int np1_flow_alloc(pid_t n_pid, int flow_id) { struct flow_info flow; - struct crypt_sk crypt = { .nid = NID_undef, .key = NULL }; + struct crypt_sk crypt = { .nid = NID_undef, .key = NULL, + .epoch = 0, .role = CRYPT_ROLE_INIT }; memset(&flow, 0, sizeof(flow)); @@ -1837,9 +2429,10 @@ int np1_flow_alloc(pid_t n_pid, flow.n_pid = getpid(); flow.qs = qos_np1; flow.mpl = 0; - flow.n_1_pid = n_pid; /* This "flow" is upside-down! */ + /* np1 flow: n_1_pid is the upper. */ + flow.n_1_pid = n_pid; - return flow_init(&flow, &crypt); + return flow_init(&flow, &crypt, 0); } int np1_flow_dealloc(int flow_id, @@ -1847,12 +2440,7 @@ int np1_flow_dealloc(int flow_id, { int fd; - /* - * TODO: Don't pass timeo to the IPCP but wait in IRMd. - * This will need async ops, waiting until we bootstrap - * the IRMd over ouroboros. - */ - + /* TODO: wait in IRMd, not here; needs async ops. */ sleep(timeo); pthread_rwlock_rdlock(&proc.lock); @@ -1881,6 +2469,38 @@ int np1_flow_resp(int flow_id, return fd; } +int np1_flow_fd(int flow_id) +{ + int fd; + + if (flow_id < 0 || flow_id >= SYS_MAX_FLOWS) + return -1; + + pthread_rwlock_rdlock(&proc.lock); + + fd = proc.id_to_fd[flow_id].fd; + + pthread_rwlock_unlock(&proc.lock); + + return fd; +} + +int np1_flow_id(int fd) +{ + int flow_id; + + if (fd < 0 || fd >= PROC_MAX_FLOWS) + return -1; + + pthread_rwlock_rdlock(&proc.lock); + + flow_id = proc.flows[fd].info.id; + + pthread_rwlock_unlock(&proc.lock); + + return flow_id; +} + int ipcp_create_r(const struct ipcp_info * info) { uint8_t buf[SOCK_BUF_SIZE]; @@ -1900,6 +2520,7 @@ int ipcp_create_r(const struct ipcp_info * info) int ipcp_flow_req_arr(const buffer_t * dst, qosspec_t qs, time_t mpl, + uint32_t mtu, const buffer_t * data) { struct flow_info flow; @@ -1916,6 +2537,7 @@ int ipcp_flow_req_arr(const buffer_t * dst, flow.n_1_pid = getpid(); flow.qs = qs; flow.mpl = mpl; + flow.mtu = mtu; if (ipcp_flow_req_arr__irm_req_ser(&msg, dst, &flow, data) < 0) return -ENOMEM; @@ -1924,28 +2546,56 @@ int ipcp_flow_req_arr(const buffer_t * dst, if (err < 0) return err; - crypt.key = key; + crypt.key = key; + crypt.epoch = 0; + crypt.role = CRYPT_ROLE_INIT; err = flow__irm_result_des(&msg, &flow, &crypt); if (err < 0) return err; - assert(crypt.nid == NID_undef); /* np1 flows are not encrypted */ + /* np1 flows are not encrypted. */ + assert(crypt.nid == NID_undef); - /* inverted for np1_flow */ + /* Inverted for np1_flow. */ flow.n_1_pid = flow.n_pid; flow.n_pid = getpid(); flow.mpl = 0; + flow.mtu = 0; flow.qs = qos_np1; crypt.nid = NID_undef; - return flow_init(&flow, &crypt); + return flow_init(&flow, &crypt, 0); +} + +int ipcp_flow_update_arr(int flow_id, + const buffer_t * data) +{ + struct flow_info flow; + uint8_t buf[SOCK_BUF_SIZE]; + buffer_t msg = {SOCK_BUF_SIZE, buf}; + int err; + + memset(&flow, 0, sizeof(flow)); + + flow.id = flow_id; + flow.n_1_pid = getpid(); + + if (ipcp_flow_update_arr__irm_req_ser(&msg, &flow, data) < 0) + return -ENOMEM; + + err = send_recv_msg(&msg); + if (err < 0) + return err; + + return irm__irm_result_des(&msg); } int ipcp_flow_alloc_reply(int fd, int response, time_t mpl, + uint32_t mtu, const buffer_t * data) { struct flow_info flow; @@ -1953,7 +2603,7 @@ int ipcp_flow_alloc_reply(int fd, buffer_t msg = {SOCK_BUF_SIZE, buf}; int err; - assert(fd >= 0 && fd < SYS_MAX_FLOWS); + assert(fd >= 0 && fd < PROC_MAX_FLOWS); pthread_rwlock_rdlock(&proc.lock); @@ -1962,6 +2612,7 @@ int ipcp_flow_alloc_reply(int fd, pthread_rwlock_unlock(&proc.lock); flow.mpl = mpl; + flow.mtu = mtu; if (ipcp_flow_alloc_reply__irm_msg_ser(&msg, &flow, response, data) < 0) return -ENOMEM; @@ -1979,7 +2630,7 @@ int ipcp_flow_read(int fd, struct flow * flow; ssize_t idx = -1; - assert(fd >= 0 && fd < SYS_MAX_FLOWS); + assert(fd >= 0 && fd < PROC_MAX_FLOWS); assert(spb); flow = &proc.flows[fd]; @@ -1988,7 +2639,14 @@ int ipcp_flow_read(int fd, assert(flow->info.id >= 0); - while (frcti_queued_pdu(flow->frcti) < 0) { + /* Raw flow: deliver the popped pkt directly (no FRCT rq). */ + if (flow->frcti == NULL) { + pthread_rwlock_unlock(&proc.lock); + idx = flow_rx_spb(flow, spb, false, NULL); + return idx < 0 ? (int) idx : 0; + } + + while (!FRCTI_PDU_READY(flow->frcti)) { pthread_rwlock_unlock(&proc.lock); idx = flow_rx_spb(flow, spb, false, NULL); @@ -1997,7 +2655,7 @@ int ipcp_flow_read(int fd, pthread_rwlock_rdlock(&proc.lock); - frcti_rcv(flow->frcti, *spb); + FRCTI_RCV(flow->frcti, *spb); } pthread_rwlock_unlock(&proc.lock); @@ -2011,12 +2669,12 @@ int ipcp_flow_write(int fd, struct flow * flow; int ret; - assert(fd >= 0 && fd < SYS_MAX_FLOWS); + assert(fd >= 0 && fd < PROC_MAX_FLOWS); assert(spb); flow = &proc.flows[fd]; - pthread_rwlock_wrlock(&proc.lock); + pthread_rwlock_rdlock(&proc.lock); if (flow->info.id < 0) { pthread_rwlock_unlock(&proc.lock); @@ -2030,30 +2688,28 @@ int ipcp_flow_write(int fd, pthread_rwlock_unlock(&proc.lock); - ret = flow_tx_spb(flow, spb, true, NULL); + ret = flow_tx_spb(flow, spb, FRCT_FR_SOLE, true, NULL); return ret; } -static int pool_copy_spb(struct ssm_pool * src_pool, - ssize_t src_idx, - struct ssm_pool * dst_pool, - struct ssm_pk_buff ** dst_spb) +/* Copy src into dst_pool without consuming src. Caller owns both halves. */ +static int pool_dup_spb(struct ssm_pool * src_pool, + size_t src_off, + struct ssm_pool * dst_pool, + struct ssm_pk_buff ** dst_spb) { struct ssm_pk_buff * src; uint8_t * ptr; size_t len; - src = ssm_pool_get(src_pool, src_idx); + src = ssm_pool_get(src_pool, src_off); len = ssm_pk_buff_len(src); - if (ssm_pool_alloc(dst_pool, len, &ptr, dst_spb) < 0) { - ssm_pool_remove(src_pool, src_idx); + if (ssm_pool_alloc(dst_pool, len, &ptr, dst_spb) < 0) return -ENOMEM; - } memcpy(ptr, ssm_pk_buff_head(src), len); - ssm_pool_remove(src_pool, src_idx); return 0; } @@ -2063,9 +2719,9 @@ int np1_flow_read(int fd, struct ssm_pool * pool) { struct flow * flow; - ssize_t idx = -1; + ssize_t off; - assert(fd >= 0 && fd < SYS_MAX_FLOWS); + assert(fd >= 0 && fd < PROC_MAX_FLOWS); assert(spb); flow = &proc.flows[fd]; @@ -2074,20 +2730,23 @@ int np1_flow_read(int fd, pthread_rwlock_rdlock(&proc.lock); - idx = ssm_rbuff_read(flow->rx_rb); - if (idx < 0) { + off = ssm_rbuff_read(flow->rx_rb); + if (off < 0) { pthread_rwlock_unlock(&proc.lock); - return idx; + return off; } pthread_rwlock_unlock(&proc.lock); if (pool == NULL) { - *spb = ssm_pool_get(proc.pool, idx); + *spb = ssm_pool_get(proc.pool, off); } else { /* Cross-pool copy: PUP -> GSPP */ - if (pool_copy_spb(pool, idx, proc.pool, spb) < 0) + if (pool_dup_spb(pool, off, proc.pool, spb) < 0) { + ssm_pool_remove(pool, off); return -ENOMEM; + } + ssm_pool_remove(pool, off); } return 0; @@ -2100,9 +2759,10 @@ int np1_flow_write(int fd, struct flow * flow; struct ssm_pk_buff * dst; int ret; - ssize_t idx; + size_t off; + size_t dst_off; - assert(fd >= 0 && fd < SYS_MAX_FLOWS); + assert(fd >= 0 && fd < PROC_MAX_FLOWS); assert(spb); flow = &proc.flows[fd]; @@ -2121,45 +2781,47 @@ int np1_flow_write(int fd, pthread_rwlock_unlock(&proc.lock); - idx = ssm_pk_buff_get_idx(spb); + off = ssm_pk_buff_get_off(spb); if (pool == NULL) { - ret = ssm_rbuff_write_b(flow->tx_rb, idx, NULL); + ret = ssm_rbuff_write_b(flow->tx_rb, off, NULL); if (ret < 0) - ssm_pool_remove(proc.pool, idx); - else - ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT); + return ret; + ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT); } else { - /* Cross-pool copy: GSPP -> PUP */ - if (pool_copy_spb(proc.pool, idx, pool, &dst) < 0) + /* Cross-pool copy: GSPP -> PUP. Src kept on error. */ + if (pool_dup_spb(proc.pool, off, pool, &dst) < 0) return -ENOMEM; - idx = ssm_pk_buff_get_idx(dst); - ret = ssm_rbuff_write_b(flow->tx_rb, idx, NULL); - if (ret < 0) - ssm_pool_remove(pool, idx); - else - ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT); + dst_off = ssm_pk_buff_get_off(dst); + ret = ssm_rbuff_write_b(flow->tx_rb, dst_off, NULL); + if (ret < 0) { + ssm_pool_remove(pool, dst_off); + return ret; + } + ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT); + ssm_pool_remove(proc.pool, off); } - return ret; + return 0; } int ipcp_spb_reserve(struct ssm_pk_buff ** spb, size_t len) { - return ssm_pool_alloc_b(proc.pool, len, NULL, spb, NULL) < 0 ? -1 : 0; + return ssm_pool_alloc_b(proc.pool, len, NULL, spb, NULL) < 0 + ? -1 : 0; } void ipcp_spb_release(struct ssm_pk_buff * spb) { - ssm_pool_remove(proc.pool, ssm_pk_buff_get_idx(spb)); + ssm_pool_remove(proc.pool, ssm_pk_buff_get_off(spb)); } int ipcp_flow_fini(int fd) { struct ssm_rbuff * rx_rb; - assert(fd >= 0 && fd < SYS_MAX_FLOWS); + assert(fd >= 0 && fd < PROC_MAX_FLOWS); pthread_rwlock_rdlock(&proc.lock); @@ -2168,8 +2830,8 @@ int ipcp_flow_fini(int fd) return -1; } - ssm_rbuff_set_acl(proc.flows[fd].rx_rb, ACL_FLOWDOWN); - ssm_rbuff_set_acl(proc.flows[fd].tx_rb, ACL_FLOWDOWN); + ssm_rbuff_set_bits(proc.flows[fd].rx_rb, RB_FLOWDOWN); + ssm_rbuff_set_bits(proc.flows[fd].tx_rb, RB_FLOWDOWN); ssm_flow_set_notify(proc.flows[fd].set, proc.flows[fd].info.id, @@ -2188,7 +2850,7 @@ int ipcp_flow_fini(int fd) int ipcp_flow_get_qoscube(int fd, qoscube_t * cube) { - assert(fd >= 0 && fd < SYS_MAX_FLOWS); + assert(fd >= 0 && fd < PROC_MAX_FLOWS); assert(cube); pthread_rwlock_rdlock(&proc.lock); @@ -2227,7 +2889,7 @@ int local_flow_transfer(int src_fd, struct ssm_pk_buff * dst_spb; struct ssm_pool * sp; struct ssm_pool * dp; - ssize_t idx; + ssize_t off; int ret; assert(src_fd >= 0); @@ -2241,15 +2903,15 @@ int local_flow_transfer(int src_fd, pthread_rwlock_rdlock(&proc.lock); - idx = ssm_rbuff_read(src_flow->rx_rb); - if (idx < 0) { + off = ssm_rbuff_read(src_flow->rx_rb); + if (off < 0) { pthread_rwlock_unlock(&proc.lock); - return idx; + return off; } if (dst_flow->info.id < 0) { pthread_rwlock_unlock(&proc.lock); - ssm_pool_remove(sp, idx); + ssm_pool_remove(sp, off); return -ENOTALLOC; } @@ -2257,21 +2919,24 @@ int local_flow_transfer(int src_fd, if (sp == dp) { /* Same pool: zero-copy */ - ret = ssm_rbuff_write_b(dst_flow->tx_rb, idx, NULL); + ret = ssm_rbuff_write_b(dst_flow->tx_rb, off, NULL); if (ret < 0) - ssm_pool_remove(sp, idx); + ssm_pool_remove(sp, off); else ssm_flow_set_notify(dst_flow->set, dst_flow->info.id, FLOW_PKT); } else { /* Different pools: single copy */ - if (pool_copy_spb(sp, idx, dp, &dst_spb) < 0) + if (pool_dup_spb(sp, off, dp, &dst_spb) < 0) { + ssm_pool_remove(sp, off); return -ENOMEM; + } - idx = ssm_pk_buff_get_idx(dst_spb); - ret = ssm_rbuff_write_b(dst_flow->tx_rb, idx, NULL); + ssm_pool_remove(sp, off); + off = ssm_pk_buff_get_off(dst_spb); + ret = ssm_rbuff_write_b(dst_flow->tx_rb, off, NULL); if (ret < 0) - ssm_pool_remove(dp, idx); + ssm_pool_remove(dp, off); else ssm_flow_set_notify(dst_flow->set, dst_flow->info.id, FLOW_PKT); diff --git a/src/lib/frct.c b/src/lib/frct.c index fad2cf69..c055433d 100644 --- a/src/lib/frct.c +++ b/src/lib/frct.c @@ -1,7 +1,7 @@ /* * Ouroboros - Copyright (C) 2016 - 2026 * - * Flow and Retransmission Control + * Flow and Retransmission Control Task (FRCT) * * Dimitri Staessens <dimitri@ouroboros.rocks> * Sander Vrijders <sander@ouroboros.rocks> @@ -20,97 +20,416 @@ * Foundation, Inc., http://www.fsf.org/about/contact/. */ -#include <ouroboros/endian.h> +/* Included by dev.c; uses dev.c statics (proc, spb_encrypt, ...). */ #define DELT_RDV (100 * MILLION) /* ns */ -#define MAX_RDV (1 * BILLION) /* ns */ +#define MAX_RDV (1 * BILLION) /* ns */ + +#define MAX_RTO_MUL 8 /* caps the RTO backoff shift */ +#define MAX_TLP_PER_EP 2 /* RFC 8985 §7.3: up to 2 TLPs */ +#define INITIAL_RTO (1 * BILLION) /* RFC 6298 §2.1: 1 s default */ +#define RTT_BOOT_NS (10 * MILLION) /* rtt_hint floor + initial mdev */ +#define SRTT_FLOOR_NS 1000L /* 1 us; smoothed RTT floor */ +#define MDEV_FLOOR_NS 100L /* 100 ns; mdev sanity floor */ +#define RTT_CLAMP_MUL 16 /* probe sample cap = N * srtt */ +#define MIN_RTT_WIN_NS (300ULL * BILLION) /* 5 min, Linux tcp default */ +#define NACK_COOLDOWN_NS (100 * MILLION) /* pre-DRF NACK cooldown */ +#define FRCT_TX_TIMEO_NS (250 * 1000) /* tx ring write deadline */ +#define ACK_DELAY_NS (2ULL * TICTIME) /* delayed-ACK fire delay */ #define FRCT "frct" #define FRCT_PCILEN (sizeof(struct frct_pci)) #define FRCT_NAME_STRLEN 32 -struct frct_cr { - uint32_t lwe; /* Left window edge */ - uint32_t rwe; /* Right window edge */ +/* Wire-protocol cap on SACK blocks per packet; binds both peers. */ +#define SACK_MAX_BLOCKS 2048 +#define SACK_BLOCK_SIZE (2 * sizeof(uint32_t)) +/* 2B count + 2B pad to 4-byte align the block list. */ +#define SACK_HDR_SIZE (sizeof(uint32_t)) +#define SACK_MIN_GAP_NS (250u * 1000u) /* 250 us SACK gap */ +#define MIN_REORDER_NS (250u * 1000u) /* 250 us RACK floor */ +#define SACK_RXM_MAX 32 /* Cap on retransmits staged from single SACK.*/ +#define DUP_THRESH 3 /* RFC 8985 §6.2 step 2.2 SACK count gate. */ + +/* RFC 8985 §7.2 RACK reorder-window scaling cap. */ +#define REO_WND_MULT_MAX 20 +/* RFC 8985 §7.2 step 5: round trips of no DSACK before halving. */ +#define REO_DECAY_PKTS 16 +/* DSACK seqno sanity: reject reports older/farther than one rcv window. */ +#define MAX_DSACK_LAG RQ_SIZE + +/* Signed ns elapsed; negative under concurrent update (no underflow). */ +static __inline__ int64_t ts_age_ns(uint64_t now_ns, + uint64_t then_ns) +{ + return (int64_t)(now_ns - then_ns); +} - uint8_t cflags; - uint32_t seqno; /* SEQ to send, or last SEQ Ack'd */ +/* True iff strictly more than thr_ns elapsed since then_ns. */ +static __inline__ bool ts_aged_ns(uint64_t now_ns, + uint64_t then_ns, + uint64_t thr_ns) +{ + return ts_age_ns(now_ns, then_ns) > (int64_t) thr_ns; +} - struct timespec act; /* Last seen activity */ - time_t inact; /* Inactivity (s) */ -}; +/* FRCT r-timer: do not retransmit packet older than t_r (from first send). */ +#define RXM_AGED_OUT(t0, now_ns, t_r) \ + ts_aged_ns((now_ns), (t0), (uint64_t)(t_r)) -struct frcti { - int fd; +/* FRCT a-timer: do not (re)transmit ACK after t_a from last data receive. */ +#define ACK_AGED_OUT(act, now_ns, t_a) \ + ts_aged_ns((now_ns), (act), (uint64_t)(t_a)) - time_t mpl; - time_t a; - time_t r; - time_t rdv; - - time_t srtt; /* Smoothed rtt */ - time_t mdev; /* Deviation */ - time_t rto; /* Retransmission timeout */ - uint32_t rttseq; - struct timespec t_probe; /* Probe time */ - bool probe; /* Probe active */ -#ifdef PROC_FLOW_STATS - size_t n_rtx; /* Number of rxm packets */ - size_t n_prb; /* Number of rtt probes */ - size_t n_rtt; /* Number of estimates */ - size_t n_dup; /* Duplicates received */ - size_t n_dak; /* Delayed ACKs received */ - size_t n_rdv; /* Number of rdv packets */ - size_t n_out; /* Packets out of window */ - size_t n_rqo; /* Packets out of rqueue */ -#endif - struct frct_cr snd_cr; - struct frct_cr rcv_cr; +struct sack_args { + uint16_t n; + bool dsack; /* RFC 2883: block[0] is a DSACK report */ + uint32_t ack; + uint32_t rwe; + uint32_t blocks[][2]; /* flexible — sized at alloc time */ +}; +/* NewReno-careful (RFC 6582) exit pad; gates RTT samples post-signal. */ +#define RTT_QUARANTINE 32 +#define RTTP_NONCE_LEN 16 - ssize_t rq[RQ_SIZE]; - pthread_rwlock_t lock; +/* RTT-probe wire payload (after the FRCT PCI). */ +struct frct_rttp { + uint32_t probe_id; /* sender counter; 0 on reply */ + uint32_t echo_id; /* peer's probe_id; 0 outbound */ + uint8_t nonce[RTTP_NONCE_LEN]; /* random; echoed verbatim */ +} __attribute__((packed)); - bool open; /* Window open/closed */ - struct timespec t_wnd; /* Window closed time */ - struct timespec t_rdvs; /* Last rendez-vous sent */ - pthread_cond_t cond; - pthread_mutex_t mtx; -}; +#define RTTP_PAYLOAD sizeof(struct frct_rttp) +#define RTTP_POS(id) ((id) & (RTTP_RING - 1)) +/* + * Flag values are assigned MSB-first on the wire (RFC convention): + * bit 0 = 0x8000 occupies wire-position 0 of the 16-bit flags + * field, bit 12 = 0x0008 is the last assigned bit, and the three + * LSBs (0x0007) are reserved. + */ enum frct_flags { - FRCT_DATA = 0x01, /* PDU carries data */ - FRCT_DRF = 0x02, /* Data run flag */ - FRCT_ACK = 0x04, /* ACK field valid */ - FRCT_FC = 0x08, /* FC window valid */ - FRCT_RDVS = 0x10, /* Rendez-vous */ - FRCT_FFGM = 0x20, /* First Fragment */ - FRCT_MFGM = 0x40, /* More fragments */ + FRCT_DATA = 0x8000, /* PDU carries data */ + FRCT_DRF = 0x4000, /* Data run flag */ + FRCT_ACK = 0x2000, /* ACK field valid */ + FRCT_NACK = 0x1000, /* Neg-ACK: pci->seqno is arrival_seqno - 1 */ + FRCT_FC = 0x0800, /* FC window valid */ + FRCT_RDVS = 0x0400, /* Rendez-vous */ + FRCT_FFGM = 0x0200, /* First fragment (begin) */ + FRCT_LFGM = 0x0100, /* Last fragment (end) */ + FRCT_RXM = 0x0080, /* Retransmission */ + FRCT_SACK = 0x0040, /* SACK block list follows */ + FRCT_RTTP = 0x0020, /* RTT probe / echo */ + FRCT_KA = 0x0010, /* Keepalive */ + FRCT_FIN = 0x0008, /* End of stream */ }; -struct frct_pci { - uint8_t flags; +/* + * DATA-packet fragment role (FFGM = begin, LFGM = end), SCTP-style: + * 1 1 = sole / un-fragmented SDU (begin AND end) + * 1 0 = first fragment of a multi-fragment SDU + * 0 0 = middle fragment + * 0 1 = last fragment + */ +#define FRCT_FR_MASK (FRCT_FFGM | FRCT_LFGM) +#define FRCT_FR_SOLE (FRCT_FFGM | FRCT_LFGM) +#define FRCT_FR_FIRST (FRCT_FFGM) +#define FRCT_FR_MID (0) +#define FRCT_FR_LAST (FRCT_LFGM) + +/* Default cap on a single reassembled SDU. App can raise via FRCTSMAXSDU */ +#define FRCT_MAX_SDU (1U << 20) + +/* Stream-mode PCI extension: [start, end) byte range on every DATA pkt. */ +struct frct_pci_stream { + uint32_t start; + uint32_t end; +} __attribute__((packed)); + +#define FRCT_PCI_STREAM_LEN (sizeof(struct frct_pci_stream)) - uint8_t pad; /* 24 bit window! */ - uint16_t window; +/* Bytes following PCI: SACK list / RTTP nonce / control payload. */ +#define FRCT_BODY(pci) ((uint8_t *) (pci) + FRCT_PCILEN) +/* Typed access to the stream PCI extension on stream DATA packets. */ +#define FRCT_SPCI(pci) \ + ((struct frct_pci_stream *) ((uint8_t *) (pci) + FRCT_PCILEN)) +/* Push the FRCT header onto spb's head. */ +#define FRCT_HDR_PUSH(spb, frcti) \ + ((struct frct_pci *) ssm_pk_buff_push((spb), \ + frcti_data_hdr_len(frcti))) + +/* Pop a fixed-size header off spb's head; cast to type *. */ +#define FRCT_HDR_POP(spb, type) \ + ((struct type *) ssm_pk_buff_pop((spb), sizeof(struct type))) + +/* Default / max per-flow stream rx ring (pow2); min N * per_pkt. */ +#define FRCT_STREAM_RING_MIN_PKTS 4 +#define FRCT_STREAM_RING_SZ (1U << 20) /* 1 MiB default */ +#define FRCT_STREAM_RING_SZ_MAX (1U << 27) /* 128 MiB */ + +struct frct_pci { + uint16_t flags; + uint16_t hcs; + + uint32_t window; uint32_t seqno; uint32_t ackno; } __attribute__((packed)); +/* Stat counters; fold to no-ops without PROC_FLOW_STATS. */ +#ifdef PROC_FLOW_STATS +struct frcti_stat { + size_t rxm_rto; /* RTO-timer driven retransmits */ + size_t rxm_rcv; /* RXM packets received (all) */ + size_t rxm_dup_rcv; /* RXM dups (peer already had it) */ + size_t rxm_sack; /* SACK-mechanism retransmits */ + size_t rxm_rack; /* RACK-driven retransmits */ + size_t rxm_dupthresh; /* DupThresh-driven retransmits */ + size_t rxm_nack; /* NACK-pulled retransmits */ + size_t rxm_due_count; /* rxm_due entries (pre-bail) */ + size_t rxm_due_acked; /* bail: seqno < snd_lwe */ + size_t rxm_due_unowned; /* bail: slot.rxm replaced */ + size_t rxm_due_aged; /* bail: r->t0 + t_r < now */ + size_t rxm_due_defer; /* bail: non-HoL, deferred to HoL */ + size_t rxm_arm_fail; /* rxm_arm: malloc failed */ + size_t rxm_cancel; /* entries cancelled at teardown */ + size_t rxm_tx_dead; /* RXM tx into terminal flow */ + size_t tx_drop; /* frct_tx fail (any cause) */ + size_t tx_drop_ack; /* bare ACK dropped */ + size_t tx_drop_sack; /* SACK dropped */ + size_t tx_drop_ka; /* keepalive dropped */ + size_t tx_drop_rttp; /* RTT probe/echo dropped */ + size_t tx_drop_nack; /* pre-DRF NACK dropped */ + size_t tx_drop_rdv; /* rendez-vous dropped */ + size_t tx_drop_other; /* anything not matched above */ + size_t ack_snd; /* ACK packets sent (bare + SACK) */ + size_t ack_fire; /* delayed-ACK timer fires */ + size_t ack_supp_seqno; /* fire suppressed: seqno */ + size_t ack_supp_inact; /* fire suppressed: inact */ + size_t ack_supp_rate; /* fire suppressed: rate */ + size_t ack_rcv; /* ACK packets received */ + size_t ack_rtt; /* ACKs that fed RTT estimator */ + size_t ack_dup_rcv; /* ACK packet wire dups dropped */ + size_t dup_rcv; /* duplicates received */ + size_t out_rcv; /* pkts out of window */ + size_t rqo_rcv; /* pkts out of rqueue */ + size_t ooo_rcv; /* OOO arrivals */ + size_t sack_snd; /* SACK packets sent */ + size_t sack_rcv; /* SACK packets received */ + size_t dsack_snd; /* SACK pkts carrying a DSACK */ + size_t dsack_rcv; /* DSACK blocks parsed */ + size_t dsack_drop; /* DSACK blocks past MAX_DSACK_LAG */ + size_t nack_snd; /* pre-DRF NACKs sent */ + size_t nack_rcv; /* pre-DRF NACKs received */ + size_t tlp_snd; /* tail loss probes sent */ + size_t inact_drop; /* inactivity drop (NACK on cd) */ + size_t drf_rebase; /* DRF-triggered window rebase */ + size_t rq_released; /* slots cleared by release_rq */ + size_t rttp_snd; /* RTT probes sent */ + size_t rttp_rcv; /* RTT probe replies rcvd */ + size_t rtt_smpl; /* RTT estimator samples */ + size_t rdv_snd; /* rendez-vous packets sent */ + size_t rdv_rcv; /* rendez-vous packets rcvd */ + size_t ka_snd; /* keepalives sent */ + size_t ka_rcv; /* keepalives received */ + size_t sdu_snd_frag; /* writes that fragmented */ + size_t sdu_snd_alloc; /* alloc fail truncated SDU send */ + size_t sdu_snd_tx; /* tx fail truncated SDU send */ + size_t frag_snd; /* fragments sent: FIRST/MID/LAST */ + size_t frag_rcv; /* fragments stashed in rq[] */ + size_t sdu_reasm; /* SDUs delivered reassembled */ + size_t sdu_sole; /* SOLE SDUs delivered (n==1) */ + size_t frag_drop; /* dropped at malformed run */ + size_t strm_snd_byte; /* bytes sent on stream */ + size_t strm_rcv_byte; /* bytes copied to ring */ + size_t strm_dlv_byte; /* bytes delivered to reader */ + size_t strm_drop; /* stream rcvs dropped */ + size_t strm_fin_drop; /* stream FIN packets rejected */ + /* Profiling instrumentation. */ + size_t rcv_proc_ns; /* time inside FRCTI_RCV (ns) */ + size_t tw_move_ns; /* time inside tw_move (ns) */ + size_t drain_calls; /* flow_drain_rx_nb invocations */ +}; + +#define STAT_BUMP(frcti, field) FETCH_ADD_RELAXED(&(frcti)->stat.field, 1) +#define STAT_ADD(frcti, field, v) FETCH_ADD_RELAXED(&(frcti)->stat.field, (v)) +#define STAT_LOAD(frcti, field) LOAD_RELAXED(&(frcti)->stat.field) +#else +#define STAT_BUMP(frcti, field) ((void) (frcti)) +#define STAT_ADD(frcti, field, v) ((void) (frcti)) +#define STAT_LOAD(frcti, field) ((void) (frcti), (size_t) 0) +#endif + +#define frcti_to_flow(f) (&proc.flows[(f)->fd]) + +#define RTTP_RING 8 +#define RTTP_COLD_NS (100 * MILLION) /* cold-probe cadence */ +#define RQ_SLOT(seqno) ((seqno) & (RQ_SIZE - 1)) + +struct rxm_entry; + +enum snd_slot_flags { + SND_RTX = 0x01, /* Any retransmit; Karn skips next RTT sample. */ + SND_FAST_RXM = 0x02, /* Fast-retx one-shot gate per loss event. */ + SND_TLP = 0x04, /* Tail loss probe; ACK resets rto_mul. */ +}; + +struct snd_slot { + struct rxm_entry * rxm; /* RXM entry, NULL if none. */ + uint64_t time; /* ts_to_ns of last send (any kind). */ + uint8_t flags; /* SND_* bits above. */ +}; + +/* Per-seqno reorder slot (FRTX) and stream-mode byte/FIN metadata. */ +struct rcv_slot { + ssize_t idx; /* spb idx; -1 = empty */ + uint32_t start; /* stream byte start */ + uint32_t end; /* stream byte end */ + uint8_t fin; /* stream FIN bit */ +}; + +struct frct_cr { + uint32_t lwe; /* Left window edge */ + uint32_t rwe; /* Right window edge */ + + uint8_t cflags; + uint32_t seqno; /* SEQ to send, or last SEQ Ack'd */ + uint32_t ackno; /* snd: ACK-pkt seqno; rcv: dedup */ + + uint64_t act; /* ts_to_ns of last activity */ + uint64_t inact; /* Inactivity threshold (ns) */ +}; + +struct frcti { + /* IMM: set once in frcti_create; read-only thereafter. */ + int fd; + uint64_t t_mpl; /* MPL (ns) */ + uint64_t t_a; /* a-timer (ns) */ + uint64_t t_r; /* r-timer (ns) */ + uint64_t t_rdv; /* RDV cooldown (ns) */ + time_t ber; /* cached qs.ber */ + bool lossy; /* qs.loss != 0 */ + time_t qs_timeout; /* cached qs.timeout (ms) */ + size_t frag_mtu; /* max FRCT pkt: PCI + payload */ + uint16_t sack_n_max; /* SACK blocks that fit MTU */ + bool stream; + + /* All fields below are protected by lock (rwlock/LOAD_ACQUIRE). */ + struct { + struct frct_cr snd_cr; + struct frct_cr rcv_cr; + + /* RTT/RACK estimator */ + time_t srtt; /* smoothed RTT */ + time_t mdev; /* mean deviation */ + time_t min_rtt; /* RACK base, ns */ + uint64_t t_min_rtt; /* min_rtt last set */ + time_t rto; /* retransmit TO */ + time_t rto_min; /* RTO floor (ns) */ + uint8_t rto_mul; /* RTO backoff bits */ + uint32_t rtt_lwe; /* RTT-sample fence */ + uint64_t t_rcv_rtt; /* last RTT feed */ + uint64_t t_snd_probe; /* last probe sent */ + uint64_t t_latest_ack; /* RACK.fack snd-ts */ + uint32_t probe_id_next; + struct { + uint32_t id; + uint64_t ts; /* ts_to_ns send */ + uint8_t nonce[RTTP_NONCE_LEN]; /* echoed back */ + } probes[RTTP_RING]; + + /* rcv reassembly */ + size_t max_rcv_sdu; /* max reasm bytes */ + uint8_t * rcv_ring; /* lazy alloc */ + size_t rcv_ring_sz; /* power of 2 */ + uint32_t ring_seq_cap; /* ring/per_pkt */ + + uint32_t snd_byte_next; + bool snd_fin_sent; + uint32_t snd_fin_seqno; + uint32_t rcv_byte_next; + uint32_t rcv_byte_high; /* contiguous high */ + uint32_t rcv_byte_fin; /* set when FIN */ + bool rcv_fin_seen; + + struct rcv_slot rcv_slots[RQ_SIZE]; + struct snd_slot snd_slots[RQ_SIZE]; /* .rxm is ATOM */ + + /* rcv SACK dedup */ + uint64_t t_snd_sack; + uint32_t sack_lwe; /* rcv lwe at SACK */ + uint16_t sack_n; /* SACK block count */ + + /* RFC 2883 D-SACK: pending report (single-slot, latest). */ + uint32_t dsack_seqno; + bool dsack_valid; + + /* RFC 8985 §7.2 RACK reorder-window scaling. */ + uint8_t reo_wnd_mult; /* REO_WND_MULT_MAX */ + uint32_t dsack_lwe_snap; /* lwe @ last DSACK */ + uint64_t t_last_reo_widen; /* once-per-RTT */ + + uint32_t dup_thresh; /* RFC 8985 */ + uint32_t tlp_high_seq; /* §7.3: 0 = none */ + uint8_t tlp_count; /* §7.3 per-episode */ + uint64_t t_nack; + bool open; /* FC window state */ + bool in_recovery; + uint32_t recovery_high; /* seqno @ entry */ + uint32_t rack_fired_lwe; /* lwe @ last RACK */ + struct timespec t_wnd; /* window-closed ts */ + struct timespec t_last_rdv; /* last RDV sent */ + struct list_head rxm_list; /* live rxm entries */ + + pthread_rwlock_t lock; + }; + + /* Read/written via __atomic without holding lock. */ + uint64_t t_ka_rcv; /* ts_to_ns of last KA rx */ + uint8_t ack_pending; /* delayed-ACK dedup */ + uint8_t tlp_pending; /* TLP arm dedup (lazy) */ + + /* Timer entries; ownership belongs to the tw module. */ + struct tw_entry ack_tw; /* delayed-ACK timer */ + struct tw_entry ka_tw; /* keepalive timer */ + struct tw_entry tlp_tw; /* tail-loss probe timer */ + +#ifdef PROC_FLOW_STATS + /* STAT: lock-free relaxed atomic counters. */ + struct frcti_stat stat; +#endif +}; + #ifdef PROC_FLOW_STATS +__attribute__((cold)) static int frct_rib_read(const char * path, char * buf, size_t len) { + struct frcti * frcti; struct timespec now; + uint64_t now_ns; char * entry; - struct flow * flow; - struct frcti * frcti; int fd; - - (void) len; + int written; + /* Snapshot under the locks; format outside (pure userspace). */ + struct { + uint64_t t_mpl; + uint64_t t_a; + uint64_t t_r; + time_t srtt; + time_t mdev; + time_t rto; + time_t min_rtt; + struct frct_cr snd_cr; + struct frct_cr rcv_cr; + size_t rx_q_now; + size_t tx_q_now; + struct frcti_stat stat; + } s; entry = strstr(path, RIB_SEPARATOR); assert(entry); @@ -118,23 +437,50 @@ static int frct_rib_read(const char * path, fd = atoi(path); - flow = &proc.flows[fd]; - clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); + + if (fd < 0 || fd >= PROC_MAX_FLOWS) + return 0; pthread_rwlock_rdlock(&proc.lock); - frcti = flow->frcti; + frcti = proc.flows[fd].frcti; + if (frcti == NULL) { + pthread_rwlock_unlock(&proc.lock); + return 0; + } + + s.t_mpl = frcti->t_mpl; + s.t_a = frcti->t_a; + s.t_r = frcti->t_r; + + s.rx_q_now = proc.flows[fd].rx_rb != NULL + ? ssm_rbuff_queued(proc.flows[fd].rx_rb) : 0; + s.tx_q_now = proc.flows[fd].tx_rb != NULL + ? ssm_rbuff_queued(proc.flows[fd].tx_rb) : 0; pthread_rwlock_rdlock(&frcti->lock); - sprintf(buf, - "Maximum packet lifetime (ns): %20ld\n" - "Max time to Ack (ns): %20ld\n" - "Max time to Retransmit (ns): %20ld\n" + s.srtt = frcti->srtt; + s.mdev = frcti->mdev; + s.rto = frcti->rto; + s.min_rtt = frcti->min_rtt; + s.snd_cr = frcti->snd_cr; + s.rcv_cr = frcti->rcv_cr; + s.stat = frcti->stat; + + pthread_rwlock_unlock(&frcti->lock); + pthread_rwlock_unlock(&proc.lock); + + written = snprintf(buf, len, + "Maximum packet lifetime (ns): %20" PRIu64 "\n" + "Max time to Ack (ns): %20" PRIu64 "\n" + "Max time to Retransmit (ns): %20" PRIu64 "\n" "Smoothed rtt (ns): %20ld\n" "RTT standard deviation (ns): %20ld\n" "Retransmit timeout RTO (ns): %20ld\n" + "Minimum RTT (RACK base, ns): %20ld\n" "Sender left window edge: %20u\n" "Sender right window edge: %20u\n" "Sender inactive (ns): %20lld\n" @@ -143,44 +489,132 @@ static int frct_rib_read(const char * path, "Receiver right window edge: %20u\n" "Receiver inactive (ns): %20lld\n" "Receiver last ack: %20u\n" - "Number of pkt retransmissions: %20zu\n" - "Number of rtt probes: %20zu\n" - "Number of rtt estimates: %20zu\n" - "Number of duplicates received: %20zu\n" - "Number of delayed acks received: %20zu\n" - "Number of rendez-vous sent: %20zu\n" - "Number of packets out of window: %20zu\n" - "Number of packets out of rqueue: %20zu\n", - frcti->mpl, - frcti->a, - frcti->r, - frcti->srtt, - frcti->mdev, - frcti->rto, - frcti->snd_cr.lwe, - frcti->snd_cr.rwe, - ts_diff_ns(&now, &frcti->snd_cr.act), - frcti->snd_cr.seqno, - frcti->rcv_cr.lwe, - frcti->rcv_cr.rwe, - ts_diff_ns(&now, &frcti->rcv_cr.act), - frcti->rcv_cr.seqno, - frcti->n_rtx, - frcti->n_prb, - frcti->n_rtt, - frcti->n_dup, - frcti->n_dak, - frcti->n_rdv, - frcti->n_out, - frcti->n_rqo); - - pthread_rwlock_unlock(&flow->frcti->lock); + "RXM (RTO-driven) sent: %20zu\n" + "RXM packets received: %20zu\n" + " duplicates received: %20zu\n" + "RXM (SACK mechanism) sent: %20zu\n" + "RXM (RACK-driven) sent: %20zu\n" + "RXM (DupThresh-driven) sent: %20zu\n" + "RXM (NACK-driven) sent: %20zu\n" + "ACK packets sent: %20zu\n" + "Delayed-ACK timer fires: %20zu\n" + " suppressed (seqno): %20zu\n" + " suppressed (inact): %20zu\n" + " suppressed (rate): %20zu\n" + "ACK packets received: %20zu\n" + " fed RTT estimator: %20zu\n" + " wire dups dropped: %20zu\n" + "Duplicates received: %20zu\n" + "Out-of-window pkts received: %20zu\n" + "Out-of-rqueue pkts received: %20zu\n" + "OOO arrivals: %20zu\n" + "SACKs sent: %20zu\n" + "SACKs received: %20zu\n" + "D-SACKs sent: %20zu\n" + "D-SACKs received: %20zu\n" + "D-SACK out-of-range dropped: %20zu\n" + "Pre-DRF NACKs sent: %20zu\n" + "Pre-DRF NACKs received: %20zu\n" + "Tail loss probes sent: %20zu\n" + "Inactivity drops (silent): %20zu\n" + "DRF window rebases: %20zu\n" + "rq slots cleared by release_rq: %20zu\n" + "RTT probes sent: %20zu\n" + "RTT probe replies received: %20zu\n" + "RTT estimator samples: %20zu\n" + "Rendez-vous packets sent: %20zu\n" + "Rendez-vous packets received: %20zu\n" + "Keepalives sent: %20zu\n" + "Keepalives received: %20zu\n" + "SDU writes fragmented: %20zu\n" + " alloc fail mid-SDU: %20zu\n" + " tx fail mid-SDU: %20zu\n" + "Fragments sent: %20zu\n" + "Fragments received: %20zu\n" + "SDUs delivered reassembled: %20zu\n" + "SDUs delivered (SOLE): %20zu\n" + "Fragments dropped (malformed): %20zu\n" + "Stream bytes sent: %20zu\n" + "Stream bytes received: %20zu\n" + "Stream bytes delivered: %20zu\n" + "Stream packets dropped: %20zu\n" + "Stream FINs dropped: %20zu\n" + "FRCTI_RCV time (ns): %20zu\n" + "tw_move time (ns): %20zu\n" + "drain_rx_nb calls: %20zu\n" + "RX rbuff queued: %20zu\n" + "TX rbuff queued: %20zu\n" + "RXM-due entries: %20zu\n" + " bail (acked): %20zu\n" + " bail (unowned): %20zu\n" + " bail (aged): %20zu\n" + " bail (defer): %20zu\n" + "RXM-arm malloc failures: %20zu\n" + "RXM cancels (teardown): %20zu\n" + "RXM tx into dead flow: %20zu\n" + "Tx ring drops (any cause): %20zu\n" + " ack: %20zu\n" + " sack: %20zu\n" + " ka: %20zu\n" + " rttp: %20zu\n" + " nack: %20zu\n" + " rdv: %20zu\n" + " other: %20zu\n", + /* Check getattr size below when adding stats. */ + s.t_mpl, s.t_a, s.t_r, + s.srtt, s.mdev, s.rto, s.min_rtt, + s.snd_cr.lwe, s.snd_cr.rwe, + (long long)(now_ns - s.snd_cr.act), + s.snd_cr.seqno, + s.rcv_cr.lwe, s.rcv_cr.rwe, + (long long)(now_ns - s.rcv_cr.act), + s.rcv_cr.seqno, + s.stat.rxm_rto, s.stat.rxm_rcv, s.stat.rxm_dup_rcv, + s.stat.rxm_sack, s.stat.rxm_rack, s.stat.rxm_dupthresh, + s.stat.rxm_nack, + s.stat.ack_snd, s.stat.ack_fire, + s.stat.ack_supp_seqno, s.stat.ack_supp_inact, + s.stat.ack_supp_rate, + s.stat.ack_rcv, s.stat.ack_rtt, s.stat.ack_dup_rcv, + s.stat.dup_rcv, s.stat.out_rcv, s.stat.rqo_rcv, + s.stat.ooo_rcv, + s.stat.sack_snd, s.stat.sack_rcv, + s.stat.dsack_snd, s.stat.dsack_rcv, s.stat.dsack_drop, + s.stat.nack_snd, s.stat.nack_rcv, s.stat.tlp_snd, + s.stat.inact_drop, s.stat.drf_rebase, s.stat.rq_released, + s.stat.rttp_snd, s.stat.rttp_rcv, s.stat.rtt_smpl, + s.stat.rdv_snd, s.stat.rdv_rcv, + s.stat.ka_snd, s.stat.ka_rcv, + s.stat.sdu_snd_frag, s.stat.sdu_snd_alloc, s.stat.sdu_snd_tx, + s.stat.frag_snd, s.stat.frag_rcv, + s.stat.sdu_reasm, s.stat.sdu_sole, s.stat.frag_drop, + s.stat.strm_snd_byte, s.stat.strm_rcv_byte, + s.stat.strm_dlv_byte, + s.stat.strm_drop, s.stat.strm_fin_drop, + s.stat.rcv_proc_ns, s.stat.tw_move_ns, + s.stat.drain_calls, + s.rx_q_now, s.tx_q_now, + s.stat.rxm_due_count, + s.stat.rxm_due_acked, s.stat.rxm_due_unowned, + s.stat.rxm_due_aged, s.stat.rxm_due_defer, + s.stat.rxm_arm_fail, + s.stat.rxm_cancel, + s.stat.rxm_tx_dead, s.stat.tx_drop, + s.stat.tx_drop_ack, s.stat.tx_drop_sack, + s.stat.tx_drop_ka, s.stat.tx_drop_rttp, + s.stat.tx_drop_nack, s.stat.tx_drop_rdv, + s.stat.tx_drop_other); + + if (written < 0) + return 0; - pthread_rwlock_unlock(&proc.lock); + if ((size_t) written >= len) + return (int) (len - 1); - return strlen(buf); + return written; } +__attribute__((cold)) static int frct_rib_readdir(char *** buf) { *buf = malloc(sizeof(**buf)); @@ -199,13 +633,14 @@ static int frct_rib_readdir(char *** buf) return -ENOMEM; } +__attribute__((cold)) static int frct_rib_getattr(const char * path, struct rib_attr * attr) { (void) path; - (void) attr; - attr->size = 1189; + /* Must be >= the sprintf output in frct_rib_read. */ + attr->size = 8192; attr->mtime = 0; return 0; @@ -220,128 +655,1168 @@ static struct rib_ops r_ops = { #endif /* PROC_FLOW_STATS */ -static bool before(uint32_t seq1, - uint32_t seq2) +static __inline__ bool before(uint32_t s1, uint32_t s2) { - return (int32_t)(seq1 - seq2) < 0; + return (int32_t)(s1 - s2) < 0; } -static bool after(uint32_t seq1, - uint32_t seq2) +static __inline__ bool after(uint32_t s1, uint32_t s2) { - return (int32_t)(seq2 - seq1) < 0; + return (int32_t)(s2 - s1) < 0; } -static void __send_frct_pkt(int fd, - uint8_t flags, - uint32_t ackno, - uint32_t rwe) +static __inline__ bool within(uint32_t seq, uint32_t lo, uint32_t hi) { - struct ssm_pk_buff * spb; - struct frct_pci * pci; - ssize_t idx; - struct flow * f; + return after(seq, lo) && !after(seq, hi); +} - /* Raw calls needed to bypass frcti. */ -#ifdef RXM_BLOCKING - idx = ssm_pool_alloc_b(proc.pool, sizeof(*pci), NULL, &spb, NULL); -#else - idx = ssm_pool_alloc(proc.pool, sizeof(*pci), NULL, &spb); -#endif - if (idx < 0) +static __inline__ bool in_window(uint32_t seq, const struct frct_cr * cr) +{ + return !before(seq, cr->lwe) && before(seq, cr->rwe); +} + +/* DRF arrival that stays within the current receive epoch. */ +static __inline__ bool same_epoch_drf(uint32_t seq, + uint16_t flags, + const struct frct_cr * cr) +{ + if (cr->lwe == cr->rwe) + return false; + + return (flags & FRCT_RXM) || in_window(seq, cr); +} + +/* + * RACK reorder window R (RFC 8985 §6.2): + * R = MIN(reo_wnd_mult * RACK.min_RTT / 4, SRTT) + * reo_wnd_mult scales on D-SACK evidence of under-tolerance (§7.2). + * Fall back to srtt when no min_rtt sample exists yet; MIN_REORDER_NS + * floor guards collapse below the timer-tick resolution. + */ +static __inline__ uint64_t rack_reorder_window(struct frcti * frcti) +{ + uint64_t mult = frcti->reo_wnd_mult > 0 ? frcti->reo_wnd_mult : 1; + uint64_t base = frcti->min_rtt > 0 ? (uint64_t) frcti->min_rtt + : (uint64_t) frcti->srtt; + uint64_t R = mult * (base / 4); + + R = MAX(R, (uint64_t) MIN_REORDER_NS); + R = MIN(R, (uint64_t) frcti->srtt); + + return R; +} + +static __inline__ int frct_spb_reserve(size_t len, + struct ssm_pk_buff ** spb) +{ + ssize_t idx = ssm_pool_alloc_b(proc.pool, len, NULL, spb, NULL); + + return idx < 0 ? (int) idx : 0; +} + +static __inline__ void frct_spb_release(struct ssm_pk_buff * spb) +{ + ssm_pool_remove(proc.pool, ssm_pk_buff_get_off(spb)); +} + +static __inline__ void frct_spb_release_idx(size_t idx) +{ + ssm_pool_remove(proc.pool, idx); +} + +/* Fetch the spb stashed at the rq slot for seqno. */ +static __inline__ struct ssm_pk_buff * rq_frag(const struct frcti * frcti, + uint32_t seqno) +{ + return ssm_pool_get(proc.pool, frcti->rcv_slots[RQ_SLOT(seqno)].idx); +} + +static __inline__ size_t frcti_data_hdr_len(const struct frcti * frcti) +{ + return FRCT_PCILEN + (frcti->stream ? FRCT_PCI_STREAM_LEN : 0); +} + +static __inline__ size_t frcti_ctrl_hdr_len(const struct frcti * frcti) +{ + (void) frcti; + + return FRCT_PCILEN; +} + +/* + * HCS at offset 2 inside PCI. Covers flags (bytes 0..1) and + * window/seqno/ackno (bytes 4..15), plus SPCI for stream DATA. + */ +static void frct_hcs_set(struct frct_pci * pci, + bool stream) +{ + uint16_t hcs = 0; + size_t tail; + + tail = sizeof(*pci) - sizeof(pci->flags) - sizeof(pci->hcs); + if (stream) + tail += FRCT_PCI_STREAM_LEN; + + crc16_ccitt_false(&hcs, pci, sizeof(pci->flags)); + crc16_ccitt_false(&hcs, &pci->window, tail); + + pci->hcs = hton16(hcs); +} + +static int frct_hcs_check(const struct frct_pci * pci, + const struct frcti * frcti) +{ + uint16_t hcs = 0; + uint16_t flags; + size_t tail; + + /* Untrusted flag read; mismatch on HCS will drop on corrupt. */ + flags = ntoh16(pci->flags); + + tail = sizeof(*pci) - sizeof(pci->flags) - sizeof(pci->hcs); + if (frcti->stream && (flags & FRCT_DATA)) + tail += FRCT_PCI_STREAM_LEN; + + crc16_ccitt_false(&hcs, pci, sizeof(pci->flags)); + crc16_ccitt_false(&hcs, &pci->window, tail); + + return hcs != ntoh16(pci->hcs); +} + +/* Bump tx_drop plus the per-frame-type counter matching `flags`. */ +static void frct_tx_drop_bump(struct frcti * frcti, + uint16_t flags) +{ + STAT_BUMP(frcti, tx_drop); + + if (flags & FRCT_SACK) { + STAT_BUMP(frcti, tx_drop_sack); return; + } - pci = (struct frct_pci *) ssm_pk_buff_head(spb); - memset(pci, 0, sizeof(*pci)); + if (flags & FRCT_KA) { + STAT_BUMP(frcti, tx_drop_ka); + return; + } - *((uint32_t *) pci) = hton32(rwe); + if (flags & FRCT_RTTP) { + STAT_BUMP(frcti, tx_drop_rttp); + return; + } - pci->flags = flags; - pci->ackno = hton32(ackno); + if (flags & FRCT_NACK) { + STAT_BUMP(frcti, tx_drop_nack); + return; + } + + if (flags & FRCT_RDVS) { + STAT_BUMP(frcti, tx_drop_rdv); + return; + } + + if (flags & FRCT_ACK) { + STAT_BUMP(frcti, tx_drop_ack); + return; + } + + STAT_BUMP(frcti, tx_drop_other); +} - f = &proc.flows[fd]; +static int frct_tx(struct frcti * frcti, struct ssm_pk_buff * spb) +{ + struct flow * f = frcti_to_flow(frcti); + const struct frct_pci * pci; + const struct timespec * dl = NULL; + struct timespec now; + struct timespec intv = TIMESPEC_INIT_NS(FRCT_TX_TIMEO_NS); + struct timespec deadline; + uint16_t flags; + ssize_t idx; + int ret = -ENOMEM; + + pci = (const struct frct_pci *) ssm_pk_buff_head(spb); + flags = ntoh16(pci->flags); + + /* CRC32 covers plaintext body; PCI is in HCS. Pre-encrypt. */ + if (flags & FRCT_SACK) { + if (crc_add(spb, frcti_ctrl_hdr_len(frcti)) != 0) + goto fail; + } else if ((flags & FRCT_DATA) && f->info.qs.ber == 0) { + if (crc_add(spb, frcti_data_hdr_len(frcti)) != 0) + goto fail; + } if (spb_encrypt(f, spb) < 0) goto fail; -#ifdef RXM_BLOCKING - if (ssm_rbuff_write_b(f->tx_rb, idx, NULL)) -#else - if (ssm_rbuff_write(f->tx_rb, idx)) -#endif + idx = ssm_pk_buff_get_off(spb); + + /* DATA blocks; control times out so a full ring can't stall wheel. */ + if (!(flags & FRCT_DATA)) { + clock_gettime(PTHREAD_COND_CLOCK, &now); + ts_add(&now, &intv, &deadline); + dl = &deadline; + } + + ret = ssm_rbuff_write_b(f->tx_rb, idx, dl); + if (ret < 0) goto fail; ssm_flow_set_notify(f->set, f->info.id, FLOW_PKT); - return; + return 0; fail: - ipcp_spb_release(spb); - return; + frct_tx_drop_bump(frcti, flags); + ssm_pool_remove(proc.pool, ssm_pk_buff_get_off(spb)); + return ret; +} + +__attribute__((cold)) +static void frct_mark_flow_down(struct frcti * frcti) +{ + struct flow * f = frcti_to_flow(frcti); + + if (f->rx_rb != NULL) + ssm_rbuff_set_bits(f->rx_rb, RB_FLOWDOWN); + + if (f->tx_rb != NULL) + ssm_rbuff_set_bits(f->tx_rb, RB_FLOWDOWN); +} + +__attribute__((cold)) +static void frct_mark_peer_dead(struct frcti * frcti) +{ + struct flow * f = frcti_to_flow(frcti); + + if (f->rx_rb != NULL) + ssm_rbuff_set_bits(f->rx_rb, RB_FLOWPEER); + + if (proc.fqset != NULL) + ssm_flow_set_notify(proc.fqset, f->info.id, FLOW_PEER); +} + +static __inline__ int frct_ctrl_alloc(struct ssm_pk_buff ** spb, + struct frct_pci ** pci, + size_t payload_len) +{ + if (frct_spb_reserve(FRCT_PCILEN + payload_len, spb) < 0) + return -1; + + *pci = (struct frct_pci *) ssm_pk_buff_head(*spb); + memset(*pci, 0, FRCT_PCILEN); + + return 0; +} + +/* + * Advertised rwe. Stream mode clamps to lwe + ring_seq_cap so the + * byte-equivalent fits the rx ring. Caller holds at least the rdlock. + */ +static __inline__ uint32_t frcti_advert_rwe(struct frcti * frcti) +{ + uint32_t rwe; + uint32_t cap; + + rwe = frcti->rcv_cr.rwe; + + if (!frcti->stream) + return rwe; + + cap = frcti->rcv_cr.lwe + frcti->ring_seq_cap; + + return before(cap, rwe) ? cap : rwe; +} + +static void frcti_pkt_snd(struct frcti * frcti, + uint16_t flags, + uint32_t ackno, + uint32_t rwe) +{ + struct ssm_pk_buff * spb; + struct frct_pci * pci; + + if (frct_ctrl_alloc(&spb, &pci, 0) < 0) + return; + + pci->flags = hton16(flags); + pci->window = hton32(rwe); + pci->ackno = hton32(ackno); + if (flags & FRCT_ACK) { + /* reuse ackno for the sequence number of delayed ACK */ + ackno = FETCH_ADD_RELAXED(&frcti->snd_cr.ackno, 1); + pci->seqno = hton32(ackno + 1); + } + + frct_hcs_set(pci, false); + + frct_tx(frcti, spb); +} + +/* RTO floor scales with srtt; hard floor rto_min guards sub-ms RTT. */ +static void rtt_init(struct frcti * frcti, + time_t rtt_hint) +{ + time_t floor; + + if (rtt_hint > 0) { + rtt_hint = MAX(rtt_hint, (time_t) RTT_BOOT_NS); + frcti->srtt = rtt_hint; + frcti->mdev = rtt_hint >> 3; + floor = MAX(frcti->rto_min, 2 * frcti->srtt); + frcti->rto = MAX(floor, rtt_hint + (frcti->mdev << MDEV_MUL)); + frcti->min_rtt = rtt_hint; + } else { + /* Boot from first ACK. */ + frcti->srtt = 0; + frcti->mdev = RTT_BOOT_NS; + frcti->rto = MAX((time_t) INITIAL_RTO, frcti->rto_min); + frcti->min_rtt = 0; + } + + frcti->rto_mul = 0; +} + +/* RFC 8985 §6.2: replace min_RTT on unset, smaller sample, or expiry. */ +static __inline__ bool min_rtt_stale(struct frcti * frcti, + time_t mrtt, + uint64_t now_ns) +{ + if (frcti->min_rtt == 0) + return true; + + if (mrtt < frcti->min_rtt) + return true; + + return ts_aged_ns(now_ns, frcti->t_min_rtt, MIN_RTT_WIN_NS); +} + +/* Linux-style windowed-min refresh of RACK.min_RTT. */ +static __inline__ void min_rtt_update(struct frcti * frcti, + time_t mrtt, + uint64_t now_ns) +{ + if (!min_rtt_stale(frcti, mrtt, now_ns)) + return; + + frcti->min_rtt = mrtt; + frcti->t_min_rtt = now_ns; +} + +static void rtt_update(struct frcti * frcti, + time_t mrtt, + uint64_t now_ns) +{ + time_t srtt = frcti->srtt; + time_t rttvar = frcti->mdev; + time_t floor; + time_t rto; + + if (srtt == 0) { + srtt = mrtt; + rttvar = mrtt >> 1; + } else { + /* RFC 6298 symmetric EWMA. */ + time_t delta = mrtt - srtt; + srtt += (delta >> 3); + delta = (ABS(delta) - rttvar) >> 2; +#ifdef FRCT_LINUX_RTT_ESTIMATOR + if (delta < 0) + delta >>= 3; +#endif + rttvar += delta; + } + STAT_BUMP(frcti, rtt_smpl); + frcti->srtt = MAX(SRTT_FLOOR_NS, srtt); + frcti->mdev = MAX(MDEV_FLOOR_NS, rttvar); + + min_rtt_update(frcti, mrtt, now_ns); + + floor = MAX(frcti->rto_min, 2 * frcti->srtt); + rto = MAX(floor, frcti->srtt + (frcti->mdev << MDEV_MUL)); + + STORE_RELEASE(&frcti->rto, rto); + STORE_RELEASE(&frcti->rto_mul, 0); +} + +/* Fill probes[pos], return new probe_id; 0 on entropy failure. Wrlock. */ +static uint32_t rttp_alloc_probe(struct frcti * frcti, + uint64_t now_ns, + uint8_t nonce[RTTP_NONCE_LEN]) +{ + uint32_t probe_id; + size_t pos; + + if (random_buffer(nonce, RTTP_NONCE_LEN) < 0) + return 0; + + probe_id = frcti->probe_id_next++; + if (probe_id == 0) + probe_id = frcti->probe_id_next++; + + pos = RTTP_POS(probe_id); + frcti->probes[pos].id = probe_id; + frcti->probes[pos].ts = now_ns; + memcpy(frcti->probes[pos].nonce, nonce, RTTP_NONCE_LEN); + frcti->t_snd_probe = now_ns; + + STAT_BUMP(frcti, rttp_snd); + + return probe_id; +} + +/* Caller wrlock; out args valid on true (caller emits post-unlock). */ +static bool rtt_probe_arm(struct frcti * frcti, + uint64_t now_ns, + uint32_t * probe_id, + uint8_t nonce[RTTP_NONCE_LEN]) +{ + if (frcti->srtt == 0) + return false; + + if (!after(frcti->snd_cr.seqno, frcti->snd_cr.lwe)) + return false; + + if (!ts_aged_ns(now_ns, frcti->t_rcv_rtt, + 2u * (uint64_t) frcti->srtt)) + return false; + + if (!ts_aged_ns(now_ns, frcti->t_snd_probe, + (uint64_t) frcti->srtt)) + return false; + + *probe_id = rttp_alloc_probe(frcti, now_ns, nonce); + + return *probe_id != 0; +} + +static void frcti_rttp_snd(struct frcti * frcti, + uint32_t probe_id, + uint32_t echo_id, + const uint8_t * nonce) +{ + struct ssm_pk_buff * spb; + struct frct_pci * pci; + struct frct_rttp * rttp; + + if (frct_ctrl_alloc(&spb, &pci, RTTP_PAYLOAD) < 0) + return; + + pci->flags = hton16(FRCT_RTTP); + + frct_hcs_set(pci, false); + + rttp = (struct frct_rttp *) FRCT_BODY(pci); + rttp->probe_id = hton32(probe_id); + rttp->echo_id = hton32(echo_id); + memcpy(rttp->nonce, nonce, sizeof(rttp->nonce)); + + frct_tx(frcti, spb); +} + +struct rxm_entry { + struct tw_entry tw; + struct list_head next; /* in frcti->rxm_list */ + struct frcti * frcti; + uint32_t seqno; + uint64_t t0; + size_t len; + uint8_t pkt[]; /* flexible — sized at alloc time */ +}; + +static void rxm_entry_destroy(struct rxm_entry * r) +{ + free(r); +} + +static bool rxm_still_owned(struct frcti * frcti, + size_t pos, + struct rxm_entry * r) +{ + return LOAD_ACQUIRE(&frcti->snd_slots[pos].rxm) == r; +} + +/* + * All in-flight slots share the HoL backoff; otherwise non-HoL timers + * cycle at base RTO and storm the wire while HoL is still backing off. + */ +static uint64_t rxm_next_deadline(struct frcti * frcti, + uint64_t now_ns) +{ + time_t rto = LOAD_RELAXED(&frcti->rto); + uint8_t rto_mul = LOAD_RELAXED(&frcti->rto_mul); + + return now_ns + ((uint64_t) rto << rto_mul); +} + +/* Copy pkt, set FRCT_RXM, refresh ackno, re-seal HCS. */ +static struct ssm_pk_buff * rxm_pkt_prepare(const void * pkt, + size_t len, + uint32_t rcv_lwe, + bool stream) +{ + struct ssm_pk_buff * spb; + struct frct_pci * pci; + uint16_t flags; + + if (frct_spb_reserve(len, &spb) < 0) + return NULL; + + pci = (struct frct_pci *) ssm_pk_buff_head(spb); + memcpy(pci, pkt, len); + + flags = ntoh16(pci->flags) | FRCT_RXM; + pci->flags = hton16(flags); + pci->ackno = hton32(rcv_lwe); + + frct_hcs_set(pci, stream); + + return spb; +} + +/* Caller must NOT hold frcti->lock. */ +static void rxm_snd(struct frcti * frcti, + uint32_t seqno, + const void * pkt, + size_t len) +{ + struct ssm_pk_buff * spb; + struct timespec now; + struct snd_slot * slot; + uint32_t snd_lwe; + uint32_t rcv_lwe; + size_t pos; + int ret; + + snd_lwe = LOAD_RELAXED(&frcti->snd_cr.lwe); + rcv_lwe = LOAD_RELAXED(&frcti->rcv_cr.lwe); + + clock_gettime(PTHREAD_COND_CLOCK, &now); + + pthread_rwlock_wrlock(&frcti->lock); + + pos = RQ_SLOT(seqno); + slot = &frcti->snd_slots[pos]; + + slot->time = TS_TO_UINT64(now); + /* RTO supersedes any pending TLP/fast-rxm on this slot. */ + slot->flags = (slot->flags & ~(SND_FAST_RXM | SND_TLP)) | SND_RTX; + /* §7.3: RTO supersedes TLP probes and ends the probe episode. */ + frcti->tlp_high_seq = 0; + frcti->tlp_count = 0; + + frcti->rtt_lwe = seqno + 1; + + /* Only the HoL retransmit bumps the global RTO backoff. */ + if (seqno == snd_lwe && frcti->rto_mul < MAX_RTO_MUL) + STORE_RELEASE(&frcti->rto_mul, frcti->rto_mul + 1); + + /* RFC 8985 §7.2 step 4: RTO on HoL resets RACK reo scaling. */ + if (seqno == snd_lwe) + frcti->reo_wnd_mult = 1; + + pthread_rwlock_unlock(&frcti->lock); + + STAT_BUMP(frcti, rxm_rto); + + spb = rxm_pkt_prepare(pkt, len, rcv_lwe, frcti->stream); + if (spb == NULL) + return; + + /* ETIMEDOUT/ENOMEM: let r-timer drive teardown. */ + ret = frct_tx(frcti, spb); + if (ret == -EFLOWDOWN || ret == -ENOTALLOC) + STAT_BUMP(frcti, rxm_tx_dead); +} + +static void rxm_due(void * arg) +{ + struct rxm_entry * r = arg; + struct frcti * frcti = r->frcti; + struct timespec now; + uint64_t now_ns; + uint32_t snd_lwe; + size_t pos = RQ_SLOT(r->seqno); + + STAT_BUMP(frcti, rxm_due_count); + + snd_lwe = LOAD_RELAXED(&frcti->snd_cr.lwe); + + /* Already ACK'd: expected for the steady-state majority. */ + if (before(r->seqno, snd_lwe)) { + STAT_BUMP(frcti, rxm_due_acked); + goto cleanup; + } + + /* SACK/RACK-cleared the slot (caller NULL'd snd_slots[pos].rxm). */ + if (!rxm_still_owned(frcti, pos, r)) { + STAT_BUMP(frcti, rxm_due_unowned); + goto cleanup; + } + + clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); + + /* R-timer expired: peer unreachable. */ + if (RXM_AGED_OUT(r->t0, now_ns, frcti->t_r)) { + STAT_BUMP(frcti, rxm_due_aged); + frct_mark_flow_down(frcti); + goto cleanup; + } + + /* HoL-only retx; defer at base rto so HoL transitions react. */ + if (r->seqno != snd_lwe) { + STAT_BUMP(frcti, rxm_due_defer); + tw_post(&r->tw, now_ns + LOAD_RELAXED(&frcti->rto), + rxm_due, r); + return; + } + + rxm_snd(frcti, r->seqno, r->pkt, r->len); + + /* Re-check ownership: fire path may have replaced our entry. */ + if (rxm_still_owned(frcti, pos, r)) { + uint64_t anchor; + + /* Per-slot anchor breaks co-fire re-bin. */ + anchor = frcti->snd_slots[pos].time; + tw_post(&r->tw, rxm_next_deadline(frcti, anchor), rxm_due, r); + return; + } + + cleanup: + pthread_rwlock_wrlock(&frcti->lock); + + if (rxm_still_owned(frcti, pos, r)) + STORE_RELEASE(&frcti->snd_slots[pos].rxm, NULL); + + list_del(&r->next); + + pthread_rwlock_unlock(&frcti->lock); + + rxm_entry_destroy(r); +} + +/* Pre-allocate rxm entry so frcti_snd can fail before committing seqno. */ +static struct rxm_entry * rxm_alloc(struct frcti * frcti, + size_t pkt_len) +{ + struct rxm_entry * r; + + r = malloc(sizeof(*r) + pkt_len); + if (r == NULL) { + STAT_BUMP(frcti, rxm_arm_fail); + return NULL; + } + + r->frcti = frcti; + tw_init_entry(&r->tw); + + return r; +} + +static void rxm_arm(struct frcti * frcti, + uint32_t seqno, + struct rxm_entry * r, + const struct ssm_pk_buff * spb) +{ + struct timespec now; + time_t rto; + uint8_t rto_mul; + uint64_t deadline; + size_t len = ssm_pk_buff_len(spb); + + memcpy(r->pkt, ssm_pk_buff_head(spb), len); + r->len = len; + r->seqno = seqno; + + clock_gettime(PTHREAD_COND_CLOCK, &now); + r->t0 = TS_TO_UINT64(now); + + rto = LOAD_RELAXED(&frcti->rto); + rto_mul = LOAD_RELAXED(&frcti->rto_mul); + deadline = r->t0 + ((uint64_t) rto << rto_mul); + + pthread_rwlock_wrlock(&frcti->lock); + + assert(before(seqno, frcti->snd_cr.lwe + RQ_SIZE)); + + list_add_tail(&r->next, &frcti->rxm_list); + STORE_RELEASE(&frcti->snd_slots[RQ_SLOT(seqno)].rxm, r); + + pthread_rwlock_unlock(&frcti->lock); + + tw_post(&r->tw, deadline, rxm_due, r); +} + +static void rxm_cancel_all(struct frcti * frcti) +{ + struct list_head * p; + struct list_head * t; + + list_for_each_safe(p, t, &frcti->rxm_list) { + struct rxm_entry * r = list_entry(p, struct rxm_entry, next); + list_del(&r->next); + tw_cancel(&r->tw); + rxm_entry_destroy(r); + STAT_BUMP(frcti, rxm_cancel); + } +} + +static __inline__ void sack_block_put(uint8_t * payload, + uint16_t i, + uint32_t s, + uint32_t e) +{ + uint32_t * blk = (uint32_t *) + (payload + SACK_HDR_SIZE + i * SACK_BLOCK_SIZE); + + blk[0] = hton32(s); + blk[1] = hton32(e); +} + +static __inline__ void sack_block_get(const uint8_t * payload, + uint16_t i, + uint32_t * s, + uint32_t * e) +{ + const uint32_t * blk = (const uint32_t *) + (payload + SACK_HDR_SIZE + i * SACK_BLOCK_SIZE); + + *s = ntoh32(blk[0]); + *e = ntoh32(blk[1]); } -static void send_frct_pkt(struct frcti * frcti) +/* + * Build SACK blocks for ranges *above* rcv_cr.lwe. Wire invariant + * (see doc/frct.txt §1.3): every block produced here satisfies + * blocks[i].start > rcv_cr.lwe = ackno, which makes the "first block + * below ackno" convention used to mark a D-SACK (RFC 2883 §4 case 1) + * unambiguous. Caller holds frcti->lock. + */ +static uint16_t sack_blocks_build(struct frcti * frcti, + uint32_t blocks[][2], + uint16_t max_n) +{ + const struct rcv_slot * slots = frcti->rcv_slots; + uint32_t s; + uint32_t end; + uint16_t n = 0; + + s = frcti->rcv_cr.lwe + 1; + end = frcti->rcv_cr.lwe + RQ_SIZE; + if (after(end, frcti->rcv_cr.rwe)) + end = frcti->rcv_cr.rwe; + + while (before(s, end) && n < max_n) { + while (before(s, end) && slots[RQ_SLOT(s)].idx == -1) + ++s; + + if (!before(s, end)) + break; + + blocks[n][0] = s; + while (before(s, end) && slots[RQ_SLOT(s)].idx != -1) + ++s; + blocks[n][1] = s; + ++n; + } + + return n; +} + +/* + * Prepend the pending D-SACK report (if any) as block[0]; clear flag. + * Returns the number of slots consumed at the head (0 or 1). Caller + * holds wrlock. + */ +static __inline__ uint16_t dsack_consume(struct frcti * frcti, + uint32_t blocks[][2]) +{ + if (!frcti->dsack_valid || frcti->sack_n_max == 0) + return 0; + + blocks[0][0] = frcti->dsack_seqno; + blocks[0][1] = frcti->dsack_seqno + 1; + frcti->dsack_valid = false; + return 1; +} + +/* Caller must NOT hold frcti->lock. */ +static void frcti_sack_snd(struct frcti * frcti, + const struct sack_args * sa) +{ + struct ssm_pk_buff * spb; + struct frct_pci * pci; + buffer_t buf; + uint16_t i; + + assert(sa->n <= SACK_MAX_BLOCKS); + + buf.len = SACK_HDR_SIZE + sa->n * SACK_BLOCK_SIZE; + + if (frct_ctrl_alloc(&spb, &pci, buf.len) < 0) + return; + + pci->flags = hton16(FRCT_ACK | FRCT_FC | FRCT_SACK); + pci->window = hton32(sa->rwe); + pci->ackno = hton32(sa->ack); + pci->seqno = hton32(FETCH_ADD_RELAXED(&frcti->snd_cr.ackno, 1) + 1); + + frct_hcs_set(pci, false); + + buf.data = FRCT_BODY(pci); + memset(buf.data, 0, SACK_HDR_SIZE); + *(uint16_t *) buf.data = hton16(sa->n); + for (i = 0; i < sa->n; ++i) + sack_block_put(buf.data, i, sa->blocks[i][0], sa->blocks[i][1]); + + frct_tx(frcti, spb); +} + +static void ack_snd(struct frcti * frcti, + bool with_sack) { struct timespec now; + uint64_t now_ns; time_t diff; uint32_t ackno; uint32_t rwe; - int fd; + struct sack_args * sa = NULL; + size_t sa_sz; + bool sacking = false; assert(frcti); + STAT_BUMP(frcti, ack_fire); + clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); + + if (with_sack && frcti->sack_n_max > 0) { + sa_sz = sizeof(*sa) + frcti->sack_n_max * sizeof(sa->blocks[0]); + sa = malloc(sa_sz); + /* If alloc fails, fall through and send a bare cum-ACK. */ + } pthread_rwlock_wrlock(&frcti->lock); - if (!after(frcti->rcv_cr.lwe, frcti->rcv_cr.seqno)) { + /* D-SACK rides through cum-ACK freshness; signal is the duplicate. */ + if (!after(frcti->rcv_cr.lwe, frcti->rcv_cr.seqno) + && !frcti->dsack_valid) { pthread_rwlock_unlock(&frcti->lock); - return; + STAT_BUMP(frcti, ack_supp_seqno); + goto out; } - fd = frcti->fd; ackno = frcti->rcv_cr.lwe; - rwe = frcti->rcv_cr.rwe; + rwe = frcti_advert_rwe(frcti); - diff = ts_diff_ns(&now, &frcti->rcv_cr.act); - if (diff > frcti->a) { + if (ACK_AGED_OUT(frcti->rcv_cr.act, now_ns, frcti->t_a)) { pthread_rwlock_unlock(&frcti->lock); - return; + STAT_BUMP(frcti, ack_supp_inact); + goto out; } - diff = ts_diff_ns(&now, &frcti->snd_cr.act); - if (diff < TICTIME) { + diff = (time_t) ts_age_ns(now_ns, frcti->snd_cr.act); + if (diff < TICTIME && !frcti->dsack_valid) { pthread_rwlock_unlock(&frcti->lock); - return; + STAT_BUMP(frcti, ack_supp_rate); + goto out; } + /* RFC 2018: piggyback SACK on timer ACK; dedup unchanged board. */ + if (sa == NULL || (frcti->sack_n == 0 && !frcti->dsack_valid)) + goto no_sack; + + sa->dsack = false; + sa->n = dsack_consume(frcti, sa->blocks); + if (sa->n == 1) + sa->dsack = true; + + sa->n += sack_blocks_build(frcti, sa->blocks + sa->n, + frcti->sack_n_max - sa->n); + if (sa->n == 0) + goto no_sack; + + if (!sa->dsack && ackno == frcti->sack_lwe && sa->n == frcti->sack_n) + goto no_sack; + + sa->ack = ackno; + sa->rwe = rwe; + frcti->sack_lwe = ackno; + frcti->sack_n = sa->n; + frcti->t_snd_sack = now_ns; + sacking = true; + + no_sack: frcti->rcv_cr.seqno = frcti->rcv_cr.lwe; pthread_rwlock_unlock(&frcti->lock); - __send_frct_pkt(fd, FRCT_ACK | FRCT_FC, ackno, rwe); + STAT_BUMP(frcti, ack_snd); + + if (sacking) { + STAT_BUMP(frcti, sack_snd); + if (sa->dsack) + STAT_BUMP(frcti, dsack_snd); + frcti_sack_snd(frcti, sa); + } else { + frcti_pkt_snd(frcti, FRCT_ACK | FRCT_FC, ackno, rwe); + } + + out: + free(sa); } -static void __send_rdv(int fd) +/* Delayed-ACK timer: per-flow, dedup'd via atomic test-and-set. */ +static void ack_due(void * arg) { - __send_frct_pkt(fd, FRCT_RDVS, 0, 0); + struct frcti * frcti = arg; + + __atomic_clear(&frcti->ack_pending, __ATOMIC_RELAXED); + + ack_snd(frcti, true); } -static struct frcti * frcti_create(int fd, - time_t a, - time_t r, - time_t mpl) +static int ack_arm(struct frcti * frcti) { - struct frcti * frcti; - ssize_t idx; - struct timespec now; - pthread_condattr_t cattr; + struct timespec now; + uint64_t deadline; + + if (__atomic_test_and_set(&frcti->ack_pending, __ATOMIC_RELAXED)) + return 0; + + clock_gettime(PTHREAD_COND_CLOCK, &now); + deadline = TS_TO_UINT64(now) + ACK_DELAY_NS; + + tw_post(&frcti->ack_tw, deadline, ack_due, frcti); + + return 0; +} + +/* Forward decl breaks the keepalive cycle: ka_arm <-> ka_due. */ +static void ka_due(void * arg); + +static int ka_arm(struct frcti * frcti) +{ + struct timespec now; + uint64_t now_ns; + uint64_t timeo_ns; + uint64_t snd_ns; + uint64_t rcv_ns; + uint64_t deadline; + + timeo_ns = (uint64_t) frcti->qs_timeout * MILLION; /* IMM */ + snd_ns = LOAD_RELAXED(&frcti->snd_cr.act) + timeo_ns / 4; + rcv_ns = LOAD_RELAXED(&frcti->rcv_cr.act) + timeo_ns; + + clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); + deadline = MIN(snd_ns, rcv_ns); + if (deadline <= now_ns) + deadline = now_ns + timeo_ns / 4; + + tw_post(&frcti->ka_tw, deadline, ka_due, frcti); + + return 0; +} + +__attribute__((cold)) +static void ka_snd(struct frcti * frcti) +{ + struct ssm_pk_buff * spb; + struct frct_pci * pci; + struct timespec now; + uint64_t now_ns; + time_t timeo_ns; + uint64_t rcv_act; + uint64_t ka_rcv; + int64_t rcv_idle; + int64_t snd_idle; + uint32_t ackno; + + assert(frcti); + + clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); + + timeo_ns = (time_t)(frcti->qs_timeout) * MILLION; /* IMM */ + rcv_act = LOAD_RELAXED(&frcti->rcv_cr.act); + ka_rcv = LOAD_RELAXED(&frcti->t_ka_rcv); + rcv_idle = ts_age_ns(now_ns, rcv_act > ka_rcv ? rcv_act : ka_rcv); + snd_idle = ts_age_ns(now_ns, LOAD_RELAXED(&frcti->snd_cr.act)); + + if (rcv_idle > timeo_ns) { + frct_mark_peer_dead(frcti); + return; + } + + if (snd_idle <= timeo_ns / 4) { + ka_arm(frcti); + return; + } + + if (frct_ctrl_alloc(&spb, &pci, 0) < 0) { + ka_arm(frcti); + return; + } + + ackno = LOAD_RELAXED(&frcti->rcv_cr.lwe); + + pci->flags = hton16(FRCT_KA | FRCT_ACK); + pci->ackno = hton32(ackno); + + frct_hcs_set(pci, false); + + STAT_BUMP(frcti, ka_snd); + frct_tx(frcti, spb); + + ka_arm(frcti); +} + +/* Keepalive timer: re-posted by the fire callback itself. */ +static void ka_due(void * arg) +{ + ka_snd((struct frcti *) arg); +} + +static void frcti_rdv_snd(struct frcti * frcti) +{ + frcti_pkt_snd(frcti, FRCT_RDVS, 0, 0); +} + +#define HAS_RESCNTL(cr) ((cr)->cflags & FRCTFRESCNTL) +static bool frcti_is_window_open(struct frcti * frcti) +{ + struct frct_cr * snd_cr = &frcti->snd_cr; + struct timespec now; + time_t diff; + bool ret = false; + + if (!HAS_RESCNTL(snd_cr)) + return true; + + if (before(snd_cr->seqno, LOAD_RELAXED(&snd_cr->rwe))) + return true; + + /* Window may be closed; wrlock for RDV state mutations. */ + pthread_rwlock_wrlock(&frcti->lock); + + if (before(snd_cr->seqno, snd_cr->rwe)) { + ret = true; + goto unlock; + } + + clock_gettime(PTHREAD_COND_CLOCK, &now); + + if (frcti->open) { + frcti->open = false; + frcti->t_wnd = now; + frcti->t_last_rdv = now; + goto unlock; + } + + diff = ts_diff_ns(&now, &frcti->t_wnd); + if (diff > MAX_RDV) + goto unlock; + + diff = ts_diff_ns(&now, &frcti->t_last_rdv); + if (diff > (time_t) frcti->t_rdv) { + frcti->t_last_rdv = now; + frcti_rdv_snd(frcti); + STAT_BUMP(frcti, rdv_snd); + } + unlock: + pthread_rwlock_unlock(&frcti->lock); + + return ret; +} + +/* n contiguous seqnos free? No RDV: the n=1 path drives it. */ +static bool frcti_is_window_open_n(struct frcti * frcti, + size_t n) +{ + struct frct_cr * snd_cr = &frcti->snd_cr; + + if (!HAS_RESCNTL(snd_cr)) + return true; + + if (n <= 1) + return frcti_is_window_open(frcti); + + return before(snd_cr->seqno + (uint32_t)(n - 1), + LOAD_RELAXED(&snd_cr->rwe)); +} + +static void release_rq(struct frcti * frcti) +{ + size_t i; + + for (i = 0; i < RQ_SIZE; ++i) { + if (frcti->rcv_slots[i].idx == -1) + continue; + + /* Stream rq entries are sentinels (no spb owned). */ + if (!frcti->stream) + frct_spb_release_idx(frcti->rcv_slots[i].idx); + + frcti->rcv_slots[i].idx = -1; + STAT_BUMP(frcti, rq_released); + } +} + +static __inline__ bool stream_ring_sz_ok(struct frcti * frcti, + size_t n) +{ + size_t per_pkt; + + if (n > FRCT_STREAM_RING_SZ_MAX) + return false; + + if ((n & (n - 1)) != 0) + return false; + + per_pkt = frcti->frag_mtu - frcti_data_hdr_len(frcti); + + return n >= FRCT_STREAM_RING_MIN_PKTS * per_pkt; +} + +/* Default ring sized for full RQ_SIZE seqno window; pow2, capped. */ +static size_t default_stream_ring_sz(size_t per_pkt) +{ + size_t need; + size_t sz; + + need = (size_t) RQ_SIZE * per_pkt; + sz = FRCT_STREAM_RING_SZ; + + while (sz < need && sz < FRCT_STREAM_RING_SZ_MAX) + sz <<= 1; + + return sz; +} + +struct frcti * frcti_create(int fd, + uint64_t a, + uint64_t r, + uint64_t mpl, + time_t rtt_hint, + qosspec_t qs, + uint32_t mtu) +{ + struct frcti * frcti; + ssize_t idx; + struct timespec now; + uint64_t now_ns; + size_t bb; + size_t per_pkt; #ifdef PROC_FLOW_STATS - char frctstr[FRCT_NAME_STRLEN + 1]; + char frctstr[FRCT_NAME_STRLEN + 1]; #endif - mpl *= MILLION; - a *= BILLION; - r *= BILLION; + mpl *= MILLION; /* ms -> ns */ + a *= MILLION; /* ms -> ns */ + r *= MILLION; /* ms -> ns */ frcti = malloc(sizeof(*frcti)); if (frcti == NULL) @@ -349,56 +1824,76 @@ static struct frcti * frcti_create(int fd, memset(frcti, 0, sizeof(*frcti)); + list_head_init(&frcti->rxm_list); + if (pthread_rwlock_init(&frcti->lock, NULL)) goto fail_lock; - if (pthread_mutex_init(&frcti->mtx, NULL)) - goto fail_mutex; - - if (pthread_condattr_init(&cattr)) - goto fail_cattr; -#ifndef __APPLE__ - pthread_condattr_setclock(&cattr, PTHREAD_COND_CLOCK); -#endif - if (pthread_cond_init(&frcti->cond, &cattr)) - goto fail_cond; - #ifdef PROC_FLOW_STATS sprintf(frctstr, "%d", fd); if (rib_reg(frctstr, &r_ops)) goto fail_rib_reg; #endif - pthread_condattr_destroy(&cattr); for (idx = 0; idx < RQ_SIZE; ++idx) - frcti->rq[idx] = -1; + frcti->rcv_slots[idx].idx = -1; clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); + + frcti->t_mpl = mpl; + frcti->t_a = a; + frcti->t_r = r; + frcti->t_rdv = DELT_RDV; + frcti->fd = fd; + frcti->ber = (time_t) qs.ber; + frcti->lossy = (qs.loss != 0); + frcti->qs_timeout = (time_t) qs.timeout; + + frcti->frag_mtu = (size_t) mtu; + + /* Cap blocks per SACK at what fits in the per-flow frag_mtu. */ + bb = (frcti->frag_mtu - FRCT_PCILEN - SACK_HDR_SIZE) + / SACK_BLOCK_SIZE; + if (bb > SACK_MAX_BLOCKS) + bb = SACK_MAX_BLOCKS; + frcti->sack_n_max = (uint16_t) bb; + + frcti->max_rcv_sdu = FRCT_MAX_SDU; + + frcti->stream = (qs.service == SVC_STREAM); + if (frcti->stream) { + per_pkt = frcti->frag_mtu - frcti_data_hdr_len(frcti); + frcti->rcv_ring_sz = default_stream_ring_sz(per_pkt); + frcti->ring_seq_cap = + (uint32_t) (frcti->rcv_ring_sz / per_pkt); + } - frcti->mpl = mpl; - frcti->a = a; - frcti->r = r; - frcti->rdv = DELT_RDV; - frcti->fd = fd; - - - frcti->rttseq = 0; - frcti->probe = false; - - frcti->srtt = 0; /* Updated on first ACK */ - frcti->mdev = 10 * MILLION; /* Updated on first ACK */ - frcti->rto = BILLION; /* Initial rxm will be after 1 s */ -#ifdef PROC_FLOW_STATS - frcti->n_rtx = 0; - frcti->n_prb = 0; - frcti->n_rtt = 0; - frcti->n_dup = 0; - frcti->n_dak = 0; - frcti->n_rdv = 0; - frcti->n_out = 0; - frcti->n_rqo = 0; -#endif - if (proc.flows[fd].info.qs.loss == 0) { + frcti->rto_min = (time_t) MAX(RTO_MIN, 1ULL << RXMQ_RES); + rtt_init(frcti, rtt_hint); + frcti->t_min_rtt = now_ns; + frcti->probe_id_next = 1; + frcti->t_rcv_rtt = now_ns; + frcti->t_snd_probe = now_ns; + frcti->t_snd_sack = 0; + frcti->sack_lwe = 0; + frcti->sack_n = 0; + frcti->dsack_seqno = 0; + frcti->dsack_valid = false; + frcti->reo_wnd_mult = 1; + frcti->dsack_lwe_snap = 0; + frcti->t_last_reo_widen = 0; + /* So the first pre-DRF NACK fires without waiting cooldown. */ + frcti->t_nack = now_ns - BILLION; + frcti->in_recovery = false; + frcti->recovery_high = 0; + frcti->rack_fired_lwe = 0; + + tw_init_entry(&frcti->ack_tw); + tw_init_entry(&frcti->ka_tw); + tw_init_entry(&frcti->tlp_tw); + + if (!frcti->lossy) { frcti->snd_cr.cflags |= FRCTFRTX | FRCTFLINGER; frcti->rcv_cr.cflags |= FRCTFRTX; } @@ -406,24 +1901,31 @@ static struct frcti * frcti_create(int fd, frcti->snd_cr.cflags |= FRCTFRESCNTL; frcti->snd_cr.rwe = START_WINDOW; + if (frcti->lossy) + frcti->snd_cr.rwe = RQ_SIZE; + + frcti->snd_cr.inact = 3 * mpl + a + r + BILLION; /* ns */ + frcti->snd_cr.act = now_ns - frcti->snd_cr.inact - BILLION; - frcti->snd_cr.inact = (3 * mpl + a + r) / BILLION + 1; /* s */ - frcti->snd_cr.act.tv_sec = now.tv_sec - (frcti->snd_cr.inact + 1); + frcti->rcv_cr.inact = 2 * mpl + a + r + BILLION; /* ns */ + frcti->rcv_cr.act = now_ns - frcti->rcv_cr.inact - BILLION; - frcti->rcv_cr.inact = (2 * mpl + a + r) / BILLION + 1; /* s */ - frcti->rcv_cr.act.tv_sec = now.tv_sec - (frcti->rcv_cr.inact + 1); + frcti->t_ka_rcv = now_ns; + + /* qs_timeout == 0: no KA, silent peer crash goes undetected. */ + if (frcti->qs_timeout > 0) { + if (ka_arm(frcti) < 0) + goto fail_ka_arm; + } return frcti; + fail_ka_arm: #ifdef PROC_FLOW_STATS + sprintf(frctstr, "%d", fd); + rib_unreg(frctstr); fail_rib_reg: - pthread_cond_destroy(&frcti->cond); #endif - fail_cond: - pthread_condattr_destroy(&cattr); - fail_cattr: - pthread_mutex_destroy(&frcti->mtx); - fail_mutex: pthread_rwlock_destroy(&frcti->lock); fail_lock: free(frcti); @@ -431,21 +1933,55 @@ static struct frcti * frcti_create(int fd, return NULL; } -static void frcti_destroy(struct frcti * frcti) +void frcti_destroy(struct frcti * frcti) { #ifdef PROC_FLOW_STATS char frctstr[FRCT_NAME_STRLEN + 1]; +#endif + /* Drop every wheel entry referencing frcti before freeing it. */ + rxm_cancel_all(frcti); + tw_cancel(&frcti->ack_tw); + tw_cancel(&frcti->ka_tw); + tw_cancel(&frcti->tlp_tw); + +#if defined(PROC_FLOW_STATS) && defined(FRCT_DEBUG_STDOUT) + printf("[FRCT teardown] pid=%d fd=%d " + "sdu_snd=%zu sdu_reasm=%zu sdu_sole=%zu " + "frag_snd=%zu frag_rcv=%zu frag_drop=%zu " + "rxm_rto=%zu rxm_sack=%zu rxm_dup=%zu " + "rxm_due=%zu acked=%zu unowned=%zu aged=%zu defer=%zu " + "cancel=%zu arm_fail=%zu inflight=%u " + "nack_snd=%zu nack_rcv=%zu inact_drop=%zu " + "drf_rebase=%zu rq_released=%zu\n", + (int) getpid(), frcti->fd, + frcti->stat.sdu_snd_frag, frcti->stat.sdu_reasm, + frcti->stat.sdu_sole, + frcti->stat.frag_snd, frcti->stat.frag_rcv, + frcti->stat.frag_drop, + frcti->stat.rxm_rto, frcti->stat.rxm_sack, + frcti->stat.rxm_dupthresh, + frcti->stat.rxm_due_count, frcti->stat.rxm_due_acked, + frcti->stat.rxm_due_unowned, frcti->stat.rxm_due_aged, + frcti->stat.rxm_due_defer, + frcti->stat.rxm_cancel, frcti->stat.rxm_arm_fail, + frcti->snd_cr.seqno - frcti->snd_cr.lwe, + frcti->stat.nack_snd, frcti->stat.nack_rcv, + frcti->stat.inact_drop, + frcti->stat.drf_rebase, frcti->stat.rq_released); +#endif + + release_rq(frcti); + free(frcti->rcv_ring); +#ifdef PROC_FLOW_STATS sprintf(frctstr, "%d", frcti->fd); rib_unreg(frctstr); #endif - pthread_cond_destroy(&frcti->cond); - pthread_mutex_destroy(&frcti->mtx); pthread_rwlock_destroy(&frcti->lock); free(frcti); } -static uint16_t frcti_getflags(struct frcti * frcti) +uint16_t frcti_getflags(struct frcti * frcti) { uint16_t ret; @@ -453,89 +1989,91 @@ static uint16_t frcti_getflags(struct frcti * frcti) pthread_rwlock_rdlock(&frcti->lock); - ret = frcti->snd_cr.cflags; + ret = frcti->snd_cr.cflags & FRCTFMASK; pthread_rwlock_unlock(&frcti->lock); return ret; } -static void frcti_setflags(struct frcti * frcti, - uint16_t flags) +void frcti_setflags(struct frcti * frcti, + uint16_t flags) { - flags |= FRCTFRTX; /* Should not be set by command */ - assert(frcti); - pthread_rwlock_wrlock(&frcti->lock); + flags &= FRCTFSETMASK; - frcti->snd_cr.cflags &= FRCTFRTX; /* Zero other flags */ + pthread_rwlock_wrlock(&frcti->lock); - frcti->snd_cr.cflags &= flags; + frcti->snd_cr.cflags = (frcti->snd_cr.cflags & ~FRCTFSETMASK) | flags; pthread_rwlock_unlock(&frcti->lock); } -#define frcti_queued_pdu(frcti) \ - (frcti == NULL ? idx : __frcti_queued_pdu(frcti)) +size_t frcti_get_max_rcv_sdu(struct frcti * frcti) +{ + size_t ret; -#define frcti_snd(frcti, spb) \ - (frcti == NULL ? 0 : __frcti_snd(frcti, spb)) + assert(frcti); + + pthread_rwlock_rdlock(&frcti->lock); + ret = frcti->max_rcv_sdu; + pthread_rwlock_unlock(&frcti->lock); -#define frcti_rcv(frcti, spb) \ - (frcti == NULL ? 0 : __frcti_rcv(frcti, spb)) + return ret; +} -#define frcti_dealloc(frcti) \ - (frcti == NULL ? 0 : __frcti_dealloc(frcti)) +int frcti_set_max_rcv_sdu(struct frcti * frcti, + size_t max) +{ + assert(frcti); -#define frcti_is_window_open(frcti) \ - (frcti == NULL ? true : __frcti_is_window_open(frcti)) + if (max == 0) + return -EINVAL; -#define frcti_window_wait(frcti, abstime) \ - (frcti == NULL ? 0 : __frcti_window_wait(frcti, abstime)) + pthread_rwlock_wrlock(&frcti->lock); + frcti->max_rcv_sdu = max; + pthread_rwlock_unlock(&frcti->lock); + return 0; +} -static bool __frcti_is_window_open(struct frcti * frcti) +size_t frcti_get_rcv_ring_sz(struct frcti * frcti) { - struct frct_cr * snd_cr = &frcti->snd_cr; - bool ret = true; + size_t ret; + + assert(frcti); pthread_rwlock_rdlock(&frcti->lock); + ret = frcti->rcv_ring_sz; + pthread_rwlock_unlock(&frcti->lock); - if (snd_cr->cflags & FRCTFRESCNTL) - ret = before(snd_cr->seqno, snd_cr->rwe); + return ret; +} - if (!ret) { - struct timespec now; +/* Set before any stream byte has been delivered; -EBUSY otherwise. */ +int frcti_set_rcv_ring_sz(struct frcti * frcti, + size_t n) +{ + int ret = 0; + size_t per_pkt; - clock_gettime(PTHREAD_COND_CLOCK, &now); + assert(frcti); - pthread_mutex_lock(&frcti->mtx); - if (frcti->open) { - frcti->open = false; - frcti->t_wnd = now; - frcti->t_rdvs = now; - } else { - time_t diff; - diff = ts_diff_ns(&now, &frcti->t_wnd); - if (diff > MAX_RDV) { - pthread_mutex_unlock(&frcti->mtx); - pthread_rwlock_unlock(&frcti->lock); - return false; - } - - diff = ts_diff_ns(&now, &frcti->t_rdvs); - if (diff > frcti->rdv) { - frcti->t_rdvs = now; - __send_rdv(frcti->fd); -#ifdef PROC_FLOW_STATS - frcti->n_rdv++; -#endif + if (!frcti->stream) + return -ENOTSUP; + if (!stream_ring_sz_ok(frcti, n)) + return -EINVAL; - } - } + per_pkt = frcti->frag_mtu - frcti_data_hdr_len(frcti); + + pthread_rwlock_wrlock(&frcti->lock); - pthread_mutex_unlock(&frcti->mtx); + if (frcti->rcv_ring != NULL) { + ret = -EBUSY; + } else { + frcti->rcv_ring_sz = n; + frcti->ring_seq_cap = (uint32_t) (n / per_pkt); } pthread_rwlock_unlock(&frcti->lock); @@ -543,392 +2081,2130 @@ static bool __frcti_is_window_open(struct frcti * frcti) return ret; } -static int __frcti_window_wait(struct frcti * frcti, - struct timespec * abstime) +time_t frcti_get_rto_min(struct frcti * frcti) { - struct frct_cr * snd_cr = &frcti->snd_cr; - int ret = 0; + time_t v; + + assert(frcti); pthread_rwlock_rdlock(&frcti->lock); + v = frcti->rto_min; + pthread_rwlock_unlock(&frcti->lock); - if (!(snd_cr->cflags & FRCTFRESCNTL)) { - pthread_rwlock_unlock(&frcti->lock); + return v; +} + +/* Floor at the timer-wheel resolution; finer granularity is unrepresentable. */ +int frcti_set_rto_min(struct frcti * frcti, + time_t rto_min) +{ + time_t floor = (time_t) (1ULL << RXMQ_RES); + time_t rto_floor; + time_t rto; + + assert(frcti); + + if (rto_min < floor) + return -EINVAL; + + pthread_rwlock_wrlock(&frcti->lock); + + frcti->rto_min = rto_min; + if (frcti->srtt > 0) { + rto_floor = MAX(rto_min, 2 * frcti->srtt); + rto = MAX(rto_floor, + frcti->srtt + (frcti->mdev << MDEV_MUL)); + STORE_RELEASE(&frcti->rto, rto); + } else if (frcti->rto < rto_min) { + STORE_RELEASE(&frcti->rto, rto_min); + } + + pthread_rwlock_unlock(&frcti->lock); + + return 0; +} + +/* Re-arm a fresh rxm so a lost fast-retx still recovers via RTO. */ +static void sack_rxm_snd(struct frcti * frcti, + void * pkt, + size_t len) +{ + struct ssm_pk_buff * spb; + const struct frct_pci * pci; + struct rxm_entry * rxm; + uint32_t rcv_lwe; + uint32_t seqno; + int ret; + + rcv_lwe = LOAD_RELAXED(&frcti->rcv_cr.lwe); + + spb = rxm_pkt_prepare(pkt, len, rcv_lwe, frcti->stream); + if (spb == NULL) + return; + + pci = (const struct frct_pci *) ssm_pk_buff_head(spb); + seqno = ntoh32(pci->seqno); + + rxm = rxm_alloc(frcti, ssm_pk_buff_len(spb)); + if (rxm == NULL) { + frct_spb_release(spb); + return; + } + rxm_arm(frcti, seqno, rxm, spb); + + STAT_BUMP(frcti, rxm_sack); + ret = frct_tx(frcti, spb); + if (ret == -EFLOWDOWN || ret == -ENOTALLOC) + STAT_BUMP(frcti, rxm_tx_dead); +} + +/* Additive HoL emit; original snd_slots[hp].rxm stays armed (NewReno). */ +static int fast_rxm_send(struct frcti * frcti, + void * pkt, + size_t len) +{ + struct ssm_pk_buff * spb; + uint32_t rcv_lwe; + + rcv_lwe = LOAD_RELAXED(&frcti->rcv_cr.lwe); + + spb = rxm_pkt_prepare(pkt, len, rcv_lwe, frcti->stream); + if (spb == NULL) return 0; + + return frct_tx(frcti, spb); +} + +/* PCI bytes survive head_release at receive; just rewind the pointer. */ +static __inline__ uint16_t frag_role_peek(struct ssm_pk_buff * spb) +{ + const struct frct_pci * pci; + + assert(ssm_pk_buff_head(spb) != NULL); + + pci = (const struct frct_pci *) (ssm_pk_buff_head(spb) - FRCT_PCILEN); + + return ntoh16(pci->flags) & FRCT_FR_MASK; +} + +enum frag_state { + FRAG_NOT_READY, /* head missing / FIRST..LAST run incomplete */ + FRAG_DELIVER, /* *count fragments form a deliverable SDU */ + FRAG_DROP, /* *count fragments at lwe are malformed */ +}; + +/* + * On a gap in the run: FRTX waits (NOT_READY); best-effort scans forward + * for the next FIRST/SOLE and returns DROP for the broken prefix. *count + * gets the offset from the trailing edge. NOT_READY if no later run is + * in window. Caller rdlock. + */ +static enum frag_state frag_inspect_gap(struct frcti * frcti, + size_t start, + size_t * count) +{ + const struct rcv_slot * slots = frcti->rcv_slots; + struct ssm_pk_buff * spb; + uint32_t k; + uint16_t role; + size_t m; + + if (frcti->rcv_cr.cflags & FRCTFRTX) + return FRAG_NOT_READY; + + k = frcti->rcv_cr.rwe - RQ_SIZE; + + for (m = start; m < RQ_SIZE; ++m) { + if (slots[RQ_SLOT(k + m)].idx == -1) + continue; + + spb = rq_frag(frcti, k + m); + role = frag_role_peek(spb); + + if (role == FRCT_FR_SOLE || role == FRCT_FR_FIRST) { + if (m == 0) + return FRAG_NOT_READY; + + *count = m; + return FRAG_DROP; + } } - while (snd_cr->seqno == snd_cr->rwe && ret != -ETIMEDOUT) { - struct timespec now; - pthread_rwlock_unlock(&frcti->lock); - pthread_mutex_lock(&frcti->mtx); + return FRAG_NOT_READY; +} + +/* + * Inspect rq[lwe..]; set *count and return DELIVER/DROP/NOT_READY. DROP + * covers broken prefixes (mid/last at HoL, FIRST..[non-LAST]..new-FIRST). + * Non-FRTX flows skip past gaps to the next FIRST/SOLE. Caller rdlock. + */ +static enum frag_state frag_run_inspect(struct frcti * frcti, + size_t * count) +{ + const struct rcv_slot * slots = frcti->rcv_slots; + struct ssm_pk_buff * spb; + uint32_t k = frcti->rcv_cr.rwe - RQ_SIZE; + uint16_t role; + size_t n = 0; + + if (slots[RQ_SLOT(k)].idx == -1) + return frag_inspect_gap(frcti, 0, count); + + spb = rq_frag(frcti, k); + role = frag_role_peek(spb); + + if (role == FRCT_FR_SOLE) { + *count = 1; + return FRAG_DELIVER; + } - if (frcti->open) { - clock_gettime(PTHREAD_COND_CLOCK, &now); + if (role != FRCT_FR_FIRST) { + *count = 1; + return FRAG_DROP; + } + + while (true) { + if (n == RQ_SIZE || slots[RQ_SLOT(k + n)].idx == -1) + return frag_inspect_gap(frcti, n, count); + + spb = rq_frag(frcti, k + n); + role = frag_role_peek(spb); + ++n; + + if (role == FRCT_FR_LAST) { + *count = n; + return FRAG_DELIVER; + } - frcti->t_wnd = now; - frcti->t_rdvs = now; - frcti->open = false; + if (n > 1 && role != FRCT_FR_MID) { + /* SOLE or new FIRST mid-run: drop the prefix. */ + *count = n - 1; + return FRAG_DROP; } + } +} + +/* Caller wrlock. Delivery edge is implicit: rwe - RQ_SIZE. */ +static void frag_drop(struct frcti * frcti, + size_t count) +{ + uint32_t k = frcti->rcv_cr.rwe - RQ_SIZE; + uint32_t edge; + size_t i; + + for (i = 0; i < count; ++i) { + size_t pos = RQ_SLOT(k + i); + + if (frcti->rcv_slots[pos].idx == -1) + continue; + + frct_spb_release_idx(frcti->rcv_slots[pos].idx); + frcti->rcv_slots[pos].idx = -1; + } + + frcti->rcv_cr.rwe += count; + + /* Drop may span a gap; pull lwe up to preserve rwe - RQ_SIZE <= lwe. */ + edge = frcti->rcv_cr.rwe - RQ_SIZE; + if (before(frcti->rcv_cr.lwe, edge)) + STORE_RELEASE(&frcti->rcv_cr.lwe, edge); +} + +/* Copy `count` fragments at rq[lwe..] into buf; release + advance lwe. */ +static size_t frag_gather(struct frcti * frcti, + size_t count, + uint8_t * buf) +{ + struct ssm_pk_buff * frag; + size_t off = 0; + size_t i; + uint32_t k = frcti->rcv_cr.rwe - RQ_SIZE; + + for (i = 0; i < count; ++i) { + size_t pos = RQ_SLOT(k + i); + size_t flen; + + frag = rq_frag(frcti, k + i); + flen = ssm_pk_buff_len(frag); + memcpy(buf + off, ssm_pk_buff_head(frag), flen); + off += flen; + frct_spb_release_idx(frcti->rcv_slots[pos].idx); + frcti->rcv_slots[pos].idx = -1; + } + + frcti->rcv_cr.rwe += count; + + return off; +} + +/* Caller holds lock. */ +static size_t frag_total_len(struct frcti * frcti, + size_t count, + bool * overflow) +{ + struct ssm_pk_buff * frag; + size_t total = 0; + size_t i; + uint32_t k = frcti->rcv_cr.rwe - RQ_SIZE; - pthread_cleanup_push(__cleanup_mutex_unlock, &frcti->mtx); + *overflow = false; - ret = -__timedwait(&frcti->cond, &frcti->mtx, abstime); + for (i = 0; i < count; ++i) { + size_t flen; - pthread_cleanup_pop(false); + frag = rq_frag(frcti, k + i); + flen = ssm_pk_buff_len(frag); + if (total + flen < total) { + *overflow = true; + return 0; + } + total += flen; + } - if (ret == -ETIMEDOUT) { - time_t diff; + return total; +} - clock_gettime(PTHREAD_COND_CLOCK, &now); +/* + * Process a delivered slot at lwe: latch FIN if acceptable, + * advance byte_high (clamped to byte_fin once latched). + */ +static __inline__ void stream_deliver_slot(struct frcti * frcti, + size_t lp) +{ + uint32_t end; - diff = ts_diff_ns(&now, &frcti->t_wnd); - if (diff > MAX_RDV) { - pthread_mutex_unlock(&frcti->mtx); - return -ECONNRESET; /* write fails! */ - } + end = frcti->rcv_slots[lp].end; - diff = ts_diff_ns(&now, &frcti->t_rdvs); - if (diff > frcti->rdv) { - frcti->t_rdvs = now; - __send_rdv(frcti->fd); - } + if (frcti->rcv_slots[lp].fin) { + if (end == frcti->rcv_byte_high && !frcti->rcv_fin_seen) { + frcti->rcv_fin_seen = true; + frcti->rcv_byte_fin = end; + } else { + STAT_BUMP(frcti, strm_fin_drop); } + } + + if (frcti->rcv_fin_seen && after(end, frcti->rcv_byte_fin)) + end = frcti->rcv_byte_fin; + + frcti->rcv_byte_high = end; +} + +/* Two-segment memcpy from buf into the rx ring at byte offset start. */ +static void stream_ring_write(struct frcti * frcti, + uint32_t start, + buffer_t buf) +{ + size_t mask = frcti->rcv_ring_sz - 1; + size_t off = start & mask; - pthread_mutex_unlock(&frcti->mtx); - pthread_rwlock_rdlock(&frcti->lock); + if (off + buf.len <= frcti->rcv_ring_sz) { + memcpy(frcti->rcv_ring + off, buf.data, buf.len); + } else { + size_t first = frcti->rcv_ring_sz - off; + memcpy(frcti->rcv_ring + off, buf.data, first); + memcpy(frcti->rcv_ring, buf.data + first, buf.len - first); } +} +/* Two-segment memcpy from the rx ring at byte offset start into buf. */ +static void stream_ring_read(struct frcti * frcti, + uint32_t start, + buffer_t buf) +{ + size_t mask = frcti->rcv_ring_sz - 1; + size_t off = start & mask; + + if (off + buf.len <= frcti->rcv_ring_sz) { + memcpy(buf.data, frcti->rcv_ring + off, buf.len); + } else { + size_t first = frcti->rcv_ring_sz - off; + memcpy(buf.data, frcti->rcv_ring + off, first); + memcpy(buf.data + first, frcti->rcv_ring, buf.len - first); + } +} + +/* Deliver-or-drop one stashed slot at lwe; advance lwe/rwe. Caller wrlock. */ +static void stream_advance_lwe(struct frcti * frcti) +{ + size_t lp; + + lp = RQ_SLOT(frcti->rcv_cr.lwe); + + if (frcti->rcv_slots[lp].start != frcti->rcv_byte_high) + STAT_BUMP(frcti, strm_drop); + else + stream_deliver_slot(frcti, lp); + + frcti->rcv_slots[lp].fin = 0; + frcti->rcv_slots[lp].idx = -1; + STORE_RELEASE(&frcti->rcv_cr.lwe, frcti->rcv_cr.lwe + 1); + frcti->rcv_cr.rwe++; +} + +/* + * Validate a stream DATA packet before stashing. Returns 0 if the + * packet may be written into rcv_ring + rq[], -1 otherwise. + */ +static __inline__ int stream_stash_check(struct frcti * frcti, + uint32_t start, + uint32_t end, + size_t plen, + uint16_t flags) +{ + if (end - start != (uint32_t) plen) + return -1; + + /* FIN MUST be 0-byte. */ + if ((flags & FRCT_FIN) && plen != 0) + return -1; + + /* Post-EOS: no further FIN once latched. */ + if (frcti->rcv_fin_seen && (flags & FRCT_FIN)) + return -1; + + /* Post-EOS: reject data at or past byte_fin. */ + if (frcti->rcv_fin_seen && !before(start, frcti->rcv_byte_fin)) + return -1; + + /* Stale: peer is behind the delivered edge. */ + if (before(end, frcti->rcv_byte_next)) + return -1; + + /* Exact-edge: only an empty-stream FIN is meaningful. */ + if (end == frcti->rcv_byte_next && !(flags & FRCT_FIN)) + return -1; + + if (end - frcti->rcv_byte_next > frcti->rcv_ring_sz) + return -1; + + return 0; +} + +/* + * Stream-mode DATA receive: validate, stash payload in rcv_ring, mark + * rq[pos], advance lwe through any newly-contiguous run. Returns 0 + * (spb released) or -1 (caller releases). Caller wrlock. + */ +static int frcti_stream_data_rcv(struct frcti * frcti, + struct ssm_pk_buff * spb, + size_t pos, + uint16_t flags) +{ + struct frct_pci_stream * spci; + uint32_t start; + uint32_t end; + buffer_t buf; + size_t skip; + + if (ssm_pk_buff_len(spb) < FRCT_PCI_STREAM_LEN) + return -1; + + if (frcti->rcv_ring == NULL) { + frcti->rcv_ring = calloc(1, frcti->rcv_ring_sz); + if (frcti->rcv_ring == NULL) + return -ENOMEM; + } + + spci = FRCT_HDR_POP(spb, frct_pci_stream); + start = ntoh32(spci->start); + end = ntoh32(spci->end); + + buf.data = ssm_pk_buff_head(spb); + buf.len = ssm_pk_buff_len(spb); + + if (stream_stash_check(frcti, start, end, buf.len, flags) < 0) + return -1; + + /* Trim front-overlap with already-delivered region. */ + if (before(start, frcti->rcv_byte_next)) { + skip = frcti->rcv_byte_next - start; + buf.data += skip; + buf.len -= skip; + start = frcti->rcv_byte_next; + } + + stream_ring_write(frcti, start, buf); + STAT_ADD(frcti, strm_rcv_byte, buf.len); + + frcti->rcv_slots[pos].idx = 1; + frcti->rcv_slots[pos].start = start; + frcti->rcv_slots[pos].end = end; + frcti->rcv_slots[pos].fin = (flags & FRCT_FIN) ? 1 : 0; + + while (frcti->rcv_slots[RQ_SLOT(frcti->rcv_cr.lwe)].idx != -1) + stream_advance_lwe(frcti); + + frct_spb_release(spb); + + return 0; +} + +/* + * DATA receive: stash idx at rq[pos], advance lwe through any + * contiguous run. Caller wrlock. + */ +static void frcti_data_stash(struct frcti * frcti, + ssize_t idx, + size_t pos, + uint16_t flags) +{ + frcti->rcv_slots[pos].idx = idx; + + if ((flags & FRCT_FR_MASK) != FRCT_FR_SOLE) + STAT_BUMP(frcti, frag_rcv); + + /* lwe = cum-ACK edge; advance per fragment through contiguous run. */ + while (before(frcti->rcv_cr.lwe, frcti->rcv_cr.rwe) + && frcti->rcv_slots[RQ_SLOT(frcti->rcv_cr.lwe)].idx != -1) + STORE_RELEASE(&frcti->rcv_cr.lwe, frcti->rcv_cr.lwe + 1); +} + +/* Stream consume: copy up to `count` contiguous bytes from ring into buf. */ +static ssize_t frcti_consume_stream(struct frcti * frcti, + uint8_t * buf, + size_t count) +{ + size_t avail; + size_t copy; + ssize_t ret; + buffer_t dst; + + assert(frcti); + + pthread_rwlock_wrlock(&frcti->lock); + + avail = (size_t) (frcti->rcv_byte_high - frcti->rcv_byte_next); + if (avail == 0) { + /* EOS drained: signal EOF to the reader. */ + if (frcti->rcv_fin_seen + && frcti->rcv_byte_next == frcti->rcv_byte_fin) + ret = 0; + else + ret = -EAGAIN; + goto unlock; + } + + copy = MIN(avail, count); + + dst.data = buf; + dst.len = copy; + stream_ring_read(frcti, frcti->rcv_byte_next, dst); + + frcti->rcv_byte_next += (uint32_t) copy; + STAT_ADD(frcti, strm_dlv_byte, copy); + + ret = (ssize_t) copy; + + unlock: pthread_rwlock_unlock(&frcti->lock); return ret; } -static ssize_t __frcti_queued_pdu(struct frcti * frcti) +/* + * FRTX consume: copy next ready PDU (full SDU or nothing). Returns bytes, + * -EAGAIN (no PDU), or -EMSGSIZE (oversize: run dropped to unblock flow). + */ +static ssize_t frcti_consume(struct frcti * frcti, + uint8_t * buf, + size_t count) { - ssize_t idx; - size_t pos; + size_t n; + size_t total; + bool overflow; + enum frag_state st; + ssize_t ret; assert(frcti); - /* See if we already have the next PDU. */ pthread_rwlock_wrlock(&frcti->lock); - pos = frcti->rcv_cr.lwe & (RQ_SIZE - 1); - - idx = frcti->rq[pos]; - if (idx != -1) { - ++frcti->rcv_cr.lwe; - ++frcti->rcv_cr.rwe; - frcti->rq[pos] = -1; + while (true) { + st = frag_run_inspect(frcti, &n); + if (st == FRAG_NOT_READY) { + ret = -EAGAIN; + goto unlock; + } + if (st == FRAG_DROP) { + STAT_ADD(frcti, frag_drop, n); + frag_drop(frcti, n); + continue; + } + /* FRAG_DELIVER */ + total = frag_total_len(frcti, n, &overflow); + if (overflow || total > frcti->max_rcv_sdu || total > count) { + STAT_ADD(frcti, frag_drop, n); + frag_drop(frcti, n); + ret = -EMSGSIZE; + goto unlock; + } + ret = (ssize_t) frag_gather(frcti, n, buf); + if (n > 1) + STAT_BUMP(frcti, sdu_reasm); + else + STAT_BUMP(frcti, sdu_sole); + goto unlock; } + unlock: pthread_rwlock_unlock(&frcti->lock); - return idx; + return ret; } -static ssize_t __frcti_pdu_ready(struct frcti * frcti) +static bool frcti_pdu_ready(struct frcti * frcti) { - ssize_t idx; - size_t pos; + size_t pos; + size_t count; + bool ready; assert(frcti); - /* See if we already have the next PDU. */ pthread_rwlock_rdlock(&frcti->lock); - pos = frcti->rcv_cr.lwe & (RQ_SIZE - 1); - idx = frcti->rq[pos]; + if (frcti->stream) { + ready = frcti->rcv_byte_high != frcti->rcv_byte_next; + pthread_rwlock_unlock(&frcti->lock); + return ready; + } + + if (frag_run_inspect(frcti, &count) != FRAG_DELIVER) { + /* Drop case: frcti_consume will handle it; not ready. */ + pthread_rwlock_unlock(&frcti->lock); + return false; + } + + pos = RQ_SLOT(frcti->rcv_cr.rwe - RQ_SIZE); + ready = frcti->rcv_slots[pos].idx != -1; + + pthread_rwlock_unlock(&frcti->lock); + + return ready; +} + +/* No srtt yet: probe at the cold-probe cadence to seed it. */ +#define PROBE_DUE_COLD(frcti, now_ns) \ + ((now_ns) - (frcti)->t_snd_probe > (uint64_t) RTTP_COLD_NS) + +/* Have srtt: probe when peer quiet for > 2*srtt and last probe > srtt. */ +#define PROBE_DUE_WARM(frcti, now_ns) \ + ((now_ns) - (frcti)->t_rcv_rtt > 2u * (uint64_t)(frcti)->srtt \ + && (now_ns) - (frcti)->t_snd_probe > (uint64_t)(frcti)->srtt) + +/* Seeds srtt for receive-only sides so they don't fall back to 1 s RTO. */ +__attribute__((cold)) +static void frcti_rcv_probe(struct frcti * frcti, + uint64_t now_ns) +{ + uint32_t probe_id; + uint8_t nonce[RTTP_NONCE_LEN] = { 0 }; + + pthread_rwlock_wrlock(&frcti->lock); + + if (frcti->srtt == 0 && !PROBE_DUE_COLD(frcti, now_ns)) { + pthread_rwlock_unlock(&frcti->lock); + return; + } + + if (frcti->srtt != 0 && !PROBE_DUE_WARM(frcti, now_ns)) { + pthread_rwlock_unlock(&frcti->lock); + return; + } + + probe_id = rttp_alloc_probe(frcti, now_ns, nonce); pthread_rwlock_unlock(&frcti->lock); - return idx; + if (probe_id != 0) + frcti_rttp_snd(frcti, probe_id, 0, nonce); } -#include <timerwheel.c> +/* Echo at slot `pos` matches our probe: id, slot, nonce all intact. */ +static __inline__ bool probe_echo_matches(struct frcti * frcti, + size_t pos, + uint32_t echo_id, + const uint8_t nonce[RTTP_NONCE_LEN]) +{ + if (frcti->probes[pos].id != echo_id) + return false; + + if (frcti->probes[pos].ts == 0) + return false; + + return memcmp(frcti->probes[pos].nonce, nonce, RTTP_NONCE_LEN) == 0; +} /* - * Send a final ACK for everything that has not been ACK'd. - * If the flow should be kept active for retransmission, - * the returned time will be negative. + * RTT probe (echo_id == 0): bounce the nonce back to peer. + * RTT echo (echo_id != 0): verify nonce + feed sample. */ -static time_t __frcti_dealloc(struct frcti * frcti) +static void frcti_rttp_rcv(struct frcti * frcti, + buffer_t pkt, + uint64_t now_ns) { - struct timespec now; - time_t wait; - int ackno; - int fd = -1; + const struct frct_rttp * rttp; + uint32_t probe_id; + uint32_t echo_id; + uint8_t nonce[RTTP_NONCE_LEN]; + size_t ring_pos; + int64_t elapsed; + uint64_t sample; + + if (pkt.len < RTTP_PAYLOAD) + return; + + rttp = (const struct frct_rttp *) pkt.data; + probe_id = ntoh32(rttp->probe_id); + echo_id = ntoh32(rttp->echo_id); + + /* Forged/malformed: bouncing this would loop on echo_id == 0. */ + if (probe_id == 0 && echo_id == 0) + return; + + memcpy(nonce, rttp->nonce, sizeof(nonce)); + + if (echo_id == 0) { + /* Probe: echo back with same nonce so peer can verify. */ + STAT_BUMP(frcti, rttp_rcv); + frcti_rttp_snd(frcti, 0, probe_id, nonce); + return; + } + + ring_pos = RTTP_POS(echo_id); + + pthread_rwlock_wrlock(&frcti->lock); + + if (!probe_echo_matches(frcti, ring_pos, echo_id, nonce)) { + pthread_rwlock_unlock(&frcti->lock); + return; + } + + elapsed = ts_age_ns(now_ns, frcti->probes[ring_pos].ts); + frcti->probes[ring_pos].ts = 0; + frcti->t_rcv_rtt = now_ns; + + if (elapsed <= 0) { + pthread_rwlock_unlock(&frcti->lock); + return; + } + sample = (uint64_t) elapsed; + + /* Clamp probe sample to RTT_CLAMP_MUL * srtt to avoid poisoning. */ + if (frcti->srtt > 0) + sample = MIN(sample, (uint64_t) frcti->srtt * RTT_CLAMP_MUL); + + rtt_update(frcti, sample, now_ns); + + pthread_rwlock_unlock(&frcti->lock); +} + +/* Honours piggybacked ACK on the KA. */ +static void frcti_ka_rcv(struct frcti * frcti, + const struct frct_pci * pci, + uint64_t now_ns, + uint16_t flags) +{ + uint32_t ka_ackno; + + STORE_RELEASE(&frcti->t_ka_rcv, now_ns); + STAT_BUMP(frcti, ka_rcv); + + if (!(flags & FRCT_ACK)) + return; + + ka_ackno = ntoh32(pci->ackno); + + pthread_rwlock_wrlock(&frcti->lock); + + if (within(ka_ackno, frcti->snd_cr.lwe, frcti->snd_cr.seqno)) + STORE_RELEASE(&frcti->snd_cr.lwe, ka_ackno); + + pthread_rwlock_unlock(&frcti->lock); +} + +/* + * Additive HoL re-emit (carries DRF); runs before rcv_cr->act + * refresh so it doesn't pre-empt peer's first DRF. + */ +__attribute__((cold)) +static void frcti_nack_rcv(struct frcti * frcti) +{ + struct timespec now; + uint64_t now_ns; + size_t hp; + struct rxm_entry * rxm; + void * pkt_copy = NULL; + size_t pkt_len = 0; clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); + + pthread_rwlock_wrlock(&frcti->lock); + + STAT_BUMP(frcti, nack_rcv); + + if (frcti->snd_cr.seqno == frcti->snd_cr.lwe) { + pthread_rwlock_unlock(&frcti->lock); + return; + } + + hp = RQ_SLOT(frcti->snd_cr.lwe); + rxm = LOAD_ACQUIRE(&frcti->snd_slots[hp].rxm); + if (rxm == NULL || RXM_AGED_OUT(rxm->t0, now_ns, frcti->t_r)) { + pthread_rwlock_unlock(&frcti->lock); + return; + } + + pkt_copy = malloc(rxm->len); + if (pkt_copy != NULL) { + memcpy(pkt_copy, rxm->pkt, rxm->len); + pkt_len = rxm->len; + /* Karn: suppress RTT sample. NACK supersedes pending TLP. */ + frcti->snd_slots[hp].flags = + (frcti->snd_slots[hp].flags & ~SND_TLP) + | SND_RTX | SND_FAST_RXM; + frcti->rtt_lwe = frcti->snd_cr.lwe + 1; + STAT_BUMP(frcti, rxm_nack); + } + + pthread_rwlock_unlock(&frcti->lock); + + if (pkt_copy != NULL) { + int ret = fast_rxm_send(frcti, pkt_copy, pkt_len); + if (ret == -EFLOWDOWN || ret == -ENOTALLOC) + STAT_BUMP(frcti, rxm_tx_dead); + free(pkt_copy); + } +} + +__attribute__((cold)) +static void frcti_rdv_rcv(struct frcti * frcti) +{ + uint32_t rwe; pthread_rwlock_rdlock(&frcti->lock); - ackno = frcti->rcv_cr.lwe; - if (frcti->rcv_cr.lwe != frcti->rcv_cr.seqno) - fd = frcti->fd; + rwe = frcti_advert_rwe(frcti); + + pthread_rwlock_unlock(&frcti->lock); + + STAT_BUMP(frcti, rdv_rcv); + + frcti_pkt_snd(frcti, FRCT_FC, 0, rwe); +} + +/* §7.2: PTO = 2*SRTT + max delayed-ACK delay; fallback when unseeded. */ +static __inline__ uint64_t tlp_pto(const struct frcti * frcti) +{ + if (frcti->srtt > 0) + return 2ULL * (uint64_t) frcti->srtt + ACK_DELAY_NS; + + return NACK_COOLDOWN_NS; +} + +/* + * RFC 8985 §7: lazy probe. Re-evaluate on fire — if sender was active + * within PTO, re-post; else probe HoL once and hand off to RTO. + */ +__attribute__((cold)) +static void tlp_due(void * arg) +{ + struct frcti * frcti = arg; + struct timespec now; + uint64_t now_ns; + uint64_t pto; + uint64_t rto_at; + size_t hp; + struct rxm_entry * rxm; + void * pkt_copy = NULL; + size_t pkt_len = 0; + bool re_post = false; + uint64_t deadline = 0; + + clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); + + pthread_rwlock_wrlock(&frcti->lock); + + if (frcti->snd_cr.seqno == frcti->snd_cr.lwe) + goto unlock; - wait = MAX(frcti->rcv_cr.inact - now.tv_sec + frcti->rcv_cr.act.tv_sec, - frcti->snd_cr.inact - now.tv_sec + frcti->snd_cr.act.tv_sec); - wait = MAX(wait, 0); + if (!before(frcti->snd_cr.seqno, frcti->snd_cr.rwe)) + goto unlock; /* FC-blocked: RDV handles it. */ - if (frcti->snd_cr.cflags & FRCTFLINGER - && before(frcti->snd_cr.lwe, frcti->snd_cr.seqno)) - wait = -wait; + /* RFC 8985 §7.3: one outstanding probe, MAX_TLP_PER_EP per ep. */ + if (frcti->tlp_high_seq != 0) + goto unlock; + if (frcti->tlp_count >= MAX_TLP_PER_EP) + goto unlock; + + pto = tlp_pto(frcti); + + /* §7.2: anchor PTO on most recent send; defer if still active. */ + if (now_ns < frcti->snd_cr.act + pto) { + deadline = frcti->snd_cr.act + pto; + re_post = true; + goto unlock; + } + + hp = RQ_SLOT(frcti->snd_cr.lwe); + rxm = LOAD_ACQUIRE(&frcti->snd_slots[hp].rxm); + if (rxm == NULL || RXM_AGED_OUT(rxm->t0, now_ns, frcti->t_r)) + goto unlock; + + /* Cap: if HoL RTO is due, let rxm_due fire instead. */ + rto_at = rxm->t0 + ((uint64_t) frcti->rto + << LOAD_RELAXED(&frcti->rto_mul)); + if (rto_at <= now_ns) + goto unlock; + + pkt_copy = malloc(rxm->len); + if (pkt_copy != NULL) { + memcpy(pkt_copy, rxm->pkt, rxm->len); + pkt_len = rxm->len; + frcti->snd_slots[hp].time = now_ns; + frcti->snd_slots[hp].flags |= SND_TLP | SND_FAST_RXM; + frcti->rtt_lwe = frcti->snd_cr.lwe + 1; + /* §7.3 outstanding-probe marker; ack_rcv/rxm_snd clear. */ + frcti->tlp_high_seq = frcti->snd_cr.seqno; + frcti->tlp_count++; + STAT_BUMP(frcti, tlp_snd); + } + + unlock: pthread_rwlock_unlock(&frcti->lock); - if (fd != -1) - __send_frct_pkt(fd, FRCT_ACK, ackno, 0); + if (pkt_copy != NULL) { + fast_rxm_send(frcti, pkt_copy, pkt_len); + free(pkt_copy); + } - return wait; + if (re_post) + tw_post(&frcti->tlp_tw, deadline, tlp_due, frcti); + else + __atomic_clear(&frcti->tlp_pending, __ATOMIC_RELAXED); } -static int __frcti_snd(struct frcti * frcti, - struct ssm_pk_buff * spb) +/* §7.2 lazy: post once per quiet period. tlp_due re-evaluates on fire. */ +static int tlp_arm(struct frcti * frcti) { - struct frct_pci * pci; - struct timespec now; - struct frct_cr * snd_cr; - struct frct_cr * rcv_cr; - uint32_t seqno; - bool rtx; + struct timespec now; + uint64_t now_ns; + uint64_t pto; + uint64_t deadline; + + /* §7.3: one outstanding probe, MAX_TLP_PER_EP per recovery ep. */ + if (LOAD_RELAXED(&frcti->tlp_high_seq) != 0) + return 0; + if (LOAD_RELAXED(&frcti->tlp_count) >= MAX_TLP_PER_EP) + return 0; + if (__atomic_test_and_set(&frcti->tlp_pending, __ATOMIC_RELAXED)) + return 0; + + clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); + + pto = tlp_pto(frcti); + + deadline = LOAD_RELAXED(&frcti->snd_cr.act) + pto; + if (deadline <= now_ns) + deadline = now_ns + pto; + + tw_post(&frcti->tlp_tw, deadline, tlp_due, frcti); + + return 0; +} + +/* + * FC window advert from any flag-bearing packet. Caps at lwe + RQ_SIZE, + * rejects backward shrink (forged/stale FC), marks window open. + * Caller wrlock. + */ +static __inline__ void frcti_fc_rcv(struct frcti * frcti, + const struct frct_pci * pci) +{ + struct frct_cr * snd_cr; + uint32_t rwe; + uint32_t rwe_max; + + snd_cr = &frcti->snd_cr; + rwe = ntoh32(pci->window); + rwe_max = snd_cr->lwe + RQ_SIZE; + + if (after(rwe, rwe_max)) + rwe = rwe_max; + + /* Reject backward shrink (forged/stale FC). */ + if (before(rwe, snd_cr->rwe)) + rwe = snd_cr->rwe; + + STORE_RELAXED(&snd_cr->rwe, rwe); + frcti->open = true; +} + +/* Packet copies captured under frcti->lock; emitted after release. */ +struct pending { + buffer_t fast_rxm; + buffer_t sack_rxm[SACK_RXM_MAX]; + size_t sack_rxm_cnt; +}; + +/* RFC 6582 §3.2: seal recovery_high on entry; do not extend on new gaps. */ +static void recovery_enter(struct frcti * frcti) +{ + if (frcti->in_recovery) + return; + + frcti->in_recovery = true; + frcti->recovery_high = frcti->snd_cr.seqno + RTT_QUARANTINE; +} + +/* True when cum-ACK clears recovery_high or all in-flight ACKed. */ +static bool recovery_exit_reached(struct frcti * frcti, + uint32_t ackno) +{ + if (!frcti->in_recovery) + return false; + + if (!before(ackno, frcti->recovery_high)) + return true; + + return ackno == frcti->snd_cr.seqno; +} + +/* RTT sample gate: Karn + SACK-consume + don't-seed. */ +static bool rtt_sample_eligible(struct frcti * frcti, + size_t p, + uint16_t flags, + uint32_t lwe) +{ + if (flags & FRCT_RXM) + return false; + if (frcti->snd_slots[p].flags & (SND_RTX | SND_TLP)) + return false; + if (LOAD_ACQUIRE(&frcti->snd_slots[p].rxm) == NULL) + return false; + if (before(lwe, frcti->rtt_lwe)) + return false; + /* Don't seed srtt from a cum-ACK; let probes seed. */ + if (frcti->srtt == 0) + return false; + return true; +} + +#define RXM_SLOT_EMPTY(rxm) ((rxm) == NULL) +#define FAST_RXM_STAGED(pending) ((pending)->fast_rxm.data != NULL) +#define RXM_FAST_DONE(flags) (((flags) & SND_FAST_RXM) != 0) + +/* RACK fast retransmit on cum-ACK: HoL aged past R, not yet retransmitted. */ +static void fast_rxm_consider(struct frcti * frcti, + uint64_t now_ns, + struct pending * pending) +{ + struct rxm_entry * rxm; + struct snd_slot * slot; + size_t hp; + uint64_t R; + bool rack_ok; + + hp = RQ_SLOT(frcti->snd_cr.lwe); + slot = &frcti->snd_slots[hp]; + rxm = LOAD_ACQUIRE(&slot->rxm); + R = rack_reorder_window(frcti); + + if (RXM_SLOT_EMPTY(rxm)) + return; + + /* RFC 8985 §6.2: time-based RACK OR DupThresh count. */ + rack_ok = (int64_t)(frcti->t_latest_ack - slot->time) > (int64_t) R; + if (!rack_ok && frcti->dup_thresh < DUP_THRESH) + return; + + /* HoL aged past t_r; let rxm_due tear the flow down. */ + if (RXM_AGED_OUT(rxm->t0, now_ns, frcti->t_r)) + return; + + /* Already on it. */ + if (FAST_RXM_STAGED(pending) || RXM_FAST_DONE(slot->flags)) + return; + + recovery_enter(frcti); + + pending->fast_rxm.data = malloc(rxm->len); + if (pending->fast_rxm.data == NULL) + return; + + pending->fast_rxm.len = rxm->len; + memcpy(pending->fast_rxm.data, rxm->pkt, rxm->len); + slot->flags |= SND_RTX | SND_FAST_RXM; + frcti->rtt_lwe = frcti->snd_cr.lwe + 1; + if (rack_ok) + STAT_BUMP(frcti, rxm_rack); + else + STAT_BUMP(frcti, rxm_dupthresh); +} + +/* Caller holds wrlock; RACK fast retransmit queued in pending. */ +__attribute__((hot)) +static void frcti_ack_rcv(struct frcti * frcti, + const struct frct_pci * pci, + uint16_t flags, + uint64_t now_ns, + struct pending * pending) +{ + uint32_t ackno; + uint32_t lwe; + size_t p; + size_t fresh; + + if (!(flags & FRCT_DATA)) + STAT_BUMP(frcti, ack_rcv); + + ackno = ntoh32(pci->ackno); + if (ackno == frcti->snd_cr.lwe) { + /* RFC 8985 §6.2: only on scoreboard change. */ + if (frcti->snd_cr.lwe != frcti->rack_fired_lwe) { + fast_rxm_consider(frcti, now_ns, pending); + frcti->rack_fired_lwe = frcti->snd_cr.lwe; + } + return; + } + + if (!within(ackno, frcti->snd_cr.lwe, frcti->snd_cr.seqno)) + return; + + lwe = frcti->snd_cr.lwe; + p = RQ_SLOT(lwe); + + STORE_RELEASE(&frcti->snd_cr.lwe, ackno); + + /* §7.3: cum-ACK past the probed seqno resolves the TLP. */ + if (frcti->tlp_high_seq != 0 + && !before(ackno, frcti->tlp_high_seq)) + frcti->tlp_high_seq = 0; + + /* §7.3: end the probe episode once inflight drains. */ + if (ackno == frcti->snd_cr.seqno) + frcti->tlp_count = 0; + + /* RFC 8985 §7.2: halve mult per REO_DECAY_PKTS fresh-ACK'd seqnos. */ + fresh = ackno - frcti->dsack_lwe_snap; + if (frcti->reo_wnd_mult > 1 && fresh >= REO_DECAY_PKTS) { + uint8_t half = frcti->reo_wnd_mult >> 1; + frcti->reo_wnd_mult = half < 1 ? 1 : half; + frcti->dsack_lwe_snap = ackno; + } + + /* RFC 8985: latest cum-ACKed send-time (slot of ackno-1). */ + frcti->t_latest_ack = frcti->snd_slots[RQ_SLOT(ackno - 1)].time; + + /* RFC 8985: SACK-above-lwe count is per-recovery-episode. */ + frcti->dup_thresh = 0; + + /* Karn-skip on retx; TLP ACK clears rto_mul (no CC backoff). */ + if ((frcti->snd_slots[p].flags & SND_RTX) == 0 + || (frcti->snd_slots[p].flags & SND_TLP) != 0) + STORE_RELEASE(&frcti->rto_mul, 0); + + if (recovery_exit_reached(frcti, ackno)) + frcti->in_recovery = false; + + if (rtt_sample_eligible(frcti, p, flags, lwe)) { + int64_t mrtt = ts_age_ns(now_ns, frcti->snd_slots[p].time); + if (mrtt > 0) { + if (!(flags & FRCT_DATA)) + STAT_BUMP(frcti, ack_rtt); + rtt_update(frcti, (time_t) mrtt, now_ns); + frcti->t_rcv_rtt = now_ns; + } + } +} + +/* Skip k == lwe under clamp: NULLing HoL from a stale SACK wedges it. */ +static uint32_t sack_mark_blocks(struct frcti * frcti, + const uint8_t * payload, + uint16_t n, + uint32_t * newly_marked) +{ + uint32_t hi_sacked = frcti->snd_cr.lwe; + uint32_t marked = 0; + uint16_t i; + + for (i = 0; i < n; ++i) { + uint32_t s; + uint32_t e; + uint32_t k; + bool clamped; + + sack_block_get(payload, i, &s, &e); + + if (!before(s, e)) + continue; + + clamped = before(s, frcti->snd_cr.lwe); + if (clamped) + s = frcti->snd_cr.lwe; + if (after(e, frcti->snd_cr.seqno)) + e = frcti->snd_cr.seqno; + + for (k = s; before(k, e); ++k) { + size_t kp = RQ_SLOT(k); + uint64_t t_k; + if (clamped && k == frcti->snd_cr.lwe) + continue; + if (LOAD_ACQUIRE(&frcti->snd_slots[kp].rxm) == NULL) + continue; + STORE_RELEASE(&frcti->snd_slots[kp].rxm, NULL); + frcti->snd_slots[kp].flags = 0; + marked++; + /* RACK.fack: latest SACK-confirmed send-time. */ + t_k = frcti->snd_slots[kp].time; + if (t_k > frcti->t_latest_ack) + frcti->t_latest_ack = t_k; + } + + if (after(e, hi_sacked)) + hi_sacked = e; + } + + *newly_marked = marked; + return hi_sacked; +} + +/* Queue once per loss event (SND_FAST_RXM gates). Emit after unlock. */ +static void sack_queue_rxm(struct frcti * frcti, + uint32_t hi_sacked, + uint64_t now_ns, + struct pending * pending) +{ + uint64_t R = rack_reorder_window(frcti); + uint32_t k; + bool rack_ok; + + for (k = frcti->snd_cr.lwe; before(k, hi_sacked); ++k) { + struct rxm_entry * rxm; + size_t kp = RQ_SLOT(k); + size_t cnt = pending->sack_rxm_cnt; + size_t rack_age; + + rxm = LOAD_ACQUIRE(&frcti->snd_slots[kp].rxm); + + if (cnt >= SACK_RXM_MAX) + break; + + if (rxm == NULL) + continue; + + if (frcti->snd_slots[kp].flags & SND_FAST_RXM) + continue; + + if (RXM_AGED_OUT(rxm->t0, now_ns, frcti->t_r)) + continue; + + rack_age = frcti->t_latest_ack - frcti->snd_slots[kp].time; + /* RFC 8985 §6.2: time-based RACK OR DupThresh count. */ + rack_ok = (int64_t) rack_age > (int64_t) R; + if (!rack_ok && frcti->dup_thresh < DUP_THRESH) + continue; + + if (rack_ok) + STAT_BUMP(frcti, rxm_rack); + else + STAT_BUMP(frcti, rxm_dupthresh); + + pending->sack_rxm[cnt].data = malloc(rxm->len); + if (pending->sack_rxm[cnt].data == NULL) + break; + + pending->sack_rxm[cnt].len = rxm->len; + memcpy(pending->sack_rxm[cnt].data, rxm->pkt, rxm->len); + pending->sack_rxm_cnt++; + /* NULL slot so the original timer self-cleans. */ + STORE_RELEASE(&frcti->snd_slots[kp].rxm, NULL); + frcti->snd_slots[kp].time = now_ns; + frcti->snd_slots[kp].flags |= SND_RTX | SND_FAST_RXM; + frcti->rtt_lwe = k + 1; + } +} + +/* + * RFC 2883 D-SACK detector. Returns true iff block[0] is a D-SACK + * report: + * case 1: blocks[0].start < pkt_ackno (strictly below cum-ACK). + * case 2: blocks[0] is a strict sub-range of some blocks[i>0]. + * MAX_DSACK_LAG bounds case-1 distance to one rcv window (sanity). + */ +static bool sack_is_dsack(struct frcti * frcti, + const uint8_t * payload, + uint16_t n, + uint32_t pkt_ackno) +{ + uint32_t s0; + uint32_t e0; + uint16_t i; + + if (n == 0) + return false; + + sack_block_get(payload, 0, &s0, &e0); + if (!before(s0, e0)) + return false; + + if (before(s0, pkt_ackno)) { + if ((pkt_ackno - s0) <= (uint32_t) MAX_DSACK_LAG) + return true; + STAT_BUMP(frcti, dsack_drop); + return false; + } + + for (i = 1; i < n; ++i) { + uint32_t si; + uint32_t ei; + + sack_block_get(payload, i, &si, &ei); + if (!before(si, ei)) + continue; + if (!before(s0, si) && !after(e0, ei) + && (s0 != si || e0 != ei)) + return true; + } + + return false; +} + +/* RFC 8985 §7.2: grow reo_wnd_mult on DSACK; at most once per RTT. */ +static __inline__ void reo_wnd_on_dsack(struct frcti * frcti, + uint64_t now_ns) +{ + time_t srtt = frcti->srtt; + + /* Snap is unconditional: feeds the per-D-SACK decay clock. */ + frcti->dsack_lwe_snap = frcti->snd_cr.lwe; + + if (srtt > 0 + && now_ns - frcti->t_last_reo_widen <= (uint64_t) srtt) + return; + + if (frcti->reo_wnd_mult < REO_WND_MULT_MAX) + frcti->reo_wnd_mult++; + + frcti->t_last_reo_widen = now_ns; +} + +/* Caller holds wrlock; retransmits queued for post-unlock emission. */ +static void frcti_sack_rcv(struct frcti * frcti, + buffer_t pkt, + uint32_t pkt_ackno, + uint64_t now_ns, + struct pending * pending) +{ + uint32_t hi_sacked; + uint32_t marked; + uint16_t n; + bool dsack; + uint16_t n_real; + + if (pkt.len < SACK_HDR_SIZE) + return; + + n = ntoh16(*(const uint16_t *) pkt.data); + if (n > SACK_MAX_BLOCKS) + return; + + if (pkt.len < SACK_HDR_SIZE + (size_t) n * SACK_BLOCK_SIZE) + return; + + STAT_BUMP(frcti, sack_rcv); + + dsack = sack_is_dsack(frcti, pkt.data, n, pkt_ackno); + n_real = n - (dsack ? 1 : 0); + + if (dsack) { + STAT_BUMP(frcti, dsack_rcv); + reo_wnd_on_dsack(frcti, now_ns); + } + + /* DSACK-only carries no new gap; don't enter recovery. */ + if (n_real > 0) + recovery_enter(frcti); + + marked = 0; + hi_sacked = sack_mark_blocks(frcti, pkt.data, n, &marked); + frcti->dup_thresh += marked; + + if (after(hi_sacked, frcti->snd_cr.lwe)) + sack_queue_rxm(frcti, hi_sacked, now_ns, pending); +} + +/* Emit and free queued packet copies. */ +static void pending_flush(struct frcti * frcti, + struct pending * pending) +{ + size_t i; + + for (i = 0; i < pending->sack_rxm_cnt; ++i) { + sack_rxm_snd(frcti, pending->sack_rxm[i].data, + pending->sack_rxm[i].len); + free(pending->sack_rxm[i].data); + } + + if (pending->fast_rxm.data != NULL) { + int ret = fast_rxm_send(frcti, pending->fast_rxm.data, + pending->fast_rxm.len); + if (ret == -EFLOWDOWN || ret == -ENOTALLOC) + STAT_BUMP(frcti, rxm_tx_dead); + free(pending->fast_rxm.data); + } +} + +/* Pre-DRF NACK: ask peer to retransmit HoL; seqno is informational. */ +static void frcti_nack_snd(struct frcti * frcti, + uint32_t seqno_unseen) +{ + struct ssm_pk_buff * spb; + struct frct_pci * pci; + + if (frct_ctrl_alloc(&spb, &pci, 0) < 0) + return; + + pci->flags = hton16(FRCT_NACK); + pci->seqno = hton32(seqno_unseen); + + frct_hcs_set(pci, false); + + frct_tx(frcti, spb); +} + +enum frct_act { + FRCT_ACTIVE, + FRCT_INACT_NEED_NACK, + FRCT_INACT_DROP, +}; + +/* On rcv inactivity: rebase on DRF, or arm pre-DRF NACK. Caller wrlock. */ +static enum frct_act rcv_inact_check(struct frcti * frcti, + uint16_t flags, + uint32_t seqno, + uint64_t now_ns) +{ + struct frct_cr * rcv_cr = &frcti->rcv_cr; + uint64_t cd; + + if (!ts_aged_ns(now_ns, rcv_cr->act, rcv_cr->inact)) + return FRCT_ACTIVE; + + if (flags & FRCT_DRF) { + if (same_epoch_drf(seqno, flags, rcv_cr)) + return FRCT_ACTIVE; + + /* Bootstrap or fresh epoch: rebase. */ + STAT_BUMP(frcti, drf_rebase); + release_rq(frcti); + STORE_RELEASE(&rcv_cr->lwe, seqno); + rcv_cr->rwe = seqno + RQ_SIZE; + rcv_cr->seqno = seqno; + return FRCT_ACTIVE; + } + + if (!(flags & FRCT_DATA)) + return FRCT_ACTIVE; + + /* Pre-DRF: nudge sender with NACK (rate-limited). */ + cd = frcti->srtt > 0 ? (uint64_t) frcti->srtt : NACK_COOLDOWN_NS; + if (!ts_aged_ns(now_ns, frcti->t_nack, cd)) + return FRCT_INACT_DROP; + + frcti->t_nack = now_ns; + STAT_BUMP(frcti, nack_snd); + + return FRCT_INACT_NEED_NACK; +} + +/* Both modes: bounded accept into rq[seqno]. Caller wrlock. */ +__attribute__((hot)) +static bool rq_accept(struct frcti * frcti, + uint32_t seqno, + size_t pos, + uint16_t flags) +{ + struct frct_cr * rcv_cr = &frcti->rcv_cr; + + if (!before(seqno, rcv_cr->rwe)) { + STAT_BUMP(frcti, out_rcv); + return false; + } + + if (!before(seqno, rcv_cr->lwe + RQ_SIZE)) { + STAT_BUMP(frcti, rqo_rcv); + return false; + } + + if (frcti->rcv_slots[pos].idx != -1) { + if (flags & FRCT_RXM) + STAT_BUMP(frcti, rxm_dup_rcv); + else + STAT_BUMP(frcti, dup_rcv); + /* RFC 2883 §4 case 2: in-window dup; sub-range marker. */ + frcti->dsack_seqno = seqno; + frcti->dsack_valid = true; + return false; + } + + return true; +} + +/* OOO arrival; throttle by min_gap + scoreboard dedup. */ +static bool sack_check(struct frcti * frcti, + uint32_t seqno, + uint64_t now_ns, + struct sack_args * out) +{ + struct frct_cr * rcv_cr = &frcti->rcv_cr; + uint64_t min_gap; + uint16_t n; + + if (!after(seqno, rcv_cr->lwe)) + return false; + + STAT_BUMP(frcti, ooo_rcv); + + /* SACK carries cum-ACK; bound by t_a like any other ACK. */ + if (ACK_AGED_OUT(rcv_cr->act, now_ns, frcti->t_a)) + return false; + + /* srtt/8 gate starved recovery under burst loss; floor to save CPU. */ + min_gap = (uint64_t) SACK_MIN_GAP_NS; + + if (!ts_aged_ns(now_ns, frcti->t_snd_sack, min_gap)) + return false; + + out->dsack = false; + n = dsack_consume(frcti, out->blocks); + if (n == 1) + out->dsack = true; + n += sack_blocks_build(frcti, out->blocks + n, + frcti->sack_n_max - n); + + if (!out->dsack + && rcv_cr->lwe == frcti->sack_lwe && n == frcti->sack_n) + return false; + + out->n = n; + out->ack = rcv_cr->lwe; + out->rwe = frcti_advert_rwe(frcti); + frcti->t_snd_sack = now_ns; + frcti->sack_lwe = rcv_cr->lwe; + frcti->sack_n = n; + + return true; +} + +/* Wire-dup of fresh DATA at an already-ACKed seqno. */ +static __inline__ bool is_dup_data(uint16_t flags, + uint32_t seqno, + uint32_t lwe) +{ + if (!(flags & FRCT_DATA)) + return false; + + if (flags & FRCT_RXM) + return false; + + return before(seqno, lwe); +} + +/* + * Wire-dup ACK packet: same seqno as the previous emission. Updates + * the dedup ackno on a fresh ACK; caller drops on true. + */ +static __inline__ bool is_dup_ack(struct frcti * frcti, + uint16_t flags, + uint32_t seqno) +{ + if (flags & FRCT_DATA) + return false; + + if (!(flags & FRCT_ACK)) + return false; + + if (seqno == frcti->rcv_cr.ackno) + return true; + + frcti->rcv_cr.ackno = seqno; + + return false; +} + +/* Caller wrlock. */ +__attribute__((cold)) +static void seqno_rotate(struct frcti * frcti, + uint64_t now_ns) +{ + struct frct_cr * snd_cr = &frcti->snd_cr; + + if (!ts_aged_ns(now_ns, snd_cr->act, snd_cr->inact)) + return; + /* Idle-on-wire ≠idle e2e: don't orphan in-flight rxm. */ + if (snd_cr->seqno != snd_cr->lwe) + return; + + /* Avoid colliding with peer's current rcv window. */ + do { + random_buffer(&snd_cr->seqno, sizeof(snd_cr->seqno)); + } while (in_window(snd_cr->seqno, snd_cr)); + STORE_RELEASE(&snd_cr->lwe, snd_cr->seqno); + STORE_RELAXED(&snd_cr->rwe, snd_cr->lwe + START_WINDOW); + frcti->rtt_lwe = snd_cr->seqno; + frcti->in_recovery = false; + frcti->recovery_high = snd_cr->seqno; +} + +__attribute__((hot)) +static int frcti_snd(struct frcti * frcti, + struct ssm_pk_buff * spb, + uint16_t flags) +{ + struct frct_pci * pci; + struct frct_pci_stream * spci = NULL; + struct timespec now; + struct frct_cr * snd_cr; + struct frct_cr * rcv_cr; + struct rxm_entry * rxm = NULL; + uint32_t seqno; + uint16_t pci_flags = 0; + bool rtx; + uint64_t now_ns; + int64_t rcv_idle; + uint32_t probe_id = 0; + uint8_t probe_nonce[RTTP_NONCE_LEN] = { 0 }; + bool probe; + size_t payload_len = 0; assert(frcti); - assert(ssm_pk_buff_len(spb) != 0); + /* Stream mode permits 0-byte sends for the EOS marker. */ + assert(ssm_pk_buff_len(spb) != 0 || frcti->stream); snd_cr = &frcti->snd_cr; rcv_cr = &frcti->rcv_cr; - timerwheel_move(); + tw_move_safe(); + + if (frcti->stream) + payload_len = ssm_pk_buff_len(spb); - pci = (struct frct_pci *) ssm_pk_buff_head_alloc(spb, FRCT_PCILEN); + pci = FRCT_HDR_PUSH(spb, frcti); if (pci == NULL) return -ENOMEM; - memset(pci, 0, sizeof(*pci)); + /* Pre-allocate rxm so alloc fail can't orphan a seqno. */ + if (snd_cr->cflags & FRCTFRTX) { + rxm = rxm_alloc(frcti, ssm_pk_buff_len(spb)); + if (rxm == NULL) { + ssm_pk_buff_pop(spb, frcti_data_hdr_len(frcti)); + return -ENOMEM; + } + } + + memset(pci, 0, FRCT_PCILEN); + + if (frcti->stream) + spci = FRCT_SPCI(pci); clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); pthread_rwlock_wrlock(&frcti->lock); rtx = snd_cr->cflags & FRCTFRTX; - pci->flags |= FRCT_DATA; + pci_flags |= FRCT_DATA; + if (!frcti->stream) + pci_flags |= (flags & FRCT_FR_MASK); - /* Set DRF if there are no unacknowledged packets. */ - if (snd_cr->seqno == snd_cr->lwe) - pci->flags |= FRCT_DRF; + if (!frcti->stream && (flags & FRCT_FR_MASK) != FRCT_FR_SOLE) + STAT_BUMP(frcti, frag_snd); - /* Choose a new sequence number if sender inactivity expired. */ - if (now.tv_sec - snd_cr->act.tv_sec > snd_cr->inact) { - /* There are no unacknowledged packets. */ - assert(snd_cr->seqno == snd_cr->lwe); - random_buffer(&snd_cr->seqno, sizeof(snd_cr->seqno)); - snd_cr->lwe = snd_cr->seqno; - snd_cr->rwe = snd_cr->lwe + START_WINDOW; + if (frcti->stream) { + if (flags & FRCT_FIN) + pci_flags |= FRCT_FIN; + + spci->start = hton32(frcti->snd_byte_next); + frcti->snd_byte_next += (uint32_t) payload_len; + spci->end = hton32(frcti->snd_byte_next); + STAT_ADD(frcti, strm_snd_byte, payload_len); } + if (snd_cr->seqno == snd_cr->lwe) + pci_flags |= FRCT_DRF; + + seqno_rotate(frcti, now_ns); + seqno = snd_cr->seqno; pci->seqno = hton32(seqno); - if (now.tv_sec - rcv_cr->act.tv_sec < rcv_cr->inact) { - pci->flags |= FRCT_FC; - *((uint32_t *) pci) |= hton32(rcv_cr->rwe & 0x00FFFFFF); + rcv_idle = ts_age_ns(now_ns, rcv_cr->act); + + if (rcv_idle < (int64_t) rcv_cr->inact) { + pci_flags |= FRCT_FC; + pci->window = hton32(frcti_advert_rwe(frcti)); } if (!rtx) { - snd_cr->lwe++; + STORE_RELEASE(&snd_cr->lwe, snd_cr->lwe + 1); + STORE_RELEASE(&snd_cr->rwe, snd_cr->lwe + RQ_SIZE); } else { - if (!frcti->probe) { - frcti->rttseq = snd_cr->seqno; - frcti->t_probe = now; - frcti->probe = true; -#ifdef PROC_FLOW_STATS - frcti->n_prb++; -#endif - } - if ((now.tv_sec - rcv_cr->act.tv_sec) * BILLION <= frcti->a) { - pci->flags |= FRCT_ACK; + size_t p = RQ_SLOT(seqno); + frcti->snd_slots[p].time = now_ns; + /* Fresh send clears RTX bits. */ + frcti->snd_slots[p].flags = 0; + if (rcv_idle <= (int64_t) frcti->t_a) { + pci_flags |= FRCT_ACK; pci->ackno = hton32(rcv_cr->lwe); rcv_cr->seqno = rcv_cr->lwe; } } + pci->flags = hton16(pci_flags); + + frct_hcs_set(pci, frcti->stream); + snd_cr->seqno++; - snd_cr->act = now; + STORE_RELEASE(&snd_cr->act, now_ns); + + probe = rtt_probe_arm(frcti, now_ns, &probe_id, probe_nonce); pthread_rwlock_unlock(&frcti->lock); - if (rtx) - timerwheel_rxm(frcti, seqno, spb); + if (probe) + frcti_rttp_snd(frcti, probe_id, 0, probe_nonce); + + if (rtx) { + assert(rxm != NULL); + rxm_arm(frcti, seqno, rxm, spb); + tlp_arm(frcti); + } return 0; } -static void rtt_estimator(struct frcti * frcti, - time_t mrtt) +/* Stream FIN is armed for rxm; needs to be in window. */ +static __inline__ bool stream_fin_blocked(struct frcti * frcti) { - time_t srtt = frcti->srtt; - time_t rttvar = frcti->mdev; + if (!frcti->stream) + return false; - if (srtt == 0) { /* first measurement */ - srtt = mrtt; - rttvar = mrtt >> 1; - } else { - time_t delta = mrtt - srtt; - srtt += (delta >> 3); - delta = (ABS(delta) - rttvar) >> 2; -#ifdef FRCT_LINUX_RTT_ESTIMATOR - if (delta < 0) - delta >>= 3; -#endif - rttvar += delta; + return !before(frcti->snd_cr.seqno, frcti->snd_cr.lwe + RQ_SIZE); +} + +/* + * Stream: 0-byte FRCT_FIN DATA so peer's flow_read returns 0 at this + * byte. Msg: control packet with FRCT_FIN flag, snd_cr.seqno carried + * in pci->ackno (sender packs via frcti_pkt_snd's ackno parameter). + */ +static void frcti_fin_snd(struct frcti * frcti) +{ + struct ssm_pk_buff * spb; + bool already; + uint32_t fin_seqno; + + if (!(frcti->snd_cr.cflags & FRCTFLINGER)) + return; + + pthread_rwlock_wrlock(&frcti->lock); + + already = frcti->snd_fin_sent; + + /* Defer before committing snd_fin_sent; linger loop retries. */ + if (!already && stream_fin_blocked(frcti)) { + pthread_rwlock_unlock(&frcti->lock); + return; } -#ifdef PROC_FLOW_STATS - frcti->n_rtt++; -#endif - frcti->srtt = MAX(1000L, srtt); - frcti->mdev = MAX(100L, rttvar); - frcti->rto = MAX(RTO_MIN, frcti->srtt + (frcti->mdev << MDEV_MUL)); -} - -/* Always queues the next application packet on the RQ. */ -static void __frcti_rcv(struct frcti * frcti, - struct ssm_pk_buff * spb) -{ - ssize_t idx; - size_t pos; - struct frct_pci * pci; - struct timespec now; - struct frct_cr * rcv_cr; - struct frct_cr * snd_cr; - uint32_t seqno; - uint32_t ackno; - uint32_t rwe; - int fd = -1; - assert(frcti); + frcti->snd_fin_sent = true; + fin_seqno = frcti->snd_cr.seqno; + + if (!already && !frcti->stream) + frcti->snd_fin_seqno = fin_seqno; + pthread_rwlock_unlock(&frcti->lock); + + if (already) + return; + + if (!frcti->stream) { + frcti_pkt_snd(frcti, FRCT_FIN, fin_seqno, 0); + return; + } + + if (frct_spb_reserve(frcti_data_hdr_len(frcti), &spb) < 0) + return; + + /* Reset spb to 0-len so frcti_snd's head_alloc populates PCI. */ + ssm_pk_buff_truncate(spb, 0); + + if (frcti_snd(frcti, spb, FRCT_FIN) < 0) { + frct_spb_release(spb); + return; + } + + if (frct_tx(frcti, spb) < 0) + return; + + pthread_rwlock_wrlock(&frcti->lock); + + frcti->snd_fin_seqno = frcti->snd_cr.seqno - 1; + + pthread_rwlock_unlock(&frcti->lock); +} + +static bool final_ack_due(struct frcti * frcti, + struct frct_cr * rcv_cr, + uint64_t now_ns) +{ + if (rcv_cr->lwe == rcv_cr->seqno) + return false; + + if (ACK_AGED_OUT(rcv_cr->act, now_ns, frcti->t_a)) + return false; + + return true; +} + +/* Snd-side has FLINGER cflag and unACK'd data below the FIN/seqno. */ +static __inline__ bool snd_drain_pending(struct frct_cr * snd_cr, + uint32_t edge) +{ + if (!(snd_cr->cflags & FRCTFLINGER)) + return false; + + return before(snd_cr->lwe, edge); +} + +/* Peer is still active and we haven't seen their FIN yet. */ +static __inline__ bool rcv_drain_pending(struct frcti * frcti, + struct frct_cr * rcv_cr, + uint64_t now_ns) +{ + if (frcti->rcv_fin_seen) + return false; + + return !ts_aged_ns(now_ns, rcv_cr->act, rcv_cr->inact); +} + +/* Drain-loop predicate: snd-side unACK'd data OR peer still active. */ +static bool frcti_lingering(struct frcti * frcti) +{ + struct timespec now; + struct frct_cr * snd_cr; + struct frct_cr * rcv_cr; + uint32_t edge; + uint64_t now_ns; + bool snd_linger; + bool rcv_linger; + + /* Idempotent; emits FIN once per side, both stream and msg. */ + frcti_fin_snd(frcti); + + clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); + + pthread_rwlock_rdlock(&frcti->lock); + + snd_cr = &frcti->snd_cr; rcv_cr = &frcti->rcv_cr; + + if (frcti->snd_fin_sent) + edge = frcti->snd_fin_seqno; + else + edge = snd_cr->seqno; + + snd_linger = snd_drain_pending(snd_cr, edge); + rcv_linger = rcv_drain_pending(frcti, rcv_cr, now_ns); + + pthread_rwlock_unlock(&frcti->lock); + + return snd_linger || rcv_linger; +} + +static time_t frcti_dealloc(struct frcti * frcti) +{ + struct timespec now; + struct frct_cr * snd_cr; + struct frct_cr * rcv_cr; + int ackno; + bool due; + int64_t now_ns; + int64_t rcv; + int64_t snd; + snd_cr = &frcti->snd_cr; + rcv_cr = &frcti->rcv_cr; + + /* Idempotent; usually already sent by frcti_lingering. */ + frcti_fin_snd(frcti); clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); - pci = (struct frct_pci *) ssm_pk_buff_head_release(spb, FRCT_PCILEN); + pthread_rwlock_rdlock(&frcti->lock); - idx = ssm_pk_buff_get_idx(spb); - seqno = ntoh32(pci->seqno); - pos = seqno & (RQ_SIZE - 1); + ackno = rcv_cr->lwe; + rcv = (int64_t)(rcv_cr->act + rcv_cr->inact) - now_ns; + snd = (int64_t)(snd_cr->act + snd_cr->inact) - now_ns; + due = final_ack_due(frcti, rcv_cr, now_ns); - pthread_rwlock_wrlock(&frcti->lock); + pthread_rwlock_unlock(&frcti->lock); - if (now.tv_sec - rcv_cr->act.tv_sec > rcv_cr->inact) { - if (pci->flags & FRCT_DRF) { /* New run. */ - rcv_cr->lwe = seqno; - rcv_cr->rwe = seqno + RQ_SIZE; - rcv_cr->seqno = seqno; - } else if (pci->flags & FRCT_DATA) { - goto drop_packet; - } - } + if (due) + frcti_pkt_snd(frcti, FRCT_ACK, ackno, 0); - rcv_cr->act = now; + return (time_t) MAX((MAX(rcv, snd) / BILLION), 0); +} - /* For now, just send an immediate window update. */ - if (pci->flags & FRCT_RDVS) { - fd = frcti->fd; - rwe = rcv_cr->rwe; - pthread_rwlock_unlock(&frcti->lock); +__attribute__((hot)) +static void frcti_rcv(struct frcti * frcti, + struct ssm_pk_buff * spb) +{ + ssize_t idx; + size_t pos; + struct frct_pci * pci; + struct timespec now; + uint64_t now_ns; + struct frct_cr * rcv_cr; + uint32_t seqno; + uint16_t flags; + buffer_t pkt; + struct pending pending = { 0 }; + bool in_order; + struct sack_args * sa = NULL; + bool send_sack = false; + + assert(frcti); - __send_frct_pkt(fd, FRCT_FC, 0, rwe); + rcv_cr = &frcti->rcv_cr; + + clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); - ssm_pool_remove(proc.pool, idx); + if (ssm_pk_buff_len(spb) < FRCT_PCILEN) { + frct_spb_release(spb); return; } - if (pci->flags & FRCT_ACK) { - ackno = ntoh32(pci->ackno); - if (after(ackno, frcti->snd_cr.lwe)) - frcti->snd_cr.lwe = ackno; + pci = FRCT_HDR_POP(spb, frct_pci); - if (frcti->probe && after(ackno, frcti->rttseq)) { -#ifdef PROC_FLOW_STATS - if (!(pci->flags & FRCT_DATA)) - frcti->n_dak++; -#endif - rtt_estimator(frcti, ts_diff_ns(&now, &frcti->t_probe)); - frcti->probe = false; - } + idx = ssm_pk_buff_get_off(spb); + seqno = ntoh32(pci->seqno); + pos = RQ_SLOT(seqno); + + flags = ntoh16(pci->flags); + + pkt.data = ssm_pk_buff_head(spb); + pkt.len = ssm_pk_buff_len(spb); + + if (flags & FRCT_RXM) + STAT_BUMP(frcti, rxm_rcv); + + /* Stateless / lock-free dispatches. spb released via ctrl_done. */ + if (flags & FRCT_KA) { + frcti_ka_rcv(frcti, pci, now_ns, flags); + goto ctrl_done; } - if (pci->flags & FRCT_FC) { - uint32_t rwe; + if (flags & FRCT_RTTP) { + frcti_rttp_rcv(frcti, pkt, now_ns); + goto ctrl_done; + } - rwe = ntoh32(*((uint32_t *)pci) & hton32(0x00FFFFFF)); - rwe |= snd_cr->rwe & 0xFF000000; + if (flags & FRCT_NACK) { + frcti_nack_rcv(frcti); + goto ctrl_done; + } + + if (flags & FRCT_RDVS) { + frcti_rdv_rcv(frcti); + goto ctrl_done; + } - /* Rollover for 24 bit */ - if (before(rwe, snd_cr->rwe) && snd_cr->rwe - rwe > 0x007FFFFF) - rwe += 0x01000000; + /* Msg-mode FIN: control packet, FIN seqno carried in pci->ackno. */ + if ((flags & FRCT_FIN) && !(flags & FRCT_DATA)) { + pthread_rwlock_wrlock(&frcti->lock); + if (!frcti->rcv_fin_seen) { + frcti->rcv_fin_seen = true; + frcti->rcv_byte_fin = ntoh32(pci->ackno); + } + pthread_rwlock_unlock(&frcti->lock); + goto ctrl_done; + } - snd_cr->rwe = rwe; + pthread_rwlock_wrlock(&frcti->lock); - pthread_mutex_lock(&frcti->mtx); - if (!frcti->open) { - frcti->open = true; - pthread_cond_broadcast(&frcti->cond); + /* rcv_inact_check is a no-op for non-DATA non-DRF packets. */ + if (flags & (FRCT_DATA | FRCT_DRF)) { + switch (rcv_inact_check(frcti, flags, seqno, now_ns)) { + case FRCT_INACT_NEED_NACK: + pthread_rwlock_unlock(&frcti->lock); + frcti_nack_snd(frcti, seqno - 1); + frct_spb_release(spb); + return; + case FRCT_INACT_DROP: + STAT_BUMP(frcti, inact_drop); + goto drop_packet; + case FRCT_ACTIVE: + /* FALLTHRU */ + default: + break; } - pthread_mutex_unlock(&frcti->mtx); } - if (!(pci->flags & FRCT_DATA)) + /* DATA-only act refresh: non-DATA would lock out DRF rebase. */ + if (flags & FRCT_DATA) + STORE_RELEASE(&rcv_cr->act, now_ns); + + /* Wire-dup ACK packet: same seqno as the previous emission. */ + if (is_dup_ack(frcti, flags, seqno)) { + STAT_BUMP(frcti, ack_dup_rcv); + goto drop_packet; + } + + /* Wire-dup of DATA: piggybacked ACK info already processed. */ + if (is_dup_data(flags, seqno, rcv_cr->lwe)) { + rcv_cr->seqno = seqno; + STAT_BUMP(frcti, dup_rcv); + /* RFC 2883 §4 case 1: dup below cum-ACK. */ + frcti->dsack_seqno = seqno; + frcti->dsack_valid = true; + goto drop_packet; + } + + if (flags & FRCT_ACK) + frcti_ack_rcv(frcti, pci, flags, now_ns, &pending); + + if (flags & FRCT_SACK) + frcti_sack_rcv(frcti, pkt, ntoh32(pci->ackno), + now_ns, &pending); + + if (flags & FRCT_FC) + frcti_fc_rcv(frcti, pci); + + if (!(flags & FRCT_DATA)) goto drop_packet; if (before(seqno, rcv_cr->lwe)) { - rcv_cr->seqno = seqno; /* Ensures we send a new ACK. */ -#ifdef PROC_FLOW_STATS - frcti->n_dup++; -#endif + /* Bump rcv_cr.seqno to force ack_snd to fire on the dup. */ + rcv_cr->seqno = seqno; + if (flags & FRCT_RXM) + STAT_BUMP(frcti, rxm_dup_rcv); + else + STAT_BUMP(frcti, dup_rcv); + /* RFC 2883 §4 case 1: dup below cum-ACK. */ + frcti->dsack_seqno = seqno; + frcti->dsack_valid = true; goto drop_packet; } - if (rcv_cr->cflags & FRCTFRTX) { + if (!rq_accept(frcti, seqno, pos, flags)) + goto drop_packet; - if (!before(seqno, rcv_cr->rwe)) { /* Out of window. */ -#ifdef PROC_FLOW_STATS - frcti->n_out++; -#endif + if (frcti->stream) { + if (frcti_stream_data_rcv(frcti, spb, pos, flags) < 0) { + STAT_BUMP(frcti, strm_drop); goto drop_packet; } - - if (!before(seqno, rcv_cr->lwe + RQ_SIZE)) { -#ifdef PROC_FLOW_STATS - frcti->n_rqo++; -#endif - goto drop_packet; /* Out of rq. */ - } - if (frcti->rq[pos] != -1) { -#ifdef PROC_FLOW_STATS - frcti->n_dup++; -#endif - goto drop_packet; /* Duplicate in rq. */ - } - fd = frcti->fd; + /* spb consumed by stash; do not release in drop path. */ + spb = NULL; } else { - rcv_cr->lwe = seqno; + frcti_data_stash(frcti, idx, pos, flags); + } + + /* Lazy alloc: only OOO arrivals can trigger a SACK send. */ + if (after(seqno, rcv_cr->lwe) && frcti->sack_n_max > 0) { + size_t sa_sz = sizeof(*sa) + + frcti->sack_n_max * sizeof(sa->blocks[0]); + sa = malloc(sa_sz); + /* If alloc fails, sack_check sees NULL and we skip SACK. */ } - frcti->rq[pos] = idx; + send_sack = sa != NULL && sack_check(frcti, seqno, now_ns, sa); + in_order = !after(seqno, rcv_cr->lwe); pthread_rwlock_unlock(&frcti->lock); - if (fd != -1) - timerwheel_delayed_ack(fd, frcti); + if (send_sack) { + STAT_BUMP(frcti, sack_snd); + if (sa->dsack) + STAT_BUMP(frcti, dsack_snd); + frcti_sack_snd(frcti, sa); + } else if (in_order) { + ack_arm(frcti); + } + + if ((flags & FRCT_ACK) && frcti->snd_cr.seqno != frcti->snd_cr.lwe) + tlp_arm(frcti); + + pending_flush(frcti, &pending); + + frcti_rcv_probe(frcti, now_ns); + free(sa); + return; + + ctrl_done: + frct_spb_release(spb); return; drop_packet: pthread_rwlock_unlock(&frcti->lock); - ssm_pool_remove(proc.pool, idx); - send_frct_pkt(frcti); - return; + frct_spb_release(spb); + /* with_sack=true: ack_snd no-ops if neither dsack nor SACK is due. */ + ack_snd(frcti, true); + + pending_flush(frcti, &pending); + free(sa); } + +/* NULL-shim macros for the no-FRCT case. */ + +#define FRCTI_SND(frcti, spb, flags) \ + ((frcti) == NULL ? 0 : frcti_snd((frcti), (spb), (flags))) + +#define FRCTI_RCV(frcti, spb) \ + do { \ + if ((frcti) != NULL) \ + frcti_rcv((frcti), (spb)); \ + } while (0) + +#define FRCTI_PDU_READY(frcti) \ + ((frcti) != NULL && frcti_pdu_ready(frcti)) + +#define FRCTI_CONSUME(frcti, buf, count) \ + ((frcti) == NULL ? (ssize_t) -EAGAIN \ + : (frcti)->stream \ + ? frcti_consume_stream((frcti), (buf), (count)) \ + : frcti_consume((frcti), (buf), (count))) + +#define FRCTI_IS_FRTX(frcti) \ + ((frcti) != NULL && ((frcti)->rcv_cr.cflags & FRCTFRTX)) + +#define FRCTI_IS_STREAM(frcti) ((frcti) != NULL && (frcti)->stream) + +#define FRCTI_PAYLOAD_CAP(frcti) \ + ((frcti)->frag_mtu - frcti_data_hdr_len(frcti)) + +#define FRCTI_NEEDS_FRAG(frcti, count) \ + ((frcti) != NULL && (count) > FRCTI_PAYLOAD_CAP(frcti)) + +#define FRCTI_IS_WINDOW_OPEN(frcti) \ + ((frcti) == NULL ? true : frcti_is_window_open(frcti)) + +#define FRCTI_IS_WINDOW_OPEN_N(frcti, n) \ + ((frcti) == NULL ? true : frcti_is_window_open_n((frcti), (n))) + +#define FRCTI_LINGERING(frcti) \ + ((frcti) == NULL ? false : frcti_lingering(frcti)) + +#define FRCTI_DEALLOC(frcti) \ + ((frcti) == NULL ? (time_t) 0 : frcti_dealloc(frcti)) + diff --git a/src/lib/hash.c b/src/lib/hash.c index 7adee968..903474df 100644 --- a/src/lib/hash.c +++ b/src/lib/hash.c @@ -39,6 +39,9 @@ #include <ouroboros/md5.h> #include <ouroboros/sha3.h> #endif +#include <ouroboros/crc8.h> +#include <ouroboros/crc16.h> +#include <ouroboros/crc64.h> #include <string.h> #include <assert.h> #include <stdbool.h> @@ -69,6 +72,14 @@ int hash_len_tbl [] = { uint16_t hash_len(enum hash_algo algo) { + if (algo == HASH_CRC8) + return CRC8_HASH_LEN; + + if (algo == HASH_CRC16) + return CRC16_HASH_LEN; + + if (algo == HASH_CRC64) + return CRC64_HASH_LEN; #ifdef HAVE_LIBGCRYPT return (uint16_t) gcry_md_get_algo_dlen(gcry_algo_tbl[algo]); #else @@ -81,12 +92,36 @@ void mem_hash(enum hash_algo algo, const uint8_t * buf, size_t len) { -#ifdef HAVE_LIBGCRYPT - gcry_md_hash_buffer(gcry_algo_tbl[algo], dst, buf, len); -#else +#ifndef HAVE_LIBGCRYPT struct sha3_ctx sha3_ctx; struct md5_ctx md5_ctx; +#endif + if (algo == HASH_CRC8) { + uint8_t crc = 0; + + crc8_autosar(&crc, buf, len); + *(uint8_t *) dst = crc; + return; + } + if (algo == HASH_CRC16) { + uint16_t crc = 0; + + crc16_ccitt_false(&crc, buf, len); + *(uint16_t *) dst = htobe16(crc); + return; + } + + if (algo == HASH_CRC64) { + uint64_t crc = 0; + + crc64_nvme(&crc, buf, len); + *(uint64_t *) dst = htobe64(crc); + return; + } +#ifdef HAVE_LIBGCRYPT + gcry_md_hash_buffer(gcry_algo_tbl[algo], dst, buf, len); +#else switch (algo) { case HASH_CRC32: memset(dst, 0, CRC32_HASH_LEN); @@ -131,3 +166,14 @@ void str_hash(enum hash_algo algo, { return mem_hash(algo, dst, (const uint8_t *) str, strlen(str)); } + +uint64_t hash_mix64(uint64_t key) +{ + key ^= key >> 33; + key *= 0xff51afd7ed558ccdULL; + key ^= key >> 33; + key *= 0xc4ceb9fe1a85ec53ULL; + key ^= key >> 33; + + return key; +} diff --git a/src/lib/pb/ipcp.proto b/src/lib/pb/ipcp.proto index 9dc402f5..afee4f91 100644 --- a/src/lib/pb/ipcp.proto +++ b/src/lib/pb/ipcp.proto @@ -39,6 +39,7 @@ enum ipcp_msg_code { IPCP_CONNECT = 10; IPCP_DISCONNECT = 11; IPCP_REPLY = 12; + IPCP_FLOW_UPDATE = 13; } message ipcp_msg { @@ -54,7 +55,7 @@ message ipcp_msg { optional int32 response = 10; optional string comp = 11; optional uint32 timeo_sec = 12; - optional sint32 mpl = 13; + optional sint32 mpl = 13; /* MPL in ms. */ optional int32 result = 14; optional uint32 uid = 15; /* 0 = GSPP, >0 = PUP uid */ } diff --git a/src/lib/pb/irm.proto b/src/lib/pb/irm.proto index 9ed0a29b..f54bc9ea 100644 --- a/src/lib/pb/irm.proto +++ b/src/lib/pb/irm.proto @@ -53,6 +53,8 @@ enum irm_msg_code { IPCP_FLOW_REQ_ARR = 25; IPCP_FLOW_ALLOC_REPLY = 26; IRM_REPLY = 27; + IRM_FLOW_UPDATE = 28; + IPCP_FLOW_UPDATE_ARR = 29; } message timespec_msg { @@ -88,12 +90,15 @@ message irm_msg { repeated ipcp_list_msg ipcps = 17; repeated name_info_msg names = 18; optional timespec_msg timeo = 19; - optional sint32 mpl = 20; + optional sint32 mpl = 20; /* MPL in ms. */ optional string comp = 21; optional bytes pk = 22; /* piggyback */ optional uint32 timeo_sec = 23; optional uint32 timeo_nsec = 24; optional sint32 result = 25; - optional bytes sym_key = 26; /* symmetric encryption key */ - optional sint32 cipher_nid = 27; /* cipher NID */ + optional bytes sym_key = 26; /* symmetric encryption key */ + optional sint32 cipher_nid = 27; /* cipher NID */ + optional uint32 generation = 28; /* re-key batch generation */ + optional bool rekey = 29; /* re-key watermark trigger */ + optional bool rk_initiator = 30; /* re-key proof-holder side */ } diff --git a/src/lib/pb/model.proto b/src/lib/pb/model.proto index f1382f3d..4c1564a5 100644 --- a/src/lib/pb/model.proto +++ b/src/lib/pb/model.proto @@ -28,7 +28,7 @@ message qosspec_msg { required uint32 availability = 3; /* Class of 9s. */ required uint32 loss = 4; /* Packet loss. */ required uint32 ber = 5; /* Bit error rate, ppb. */ - required uint32 in_order = 6; /* In-order delivery. */ + required uint32 service = 6; /* enum qos_service. */ required uint32 max_gap = 7; /* In ms. */ required uint32 timeout = 8; /* Timeout in ms. */ } @@ -37,10 +37,11 @@ message flow_info_msg { required uint32 id = 1; required uint32 n_pid = 2; required uint32 n_1_pid = 3; - required uint32 mpl = 4; + required uint32 mpl = 4; /* MPL in ms. */ required uint32 state = 5; required qosspec_msg qos = 6; required uint32 uid = 7; + required uint32 mtu = 8; /* Layer MTU (bytes). */ } message name_info_msg { diff --git a/src/lib/protobuf.c b/src/lib/protobuf.c index 28b3aab2..a824d357 100644 --- a/src/lib/protobuf.c +++ b/src/lib/protobuf.c @@ -81,6 +81,7 @@ flow_info_msg_t * flow_info_s_to_msg(const struct flow_info * s) msg->mpl = s->mpl; msg->state = s->state; msg->uid = s->uid; + msg->mtu = s->mtu; msg->qos = qos_spec_s_to_msg(&s->qs); if (msg->qos == NULL) goto fail_msg; @@ -107,6 +108,7 @@ struct flow_info flow_info_msg_to_s(const flow_info_msg_t * msg) s.mpl = msg->mpl; s.state = msg->state; s.uid = msg->uid; + s.mtu = msg->mtu; s.qs = qos_spec_msg_to_s(msg->qos); return s; @@ -757,7 +759,7 @@ qosspec_msg_t * qos_spec_s_to_msg(const struct qos_spec * s) msg->availability = s->availability; msg->loss = s->loss; msg->ber = s->ber; - msg->in_order = s->in_order; + msg->service = s->service; msg->max_gap = s->max_gap; msg->timeout = s->timeout; @@ -775,7 +777,7 @@ struct qos_spec qos_spec_msg_to_s(const qosspec_msg_t * msg) s.availability = msg->availability; s.loss = msg->loss; s.ber = msg->ber; - s.in_order = msg->in_order; + s.service = msg->service; s.max_gap = msg->max_gap; s.timeout = msg->timeout; diff --git a/src/lib/qoscube.c b/src/lib/qoscube.c index 1eaa0d7c..5d7ae17d 100644 --- a/src/lib/qoscube.c +++ b/src/lib/qoscube.c @@ -29,15 +29,11 @@ qoscube_t qos_spec_to_cube(qosspec_t qs) { - if (qs.delay <= qos_voice.delay && - qs.bandwidth <= qos_voice.bandwidth && - qs.availability >= qos_voice.availability && - qs.max_gap <= qos_voice.max_gap) + if (qs.delay <= 50 && qs.bandwidth <= 100000 + && qs.availability >= 5 && qs.max_gap <= 50) return QOS_CUBE_VOICE; - else if (qs.delay <= qos_video.delay && - qs.bandwidth <= qos_video.bandwidth && - qs.availability >= qos_video.availability && - qs.max_gap <= qos_video.max_gap) + else if (qs.delay <= 100 && qs.availability >= 3 + && qs.max_gap <= 100) return QOS_CUBE_VIDEO; else return QOS_CUBE_BE; diff --git a/src/lib/random.c b/src/lib/random.c index 96315132..a132f470 100644 --- a/src/lib/random.c +++ b/src/lib/random.c @@ -28,6 +28,8 @@ #include <stdlib.h> #elif defined(HAVE_SYS_RANDOM) #include <sys/random.h> +#include <errno.h> +#include <stdint.h> #elif defined(HAVE_LIBGCRYPT) #include <gcrypt.h> #elif defined(HAVE_OPENSSL_RNG) @@ -42,13 +44,28 @@ int random_buffer(void * buf, arc4random_buf(buf, len); return 0; #elif defined(HAVE_SYS_RANDOM) - return getrandom(buf, len, GRND_NONBLOCK); + size_t off = 0; + ssize_t ret; + + while (off < len) { + ret = getrandom((uint8_t *) buf + off, len - off, + GRND_NONBLOCK); + if (ret < 0) { + if (errno == EINTR) + continue; + return -1; + } + off += (size_t) ret; + } + + return 0; #elif defined(HAVE_LIBGCRYPT) gcry_randomize(buf, len, GCRY_STRONG_RANDOM); return 0; #elif defined(HAVE_OPENSSL_RNG) - if (len > 0 && len < INT_MAX) - return RAND_bytes((unsigned char *) buf, (int) len); - return -1; + if (len == 0 || len >= INT_MAX) + return -1; + + return RAND_bytes((unsigned char *) buf, (int) len) == 1 ? 0 : -1; #endif } diff --git a/src/lib/rib.c b/src/lib/rib.c index a8d535c9..6e421397 100644 --- a/src/lib/rib.c +++ b/src/lib/rib.c @@ -112,14 +112,14 @@ static int rib_read(const char * path, (void) info; (void) offset; - pthread_rwlock_wrlock(&rib.lock); + pthread_rwlock_rdlock(&rib.lock); list_for_each(p, &rib.reg_comps) { struct reg_comp * r = list_entry(p, struct reg_comp, next); if (strcmp(comp, r->path) == 0) { - int ret = r->ops->read(path + 1, buf, size); + struct rib_ops * ops = r->ops; pthread_rwlock_unlock(&rib.lock); - return ret; + return ops->read(path + 1, buf, size); } } @@ -160,19 +160,25 @@ static int rib_readdir(const char * path, ssize_t len; ssize_t i; struct reg_comp * c; + struct rib_ops * ops; c = list_entry(p, struct reg_comp, next); if (strcmp(path + 1, c->path) != 0) continue; - assert(c->ops->readdir != NULL); + ops = c->ops; + + assert(ops->readdir != NULL); + + pthread_rwlock_unlock(&rib.lock); - len = c->ops->readdir(&dir_entries); + len = ops->readdir(&dir_entries); if (len < 0) - break; + return 0; for (i = 0; i < len; ++i) filler(buf, dir_entries[i], NULL, 0); freepp(char, dir_entries, len); + return 0; } } diff --git a/src/lib/serdes-irm.c b/src/lib/serdes-irm.c index 65f2c02d..1d9b4dec 100644 --- a/src/lib/serdes-irm.c +++ b/src/lib/serdes-irm.c @@ -174,6 +174,54 @@ int flow__irm_result_des(buffer_t * buf, else memset(sk->key, 0, SYMMKEYSZ); + sk->epoch = msg->has_generation ? (uint8_t) msg->generation : 0; + + if (msg->sym_key.len == SYMMKEYSZ) + crypt_secure_clear(msg->sym_key.data, msg->sym_key.len); + + irm_msg__free_unpacked(msg, NULL); + + return 0; + fail: + irm_msg__free_unpacked(msg, NULL); + fail_msg: + return err; +} + +int flow_rekey__irm_result_des(buffer_t * buf, + struct crypt_sk * sk, + bool * has_key, + bool * initiator) +{ + irm_msg_t * msg; + int err; + + msg = irm_msg__unpack(NULL, buf->len, buf->data); + if (msg == NULL) { + err = -EIRMD; + goto fail_msg; + } + + if (!msg->has_result) { + err = -EIRMD; + goto fail; + } + + if (msg->result < 0) { + err = msg->result; + goto fail; + } + + *has_key = msg->has_sym_key && msg->sym_key.len == SYMMKEYSZ; + if (*has_key) { + memcpy(sk->key, msg->sym_key.data, SYMMKEYSZ); + sk->nid = NID_undef; + sk->epoch = msg->has_generation ? + (uint8_t) msg->generation : 0; + *initiator = msg->has_rk_initiator && msg->rk_initiator; + crypt_secure_clear(msg->sym_key.data, msg->sym_key.len); + } + irm_msg__free_unpacked(msg, NULL); return 0; @@ -222,6 +270,44 @@ int flow_dealloc__irm_req_ser(buffer_t * buf, return -ENOMEM; } +int flow_update__irm_req_ser(buffer_t * buf, + const struct flow_info * flow, + bool rekey) +{ + irm_msg_t * msg; + size_t len; + + msg = malloc(sizeof(*msg)); + if (msg == NULL) + goto fail_malloc; + + irm_msg__init(msg); + + msg->code = IRM_MSG_CODE__IRM_FLOW_UPDATE; + msg->flow_info = flow_info_s_to_msg(flow); + if (msg->flow_info == NULL) + goto fail_msg; + + msg->has_rekey = true; + msg->rekey = rekey; + + len = irm_msg__get_packed_size(msg); + if (len == 0 || len > buf->len) + goto fail_msg; + + buf->len = len; + + irm_msg__pack(msg, buf->data); + irm_msg__free_unpacked(msg, NULL); + + return 0; + + fail_msg: + irm_msg__free_unpacked(msg, NULL); + fail_malloc: + return -ENOMEM; +} + int ipcp_flow_dealloc__irm_req_ser(buffer_t * buf, const struct flow_info * flow) { @@ -398,6 +484,56 @@ int ipcp_flow_req_arr__irm_req_ser(buffer_t * buf, return 0; fail_msg: + /* hash/pk are borrowed from the caller; detach before free. */ + msg->hash.len = 0; + msg->hash.data = NULL; + msg->pk.len = 0; + msg->pk.data = NULL; + irm_msg__free_unpacked(msg, NULL); + fail_malloc: + return -ENOMEM; +} + +int ipcp_flow_update_arr__irm_req_ser(buffer_t * buf, + const struct flow_info * flow, + const buffer_t * data) +{ + irm_msg_t * msg; + size_t len; + + msg = malloc(sizeof(*msg)); + if (msg == NULL) + goto fail_malloc; + + irm_msg__init(msg); + + msg->code = IRM_MSG_CODE__IPCP_FLOW_UPDATE_ARR; + msg->flow_info = flow_info_s_to_msg(flow); + if (msg->flow_info == NULL) + goto fail_msg; + + msg->has_pk = true; + msg->pk.len = data->len; + msg->pk.data = data->data; + + len = irm_msg__get_packed_size(msg); + if (len == 0 || len > buf->len) + goto fail_msg; + + buf->len = len; + + irm_msg__pack(msg, buf->data); + + /* Don't free data! */ + msg->pk.len = 0; + msg->pk.data = NULL; + irm_msg__free_unpacked(msg, NULL); + + return 0; + fail_msg: + /* pk.data is borrowed from the caller; detach before free. */ + msg->pk.len = 0; + msg->pk.data = NULL; irm_msg__free_unpacked(msg, NULL); fail_malloc: return -ENOMEM; diff --git a/src/lib/ssm/flow_set.c b/src/lib/ssm/flow_set.c index 73d0db55..2e33b408 100644 --- a/src/lib/ssm/flow_set.c +++ b/src/lib/ssm/flow_set.c @@ -58,9 +58,9 @@ #define QUEUESIZE ((SSM_RBUFF_SIZE) * sizeof(struct flowevent)) #define SSM_FSET_FILE_SIZE (SYS_MAX_FLOWS * sizeof(ssize_t) \ - + PROG_MAX_FQUEUES * sizeof(size_t) \ - + PROG_MAX_FQUEUES * sizeof(pthread_cond_t) \ - + PROG_MAX_FQUEUES * QUEUESIZE \ + + PROC_MAX_FQUEUES * sizeof(size_t) \ + + PROC_MAX_FQUEUES * sizeof(pthread_cond_t) \ + + PROC_MAX_FQUEUES * QUEUESIZE \ + sizeof(pthread_mutex_t)) #define fqueue_ptr(fs, idx) (fs->fqueues + (SSM_RBUFF_SIZE) * idx) @@ -104,10 +104,10 @@ static struct ssm_flow_set * flow_set_create(pid_t pid, set->mtable = shm_base; set->heads = (size_t *) (set->mtable + SYS_MAX_FLOWS); - set->conds = (pthread_cond_t *)(set->heads + PROG_MAX_FQUEUES); - set->fqueues = (struct flowevent *) (set->conds + PROG_MAX_FQUEUES); + set->conds = (pthread_cond_t *)(set->heads + PROC_MAX_FQUEUES); + set->fqueues = (struct flowevent *) (set->conds + PROC_MAX_FQUEUES); set->lock = (pthread_mutex_t *) - (set->fqueues + PROG_MAX_FQUEUES * (SSM_RBUFF_SIZE)); + (set->fqueues + PROC_MAX_FQUEUES * (SSM_RBUFF_SIZE)); return set; @@ -164,7 +164,7 @@ struct ssm_flow_set * ssm_flow_set_create(pid_t pid) if (pthread_condattr_setclock(&cattr, PTHREAD_COND_CLOCK)) goto fail_condattr_set; #endif - for (i = 0; i < PROG_MAX_FQUEUES; ++i) { + for (i = 0; i < PROC_MAX_FQUEUES; ++i) { set->heads[i] = 0; if (pthread_cond_init(&set->conds[i], &cattr)) goto fail_init; @@ -222,7 +222,7 @@ void ssm_flow_set_zero(struct ssm_flow_set * set, ssize_t i = 0; assert(set); - assert(idx < PROG_MAX_FQUEUES); + assert(idx < PROC_MAX_FQUEUES); pthread_mutex_lock(set->lock); @@ -242,7 +242,7 @@ int ssm_flow_set_add(struct ssm_flow_set * set, { assert(set); assert(!(flow_id < 0) && flow_id < SYS_MAX_FLOWS); - assert(idx < PROG_MAX_FQUEUES); + assert(idx < PROC_MAX_FQUEUES); pthread_mutex_lock(set->lock); @@ -264,7 +264,7 @@ void ssm_flow_set_del(struct ssm_flow_set * set, { assert(set); assert(!(flow_id < 0) && flow_id < SYS_MAX_FLOWS); - assert(idx < PROG_MAX_FQUEUES); + assert(idx < PROC_MAX_FQUEUES); pthread_mutex_lock(set->lock); @@ -282,7 +282,7 @@ int ssm_flow_set_has(struct ssm_flow_set * set, assert(set); assert(!(flow_id < 0) && flow_id < SYS_MAX_FLOWS); - assert(idx < PROG_MAX_FQUEUES); + assert(idx < PROC_MAX_FQUEUES); pthread_mutex_lock(set->lock); @@ -299,26 +299,34 @@ void ssm_flow_set_notify(struct ssm_flow_set * set, int event) { struct flowevent * e; + ssize_t idx; assert(set); assert(!(flow_id < 0) && flow_id < SYS_MAX_FLOWS); pthread_mutex_lock(set->lock); - if (set->mtable[flow_id] == -1) { + idx = set->mtable[flow_id]; + if (idx == -1) { pthread_mutex_unlock(set->lock); return; } - e = fqueue_ptr(set, set->mtable[flow_id]) + - set->heads[set->mtable[flow_id]]; + /* Ring full: drop redundant FLOW_PKT, reserve a slot for ctrl. */ + if (set->heads[idx] >= SSM_RBUFF_SIZE + || (event == FLOW_PKT && set->heads[idx] >= SSM_RBUFF_SIZE - 1)) { + pthread_mutex_unlock(set->lock); + return; + } + + e = fqueue_ptr(set, idx) + set->heads[idx]; e->flow_id = flow_id; e->event = event; - ++set->heads[set->mtable[flow_id]]; + ++set->heads[idx]; - pthread_cond_signal(&set->conds[set->mtable[flow_id]]); + pthread_cond_signal(&set->conds[idx]); pthread_mutex_unlock(set->lock); } @@ -332,7 +340,7 @@ ssize_t ssm_flow_set_wait(const struct ssm_flow_set * set, ssize_t ret = 0; assert(set); - assert(idx < PROG_MAX_FQUEUES); + assert(idx < PROC_MAX_FQUEUES); assert(fqueue); #ifndef HAVE_ROBUST_MUTEX diff --git a/src/lib/ssm/pool.c b/src/lib/ssm/pool.c index f17a6e65..705de147 100644 --- a/src/lib/ssm/pool.c +++ b/src/lib/ssm/pool.c @@ -24,6 +24,7 @@ #include "config.h" +#include <ouroboros/atomics.h> #include <ouroboros/errno.h> #include <ouroboros/pthread.h> #include <ouroboros/ssm_pool.h> @@ -37,10 +38,20 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <time.h> #include <unistd.h> #include <sys/mman.h> #include <sys/stat.h> +static __inline__ uint64_t pool_now_ns(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + + return (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec; +} + /* Global Shared Packet Pool (GSPP) configuration */ static const struct ssm_size_class_cfg ssm_gspp_cfg[SSM_POOL_MAX_CLASSES] = { { (1 << 8), SSM_GSPP_256_BLOCKS }, @@ -75,26 +86,6 @@ static const struct ssm_size_class_cfg ssm_pup_cfg[SSM_POOL_MAX_CLASSES] = { #define GET_SHARD_FOR_PID(pid) ((int)((pid) % SSM_POOL_SHARDS)) -#define LOAD_RELAXED(ptr) \ - (__atomic_load_n(ptr, __ATOMIC_RELAXED)) - -#define LOAD_ACQUIRE(ptr) \ - (__atomic_load_n(ptr, __ATOMIC_ACQUIRE)) - -#define STORE_RELEASE(ptr, val) \ - (__atomic_store_n(ptr, val, __ATOMIC_RELEASE)) - -#define LOAD(ptr) \ - (__atomic_load_n(ptr, __ATOMIC_SEQ_CST)) - -#define STORE(ptr, val) \ - (__atomic_store_n(ptr, val, __ATOMIC_SEQ_CST)) - -#define FETCH_ADD(ptr, val) \ - (__atomic_fetch_add(ptr, val, __ATOMIC_SEQ_CST)) - -#define FETCH_SUB(ptr, val) \ - (__atomic_fetch_sub(ptr, val, __ATOMIC_SEQ_CST)) #define SSM_FILE_SIZE (SSM_POOL_TOTAL_SIZE + sizeof(struct _ssm_pool_hdr)) #define SSM_GSPP_FILE_SIZE (SSM_GSPP_TOTAL_SIZE + sizeof(struct _ssm_pool_hdr)) @@ -165,29 +156,6 @@ static __inline__ void list_add_head(struct _ssm_list_head * head, STORE(&head->count, LOAD(&head->count) + 1); } -static __inline__ int select_size_class(struct ssm_pool * pool, - size_t len) -{ - size_t sz; - int i; - - assert(pool != NULL); - - /* Total space needed: header + headspace + data + tailspace */ - sz = sizeof(struct ssm_pk_buff) + SSM_PK_BUFF_HEADSPACE + len - + SSM_PK_BUFF_TAILSPACE; - - for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) { - struct _ssm_size_class * sc; - - sc = &pool->hdr->size_classes[i]; - if (sc->object_size > 0 && sz <= sc->object_size) - return i; - } - - return -1; -} - static __inline__ int find_size_class_for_offset(struct ssm_pool * pool, size_t offset) { @@ -278,6 +246,7 @@ static void init_size_classes(struct ssm_pool * pool) STORE(&blk->refcount, 0); blk->allocator_pid = 0; + blk->alloc_ts = 0; STORE(&blk->next_offset, 0); list_add_head(&sc->shards[0].free_list, blk, @@ -308,19 +277,31 @@ static size_t reclaim_pid_from_sc(struct _ssm_size_class * sc, size_t i; size_t recovered = 0; struct ssm_pk_buff * blk; + uint64_t now; + uint64_t min_age_ns; - region = (uint8_t *) pool_base + sc->pool_start; + region = (uint8_t *) pool_base + sc->pool_start; + now = pool_now_ns(); + min_age_ns = (uint64_t) SSM_POOL_RECLAIM_AGE_S * 1000000000ULL; for (i = 0; i < sc->object_count; ++i) { blk = (struct ssm_pk_buff *)(region + i * sc->object_size); - if (blk->allocator_pid == pid && LOAD(&blk->refcount) > 0) { - STORE(&blk->refcount, 0); - blk->allocator_pid = 0; - list_add_head(&shard->free_list, blk, pool_base); - FETCH_ADD(&shard->free_count, 1); - recovered++; - } + if (blk->allocator_pid != pid) + continue; + + if (LOAD(&blk->refcount) == 0) + continue; + + /* Recent: a live consumer may still hold the handoff. */ + if (now - blk->alloc_ts < min_age_ns) + continue; + + STORE(&blk->refcount, 0); + blk->allocator_pid = 0; + list_add_head(&shard->free_list, blk, pool_base); + FETCH_ADD(&shard->free_count, 1); + recovered++; } return recovered; @@ -381,6 +362,7 @@ static __inline__ ssize_t init_block(struct ssm_pool * pool, { STORE(&blk->refcount, 1); blk->allocator_pid = getpid(); + blk->alloc_ts = pool_now_ns(); blk->size = (uint32_t) (sc->object_size - sizeof(struct ssm_pk_buff)); blk->pk_head = SSM_PK_BUFF_HEADSPACE; @@ -702,7 +684,7 @@ ssize_t ssm_pool_alloc(struct ssm_pool * pool, assert(pool != NULL); assert(spb != NULL); - idx = select_size_class(pool, count); + idx = select_size_class(pool->hdr, count); if (idx >= 0) return alloc_from_sc(pool, idx, count, ptr, spb); @@ -720,7 +702,7 @@ ssize_t ssm_pool_alloc_b(struct ssm_pool * pool, assert(pool != NULL); assert(spb != NULL); - idx = select_size_class(pool, count); + idx = select_size_class(pool->hdr, count); if (idx >= 0) return alloc_from_sc_b(pool, idx, count, ptr, spb, abstime); @@ -746,7 +728,7 @@ ssize_t ssm_pool_read(uint8_t ** dst, } struct ssm_pk_buff * ssm_pool_get(struct ssm_pool * pool, - size_t off) + size_t off) { struct ssm_pk_buff * blk; @@ -825,36 +807,36 @@ int ssm_pool_remove(struct ssm_pool * pool, return 0; } -size_t ssm_pk_buff_get_idx(struct ssm_pk_buff * spb) +size_t ssm_pk_buff_get_off(const struct ssm_pk_buff * spb) { assert(spb != NULL); return spb->off; } -uint8_t * ssm_pk_buff_head(struct ssm_pk_buff * spb) +uint8_t * ssm_pk_buff_head(const struct ssm_pk_buff * spb) { assert(spb != NULL); - return spb->data + spb->pk_head; + return (uint8_t *) spb->data + spb->pk_head; } -uint8_t * ssm_pk_buff_tail(struct ssm_pk_buff * spb) +uint8_t * ssm_pk_buff_tail(const struct ssm_pk_buff * spb) { assert(spb != NULL); - return spb->data + spb->pk_tail; + return (uint8_t *) spb->data + spb->pk_tail; } -size_t ssm_pk_buff_len(struct ssm_pk_buff * spb) +size_t ssm_pk_buff_len(const struct ssm_pk_buff * spb) { assert(spb != NULL); return spb->pk_tail - spb->pk_head; } -uint8_t * ssm_pk_buff_head_alloc(struct ssm_pk_buff * spb, - size_t size) +uint8_t * ssm_pk_buff_push(struct ssm_pk_buff * spb, + size_t size) { assert(spb != NULL); @@ -866,8 +848,8 @@ uint8_t * ssm_pk_buff_head_alloc(struct ssm_pk_buff * spb, return spb->data + spb->pk_head; } -uint8_t * ssm_pk_buff_tail_alloc(struct ssm_pk_buff * spb, - size_t size) +uint8_t * ssm_pk_buff_push_tail(struct ssm_pk_buff * spb, + size_t size) { uint8_t * buf; @@ -883,8 +865,8 @@ uint8_t * ssm_pk_buff_tail_alloc(struct ssm_pk_buff * spb, return buf; } -uint8_t * ssm_pk_buff_head_release(struct ssm_pk_buff * spb, - size_t size) +uint8_t * ssm_pk_buff_pop(struct ssm_pk_buff * spb, + size_t size) { uint8_t * buf; @@ -898,8 +880,8 @@ uint8_t * ssm_pk_buff_head_release(struct ssm_pk_buff * spb, return buf; } -uint8_t * ssm_pk_buff_tail_release(struct ssm_pk_buff * spb, - size_t size) +uint8_t * ssm_pk_buff_pop_tail(struct ssm_pk_buff * spb, + size_t size) { assert(spb != NULL); assert(!(size > spb->pk_tail - spb->pk_head)); diff --git a/src/lib/ssm/rbuff.c b/src/lib/ssm/rbuff.c index e4558c31..0121af89 100644 --- a/src/lib/ssm/rbuff.c +++ b/src/lib/ssm/rbuff.c @@ -74,12 +74,13 @@ struct ssm_rbuff { ssize_t * shm_base; /* start of shared memory */ size_t * head; /* start of ringbuffer */ size_t * tail; - size_t * acl; /* access control */ + size_t * flags; /* out-of-band flags (RB_*) */ pthread_mutex_t * mtx; /* lock for cond vars only */ pthread_cond_t * add; /* signal when new data */ pthread_cond_t * del; /* signal when data removed */ pid_t pid; /* pid of the owner */ int flow_id; /* flow_id of the flow */ + size_t n_users; /* in-flight users */ }; #define MM_FLAGS (PROT_READ | PROT_WRITE) @@ -113,12 +114,13 @@ static struct ssm_rbuff * rbuff_create(pid_t pid, rb->shm_base = shm_base; rb->head = (size_t *) (rb->shm_base + (SSM_RBUFF_SIZE)); rb->tail = (size_t *) (rb->head + 1); - rb->acl = (size_t *) (rb->tail + 1); - rb->mtx = (pthread_mutex_t *) (rb->acl + 1); + rb->flags = (size_t *) (rb->tail + 1); + rb->mtx = (pthread_mutex_t *) (rb->flags + 1); rb->add = (pthread_cond_t *) (rb->mtx + 1); rb->del = rb->add + 1; rb->pid = pid; rb->flow_id = flow_id; + rb->n_users = 0; return rb; @@ -179,7 +181,7 @@ struct ssm_rbuff * ssm_rbuff_create(pid_t pid, if (pthread_cond_init(rb->del, &cattr)) goto fail_del; - *rb->acl = ACL_RDWR; + *rb->flags = RB_RDWR; *rb->head = 0; *rb->tail = 0; @@ -228,27 +230,38 @@ void ssm_rbuff_close(struct ssm_rbuff * rb) { assert(rb); + /* + * Caller must set RB_FLOWDOWN first; if a user becomes + * cancellable, push a cleanup that decrements n_users. + */ + while (__atomic_load_n(&rb->n_users, __ATOMIC_SEQ_CST) > 0) { + struct timespec tic = { 0, 100000 }; + nanosleep(&tic, NULL); + } + rbuff_destroy(rb); } int ssm_rbuff_write(struct ssm_rbuff * rb, - size_t idx) + size_t off) { - size_t acl; + size_t flags; bool was_empty; int ret = 0; assert(rb != NULL); - acl = __atomic_load_n(rb->acl, __ATOMIC_SEQ_CST); - if (acl != ACL_RDWR) { - if (acl & ACL_FLOWDOWN) { + __atomic_fetch_add(&rb->n_users, 1, __ATOMIC_SEQ_CST); + + flags = __atomic_load_n(rb->flags, __ATOMIC_SEQ_CST); + if (flags != RB_RDWR) { + if (flags & RB_FLOWDOWN) { ret = -EFLOWDOWN; - goto fail_acl; + goto fail_flags; } - if (acl & ACL_RDONLY) { + if (!(flags & RB_WR)) { ret = -ENOTALLOC; - goto fail_acl; + goto fail_flags; } } @@ -261,7 +274,7 @@ int ssm_rbuff_write(struct ssm_rbuff * rb, was_empty = IS_EMPTY(rb); - HEAD(rb) = (ssize_t) idx; + HEAD(rb) = (ssize_t) off; ADVANCE_HEAD(rb); if (was_empty) @@ -269,33 +282,37 @@ int ssm_rbuff_write(struct ssm_rbuff * rb, pthread_mutex_unlock(rb->mtx); + __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST); return 0; fail_mutex: pthread_mutex_unlock(rb->mtx); - fail_acl: + fail_flags: + __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST); return ret; } int ssm_rbuff_write_b(struct ssm_rbuff * rb, - size_t idx, + size_t off, const struct timespec * abstime) { - size_t acl; + size_t flags; int ret = 0; bool was_empty; assert(rb != NULL); - acl = __atomic_load_n(rb->acl, __ATOMIC_SEQ_CST); - if (acl != ACL_RDWR) { - if (acl & ACL_FLOWDOWN) { + __atomic_fetch_add(&rb->n_users, 1, __ATOMIC_SEQ_CST); + + flags = __atomic_load_n(rb->flags, __ATOMIC_SEQ_CST); + if (flags != RB_RDWR) { + if (flags & RB_FLOWDOWN) { ret = -EFLOWDOWN; - goto fail_acl; + goto fail_flags; } - if (acl & ACL_RDONLY) { + if (!(flags & RB_WR)) { ret = -ENOTALLOC; - goto fail_acl; + goto fail_flags; } } @@ -304,8 +321,8 @@ int ssm_rbuff_write_b(struct ssm_rbuff * rb, pthread_cleanup_push(__cleanup_mutex_unlock, rb->mtx); while (IS_FULL(rb) && ret != -ETIMEDOUT) { - acl = __atomic_load_n(rb->acl, __ATOMIC_SEQ_CST); - if (acl & ACL_FLOWDOWN) { + flags = __atomic_load_n(rb->flags, __ATOMIC_SEQ_CST); + if (flags & RB_FLOWDOWN) { ret = -EFLOWDOWN; break; } @@ -316,7 +333,7 @@ int ssm_rbuff_write_b(struct ssm_rbuff * rb, if (ret != -ETIMEDOUT && ret != -EFLOWDOWN) { was_empty = IS_EMPTY(rb); - HEAD(rb) = (ssize_t) idx; + HEAD(rb) = (ssize_t) off; ADVANCE_HEAD(rb); if (was_empty) pthread_cond_broadcast(rb->add); @@ -324,24 +341,28 @@ int ssm_rbuff_write_b(struct ssm_rbuff * rb, pthread_mutex_unlock(rb->mtx); - fail_acl: + fail_flags: + __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST); return ret; } -static int check_rb_acl(struct ssm_rbuff * rb) +static int check_rb_flags(struct ssm_rbuff * rb) { - size_t acl; + size_t flags; assert(rb != NULL); - acl = __atomic_load_n(rb->acl, __ATOMIC_SEQ_CST); + flags = __atomic_load_n(rb->flags, __ATOMIC_SEQ_CST); - if (acl & ACL_FLOWDOWN) + if (flags & RB_FLOWDOWN) return -EFLOWDOWN; - if (acl & ACL_FLOWPEER) + if (flags & RB_FLOWPEER) return -EFLOWPEER; + if (!(flags & RB_RD)) + return -ENOTALLOC; + return -EAGAIN; } @@ -351,11 +372,21 @@ ssize_t ssm_rbuff_read(struct ssm_rbuff * rb) assert(rb != NULL); - if (IS_EMPTY(rb)) - return check_rb_acl(rb); + __atomic_fetch_add(&rb->n_users, 1, __ATOMIC_SEQ_CST); + + if (IS_EMPTY(rb)) { + ret = check_rb_flags(rb); + goto out; + } robust_mutex_lock(rb->mtx); + if (IS_EMPTY(rb)) { + pthread_mutex_unlock(rb->mtx); + ret = check_rb_flags(rb); + goto out; + } + ret = TAIL(rb); ADVANCE_TAIL(rb); @@ -363,6 +394,8 @@ ssize_t ssm_rbuff_read(struct ssm_rbuff * rb) pthread_mutex_unlock(rb->mtx); + out: + __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST); return ret; } @@ -370,13 +403,17 @@ ssize_t ssm_rbuff_read_b(struct ssm_rbuff * rb, const struct timespec * abstime) { ssize_t idx = -1; - size_t acl; + size_t flags; assert(rb != NULL); - acl = __atomic_load_n(rb->acl, __ATOMIC_SEQ_CST); - if (IS_EMPTY(rb) && (acl & ACL_FLOWDOWN)) - return -EFLOWDOWN; + __atomic_fetch_add(&rb->n_users, 1, __ATOMIC_SEQ_CST); + + flags = __atomic_load_n(rb->flags, __ATOMIC_SEQ_CST); + if (IS_EMPTY(rb) && (flags & RB_FLOWDOWN)) { + idx = -EFLOWDOWN; + goto out; + } robust_mutex_lock(rb->mtx); @@ -384,7 +421,7 @@ ssize_t ssm_rbuff_read_b(struct ssm_rbuff * rb, while (IS_EMPTY(rb) && idx != -ETIMEDOUT && - check_rb_acl(rb) == -EAGAIN) { + check_rb_flags(rb) == -EAGAIN) { idx = -robust_wait(rb->add, rb->mtx, abstime); } @@ -395,35 +432,55 @@ ssize_t ssm_rbuff_read_b(struct ssm_rbuff * rb, ADVANCE_TAIL(rb); pthread_cond_broadcast(rb->del); } else if (idx != -ETIMEDOUT) { - idx = check_rb_acl(rb); + idx = check_rb_flags(rb); } pthread_mutex_unlock(rb->mtx); assert(idx != -EAGAIN); + out: + __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST); return idx; } -void ssm_rbuff_set_acl(struct ssm_rbuff * rb, - uint32_t flags) +void ssm_rbuff_set_bits(struct ssm_rbuff * rb, + uint32_t bits) { assert(rb != NULL); - __atomic_store_n(rb->acl, (size_t) flags, __ATOMIC_SEQ_CST); + robust_mutex_lock(rb->mtx); + __atomic_fetch_or(rb->flags, (size_t) bits, __ATOMIC_SEQ_CST); + pthread_cond_broadcast(rb->add); + pthread_cond_broadcast(rb->del); + pthread_mutex_unlock(rb->mtx); +} + +void ssm_rbuff_clr_bits(struct ssm_rbuff * rb, + uint32_t bits) +{ + assert(rb != NULL); + + robust_mutex_lock(rb->mtx); + __atomic_fetch_and(rb->flags, ~(size_t) bits, __ATOMIC_SEQ_CST); + pthread_cond_broadcast(rb->add); + pthread_cond_broadcast(rb->del); + pthread_mutex_unlock(rb->mtx); } -uint32_t ssm_rbuff_get_acl(struct ssm_rbuff * rb) +uint32_t ssm_rbuff_get_flags(struct ssm_rbuff * rb) { assert(rb != NULL); - return (uint32_t) __atomic_load_n(rb->acl, __ATOMIC_SEQ_CST); + return (uint32_t) __atomic_load_n(rb->flags, __ATOMIC_SEQ_CST); } void ssm_rbuff_fini(struct ssm_rbuff * rb) { assert(rb != NULL); + __atomic_fetch_add(&rb->n_users, 1, __ATOMIC_SEQ_CST); + robust_mutex_lock(rb->mtx); pthread_cleanup_push(__cleanup_mutex_unlock, rb->mtx); @@ -432,6 +489,8 @@ void ssm_rbuff_fini(struct ssm_rbuff * rb) robust_wait(rb->del, rb->mtx, NULL); pthread_cleanup_pop(true); + + __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST); } size_t ssm_rbuff_queued(struct ssm_rbuff * rb) diff --git a/src/lib/ssm/ssm.h.in b/src/lib/ssm/ssm.h.in index b9246c8b..57febae4 100644 --- a/src/lib/ssm/ssm.h.in +++ b/src/lib/ssm/ssm.h.in @@ -38,7 +38,6 @@ #define SSM_RBUFF_PREFIX "@SSM_RBUFF_PREFIX@" #define SSM_FLOW_SET_PREFIX "@SSM_FLOW_SET_PREFIX@" #define SSM_POOL_NAME "@SSM_POOL_NAME@" -#define SSM_POOL_BLOCKS @SSM_POOL_BLOCKS@ #define SSM_RBUFF_SIZE @SSM_RBUFF_SIZE@ /* Packet buffer space reservation */ @@ -84,6 +83,7 @@ /* Size class configuration */ #define SSM_POOL_MAX_CLASSES 9 #define SSM_POOL_SHARDS @SSM_POOL_SHARDS@ +#define SSM_POOL_RECLAIM_AGE_S @SSM_POOL_RECLAIM_AGE_S@ /* Internal structures - exposed for testing */ #ifdef __cplusplus @@ -126,6 +126,7 @@ struct ssm_pk_buff { uint32_t pk_head; /* Head offset into data */ uint32_t pk_tail; /* Tail offset into data */ uint32_t off; /* Block offset in pool */ + uint64_t alloc_ts; /* CLOCK_MONOTONIC ns at alloc */ uint8_t data[]; /* Packet data */ }; @@ -164,6 +165,24 @@ struct _ssm_pool_hdr { struct _ssm_size_class size_classes[SSM_POOL_MAX_CLASSES]; }; +#define SSM_PK_BUFF_TOTALSPACE (SSM_PK_BUFF_HEADSPACE + SSM_PK_BUFF_TAILSPACE) +static __inline__ int select_size_class(struct _ssm_pool_hdr * hdr, + size_t len) +{ + size_t sz; + int i; + + sz = sizeof(struct ssm_pk_buff) + SSM_PK_BUFF_TOTALSPACE + len; + + for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) { + struct _ssm_size_class * sc = &hdr->size_classes[i]; + if (sc->object_size > 0 && sz <= sc->object_size) + return i; + } + + return -1; +} + #ifdef __cplusplus } #endif diff --git a/src/lib/ssm/tests/pool_sharding_test.c b/src/lib/ssm/tests/pool_sharding_test.c index c53105e3..ec464a92 100644 --- a/src/lib/ssm/tests/pool_sharding_test.c +++ b/src/lib/ssm/tests/pool_sharding_test.c @@ -80,19 +80,13 @@ static int test_lazy_distribution(void) goto fail_pool; } - /* Find the first size class with blocks */ - sc_idx = -1; - for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) { - if (hdr->size_classes[i].object_count > 0) { - sc_idx = i; - break; - } - } - + /* Inspect the class that TEST_SIZE allocations will use */ + sc_idx = select_size_class(hdr, TEST_SIZE); if (sc_idx < 0) { - printf("No size classes configured.\n"); + printf("No size class fits TEST_SIZE=%d.\n", TEST_SIZE); for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) { - printf(" Class %d: count=%zu\n", i, + printf(" Class %d: object_size=%zu count=%zu\n", i, + hdr->size_classes[i].object_size, hdr->size_classes[i].object_count); } goto fail_pool; @@ -137,7 +131,6 @@ static int test_shard_migration(void) ssize_t off; int shard_idx; int sc_idx; - int i; TEST_START(); @@ -149,18 +142,11 @@ static int test_shard_migration(void) hdr = get_pool_hdr(pool); - /* Find the first size class with blocks */ - sc_idx = -1; - for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) { - if (hdr->size_classes[i].object_count > 0) { - sc_idx = i; - break; - } - } - + /* Inspect the class that TEST_SIZE allocations will use */ + sc_idx = select_size_class(hdr, TEST_SIZE); if (sc_idx < 0) { - printf("No size classes configured.\n"); - goto fail; + printf("No size class fits TEST_SIZE=%d.\n", TEST_SIZE); + goto fail_pool; } sc = &hdr->size_classes[sc_idx]; @@ -209,7 +195,6 @@ static int test_fallback_stealing(void) size_t total_free; size_t i; int sc_idx; - int c; TEST_START(); @@ -221,18 +206,11 @@ static int test_fallback_stealing(void) hdr = get_pool_hdr(pool); - /* Find the first size class with blocks */ - sc_idx = -1; - for (c = 0; c < SSM_POOL_MAX_CLASSES; c++) { - if (hdr->size_classes[c].object_count > 0) { - sc_idx = c; - break; - } - } - + /* Inspect the class that TEST_SIZE allocations will use */ + sc_idx = select_size_class(hdr, TEST_SIZE); if (sc_idx < 0) { - printf("No size classes configured.\n"); - goto fail; + printf("No size class fits TEST_SIZE=%d.\n", TEST_SIZE); + goto fail_pool; } sc = &hdr->size_classes[sc_idx]; @@ -261,7 +239,7 @@ static int test_fallback_stealing(void) /* Free them all - they go to local_shard */ for (i = 0; i < total_blocks / 2; i++) { - size_t off = ssm_pk_buff_get_idx(spbs[i]); + size_t off = ssm_pk_buff_get_off(spbs[i]); if (ssm_pool_remove(pool, off) != 0) { printf("Remove %zu failed.\n", i); free(spbs); @@ -299,7 +277,7 @@ static int test_fallback_stealing(void) /* Now all allocated blocks are in use again */ /* Cleanup - free all allocated blocks */ for (i = 0; i < total_blocks / 2; i++) { - size_t off = ssm_pk_buff_get_idx(spbs[i]); + size_t off = ssm_pk_buff_get_off(spbs[i]); ssm_pool_remove(pool, off); } @@ -396,20 +374,15 @@ static int test_multiprocess_sharding(void) /* Verify blocks distributed across shards */ hdr = get_pool_hdr(pool); - /* Find the first size class with blocks */ - sc = NULL; - for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) { - if (hdr->size_classes[i].object_count > 0) { - sc = &hdr->size_classes[i]; - break; - } - } - - if (sc == NULL) { - printf("No size classes configured.\n"); + /* Inspect the class that TEST_SIZE allocations used */ + i = select_size_class(hdr, TEST_SIZE); + if (i < 0) { + printf("No size class fits TEST_SIZE=%d.\n", TEST_SIZE); goto fail_pool; } + sc = &hdr->size_classes[i]; + /* After children allocate and free, blocks should be in shards * (though exact distribution depends on PID values) */ diff --git a/src/lib/ssm/tests/pool_test.c b/src/lib/ssm/tests/pool_test.c index 3fc19cd5..f86fbd9e 100644 --- a/src/lib/ssm/tests/pool_test.c +++ b/src/lib/ssm/tests/pool_test.c @@ -741,14 +741,14 @@ static int test_ssm_pk_buff_operations(void) memcpy(head, data, dlen); - tail = ssm_pk_buff_tail_alloc(spb, 32); + tail = ssm_pk_buff_push_tail(spb, 32); if (tail == NULL) { - printf("Tail_alloc failed.\n"); + printf("push_tail failed.\n"); goto fail_ops; } if (ssm_pk_buff_len(spb) != POOL_256 + 32) { - printf("Length after tail_alloc: %zu.\n", + printf("Length after push_tail: %zu.\n", ssm_pk_buff_len(spb)); goto fail_ops; } @@ -758,14 +758,14 @@ static int test_ssm_pk_buff_operations(void) goto fail_ops; } - tail = ssm_pk_buff_tail_release(spb, 32); + tail = ssm_pk_buff_pop_tail(spb, 32); if (tail == NULL) { - printf("Tail_release failed.\n"); + printf("pop_tail failed.\n"); goto fail_ops; } if (ssm_pk_buff_len(spb) != POOL_256) { - printf("Length after tail_release: %zu.\n", + printf("Length after pop_tail: %zu.\n", ssm_pk_buff_len(spb)); goto fail_ops; } @@ -956,6 +956,8 @@ static int test_ssm_pool_reclaim_orphans(void) ssize_t ret3; pid_t my_pid; pid_t fake_pid = 99999; + struct timespec now; + uint64_t old_ts; TEST_START(); @@ -976,9 +978,15 @@ static int test_ssm_pool_reclaim_orphans(void) goto fail_alloc; } - /* Simulate blocks from another process by changing allocator_pid */ + /* Simulate blocks leaked by a dead process: foreign pid, aged out. */ + clock_gettime(CLOCK_MONOTONIC, &now); + old_ts = ((uint64_t) now.tv_sec - (SSM_POOL_RECLAIM_AGE_S + 1)) + * 1000000000ULL + (uint64_t) now.tv_nsec; + spb1->allocator_pid = fake_pid; spb2->allocator_pid = fake_pid; + spb1->alloc_ts = old_ts; + spb2->alloc_ts = old_ts; /* Keep spb3 with our pid */ /* Reclaim orphans from fake_pid */ diff --git a/src/lib/ssm/tests/rbuff_test.c b/src/lib/ssm/tests/rbuff_test.c index 58cb39c3..48e5a714 100644 --- a/src/lib/ssm/tests/rbuff_test.c +++ b/src/lib/ssm/tests/rbuff_test.c @@ -206,10 +206,10 @@ static int test_ssm_rbuff_fill_drain(void) return TEST_RC_FAIL; } -static int test_ssm_rbuff_acl(void) +static int test_ssm_rbuff_flags(void) { struct ssm_rbuff * rb; - uint32_t acl; + uint32_t flags; TEST_START(); @@ -219,16 +219,16 @@ static int test_ssm_rbuff_acl(void) goto fail; } - acl = ssm_rbuff_get_acl(rb); - if (acl != ACL_RDWR) { - printf("Expected ACL_RDWR, got %u.\n", acl); + flags = ssm_rbuff_get_flags(rb); + if (flags != RB_RDWR) { + printf("Expected RB_RDWR, got %u.\n", flags); goto fail_rb; } - ssm_rbuff_set_acl(rb, ACL_RDONLY); - acl = ssm_rbuff_get_acl(rb); - if (acl != ACL_RDONLY) { - printf("Expected ACL_RDONLY, got %u.\n", acl); + ssm_rbuff_clr_bits(rb, RB_WR); + flags = ssm_rbuff_get_flags(rb); + if (flags != RB_RD) { + printf("Expected RB_RD, got %u.\n", flags); goto fail_rb; } @@ -237,7 +237,7 @@ static int test_ssm_rbuff_acl(void) goto fail_rb; } - ssm_rbuff_set_acl(rb, ACL_FLOWDOWN); + ssm_rbuff_set_bits(rb, RB_FLOWDOWN); if (ssm_rbuff_write(rb, 1) != -EFLOWDOWN) { printf("Expected -EFLOWDOWN on FLOWDOWN.\n"); goto fail_rb; @@ -553,7 +553,7 @@ static int test_ssm_rbuff_blocking_flowdown(void) clock_gettime(PTHREAD_COND_CLOCK, &now); ts_add(&now, &interval, &abs_timeout); - ssm_rbuff_set_acl(rb, ACL_FLOWDOWN); + ssm_rbuff_set_bits(rb, RB_FLOWDOWN); ret = ssm_rbuff_read_b(rb, &abs_timeout); if (ret != -EFLOWDOWN) { @@ -561,7 +561,7 @@ static int test_ssm_rbuff_blocking_flowdown(void) goto fail_rb; } - ssm_rbuff_set_acl(rb, ACL_RDWR); + ssm_rbuff_clr_bits(rb, RB_FLOWDOWN); for (i = 0; i < SSM_RBUFF_SIZE - 1; ++i) { if (ssm_rbuff_write(rb, i) < 0) { @@ -573,7 +573,7 @@ static int test_ssm_rbuff_blocking_flowdown(void) clock_gettime(PTHREAD_COND_CLOCK, &now); ts_add(&now, &interval, &abs_timeout); - ssm_rbuff_set_acl(rb, ACL_FLOWDOWN); + ssm_rbuff_set_bits(rb, RB_FLOWDOWN); ret = ssm_rbuff_write_b(rb, 999, &abs_timeout); if (ret != -EFLOWDOWN) { @@ -581,7 +581,7 @@ static int test_ssm_rbuff_blocking_flowdown(void) goto fail_rb; } - ssm_rbuff_set_acl(rb, ACL_RDWR); + ssm_rbuff_clr_bits(rb, RB_FLOWDOWN); while (ssm_rbuff_read(rb) >= 0) ; @@ -664,7 +664,7 @@ int rbuff_test(int argc, ret |= test_ssm_rbuff_write_read(); ret |= test_ssm_rbuff_read_empty(); ret |= test_ssm_rbuff_fill_drain(); - ret |= test_ssm_rbuff_acl(); + ret |= test_ssm_rbuff_flags(); ret |= test_ssm_rbuff_open_close(); ret |= test_ssm_rbuff_threaded(); ret |= test_ssm_rbuff_blocking(); diff --git a/src/lib/tests/CMakeLists.txt b/src/lib/tests/CMakeLists.txt index 5a2f2c52..002d94af 100644 --- a/src/lib/tests/CMakeLists.txt +++ b/src/lib/tests/CMakeLists.txt @@ -10,20 +10,24 @@ create_test_sourcelist(${PARENT_DIR}_tests test_suite.c auth_test_slh_dsa.c bitmap_test.c btree_test.c - crc32_test.c crypt_test.c hash_test.c kex_test.c kex_test_ml_kem.c + keyrot_test.c md5_test.c sha3_test.c sockets_test.c time_test.c tpm_test.c + tw_test.c ) add_executable(${PARENT_DIR}_test ${${PARENT_DIR}_tests}) +target_include_directories(${PARENT_DIR}_test PRIVATE + ${CMAKE_SOURCE_DIR}/src/lib) + disable_test_logging_for_target(${PARENT_DIR}_test) target_link_libraries(${PARENT_DIR}_test ouroboros-common) diff --git a/src/lib/tests/auth_test.c b/src/lib/tests/auth_test.c index 0f3ef715..af7cf81c 100644 --- a/src/lib/tests/auth_test.c +++ b/src/lib/tests/auth_test.c @@ -24,11 +24,14 @@ #include <test/test.h> #include <ouroboros/crypt.h> +#include <ouroboros/name.h> #include <ouroboros/random.h> #include <ouroboros/utils.h> #include <test/certs/ecdsa.h> +#include <string.h> + #define TEST_MSG_SIZE 1500 static int test_auth_create_destroy_ctx(void) @@ -138,6 +141,47 @@ static int test_check_crt_name(void) return TEST_RC_FAIL; } +static int test_crt_name_confusion(void) +{ + char name[NAME_SIZE + 1]; + void * crt; + + TEST_START(); + + if (crypt_load_crt_str(confused_crt_ec, &crt) < 0) { + printf("Failed to load name-confusion certificate.\n"); + goto fail_load; + } + + /* Must extract the real CN, not the "CN=" decoy in the O field. */ + if (crypt_get_crt_name(crt, name) < 0) { + printf("Failed to extract name from certificate.\n"); + goto fail_check; + } + + if (strcmp(name, "attacker.unittest.o7s") != 0) { + printf("Extracted '%s', expected real CN.\n", name); + goto fail_check; + } + + /* The decoy name in the O field must never authenticate. */ + if (crypt_check_crt_name(crt, "victim.unittest.o7s") == 0) { + printf("Accepted spoofed name from O field.\n"); + goto fail_check; + } + + crypt_free_crt(crt); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_check: + crypt_free_crt(crt); + fail_load: + TEST_FAIL(); + return TEST_RC_FAIL; +} + static int test_load_free_privkey(void) { void * key; @@ -400,6 +444,98 @@ static int test_verify_crt_missing_root_ca(void) return TEST_RC_FAIL; } +/* auth_verify_crt_pin: pin must lie in the verified chain (NULL: any) */ +static int test_verify_crt_pin(void) +{ + struct auth_ctx * auth; + void * _root_ca_crt; + void * _im_ca_crt; + void * _signed_server_crt; + void * _other_ca_crt; + + TEST_START(); + + auth = auth_create_ctx(); + if (auth == NULL) { + printf("Failed to create auth context.\n"); + goto fail_create_ctx; + } + + if (crypt_load_crt_str(root_ca_crt_ec, &_root_ca_crt) < 0) { + printf("Failed to load root crt from string.\n"); + goto fail_load_root_ca; + } + + if (crypt_load_crt_str(im_ca_crt_ec, &_im_ca_crt) < 0) { + printf("Failed to load intermediate crt from string.\n"); + goto fail_load_im_ca; + } + + if (crypt_load_crt_str(signed_server_crt_ec, &_signed_server_crt) < 0) { + printf("Failed to load signed crt from string.\n"); + goto fail_load_signed; + } + + if (crypt_load_crt_str(other_ca_crt_ec, &_other_ca_crt) < 0) { + printf("Failed to load out-of-chain crt from string.\n"); + goto fail_load_other; + } + + if (auth_add_crt_to_store(auth, _root_ca_crt) < 0) { + printf("Failed to add root ca crt to auth store.\n"); + goto fail_verify; + } + + if (auth_add_crt_to_store(auth, _im_ca_crt) < 0) { + printf("Failed to add intermediate ca crt to auth store.\n"); + goto fail_verify; + } + + if (auth_verify_crt_pin(auth, _signed_server_crt, _im_ca_crt) < 0) { + printf("Failed to accept pin on intermediate CA.\n"); + goto fail_verify; + } + + if (auth_verify_crt_pin(auth, _signed_server_crt, _root_ca_crt) < 0) { + printf("Failed to accept pin on root CA.\n"); + goto fail_verify; + } + + if (auth_verify_crt_pin(auth, _signed_server_crt, _other_ca_crt) == 0) { + printf("Failed to reject out-of-chain pin.\n"); + goto fail_verify; + } + + if (auth_verify_crt_pin(auth, _signed_server_crt, NULL) < 0) { + printf("Failed to accept NULL (any) pin.\n"); + goto fail_verify; + } + + crypt_free_crt(_other_ca_crt); + crypt_free_crt(_signed_server_crt); + crypt_free_crt(_im_ca_crt); + crypt_free_crt(_root_ca_crt); + + auth_destroy_ctx(auth); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_verify: + crypt_free_crt(_other_ca_crt); + fail_load_other: + crypt_free_crt(_signed_server_crt); + fail_load_signed: + crypt_free_crt(_im_ca_crt); + fail_load_im_ca: + crypt_free_crt(_root_ca_crt); + fail_load_root_ca: + auth_destroy_ctx(auth); + fail_create_ctx: + TEST_FAIL(); + return TEST_RC_FAIL; +} + int test_auth_sign(void) { uint8_t buf[TEST_MSG_SIZE]; @@ -573,6 +709,7 @@ int auth_test(int argc, #ifdef HAVE_OPENSSL ret |= test_load_free_crt(); ret |= test_check_crt_name(); + ret |= test_crt_name_confusion(); ret |= test_crypt_get_pubkey_crt(); ret |= test_load_free_privkey(); ret |= test_load_free_pubkey(); @@ -580,12 +717,14 @@ int auth_test(int argc, ret |= test_store_add(); ret |= test_verify_crt(); ret |= test_verify_crt_missing_root_ca(); + ret |= test_verify_crt_pin(); ret |= test_auth_sign(); ret |= test_auth_bad_signature(); ret |= test_crt_str(); #else (void) test_load_free_crt; (void) test_check_crt_name; + (void) test_crt_name_confusion; (void) test_crypt_get_pubkey_crt; (void) test_load_free_privkey; (void) test_load_free_pubkey; @@ -593,6 +732,7 @@ int auth_test(int argc, (void) test_store_add; (void) test_verify_crt; (void) test_verify_crt_missing_root_ca; + (void) test_verify_crt_pin; (void) test_auth_sign; (void) test_auth_bad_signature; (void) test_crt_str; diff --git a/src/lib/tests/crypt_test.c b/src/lib/tests/crypt_test.c index 028c4eb5..2d752238 100644 --- a/src/lib/tests/crypt_test.c +++ b/src/lib/tests/crypt_test.c @@ -30,6 +30,7 @@ #include <stdio.h> #define TEST_PACKET_SIZE 1500 +#define TEST_N_PACKETS 1000 extern const uint16_t crypt_supported_nids[]; extern const uint16_t md_supported_nids[]; @@ -39,9 +40,10 @@ static int test_crypt_create_destroy(void) struct crypt_ctx * ctx; uint8_t key[SYMMKEYSZ]; struct crypt_sk sk = { - .nid = NID_aes_256_gcm, - .key = key, - .rot_bit = KEY_ROTATION_BIT + .nid = NID_aes_256_gcm, + .key = key, + .epoch = 0, + .role = CRYPT_ROLE_INIT }; TEST_START(); @@ -67,18 +69,27 @@ static int test_crypt_create_destroy(void) static int test_crypt_encrypt_decrypt(int nid) { uint8_t pkt[TEST_PACKET_SIZE]; - struct crypt_ctx * ctx; + struct crypt_ctx * tx; + struct crypt_ctx * rx; uint8_t key[SYMMKEYSZ]; - struct crypt_sk sk = { - .nid = NID_aes_256_gcm, - .key = key, - .rot_bit = KEY_ROTATION_BIT + struct crypt_sk sk_tx = { + .key = key, + .epoch = 0, + .role = CRYPT_ROLE_INIT + }; + struct crypt_sk sk_rx = { + .key = key, + .epoch = 0, + .role = CRYPT_ROLE_RESP }; buffer_t in; buffer_t out; buffer_t out2; const char * cipher; + sk_tx.nid = nid; + sk_rx.nid = nid; + cipher = crypt_nid_to_str(nid); TEST_START("(%s)", cipher); @@ -92,53 +103,63 @@ static int test_crypt_encrypt_decrypt(int nid) goto fail_init; } - ctx = crypt_create_ctx(&sk); - if (ctx == NULL) { - printf("Failed to initialize cryptography.\n"); + tx = crypt_create_ctx(&sk_tx); + if (tx == NULL) { + printf("Failed to initialize TX cryptography.\n"); goto fail_init; } + rx = crypt_create_ctx(&sk_rx); + if (rx == NULL) { + printf("Failed to initialize RX cryptography.\n"); + goto fail_tx; + } + in.len = sizeof(pkt); in.data = pkt; - if (crypt_encrypt(ctx, in, &out) < 0) { + if (crypt_encrypt(tx, in, &out) < 0) { printf("Encryption failed.\n"); goto fail_encrypt; } if (out.len < in.len) { printf("Encryption returned too little data.\n"); - goto fail_encrypt; + goto fail_chk; } - if (crypt_decrypt(ctx, out, &out2) < 0) { + if (crypt_decrypt(rx, out, &out2) < 0) { printf("Decryption failed.\n"); goto fail_decrypt; } if (out2.len != in.len) { printf("Decrypted data length does not match original.\n"); - goto fail_chk; + goto fail_chk2; } if (memcmp(in.data, out2.data, in.len) != 0) { printf("Decrypted data does not match original.\n"); - goto fail_chk; + goto fail_chk2; } - crypt_destroy_ctx(ctx); freebuf(out2); freebuf(out); + crypt_destroy_ctx(rx); + crypt_destroy_ctx(tx); TEST_SUCCESS("(%s)", cipher); return TEST_RC_SUCCESS; - fail_chk: + fail_chk2: freebuf(out2); fail_decrypt: + fail_chk: freebuf(out); fail_encrypt: - crypt_destroy_ctx(ctx); + crypt_destroy_ctx(rx); + fail_tx: + crypt_destroy_ctx(tx); fail_init: TEST_FAIL("(%s)", cipher); return TEST_RC_FAIL; @@ -155,6 +176,214 @@ static int test_encrypt_decrypt_all(void) return ret; } +static int test_crypt_multi_packet(int nid) +{ + uint8_t pkt[TEST_PACKET_SIZE]; + struct crypt_ctx * tx; + struct crypt_ctx * rx; + uint8_t key[SYMMKEYSZ]; + struct crypt_sk sk_tx = { + .key = key, + .epoch = 0, + .role = CRYPT_ROLE_INIT + }; + struct crypt_sk sk_rx = { + .key = key, + .epoch = 0, + .role = CRYPT_ROLE_RESP + }; + buffer_t in; + buffer_t enc; + buffer_t dec; + const char * cipher; + int i; + + sk_tx.nid = nid; + sk_rx.nid = nid; + + cipher = crypt_nid_to_str(nid); + TEST_START("(%s)", cipher); + + if (random_buffer(key, sizeof(key)) < 0) { + printf("Failed to generate random key.\n"); + goto fail_init; + } + + if (random_buffer(pkt, sizeof(pkt)) < 0) { + printf("Failed to generate random data.\n"); + goto fail_init; + } + + tx = crypt_create_ctx(&sk_tx); + if (tx == NULL) { + printf("Failed to create TX context.\n"); + goto fail_init; + } + + rx = crypt_create_ctx(&sk_rx); + if (rx == NULL) { + printf("Failed to create RX context.\n"); + goto fail_tx; + } + + in.len = sizeof(pkt); + in.data = pkt; + + for (i = 0; i < TEST_N_PACKETS; i++) { + if (crypt_encrypt(tx, in, &enc) < 0) { + printf("Encryption failed at packet %d.\n", i); + goto fail_rx; + } + + if (crypt_decrypt(rx, enc, &dec) < 0) { + printf("Decryption failed at packet %d.\n", i); + freebuf(enc); + goto fail_rx; + } + + if (dec.len != in.len || + memcmp(in.data, dec.data, in.len) != 0) { + printf("Data mismatch at packet %d.\n", i); + freebuf(dec); + freebuf(enc); + goto fail_rx; + } + + freebuf(dec); + freebuf(enc); + } + + crypt_destroy_ctx(rx); + crypt_destroy_ctx(tx); + + TEST_SUCCESS("(%s)", cipher); + + return TEST_RC_SUCCESS; + fail_rx: + crypt_destroy_ctx(rx); + fail_tx: + crypt_destroy_ctx(tx); + fail_init: + TEST_FAIL("(%s)", cipher); + return TEST_RC_FAIL; +} + +static int test_multi_packet_all(void) +{ + int ret = 0; + int i; + + for (i = 0; crypt_supported_nids[i] != NID_undef; i++) + ret |= test_crypt_multi_packet(crypt_supported_nids[i]); + + return ret; +} + +static int test_crypt_aad_tamper(int nid) +{ + uint8_t pkt[TEST_PACKET_SIZE]; + struct crypt_ctx * tx; + struct crypt_ctx * rx; + uint8_t key[SYMMKEYSZ]; + struct crypt_sk sk_tx = { + .key = key, + .epoch = 0, + .role = CRYPT_ROLE_INIT + }; + struct crypt_sk sk_rx = { + .key = key, + .epoch = 0, + .role = CRYPT_ROLE_RESP + }; + buffer_t in; + buffer_t enc; + buffer_t dec; + const char * cipher; + + sk_tx.nid = nid; + sk_rx.nid = nid; + + cipher = crypt_nid_to_str(nid); + TEST_START("(%s)", cipher); + + if (random_buffer(key, sizeof(key)) < 0) { + printf("Failed to generate random key.\n"); + goto fail_init; + } + + if (random_buffer(pkt, sizeof(pkt)) < 0) { + printf("Failed to generate random data.\n"); + goto fail_init; + } + + tx = crypt_create_ctx(&sk_tx); + if (tx == NULL) { + printf("Failed to create TX context.\n"); + goto fail_init; + } + + rx = crypt_create_ctx(&sk_rx); + if (rx == NULL) { + printf("Failed to create RX context.\n"); + goto fail_tx; + } + + /* Only AEAD ciphers bind the selector as AAD. */ + if (crypt_get_tagsz(tx) == 0) { + crypt_destroy_ctx(rx); + crypt_destroy_ctx(tx); + + TEST_SUCCESS("(%s)", cipher); + + return TEST_RC_SUCCESS; + } + + in.len = sizeof(pkt); + in.data = pkt; + + if (crypt_encrypt(tx, in, &enc) < 0) { + printf("Encryption failed.\n"); + goto fail_rx; + } + + /* Flip a seq byte: epoch/node stay valid so the AEAD tag rejects. */ + enc.data[5] ^= 0x01; + + if (crypt_decrypt(rx, enc, &dec) == 0) { + printf("Decryption accepted a tampered selector.\n"); + freebuf(dec); + freebuf(enc); + goto fail_rx; + } + + freebuf(enc); + + crypt_destroy_ctx(rx); + crypt_destroy_ctx(tx); + + TEST_SUCCESS("(%s)", cipher); + + return TEST_RC_SUCCESS; + fail_rx: + crypt_destroy_ctx(rx); + fail_tx: + crypt_destroy_ctx(tx); + fail_init: + TEST_FAIL("(%s)", cipher); + return TEST_RC_FAIL; +} + +static int test_aad_tamper_all(void) +{ + int ret = 0; + int i; + + for (i = 0; crypt_supported_nids[i] != NID_undef; i++) + ret |= test_crypt_aad_tamper(crypt_supported_nids[i]); + + return ret; +} + #ifdef HAVE_OPENSSL #include <openssl/evp.h> #include <openssl/obj_mac.h> @@ -256,109 +485,17 @@ static int test_md_nid_values(void) } #endif -static int test_key_rotation(void) +static int test_crypt_headsz(void) { - uint8_t pkt[TEST_PACKET_SIZE]; - struct crypt_ctx * tx_ctx; - struct crypt_ctx * rx_ctx; - uint8_t key[SYMMKEYSZ]; - struct crypt_sk sk = { - .nid = NID_aes_256_gcm, - .key = key, - .rot_bit = 7 - }; - buffer_t in; - buffer_t enc; - buffer_t dec; - uint32_t i; - uint32_t threshold; - - TEST_START(); - - if (random_buffer(key, sizeof(key)) < 0) { - printf("Failed to generate random key.\n"); - goto fail; - } - - if (random_buffer(pkt, sizeof(pkt)) < 0) { - printf("Failed to generate random data.\n"); - goto fail; - } - - tx_ctx = crypt_create_ctx(&sk); - if (tx_ctx == NULL) { - printf("Failed to create TX context.\n"); - goto fail; - } - - rx_ctx = crypt_create_ctx(&sk); - if (rx_ctx == NULL) { - printf("Failed to create RX context.\n"); - goto fail_tx; - } - - in.len = sizeof(pkt); - in.data = pkt; - - threshold = (1U << sk.rot_bit); - - /* Encrypt and decrypt across multiple rotations */ - for (i = 0; i < threshold * 3; i++) { - if (crypt_encrypt(tx_ctx, in, &enc) < 0) { - printf("Encryption failed at packet %u.\n", i); - goto fail_rx; - } - - if (crypt_decrypt(rx_ctx, enc, &dec) < 0) { - printf("Decryption failed at packet %u.\n", i); - freebuf(enc); - goto fail_rx; - } - - if (dec.len != in.len || - memcmp(in.data, dec.data, in.len) != 0) { - printf("Data mismatch at packet %u.\n", i); - freebuf(dec); - freebuf(enc); - goto fail_rx; - } - - freebuf(dec); - freebuf(enc); - } - - crypt_destroy_ctx(rx_ctx); - crypt_destroy_ctx(tx_ctx); - - TEST_SUCCESS(); - - return TEST_RC_SUCCESS; - fail_rx: - crypt_destroy_ctx(rx_ctx); - fail_tx: - crypt_destroy_ctx(tx_ctx); - fail: - TEST_FAIL(); - return TEST_RC_FAIL; -} - -static int test_key_phase_bit(void) -{ - uint8_t pkt[TEST_PACKET_SIZE]; struct crypt_ctx * ctx; uint8_t key[SYMMKEYSZ]; struct crypt_sk sk = { - .nid = NID_aes_256_gcm, - .key = key, - .rot_bit = 7 + .nid = NID_aes_256_gcm, + .key = key, + .epoch = 0, + .role = CRYPT_ROLE_INIT }; - buffer_t in; - buffer_t out; - uint32_t count; - uint32_t threshold; - uint8_t phase_before; - uint8_t phase_after; - int ivsz; + int headsz; TEST_START(); @@ -367,58 +504,15 @@ static int test_key_phase_bit(void) goto fail; } - if (random_buffer(pkt, sizeof(pkt)) < 0) { - printf("Failed to generate random data.\n"); - goto fail; - } - ctx = crypt_create_ctx(&sk); if (ctx == NULL) { printf("Failed to initialize cryptography.\n"); goto fail; } - ivsz = crypt_get_ivsz(ctx); - if (ivsz <= 0) { - printf("Invalid IV size.\n"); - goto fail_ctx; - } - - in.len = sizeof(pkt); - in.data = pkt; - - /* Encrypt packets up to just before rotation threshold */ - threshold = (1U << sk.rot_bit); - - /* Encrypt threshold - 1 packets (indices 0 to threshold-2) */ - for (count = 0; count < threshold - 1; count++) { - if (crypt_encrypt(ctx, in, &out) < 0) { - printf("Encryption failed at count %u.\n", count); - goto fail_ctx; - } - freebuf(out); - } - - /* Packet at index threshold-1: phase should still be initial */ - if (crypt_encrypt(ctx, in, &out) < 0) { - printf("Encryption failed before rotation.\n"); - goto fail_ctx; - } - phase_before = (out.data[0] & 0x80) ? 1 : 0; - freebuf(out); - - /* Packet at index threshold: phase should have toggled */ - if (crypt_encrypt(ctx, in, &out) < 0) { - printf("Encryption failed at rotation threshold.\n"); - goto fail_ctx; - } - phase_after = (out.data[0] & 0x80) ? 1 : 0; - freebuf(out); - - /* Phase bit should have toggled */ - if (phase_before == phase_after) { - printf("Phase bit did not toggle: before=%u, after=%u.\n", - phase_before, phase_after); + headsz = crypt_get_headsz(ctx); + if (headsz != 6) { + printf("Unexpected header size: %d (expected 6).\n", headsz); goto fail_ctx; } @@ -447,11 +541,13 @@ int crypt_test(int argc, #ifdef HAVE_OPENSSL ret |= test_cipher_nid_values(); ret |= test_md_nid_values(); - ret |= test_key_rotation(); - ret |= test_key_phase_bit(); + ret |= test_multi_packet_all(); + ret |= test_aad_tamper_all(); + ret |= test_crypt_headsz(); #else - (void) test_key_rotation; - (void) test_key_phase_bit; + (void) test_multi_packet_all; + (void) test_aad_tamper_all; + (void) test_crypt_headsz; return TEST_RC_SKIP; #endif diff --git a/src/lib/tests/hash_test.c b/src/lib/tests/hash_test.c index e43847e1..a2ba62cc 100644 --- a/src/lib/tests/hash_test.c +++ b/src/lib/tests/hash_test.c @@ -39,6 +39,79 @@ struct vec_entry { char * out; }; +struct mix_entry { + uint64_t in; + uint64_t out; +}; + +static int test_crc8(void) +{ + int ret = 0; + + struct vec_entry vec [] = { + { "", "00" }, + { "123456789", "df" }, + { NULL, NULL } + }; + + struct vec_entry * cur = vec; + + TEST_START(); + + while (cur->in != NULL) { + uint8_t crc; + char res[3]; + + str_hash(HASH_CRC8, &crc, cur->in); + + sprintf(res, "%02x", crc); + if (strcmp(res, cur->out) != 0) { + printf("Hash failed %s != %s.\n", res, cur->out); + ret |= -1; + } + + ++cur; + } + + TEST_END(ret); + + return ret; +} + +static int test_crc16(void) +{ + int ret = 0; + + struct vec_entry vec [] = { + { "", "ffff" }, + { "123456789", "29b1" }, + { NULL, NULL } + }; + + struct vec_entry * cur = vec; + + TEST_START(); + + while (cur->in != NULL) { + uint8_t crc[2]; + char res[5]; + + str_hash(HASH_CRC16, crc, cur->in); + + sprintf(res, "%02x%02x", crc[0], crc[1]); + if (strcmp(res, cur->out) != 0) { + printf("Hash failed %s != %s.\n", res, cur->out); + ret |= -1; + } + + ++cur; + } + + TEST_END(ret); + + return ret; +} + static int test_crc32(void) { int ret = 0; @@ -74,6 +147,42 @@ static int test_crc32(void) return ret; } +static int test_crc64(void) +{ + int ret = 0; + + struct vec_entry vec [] = { + { "", "0000000000000000" }, + { "123456789", "ae8b14860a799888" }, + { "0123456789abcdef", + "091485ca7018730e" }, + { NULL, NULL } + }; + + struct vec_entry * cur = vec; + + TEST_START(); + + while (cur->in != NULL) { + uint8_t crc[8]; + char res[17]; + + str_hash(HASH_CRC64, crc, cur->in); + + sprintf(res, HASH_FMT64, HASH_VAL64(crc)); + if (strcmp(res, cur->out) != 0) { + printf("Hash failed %s != %s.\n", res, cur->out); + ret |= -1; + } + + ++cur; + } + + TEST_END(ret); + + return ret; +} + static int test_md5(void) { int ret = 0; @@ -184,6 +293,36 @@ static int test_sha3(void) return ret; } +static int test_mix64(void) +{ + int ret = 0; + + struct mix_entry vec [] = { + { 0x0000000000000000ULL, 0x0000000000000000ULL }, + { 0x123456789abcdefeULL, 0xb1943cfea4f78f08ULL } + }; + + size_t n = sizeof(vec) / sizeof(vec[0]); + size_t i; + + TEST_START(); + + for (i = 0; i < n; i++) { + uint64_t res = hash_mix64(vec[i].in); + + if (res != vec[i].out) { + printf("Mix failed %016llx != %016llx.\n", + (unsigned long long) res, + (unsigned long long) vec[i].out); + ret |= -1; + } + } + + TEST_END(ret); + + return ret; +} + int hash_test(int argc, char ** argv) { @@ -192,11 +331,19 @@ int hash_test(int argc, (void) argc; (void) argv; + ret |= test_crc8(); + + ret |= test_crc16(); + ret |= test_crc32(); + ret |= test_crc64(); + ret |= test_md5(); ret |= test_sha3(); + ret |= test_mix64(); + return ret; } diff --git a/src/lib/tests/kex_test.c b/src/lib/tests/kex_test.c index 6a4f802e..0a00ccab 100644 --- a/src/lib/tests/kex_test.c +++ b/src/lib/tests/kex_test.c @@ -44,6 +44,9 @@ #define KEX_CONFIG_NONE \ "none\n" +#define KEX_CONFIG_NO_ENC \ + "encryption=none\n" + #define KEX_CONFIG_WHITESPACE \ "# Comment line\n" \ "kex = X448" \ @@ -58,6 +61,31 @@ "kex=X25519\n" \ "digest=sha384\n" +#define KEX_CONFIG_AUTH \ + "auth=required\n" + +#define KEX_CONFIG_AUTH_INVALID \ + "auth=mandatory\n" + +#define KEX_CONFIG_AUTH_OPTIONAL \ + "auth=optional\n" + +#define KEX_CONFIG_AUTH_THEN_NO_ENC \ + "auth=required\n" \ + "digest=sha512\n" \ + "encryption=none\n" + +#define KEX_CONFIG_NO_ENC_THEN_AUTH \ + "encryption=none\n" \ + "auth=required\n" \ + "digest=sha512\n" + +#define KEX_CONFIG_CACERT \ + "cacert=/etc/ouroboros/security/cacert/ca.crt\n" + +#define KEX_CONFIG_UNKNOWN_KEY \ + "autth=required\n" + /* Test key material for key loading tests */ #define X25519_PRIVKEY_PEM \ "-----BEGIN PRIVATE KEY-----\n" \ @@ -213,6 +241,7 @@ static int test_kex_dhe_derive(const char * algo) memset(&kex, 0, sizeof(kex)); SET_KEX_ALGO(&kex, algo); + SET_KEX_KDF_NID(&kex, NID_sha256); len = kex_pkp_create(&kex, &pkp1, buf1); if (len < 0) { @@ -324,6 +353,7 @@ static int test_kex_dhe_corrupted_pubkey(const char * algo) memset(&kex, 0, sizeof(kex)); SET_KEX_ALGO(&kex, algo); + SET_KEX_KDF_NID(&kex, NID_sha256); len = kex_pkp_create(&kex, &pkp, buf); if (len < 0) { @@ -375,6 +405,8 @@ static int test_kex_dhe_wrong_algo(void) memset(&kex2, 0, sizeof(kex2)); SET_KEX_ALGO(&kex1, algo1); SET_KEX_ALGO(&kex2, algo2); + SET_KEX_KDF_NID(&kex1, NID_sha256); + SET_KEX_KDF_NID(&kex2, NID_sha256); if (kex_pkp_create(&kex1, &pkp1, buf1) < 0) { printf("Failed to create first key pair.\n"); @@ -639,7 +671,8 @@ static int test_kex_parse_config_custom(void) return TEST_RC_FAIL; } -static int test_kex_parse_config_none(void) +/* The old bare 'none' keyword must be rejected loudly */ +static int test_kex_parse_config_none_rejected(void) { struct sec_config kex; FILE * fp; @@ -654,14 +687,51 @@ static int test_kex_parse_config_none(void) goto fail; } + if (parse_sec_config(&kex, fp) == 0) { + printf("Bare 'none' keyword should be rejected.\n"); + fclose(fp); + goto fail; + } + + fclose(fp); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_kex_parse_config_no_enc(void) +{ + struct sec_config kex; + FILE * fp; + + TEST_START(); + + memset(&kex, 0, sizeof(kex)); + + fp = FMEMOPEN_STR(KEX_CONFIG_NO_ENC); + if (fp == NULL) { + printf("Failed to open memory stream.\n"); + goto fail; + } + if (parse_sec_config(&kex, fp) < 0) { - printf("Failed to parse 'none' config.\n"); + printf("Failed to parse encryption=none config.\n"); fclose(fp); goto fail; } - if (kex.x.nid != NID_undef) { - printf("'none' keyword should disable encryption.\n"); + if (kex.x.nid != NID_undef || kex.c.nid != NID_undef) { + printf("encryption=none should disable encryption.\n"); + fclose(fp); + goto fail; + } + + if (kex.d.nid != NID_sha256) { + printf("encryption=none should keep the digest.\n"); fclose(fp); goto fail; } @@ -799,6 +869,277 @@ static int test_kex_parse_config_digest(void) return TEST_RC_FAIL; } +static int test_kex_parse_config_auth(void) +{ + struct sec_config kex; + FILE * fp; + + TEST_START(); + + memset(&kex, 0, sizeof(kex)); + + fp = FMEMOPEN_STR(KEX_CONFIG_AUTH); + if (fp == NULL) { + printf("Failed to open memory stream.\n"); + goto fail; + } + + if (parse_sec_config(&kex, fp) < 0) { + printf("Failed to parse auth config.\n"); + fclose(fp); + goto fail; + } + + if (!kex.a.req) { + printf("auth=required not parsed correctly.\n"); + fclose(fp); + goto fail; + } + + fclose(fp); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_kex_parse_config_auth_invalid(void) +{ + struct sec_config kex; + FILE * fp; + + TEST_START(); + + memset(&kex, 0, sizeof(kex)); + + fp = FMEMOPEN_STR(KEX_CONFIG_AUTH_INVALID); + if (fp == NULL) { + printf("Failed to open memory stream.\n"); + goto fail; + } + + if (parse_sec_config(&kex, fp) == 0) { + printf("Invalid auth value should be rejected.\n"); + fclose(fp); + goto fail; + } + + fclose(fp); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* A caller-seeded req_auth survives parsing when no auth= line is set */ +static int test_kex_parse_config_auth_seed(void) +{ + struct sec_config kex; + FILE * fp; + + TEST_START(); + + memset(&kex, 0, sizeof(kex)); + kex.a.req = true; + + fp = FMEMOPEN_STR(KEX_CONFIG_NO_ENC); + if (fp == NULL) { + printf("Failed to open memory stream.\n"); + goto fail; + } + + if (parse_sec_config(&kex, fp) < 0) { + printf("Failed to parse config.\n"); + fclose(fp); + goto fail; + } + + if (!kex.a.req) { + printf("Seeded req_auth should survive parsing.\n"); + fclose(fp); + goto fail; + } + + fclose(fp); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* An explicit auth=optional clears a caller-seeded req_auth */ +static int test_kex_parse_config_auth_optional(void) +{ + struct sec_config kex; + FILE * fp; + + TEST_START(); + + memset(&kex, 0, sizeof(kex)); + kex.a.req = true; + + fp = FMEMOPEN_STR(KEX_CONFIG_AUTH_OPTIONAL); + if (fp == NULL) { + printf("Failed to open memory stream.\n"); + goto fail; + } + + if (parse_sec_config(&kex, fp) < 0) { + printf("Failed to parse auth=optional config.\n"); + fclose(fp); + goto fail; + } + + if (kex.a.req) { + printf("auth=optional should clear req_auth.\n"); + fclose(fp); + goto fail; + } + + fclose(fp); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* encryption=none must not drop auth=required or the digest */ +static int test_kex_parse_config_auth_no_enc(const char * config) +{ + struct sec_config kex; + FILE * fp; + + TEST_START(); + + memset(&kex, 0, sizeof(kex)); + + fp = FMEMOPEN_STR(config); + if (fp == NULL) { + printf("Failed to open memory stream.\n"); + goto fail; + } + + if (parse_sec_config(&kex, fp) < 0) { + printf("Failed to parse auth + encryption=none.\n"); + fclose(fp); + goto fail; + } + + if (!kex.a.req) { + printf("encryption=none should not drop required auth.\n"); + fclose(fp); + goto fail; + } + + if (kex.x.nid != NID_undef) { + printf("encryption=none should disable encryption.\n"); + fclose(fp); + goto fail; + } + + if (kex.d.nid != NID_sha512) { + printf("encryption=none should keep the digest.\n"); + fclose(fp); + goto fail; + } + + fclose(fp); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_kex_parse_config_cacert(void) +{ + struct sec_config kex; + FILE * fp; + + TEST_START(); + + memset(&kex, 0, sizeof(kex)); + + fp = FMEMOPEN_STR(KEX_CONFIG_CACERT); + if (fp == NULL) { + printf("Failed to open memory stream.\n"); + goto fail; + } + + if (parse_sec_config(&kex, fp) < 0) { + printf("Failed to parse cacert config.\n"); + fclose(fp); + goto fail; + } + + if (strcmp(kex.a.cacert, + "/etc/ouroboros/security/cacert/ca.crt") != 0) { + printf("cacert not parsed correctly.\n"); + fclose(fp); + goto fail; + } + + if (kex.a.req) { + printf("cacert must not imply req_auth.\n"); + fclose(fp); + goto fail; + } + + fclose(fp); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_kex_parse_config_unknown_key(void) +{ + struct sec_config kex; + FILE * fp; + + TEST_START(); + + memset(&kex, 0, sizeof(kex)); + + fp = FMEMOPEN_STR(KEX_CONFIG_UNKNOWN_KEY); + if (fp == NULL) { + printf("Failed to open memory stream.\n"); + goto fail; + } + + if (parse_sec_config(&kex, fp) == 0) { + printf("Unknown key should be rejected.\n"); + fclose(fp); + goto fail; + } + + fclose(fp); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + int kex_test(int argc, char ** argv) { @@ -809,7 +1150,16 @@ int kex_test(int argc, ret |= test_kex_create_destroy(); ret |= test_kex_parse_config_empty(); - ret |= test_kex_parse_config_none(); + ret |= test_kex_parse_config_none_rejected(); + ret |= test_kex_parse_config_no_enc(); + ret |= test_kex_parse_config_auth(); + ret |= test_kex_parse_config_auth_invalid(); + ret |= test_kex_parse_config_auth_seed(); + ret |= test_kex_parse_config_auth_optional(); + ret |= test_kex_parse_config_auth_no_enc(KEX_CONFIG_AUTH_THEN_NO_ENC); + ret |= test_kex_parse_config_auth_no_enc(KEX_CONFIG_NO_ENC_THEN_AUTH); + ret |= test_kex_parse_config_cacert(); + ret |= test_kex_parse_config_unknown_key(); #ifdef HAVE_OPENSSL ret |= test_kex_parse_config_custom(); ret |= test_kex_parse_config_whitespace(); diff --git a/src/lib/tests/keyrot_test.c b/src/lib/tests/keyrot_test.c new file mode 100644 index 00000000..1c9f741b --- /dev/null +++ b/src/lib/tests/keyrot_test.c @@ -0,0 +1,1083 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * Test of the key-rotation schedule + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +#define _POSIX_C_SOURCE 200809L + +#include "config.h" + +#include <test/test.h> + +#ifdef HAVE_OPENSSL +#include <ouroboros/crypt.h> +#include <ouroboros/pthread.h> + +#include "crypt/keyrot.h" + +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> + +static const uint8_t SEED_A[SYMMKEYSZ] = { + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, + 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20 +}; + +static int test_create_destroy(void) +{ + struct keyrot * kr; + + TEST_START(); + + kr = keyrot_create(SEED_A, 0, 0); + if (kr == NULL) + goto fail; + + keyrot_destroy(kr); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_epoch_range(void) +{ + struct keyrot * a; + + TEST_START(); + + /* epoch is a 4-bit wire field; 16 and up must be refused. */ + if (keyrot_create(SEED_A, 16, 0) != NULL) + goto fail; + + a = keyrot_create(SEED_A, 0, 0); + if (a == NULL) + goto fail; + + if (keyrot_rekey(a, SEED_A, 16) == 0) + goto fail_a; + + keyrot_destroy(a); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_a: + keyrot_destroy(a); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_tx_deterministic(void) +{ + struct keyrot * a; + struct keyrot * b; + uint8_t sela[KR_SELECTOR_LEN]; + uint8_t selb[KR_SELECTOR_LEN]; + uint8_t na[KR_NONCE_LEN]; + uint8_t nb[KR_NONCE_LEN]; + const uint8_t * ka; + const uint8_t * kb; + + TEST_START(); + + a = keyrot_create(SEED_A, 0, 0); + if (a == NULL) + goto fail; + + b = keyrot_create(SEED_A, 0, 0); + if (b == NULL) + goto fail_a; + + if (keyrot_tx_next(a, sela, &ka, na) != 0) + goto fail_b; + + if (keyrot_tx_next(b, selb, &kb, nb) != 0) + goto fail_b; + + if (memcmp(sela, selb, KR_SELECTOR_LEN) != 0) + goto fail_b; + + if (memcmp(ka, kb, SYMMKEYSZ) != 0) + goto fail_b; + + if (memcmp(na, nb, KR_NONCE_LEN) != 0) + goto fail_b; + + keyrot_destroy(b); + keyrot_destroy(a); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_b: + keyrot_destroy(b); + fail_a: + keyrot_destroy(a); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_selector_layout(void) +{ + struct keyrot * a; + uint8_t sel[KR_SELECTOR_LEN]; + uint8_t nonce[KR_NONCE_LEN]; + const uint8_t * k; + + TEST_START(); + + a = keyrot_create(SEED_A, 3, 0); + if (a == NULL) + goto fail; + + /* First packet: epoch 3, node 0, seq 0 */ + if (keyrot_tx_next(a, sel, &k, nonce) != 0) + goto fail_a; + + if ((sel[0] >> 4) != 3) /* epoch */ + goto fail_a; + + if ((((sel[0] & 0x0F) << 8) | sel[1]) != 0) /* node */ + goto fail_a; + + if (sel[2] != 0 || sel[3] != 0 || sel[4] != 0 || sel[5] != 0) + goto fail_a; + + /* Second packet: seq advances to 1 */ + if (keyrot_tx_next(a, sel, &k, nonce) != 0) + goto fail_a; + + if (sel[5] != 1) + goto fail_a; + + keyrot_destroy(a); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_a: + keyrot_destroy(a); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_nodes_left_initial(void) +{ + struct keyrot * a; + + TEST_START(); + + a = keyrot_create(SEED_A, 0, 0); + if (a == NULL) + goto fail; + + if (keyrot_tx_nodes_left(a) != KEY_NODE_COUNT) + goto fail_a; + + keyrot_destroy(a); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_a: + keyrot_destroy(a); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_roundtrip(void) +{ + struct keyrot * a; /* role 0 */ + struct keyrot * b; /* role 1 */ + uint8_t sel[KR_SELECTOR_LEN]; + uint8_t ntx[KR_NONCE_LEN]; + uint8_t nrx[KR_NONCE_LEN]; + uint8_t ktx[SYMMKEYSZ]; + const uint8_t * ptx; + const uint8_t * prx; + struct kr_rx rx; + int i; + + TEST_START(); + + a = keyrot_create(SEED_A, 0, 0); + if (a == NULL) + goto fail; + + b = keyrot_create(SEED_A, 0, 1); + if (b == NULL) + goto fail_a; + + for (i = 0; i < 256; i++) { + if (keyrot_tx_next(a, sel, &ptx, ntx) != 0) + goto fail_b; + memcpy(ktx, ptx, SYMMKEYSZ); + if (keyrot_rx_lookup(b, sel, &prx, nrx, &rx) != 0) + goto fail_b; + if (keyrot_rx_commit(b, &rx) != 0) + goto fail_b; + if (memcmp(ktx, prx, SYMMKEYSZ) != 0) + goto fail_b; + if (memcmp(ntx, nrx, KR_NONCE_LEN) != 0) + goto fail_b; + } + + keyrot_destroy(b); + keyrot_destroy(a); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_b: + keyrot_destroy(b); + fail_a: + keyrot_destroy(a); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_direction_separation(void) +{ + struct keyrot * a; /* role 0 */ + struct keyrot * b; /* role 1 */ + uint8_t sela[KR_SELECTOR_LEN]; + uint8_t selb[KR_SELECTOR_LEN]; + uint8_t n[KR_NONCE_LEN]; + uint8_t ka[SYMMKEYSZ]; + const uint8_t * pa; + const uint8_t * pb; + + TEST_START(); + + a = keyrot_create(SEED_A, 0, 0); + if (a == NULL) + goto fail; + + b = keyrot_create(SEED_A, 0, 1); + if (b == NULL) + goto fail_a; + + if (keyrot_tx_next(a, sela, &pa, n) != 0) + goto fail_b; + + memcpy(ka, pa, SYMMKEYSZ); + if (keyrot_tx_next(b, selb, &pb, n) != 0) + goto fail_b; + + /* Same position, different role -> different leaf key */ + if (memcmp(ka, pb, SYMMKEYSZ) == 0) + goto fail_b; + + keyrot_destroy(b); + keyrot_destroy(a); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_b: + keyrot_destroy(b); + fail_a: + keyrot_destroy(a); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* Build a selector by hand (test knows the wire format). */ +static void mk_sel(uint8_t epoch, + uint16_t node, + uint32_t seq, + uint8_t sel[KR_SELECTOR_LEN]) +{ + sel[0] = (uint8_t) ((epoch << 4) | ((node >> 8) & 0x0F)); + sel[1] = (uint8_t) (node & 0xFF); + sel[2] = (uint8_t) (seq >> 24); + sel[3] = (uint8_t) (seq >> 16); + sel[4] = (uint8_t) (seq >> 8); + sel[5] = (uint8_t) (seq); +} + +static int test_random_access(void) +{ + struct keyrot * b; + uint8_t s0[KR_SELECTOR_LEN]; + uint8_t s5[KR_SELECTOR_LEN]; + uint8_t n[KR_NONCE_LEN]; + uint8_t k_first[SYMMKEYSZ]; + uint8_t k_node5[SYMMKEYSZ]; + const uint8_t * p; + struct kr_rx rx; + + TEST_START(); + + b = keyrot_create(SEED_A, 0, 1); + if (b == NULL) + goto fail; + + mk_sel(0, 0, 0, s0); + mk_sel(0, 5, 12345, s5); /* a far-ahead node, mid-span */ + + /* Jump straight to node 0 */ + if (keyrot_rx_lookup(b, s0, &p, n, &rx) != 0) + goto fail_b; + + memcpy(k_first, p, SYMMKEYSZ); + + /* Jump forward to node 5 (simulates a burst skip) */ + if (keyrot_rx_lookup(b, s5, &p, n, &rx) != 0) + goto fail_b; + + memcpy(k_node5, p, SYMMKEYSZ); + + /* Different nodes must yield different keys */ + if (memcmp(k_first, k_node5, SYMMKEYSZ) == 0) + goto fail_b; + + /* Jump back to node 0: still works, identical (no wedge) */ + if (keyrot_rx_lookup(b, s0, &p, n, &rx) != 0) + goto fail_b; + + if (memcmp(k_first, p, SYMMKEYSZ) != 0) + goto fail_b; + + /* Out-of-range node must be rejected */ + mk_sel(0, KEY_NODE_COUNT, 0, s0); + if (keyrot_rx_lookup(b, s0, &p, n, &rx) == 0) + goto fail_b; + + keyrot_destroy(b); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_b: + keyrot_destroy(b); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static const uint8_t SEED_B[SYMMKEYSZ] = { + 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, + 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, + 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, + 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0 +}; + +/* + * Look up and commit one within-node counter on epoch 0. Returns 0 on + * accept, 1 on a rejected commit (replay or too old), and -1 if the + * lookup itself failed - kept distinct so a reject assertion can never + * pass on an unrelated lookup miss. + */ +static int commit_ctr(struct keyrot * kr, + uint32_t ctr) +{ + uint8_t sel[KR_SELECTOR_LEN]; + uint8_t n[KR_NONCE_LEN]; + const uint8_t * k; + struct kr_rx rx; + + mk_sel(0, 0, ctr, sel); + + if (keyrot_rx_lookup(kr, sel, &k, n, &rx) != 0) + return -1; + + return keyrot_rx_commit(kr, &rx) == 0 ? 0 : 1; +} + +static int test_replay_window(void) +{ + struct keyrot * b; + struct keyrot * c; + uint32_t base; + uint32_t jump; + + TEST_START(); + + b = keyrot_create(SEED_A, 0, 1); + if (b == NULL) + goto fail; + + /* Fresh counters accepted; an immediate replay is rejected. */ + if (commit_ctr(b, 100) != 0) + goto fail_b; + + if (commit_ctr(b, 100) != 1) + goto fail_b; + + /* In-window reorder: accepted once, rejected on replay. */ + if (commit_ctr(b, 105) != 0) + goto fail_b; + + if (commit_ctr(b, 102) != 0) + goto fail_b; + + if (commit_ctr(b, 102) != 1) + goto fail_b; + + /* Too-old boundary: the window edge is rejected, just inside is not. */ + base = 4 * KEY_REPLAY_WINDOW; + if (commit_ctr(b, base) != 0) + goto fail_b; + + if (commit_ctr(b, base - (KEY_REPLAY_WINDOW - 64)) != 1) + goto fail_b; + + if (commit_ctr(b, base - (KEY_REPLAY_WINDOW - 64) + 1) != 0) + goto fail_b; + + /* + * RFC 6479 slack-word regression: two low counters, then a + * forward jump of a full bitmap that aliases their slot, then a + * replay of a low counter. Without the reserved slack word this + * replay is wrongly accepted. + */ + c = keyrot_create(SEED_A, 0, 1); + if (c == NULL) + goto fail_b; + + if (commit_ctr(c, 70) != 0) + goto fail_c; + + if (commit_ctr(c, 74) != 0) + goto fail_c; + + jump = KEY_REPLAY_WINDOW + 63; + if (commit_ctr(c, jump) != 0) + goto fail_c; + + if (commit_ctr(c, 74) != 1) + goto fail_c; + + keyrot_destroy(c); + keyrot_destroy(b); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_c: + keyrot_destroy(c); + fail_b: + keyrot_destroy(b); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_lookup_no_commit(void) +{ + struct keyrot * b; + uint8_t sel[KR_SELECTOR_LEN]; + uint8_t n[KR_NONCE_LEN]; + const uint8_t * k; + struct kr_rx rx; + int i; + + TEST_START(); + + b = keyrot_create(SEED_A, 0, 1); + if (b == NULL) + goto fail; + + mk_sel(0, 0, 100, sel); + + /* Repeated lookups are pre-AEAD and must not consume the slot. */ + for (i = 0; i < 4; i++) { + if (keyrot_rx_lookup(b, sel, &k, n, &rx) != 0) + goto fail_b; + } + + /* The slot is still fresh, so the first commit accepts ... */ + if (keyrot_rx_commit(b, &rx) != 0) + goto fail_b; + + /* ... and only the commit advanced it, so the next is a replay. */ + if (keyrot_rx_commit(b, &rx) == 0) + goto fail_b; + + keyrot_destroy(b); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_b: + keyrot_destroy(b); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_commit_prev_batch(void) +{ + struct keyrot * b; + uint8_t sel[KR_SELECTOR_LEN]; + uint8_t n[KR_NONCE_LEN]; + const uint8_t * k; + struct kr_rx rx; + + TEST_START(); + + b = keyrot_create(SEED_A, 0, 1); + if (b == NULL) + goto fail; + + /* Capture a packet under cur (epoch 0). */ + mk_sel(0, 0, 7, sel); + if (keyrot_rx_lookup(b, sel, &k, n, &rx) != 0) + goto fail_b; + + /* Re-key: the captured batch becomes prev and the flag clears. */ + if (keyrot_rekey(b, SEED_B, 1) != 0) + goto fail_b; + + /* The straggler commits under prev without claiming a switch. */ + if (keyrot_rx_commit(b, &rx) != 0) + goto fail_b; + + if (keyrot_peer_switched(b)) + goto fail_b; + + /* prev still holds a replay window: its replay is rejected. */ + if (keyrot_rx_commit(b, &rx) == 0) + goto fail_b; + + keyrot_destroy(b); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_b: + keyrot_destroy(b); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_replay_forward_clear(void) +{ + struct keyrot * d; + uint32_t low; + uint32_t alias; + uint32_t jump; + + TEST_START(); + + d = keyrot_create(SEED_A, 0, 1); + if (d == NULL) + goto fail; + + /* alias shares low's slot a window away; the jump must clear it. */ + low = 10; + alias = low + KEY_REPLAY_WINDOW; + jump = alias + KEY_REPLAY_WINDOW / 2; + + if (commit_ctr(d, low) != 0) + goto fail_d; + + if (commit_ctr(d, jump) != 0) + goto fail_d; + + if (commit_ctr(d, alias) != 0) + goto fail_d; + + if (commit_ctr(d, alias) != 1) + goto fail_d; + + keyrot_destroy(d); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_d: + keyrot_destroy(d); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_rekey_overlap(void) +{ + struct keyrot * a; /* role 0 */ + struct keyrot * b; /* role 1 */ + uint8_t old_sel[KR_SELECTOR_LEN]; + uint8_t sel[KR_SELECTOR_LEN]; + uint8_t ntx[KR_NONCE_LEN]; + uint8_t nrx[KR_NONCE_LEN]; + uint8_t ktx[SYMMKEYSZ]; + const uint8_t * ptx; + const uint8_t * prx; + struct kr_rx rx; + + TEST_START(); + + a = keyrot_create(SEED_A, 0, 0); + if (a == NULL) + goto fail; + + b = keyrot_create(SEED_A, 0, 1); + if (b == NULL) + goto fail_a; + + /* Send one gen-0 packet; keep its selector for the overlap. */ + if (keyrot_tx_next(a, old_sel, &ptx, ntx) != 0) + goto fail_b; + + memcpy(ktx, ptx, SYMMKEYSZ); + if (keyrot_rx_lookup(b, old_sel, &prx, nrx, &rx) != 0) + goto fail_b; + + if (memcmp(ktx, prx, SYMMKEYSZ) != 0) + goto fail_b; + + /* Both ends re-key to epoch 1 with a fresh seed. */ + if (keyrot_rekey(a, SEED_B, 1) != 0) + goto fail_b; + + if (keyrot_rekey(b, SEED_B, 1) != 0) + goto fail_b; + + /* TX is gated until promotion; promote a to emit the new epoch. */ + keyrot_tx_promote(a); + + /* New gen-1 traffic works. */ + if (keyrot_tx_next(a, sel, &ptx, ntx) != 0) + goto fail_b; + + memcpy(ktx, ptx, SYMMKEYSZ); + if (keyrot_rx_lookup(b, sel, &prx, nrx, &rx) != 0) + goto fail_b; + + if (memcmp(ktx, prx, SYMMKEYSZ) != 0) + goto fail_b; + + /* A straggling gen-0 packet still decrypts (overlap window). */ + if (keyrot_rx_lookup(b, old_sel, &prx, nrx, &rx) != 0) + goto fail_b; + + /* An unknown epoch is rejected. */ + mk_sel(7, 0, 0, sel); + if (keyrot_rx_lookup(b, sel, &prx, nrx, &rx) == 0) + goto fail_b; + + keyrot_destroy(b); + keyrot_destroy(a); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_b: + keyrot_destroy(b); + fail_a: + keyrot_destroy(a); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_tx_gate(void) +{ + struct keyrot * a; /* role 0 */ + struct keyrot * b; /* role 1 */ + uint8_t sel[KR_SELECTOR_LEN]; + uint8_t n[KR_NONCE_LEN]; + const uint8_t * p; + struct kr_rx rx; + + TEST_START(); + + a = keyrot_create(SEED_A, 0, 0); + if (a == NULL) + goto fail; + + b = keyrot_create(SEED_A, 0, 1); + if (b == NULL) + goto fail_a; + + /* Both re-key to epoch 1; TX must stay on epoch 0 until promoted. */ + if (keyrot_rekey(a, SEED_B, 1) != 0) + goto fail_b; + + if (keyrot_rekey(b, SEED_B, 1) != 0) + goto fail_b; + + /* a's TX still stamps the old epoch (0). */ + if (keyrot_tx_next(a, sel, &p, n) != 0) + goto fail_b; + + if ((sel[0] >> 4) != 0) + goto fail_b; + + /* b decrypts the old-epoch packet via its prev batch. */ + if (keyrot_rx_lookup(b, sel, &p, n, &rx) != 0) + goto fail_b; + + if (keyrot_rx_commit(b, &rx) != 0) + goto fail_b; + + /* b has not yet seen the new epoch from a. */ + if (keyrot_peer_switched(b)) + goto fail_b; + + /* a promotes; its TX now stamps the new epoch (1). */ + keyrot_tx_promote(a); + if (keyrot_tx_next(a, sel, &p, n) != 0) + goto fail_b; + + if ((sel[0] >> 4) != 1) + goto fail_b; + + /* b sees the new epoch and reports the peer switched. */ + if (keyrot_rx_lookup(b, sel, &p, n, &rx) != 0) + goto fail_b; + + if (keyrot_rx_commit(b, &rx) != 0) + goto fail_b; + + if (!keyrot_peer_switched(b)) + goto fail_b; + + keyrot_destroy(b); + keyrot_destroy(a); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_b: + keyrot_destroy(b); + fail_a: + keyrot_destroy(a); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_peer_switched_commit_only(void) +{ + struct keyrot * b; + uint8_t sel[KR_SELECTOR_LEN]; + uint8_t n[KR_NONCE_LEN]; + const uint8_t * k; + struct kr_rx rx; + + TEST_START(); + + b = keyrot_create(SEED_A, 0, 1); + if (b == NULL) + goto fail; + + /* A re-key clears the flag until a packet is seen on cur. */ + if (keyrot_rekey(b, SEED_B, 1) != 0) + goto fail_b; + + if (keyrot_peer_switched(b)) + goto fail_b; + + mk_sel(1, 0, 0, sel); + + /* Lookup is pre-AEAD: selecting a key must not flip the flag. */ + if (keyrot_rx_lookup(b, sel, &k, n, &rx) != 0) + goto fail_b; + + if (keyrot_peer_switched(b)) + goto fail_b; + + /* Commit runs post-AEAD and is what records the peer switched. */ + if (keyrot_rx_commit(b, &rx) != 0) + goto fail_b; + + if (!keyrot_peer_switched(b)) + goto fail_b; + + keyrot_destroy(b); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_b: + keyrot_destroy(b); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_commit_evicted(void) +{ + struct keyrot * b; + uint8_t sel[KR_SELECTOR_LEN]; + uint8_t n[KR_NONCE_LEN]; + const uint8_t * k; + struct kr_rx rx; + + TEST_START(); + + b = keyrot_create(SEED_A, 0, 1); + if (b == NULL) + goto fail; + + mk_sel(0, 0, 3, sel); + if (keyrot_rx_lookup(b, sel, &k, n, &rx) != 0) + goto fail_b; + + /* Two re-keys drop the captured batch from both cur and prev. */ + if (keyrot_rekey(b, SEED_B, 1) != 0) + goto fail_b; + + if (keyrot_rekey(b, SEED_A, 2) != 0) + goto fail_b; + + /* Commit on an evicted batch is a silent no-op, not a fault. */ + if (keyrot_rx_commit(b, &rx) != 0) + goto fail_b; + + keyrot_destroy(b); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_b: + keyrot_destroy(b); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* + * Concurrency: many TX threads + RX + re-key share one keyrot. The + * (epoch, counter) the TX side stamps must be globally unique (no AEAD + * nonce reuse). Capped below 16 re-keys so epoch maps 1:1 to a batch and + * the wire epoch never wraps (a wrapped epoch under a fresh key is not + * reuse but would false-trip the uniqueness check). Run under TSan to + * catch data races the static reviews can't. + */ +#define CT_THREADS 4 +#define CT_PKTS 2000 +#define CT_REKEYS 8 + +struct ct_rec { + uint8_t epoch; + uint64_t ctr; +}; + +struct ct_arg { + struct keyrot * kr; + struct ct_rec * recs; + size_t n; +}; + +static void * ct_tx_thread(void * a) +{ + struct ct_arg * arg = a; + uint8_t sel[KR_SELECTOR_LEN]; + uint8_t nonce[KR_NONCE_LEN]; + const uint8_t * k; + uint64_t ctr; + size_t i; + size_t j; + + for (i = 0; i < CT_PKTS; i++) { + if (keyrot_tx_next(arg->kr, sel, &k, nonce) != 0) + continue; + + ctr = 0; + for (j = 0; j < 8; j++) + ctr = (ctr << 8) | nonce[j]; + + arg->recs[arg->n].epoch = (uint8_t) (sel[0] >> 4); + arg->recs[arg->n].ctr = ctr; + arg->n++; + } + + return NULL; +} + +static void * ct_rx_thread(void * a) +{ + struct keyrot * kr = a; + uint8_t sel[KR_SELECTOR_LEN]; + uint8_t nonce[KR_NONCE_LEN]; + const uint8_t * k; + struct kr_rx rx; + size_t i; + + /* Exercise rx_lookup against re-key reclaim; results ignored. */ + for (i = 0; i < CT_PKTS; i++) { + mk_sel((uint8_t) (i % 16), 0, (uint32_t) i, sel); + if (keyrot_rx_lookup(kr, sel, &k, nonce, &rx) == 0) + (void) keyrot_rx_commit(kr, &rx); + } + + return NULL; +} + +static void * ct_rekey_thread(void * a) +{ + struct keyrot * kr = a; + struct timespec t; + int e; + + t.tv_sec = 0; + t.tv_nsec = 2 * 1000 * 1000; /* 2 ms */ + + for (e = 1; e <= CT_REKEYS; e++) { + nanosleep(&t, NULL); + if (keyrot_rekey(kr, (e & 1) ? SEED_B : SEED_A, + (uint8_t) e) != 0) + break; + keyrot_tx_promote(kr); + } + + return NULL; +} + +static int ct_cmp(const void * x, + const void * y) +{ + const struct ct_rec * a = x; + const struct ct_rec * b = y; + + if (a->epoch != b->epoch) + return a->epoch < b->epoch ? -1 : 1; + + if (a->ctr != b->ctr) + return a->ctr < b->ctr ? -1 : 1; + + return 0; +} + +static int test_concurrent_nonce_unique(void) +{ + struct keyrot * kr; + struct ct_arg arg[CT_THREADS]; + pthread_t tx[CT_THREADS]; + pthread_t rx; + pthread_t rk; + struct ct_rec * all; + size_t total; + size_t i; + bool reuse = false; + + TEST_START(); + + kr = keyrot_create(SEED_A, 0, 0); + if (kr == NULL) + goto fail; + + all = malloc(sizeof(*all) * CT_THREADS * CT_PKTS); + if (all == NULL) + goto fail_kr; + + for (i = 0; i < CT_THREADS; i++) { + arg[i].kr = kr; + arg[i].n = 0; + arg[i].recs = all + i * CT_PKTS; + } + + for (i = 0; i < CT_THREADS; i++) + pthread_create(&tx[i], NULL, ct_tx_thread, &arg[i]); + + pthread_create(&rx, NULL, ct_rx_thread, kr); + pthread_create(&rk, NULL, ct_rekey_thread, kr); + + for (i = 0; i < CT_THREADS; i++) + pthread_join(tx[i], NULL); + + pthread_join(rx, NULL); + pthread_join(rk, NULL); + + total = 0; + for (i = 0; i < CT_THREADS; i++) { + memmove(all + total, all + i * CT_PKTS, + arg[i].n * sizeof(*all)); + total += arg[i].n; + } + + qsort(all, total, sizeof(*all), ct_cmp); + + for (i = 1; i < total; i++) + if (ct_cmp(&all[i - 1], &all[i]) == 0) { + printf("(epoch %u, ctr %llu) reused\n", + all[i].epoch, + (unsigned long long) all[i].ctr); + reuse = true; + break; + } + + free(all); + + if (reuse) + goto fail_kr; + + keyrot_destroy(kr); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_kr: + keyrot_destroy(kr); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} +#endif /* HAVE_OPENSSL */ + +int keyrot_test(int argc, + char ** argv) +{ + int ret = 0; + + (void) argc; + (void) argv; + +#ifdef HAVE_OPENSSL + ret |= test_create_destroy(); + ret |= test_epoch_range(); + ret |= test_tx_deterministic(); + ret |= test_selector_layout(); + ret |= test_nodes_left_initial(); + ret |= test_roundtrip(); + ret |= test_direction_separation(); + ret |= test_random_access(); + ret |= test_peer_switched_commit_only(); + ret |= test_commit_evicted(); + ret |= test_replay_window(); + ret |= test_lookup_no_commit(); + ret |= test_commit_prev_batch(); + ret |= test_replay_forward_clear(); + ret |= test_rekey_overlap(); + ret |= test_tx_gate(); + ret |= test_concurrent_nonce_unique(); +#endif + return ret; +} diff --git a/src/lib/tests/tpm_test.c b/src/lib/tests/tpm_test.c index df1d8850..7cc049cd 100644 --- a/src/lib/tests/tpm_test.c +++ b/src/lib/tests/tpm_test.c @@ -21,7 +21,7 @@ */ -#include "tpm.c" +#include <ouroboros/tpm.h> #include <test/test.h> diff --git a/src/lib/tests/tw_test.c b/src/lib/tests/tw_test.c new file mode 100644 index 00000000..32c302c4 --- /dev/null +++ b/src/lib/tests/tw_test.c @@ -0,0 +1,663 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * Generic timing-wheel tests + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +#if defined(__linux__) || defined(__CYGWIN__) +#define _DEFAULT_SOURCE +#else +#define _POSIX_C_SOURCE 200809L +#endif + +#include "config.h" + +#include <test/test.h> + +#include <ouroboros/time.h> +#include <ouroboros/tw.h> + +#include <stdint.h> +#include <stdio.h> +#include <time.h> + +struct payload { + struct tw_entry tw; + int fired; +}; + +struct cancel_payload { + struct tw_entry tw; + int fired; + struct tw_entry * sibling; +}; + +struct repost_payload { + struct tw_entry tw; + int fired; + struct payload * sibling; + uint64_t repost_at; +}; + +static void cb_count(void * arg) +{ + struct payload * p = arg; + p->fired++; +} + +static void cb_cancel_sibling(void * arg) +{ + struct cancel_payload * p = arg; + p->fired++; + tw_cancel(p->sibling); +} + +static void cb_repost_sibling(void * arg) +{ + struct repost_payload * p = arg; + p->fired++; + tw_post(&p->sibling->tw, p->repost_at, cb_count, p->sibling); +} + +static uint64_t now_ns(void) +{ + struct timespec ts; + clock_gettime(PTHREAD_COND_CLOCK, &ts); + return TS_TO_UINT64(ts); +} + +static void sleep_ns(uint64_t ns) +{ + struct timespec ts; + UINT64_TO_TS(ns, &ts); + nanosleep(&ts, NULL); +} + +static int test_tw_init_fini(void) +{ + TEST_START(); + + if (tw_init() < 0) { + printf("tw_init failed.\n"); + goto fail; + } + + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_tw_post_fires_after_deadline(void) +{ + struct payload p; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + tw_init_entry(&p.tw); + p.fired = 0; + + tw_post(&p.tw, now_ns() + 5 * MILLION, cb_count, &p); + + sleep_ns(20 * MILLION); + tw_move(); + + if (p.fired != 1) { + printf("expected 1 fire, got %d\n", p.fired); + goto fail_post; + } + + tw_cancel(&p.tw); + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_post: + tw_cancel(&p.tw); + tw_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_tw_no_fire_before_deadline(void) +{ + struct payload p; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + tw_init_entry(&p.tw); + p.fired = 0; + + tw_post(&p.tw, now_ns() + 100 * MILLION, cb_count, &p); + + sleep_ns(2 * MILLION); + tw_move(); + + if (p.fired != 0) { + printf("expected 0 fires, got %d\n", p.fired); + goto fail_post; + } + + tw_cancel(&p.tw); + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_post: + tw_cancel(&p.tw); + tw_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_tw_cancel_prevents_fire(void) +{ + struct payload p; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + tw_init_entry(&p.tw); + p.fired = 0; + + tw_post(&p.tw, now_ns() + 5 * MILLION, cb_count, &p); + tw_cancel(&p.tw); + + sleep_ns(20 * MILLION); + tw_move(); + + if (p.fired != 0) { + printf("cancelled entry fired %d times\n", p.fired); + goto fail_init; + } + + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_init: + tw_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_tw_cancel_unposted_is_noop(void) +{ + struct tw_entry e; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + tw_init_entry(&e); + tw_cancel(&e); + tw_cancel(&e); + + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_tw_fire_only_once(void) +{ + struct payload p; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + tw_init_entry(&p.tw); + p.fired = 0; + + tw_post(&p.tw, now_ns() + 3 * MILLION, cb_count, &p); + + sleep_ns(20 * MILLION); + tw_move(); + tw_move(); + tw_move(); + + if (p.fired != 1) { + printf("expected 1 fire, got %d after 3 moves\n", p.fired); + goto fail_post; + } + + tw_cancel(&p.tw); + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_post: + tw_cancel(&p.tw); + tw_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* Multi-level: post a level-1 (>= 256ms) deadline; should still fire. */ +static int test_tw_post_level1_fires(void) +{ + struct payload p; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + tw_init_entry(&p.tw); + p.fired = 0; + + tw_post(&p.tw, now_ns() + 300 * MILLION, cb_count, &p); + + if (p.tw.lvl != 1) { + printf("expected level 1 placement, got %zu\n", p.tw.lvl); + goto fail_post; + } + + sleep_ns(320 * MILLION); + tw_move(); + + if (p.fired != 1) { + printf("level-1 entry didn't fire (got %d)\n", p.fired); + goto fail_post; + } + + tw_cancel(&p.tw); + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_post: + tw_cancel(&p.tw); + tw_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +static int test_tw_many_entries_all_fire(void) +{ + struct payload pl[16]; + size_t i; + size_t total = 0; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + for (i = 0; i < 16; ++i) { + tw_init_entry(&pl[i].tw); + pl[i].fired = 0; + tw_post(&pl[i].tw, now_ns() + (1 + i) * MILLION, + cb_count, &pl[i]); + } + + sleep_ns(40 * MILLION); + tw_move(); + + for (i = 0; i < 16; ++i) + total += pl[i].fired; + + if (total != 16) { + printf("expected 16 fires, got %zu\n", total); + goto fail_post; + } + + for (i = 0; i < 16; ++i) + tw_cancel(&pl[i].tw); + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_post: + for (i = 0; i < 16; ++i) + tw_cancel(&pl[i].tw); + tw_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* tw_next_expiry signals empty wheel via tv_nsec == -1. */ +static int test_tw_next_expiry_empty(void) +{ + struct timespec out; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + tw_next_expiry(&out); + if (out.tv_nsec != -1) { + printf("expected tv_nsec=-1, got %ld\n", (long) out.tv_nsec); + goto fail_init; + } + + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_init: + tw_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* tw_next_expiry returns a deadline within the right ballpark. */ +static int test_tw_next_expiry_returns_deadline(void) +{ + struct payload p; + struct timespec out; + uint64_t target; + uint64_t out_ns; + int64_t skew; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + tw_init_entry(&p.tw); + p.fired = 0; + + target = now_ns() + 50 * MILLION; + tw_post(&p.tw, target, cb_count, &p); + + tw_next_expiry(&out); + out_ns = TS_TO_UINT64(out); + + /* Level-0 quantization gives ±1 slot of skew. */ + skew = (int64_t)(out_ns) - (int64_t)(target); + if (skew < -2 * MILLION || skew > 4 * MILLION) { + printf("deadline not in -2..+4 ms, skew=%ld ns\n", (long) skew); + goto fail_post; + } + + tw_cancel(&p.tw); + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_post: + tw_cancel(&p.tw); + tw_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* Repost: fire, then post again. */ +static int test_tw_repost_after_fire(void) +{ + struct payload p; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + tw_init_entry(&p.tw); + p.fired = 0; + + tw_post(&p.tw, now_ns() + 3 * MILLION, cb_count, &p); + sleep_ns(20 * MILLION); + tw_move(); + if (p.fired != 1) { + printf("first fire missed\n"); + goto fail_post; + } + + tw_post(&p.tw, now_ns() + 3 * MILLION, cb_count, &p); + sleep_ns(20 * MILLION); + tw_move(); + if (p.fired != 2) { + printf("second fire missed (fired=%d)\n", p.fired); + goto fail_post; + } + + tw_cancel(&p.tw); + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_post: + tw_cancel(&p.tw); + tw_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* Double-post replaces the schedule; only the second fires. */ +static int test_tw_double_post_replaces(void) +{ + struct payload p; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + tw_init_entry(&p.tw); + p.fired = 0; + + tw_post(&p.tw, now_ns() + 30 * MILLION, cb_count, &p); + tw_post(&p.tw, now_ns() + 3 * MILLION, cb_count, &p); + + sleep_ns(20 * MILLION); + tw_move(); + + if (p.fired != 1) { + printf("expected 1 fire after replace, got %d\n", p.fired); + goto fail_post; + } + + sleep_ns(40 * MILLION); + tw_move(); + + if (p.fired != 1) { + printf("first schedule fired after replace (got %d)\n", + p.fired); + goto fail_post; + } + + tw_cancel(&p.tw); + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_post: + tw_cancel(&p.tw); + tw_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* Fire callback may safely cancel a sibling in the same slot. */ +static int test_tw_fire_cancels_sibling(void) +{ + struct cancel_payload a; + struct payload b; + uint64_t deadline; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + tw_init_entry(&a.tw); + tw_init_entry(&b.tw); + a.fired = 0; + a.sibling = &b.tw; + b.fired = 0; + + deadline = now_ns() + 3 * MILLION; + tw_post(&a.tw, deadline, cb_cancel_sibling, &a); + tw_post(&b.tw, deadline, cb_count, &b); + + sleep_ns(20 * MILLION); + tw_move(); + + if (a.fired != 1) { + printf("a expected 1 fire, got %d\n", a.fired); + goto fail_post; + } + if (b.fired != 0) { + printf("b should not have fired (got %d)\n", b.fired); + goto fail_post; + } + + tw_cancel(&a.tw); + tw_cancel(&b.tw); + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_post: + tw_cancel(&a.tw); + tw_cancel(&b.tw); + tw_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +/* Fire callback may safely repost a sibling to a future slot. */ +static int test_tw_fire_posts_sibling(void) +{ + struct repost_payload a; + struct payload b; + uint64_t deadline; + + TEST_START(); + + if (tw_init() < 0) + goto fail; + + tw_init_entry(&a.tw); + tw_init_entry(&b.tw); + a.fired = 0; + a.sibling = &b; + a.repost_at = now_ns() + 30 * MILLION; + b.fired = 0; + + deadline = now_ns() + 3 * MILLION; + tw_post(&a.tw, deadline, cb_repost_sibling, &a); + tw_post(&b.tw, deadline, cb_count, &b); + + sleep_ns(20 * MILLION); + tw_move(); + + if (a.fired != 1) { + printf("a expected 1 fire, got %d\n", a.fired); + goto fail_post; + } + if (b.fired != 0) { + printf("b fired before reposted deadline (got %d)\n", + b.fired); + goto fail_post; + } + + sleep_ns(25 * MILLION); + tw_move(); + + if (b.fired != 1) { + printf("b expected 1 fire after repost, got %d\n", + b.fired); + goto fail_post; + } + + tw_cancel(&a.tw); + tw_cancel(&b.tw); + tw_fini(); + + TEST_SUCCESS(); + + return TEST_RC_SUCCESS; + fail_post: + tw_cancel(&a.tw); + tw_cancel(&b.tw); + tw_fini(); + fail: + TEST_FAIL(); + return TEST_RC_FAIL; +} + +int tw_test(int argc, + char ** argv) +{ + int ret = 0; + + (void) argc; + (void) argv; + + ret |= test_tw_init_fini(); + ret |= test_tw_post_fires_after_deadline(); + ret |= test_tw_no_fire_before_deadline(); + ret |= test_tw_cancel_prevents_fire(); + ret |= test_tw_cancel_unposted_is_noop(); + ret |= test_tw_fire_only_once(); + ret |= test_tw_post_level1_fires(); + ret |= test_tw_many_entries_all_fire(); + ret |= test_tw_next_expiry_empty(); + ret |= test_tw_next_expiry_returns_deadline(); + ret |= test_tw_repost_after_fire(); + ret |= test_tw_double_post_replaces(); + ret |= test_tw_fire_cancels_sibling(); + ret |= test_tw_fire_posts_sibling(); + + return ret; +} diff --git a/src/lib/timerwheel.c b/src/lib/timerwheel.c deleted file mode 100644 index 2c796c96..00000000 --- a/src/lib/timerwheel.c +++ /dev/null @@ -1,414 +0,0 @@ -/* - * Ouroboros - Copyright (C) 2016 - 2026 - * - * Timerwheel - * - * Dimitri Staessens <dimitri@ouroboros.rocks> - * Sander Vrijders <sander@ouroboros.rocks> - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public License - * version 2.1 as published by the Free Software Foundation. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., http://www.fsf.org/about/contact/. - */ - -#include <ouroboros/list.h> - -/* Overflow limits range to about 6 hours. */ -#define ts_to_ns(ts) (ts.tv_sec * BILLION + ts.tv_nsec) -#define ts_to_rxm_slot(ts) (ts_to_ns(ts) >> RXMQ_RES) -#define ts_to_ack_slot(ts) (ts_to_ns(ts) >> ACKQ_RES) - -struct rxm { - struct list_head next; - uint32_t seqno; -#ifndef RXM_BUFFER_ON_HEAP - struct ssm_pk_buff * spb; -#endif - struct frct_pci * pkt; - size_t len; - time_t t0; /* Time when original was sent (us). */ - struct frcti * frcti; - int fd; - int flow_id; /* Prevent rtx when fd reused. */ -}; - -struct ack { - struct list_head next; - struct frcti * frcti; - int fd; - int flow_id; -}; - -struct { - /* - * At a 1 ms min resolution, every level bumps the - * resolution by a factor of 16. - */ - struct list_head rxms[RXMQ_LVLS][RXMQ_SLOTS]; - - struct list_head acks[ACKQ_SLOTS]; - bool map[ACKQ_SLOTS][PROG_MAX_FLOWS]; - - size_t prv_rxm[RXMQ_LVLS]; /* Last processed rxm slots. */ - size_t prv_ack; /* Last processed ack slot. */ - pthread_mutex_t lock; -} rw; - -static void timerwheel_fini(void) -{ - size_t i; - size_t j; - struct list_head * p; - struct list_head * h; - - pthread_mutex_lock(&rw.lock); - - for (i = 0; i < RXMQ_LVLS; ++i) { - for (j = 0; j < RXMQ_SLOTS; j++) { - list_for_each_safe(p, h, &rw.rxms[i][j]) { - struct rxm * rxm; - rxm = list_entry(p, struct rxm, next); - list_del(&rxm->next); -#ifdef RXM_BUFFER_ON_HEAP - free(rxm->pkt); -#else - ssm_pk_buff_ack(rxm->spb); - ipcp_spb_release(rxm->spb); -#endif - free(rxm); - } - } - } - - for (i = 0; i < ACKQ_SLOTS; ++i) { - list_for_each_safe(p, h, &rw.acks[i]) { - struct ack * a = list_entry(p, struct ack, next); - list_del(&a->next); - free(a); - } - } - - pthread_mutex_unlock(&rw.lock); - - pthread_mutex_destroy(&rw.lock); -} - -static int timerwheel_init(void) -{ - struct timespec now; - size_t i; - size_t j; - - if (pthread_mutex_init(&rw.lock, NULL)) - return -1; - - clock_gettime(PTHREAD_COND_CLOCK, &now); - - for (i = 0; i < RXMQ_LVLS; ++i) { - rw.prv_rxm[i] = (ts_to_rxm_slot(now) - 1); - rw.prv_rxm[i] >>= (RXMQ_BUMP * i); - rw.prv_rxm[i] &= (RXMQ_SLOTS - 1); - for (j = 0; j < RXMQ_SLOTS; ++j) - list_head_init(&rw.rxms[i][j]); - } - - rw.prv_ack = (ts_to_ack_slot(now) - 1) & (ACKQ_SLOTS - 1); - for (i = 0; i < ACKQ_SLOTS; ++i) - list_head_init(&rw.acks[i]); - - return 0; -} - -static void timerwheel_move(void) -{ - struct timespec now; - struct list_head * p; - struct list_head * h; - size_t rxm_slot; - size_t ack_slot; - size_t i; - size_t j; - - pthread_mutex_lock(&rw.lock); - - pthread_cleanup_push(__cleanup_mutex_unlock, &rw.lock); - - clock_gettime(PTHREAD_COND_CLOCK, &now); - - rxm_slot = ts_to_rxm_slot(now); - - for (i = 0; i < RXMQ_LVLS; ++i) { - size_t j_max_slot = rxm_slot & (RXMQ_SLOTS - 1); - j = rw.prv_rxm[i]; - if (j_max_slot < j) - j_max_slot += RXMQ_SLOTS; - while (j++ < j_max_slot) { - list_for_each_safe(p, h, - &rw.rxms[i][j & (RXMQ_SLOTS - 1)]) { - struct rxm * r; - struct frct_cr * snd_cr; - struct frct_cr * rcv_cr; - size_t slot; - size_t rslot; - ssize_t idx; - struct ssm_pk_buff * spb; - struct frct_pci * pci; - struct flow * f; - uint32_t snd_lwe; - uint32_t rcv_lwe; - size_t lvl = 0; - - r = list_entry(p, struct rxm, next); - - list_del(&r->next); - - snd_cr = &r->frcti->snd_cr; - rcv_cr = &r->frcti->rcv_cr; - f = &proc.flows[r->fd]; -#ifndef RXM_BUFFER_ON_HEAP - ssm_pk_buff_ack(r->spb); -#endif - if (f->frcti == NULL - || f->info.id != r->flow_id) - goto cleanup; - - pthread_rwlock_rdlock(&r->frcti->lock); - - snd_lwe = snd_cr->lwe; - rcv_lwe = rcv_cr->lwe; - - pthread_rwlock_unlock(&r->frcti->lock); - - /* Has been ack'd, remove. */ - if (before(r->seqno, snd_lwe)) - goto cleanup; - - /* Check for r-timer expiry. */ - if (ts_to_ns(now) - r->t0 > r->frcti->r) - goto flow_down; - - pthread_rwlock_wrlock(&r->frcti->lock); - - if (r->seqno == r->frcti->rttseq) { - r->frcti->rto += - r->frcti->rto >> RTO_DIV; - r->frcti->probe = false; - } -#ifdef PROC_FLOW_STATS - r->frcti->n_rtx++; -#endif - rslot = r->frcti->rto >> RXMQ_RES; - - pthread_rwlock_unlock(&r->frcti->lock); - - /* Schedule at least in the next time slot. */ - slot = ts_to_ns(now) >> RXMQ_RES; - - while (rslot >= RXMQ_SLOTS) { - ++lvl; - rslot >>= RXMQ_BUMP; - slot >>= RXMQ_BUMP; - } - - if (lvl >= RXMQ_LVLS) /* Can't reschedule */ - goto flow_down; - - rslot = (rslot + slot + 1) & (RXMQ_SLOTS - 1); -#ifdef RXM_BLOCKING - if (ipcp_spb_reserve(&spb, r->len) < 0) -#else - if (ssm_pool_alloc(proc.pool, r->len, NULL, - &spb) < 0) -#endif - goto reschedule; /* rdrbuff full */ - - pci = (struct frct_pci *) ssm_pk_buff_head(spb); - memcpy(pci, r->pkt, r->len); -#ifndef RXM_BUFFER_ON_HEAP - ipcp_spb_release(r->spb); - r->spb = spb; - r->pkt = pci; - ssm_pk_buff_wait_ack(spb); -#endif - idx = ssm_pk_buff_get_idx(spb); - - /* Retransmit the copy. */ - pci->ackno = hton32(rcv_lwe); -#ifdef RXM_BLOCKING - if (ssm_rbuff_write_b(f->tx_rb, idx, NULL) < 0) -#else - if (ssm_rbuff_write(f->tx_rb, idx) < 0) -#endif - goto flow_down; - ssm_flow_set_notify(f->set, f->info.id, - FLOW_PKT); - reschedule: - list_add(&r->next, &rw.rxms[lvl][rslot]); - continue; - - flow_down: - ssm_rbuff_set_acl(f->tx_rb, ACL_FLOWDOWN); - ssm_rbuff_set_acl(f->rx_rb, ACL_FLOWDOWN); - cleanup: -#ifdef RXM_BUFFER_ON_HEAP - free(r->pkt); -#else - ipcp_spb_release(r->spb); -#endif - free(r); - } - } - rw.prv_rxm[i] = rxm_slot & (RXMQ_SLOTS - 1); - /* Move up a level in the wheel. */ - rxm_slot >>= RXMQ_BUMP; - } - - ack_slot = ts_to_ack_slot(now) & (ACKQ_SLOTS - 1) ; - - j = rw.prv_ack; - - if (ack_slot < j) - ack_slot += ACKQ_SLOTS; - - while (j++ < ack_slot) { - list_for_each_safe(p, h, &rw.acks[j & (ACKQ_SLOTS - 1)]) { - struct ack * a; - struct flow * f; - - a = list_entry(p, struct ack, next); - - list_del(&a->next); - - f = &proc.flows[a->fd]; - - rw.map[j & (ACKQ_SLOTS - 1)][a->fd] = false; - - if (f->info.id == a->flow_id && f->frcti != NULL) - send_frct_pkt(a->frcti); - - free(a); - } - } - - rw.prv_ack = ack_slot & (ACKQ_SLOTS - 1); - - pthread_cleanup_pop(true); -} - -static int timerwheel_rxm(struct frcti * frcti, - uint32_t seqno, - struct ssm_pk_buff * spb) -{ - struct timespec now; - struct rxm * r; - size_t slot; - size_t lvl = 0; - time_t rto_slot; - - r = malloc(sizeof(*r)); - if (r == NULL) - return -ENOMEM; - - clock_gettime(PTHREAD_COND_CLOCK, &now); - - r->t0 = ts_to_ns(now); - r->seqno = seqno; - r->frcti = frcti; - r->len = ssm_pk_buff_len(spb); -#ifdef RXM_BUFFER_ON_HEAP - r->pkt = malloc(r->len); - if (r->pkt == NULL) { - free(r); - return -ENOMEM; - } - memcpy(r->pkt, ssm_pk_buff_head(spb), r->len); -#else - r->spb = spb; - r->pkt = (struct frct_pci *) ssm_pk_buff_head(spb); -#endif - pthread_rwlock_rdlock(&r->frcti->lock); - - rto_slot = frcti->rto >> RXMQ_RES; - slot = r->t0 >> RXMQ_RES; - - r->fd = frcti->fd; - r->flow_id = proc.flows[r->fd].info.id; - - pthread_rwlock_unlock(&r->frcti->lock); - - while (rto_slot >= RXMQ_SLOTS) { - ++lvl; - rto_slot >>= RXMQ_BUMP; - slot >>= RXMQ_BUMP; - } - - if (lvl >= RXMQ_LVLS) { /* Out of timerwheel range. */ -#ifdef RXM_BUFFER_ON_HEAP - free(r->pkt); -#endif - free(r); - return -EPERM; - } - - slot = (slot + rto_slot + 1) & (RXMQ_SLOTS - 1); - - pthread_mutex_lock(&rw.lock); - - list_add_tail(&r->next, &rw.rxms[lvl][slot]); -#ifndef RXM_BUFFER_ON_HEAP - ssm_pk_buff_wait_ack(spb); -#endif - pthread_mutex_unlock(&rw.lock); - - return 0; -} - -static int timerwheel_delayed_ack(int fd, - struct frcti * frcti) -{ - struct timespec now; - struct ack * a; - size_t slot; - - a = malloc(sizeof(*a)); - if (a == NULL) - return -ENOMEM; - - clock_gettime(PTHREAD_COND_CLOCK, &now); - - pthread_rwlock_rdlock(&frcti->lock); - - slot = (((ts_to_ns(now) + (TICTIME << 1)) >> ACKQ_RES) + 1) - & (ACKQ_SLOTS - 1); - - pthread_rwlock_unlock(&frcti->lock); - - a->fd = fd; - a->frcti = frcti; - a->flow_id = proc.flows[fd].info.id; - - pthread_mutex_lock(&rw.lock); - - if (rw.map[slot][fd]) { - pthread_mutex_unlock(&rw.lock); - free(a); - return 0; - } - - rw.map[slot][fd] = true; - - list_add_tail(&a->next, &rw.acks[slot]); - - pthread_mutex_unlock(&rw.lock); - - return 0; -} diff --git a/src/lib/tw.c b/src/lib/tw.c new file mode 100644 index 00000000..ccde7dd1 --- /dev/null +++ b/src/lib/tw.c @@ -0,0 +1,307 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * Generic deadline-ordered callback queue (timing wheel) + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * Sander Vrijders <sander@ouroboros.rocks> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * version 2.1 as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., http://www.fsf.org/about/contact/. + */ + +#if defined(__linux__) || defined(__CYGWIN__) +#define _DEFAULT_SOURCE +#else +#define _POSIX_C_SOURCE 200809L +#endif + +#include "config.h" + +#include <ouroboros/list.h> +#include <ouroboros/pthread.h> +#include <ouroboros/time.h> +#include <ouroboros/tw.h> + +#include <assert.h> +#include <stdbool.h> +#include <stdint.h> + +/* 3 levels × 256 slots, 1 ms / 16 ms / 256 ms per-slot resolution. */ +#define TW_LVLS 3 +#define TW_SLOTS 256 +#define TW_BUMP 4 +#define TW_RES 20 /* 2^20 ns ≈ 1 ms per slot at level 0. */ + +#define TW_SLOT(x) ((x) & (TW_SLOTS - 1)) + +static struct { + struct list_head levels[TW_LVLS][TW_SLOTS]; + size_t prv[TW_LVLS]; + pthread_mutex_t mtx; + pthread_mutex_t move_mtx; + bool initialised; +} tw; + +static size_t tw_lvl_res(size_t lvl) +{ + return TW_RES + TW_BUMP * lvl; +} + +/* Smallest level whose slot range covers the deadline. */ +static size_t tw_pick_lvl(uint64_t now_ns, + uint64_t deadline_ns) +{ + uint64_t delta; + size_t lvl; + + delta = deadline_ns > now_ns ? deadline_ns - now_ns : 0; + lvl = 0; + + while (lvl < TW_LVLS - 1 && (delta >> tw_lvl_res(lvl)) >= TW_SLOTS) + ++lvl; + + return lvl; +} + +static size_t tw_slot(uint64_t ns, + size_t lvl) +{ + return TW_SLOT(ns >> tw_lvl_res(lvl)); +} + +int tw_init(void) +{ + struct timespec now; + size_t i; + size_t j; + + assert(!tw.initialised); + + if (pthread_mutex_init(&tw.mtx, NULL)) + goto fail_mtx; + + if (pthread_mutex_init(&tw.move_mtx, NULL)) + goto fail_move_mtx; + + clock_gettime(PTHREAD_COND_CLOCK, &now); + + for (i = 0; i < TW_LVLS; ++i) { + tw.prv[i] = TW_SLOT(tw_slot(TS_TO_UINT64(now), i) - 1); + for (j = 0; j < TW_SLOTS; ++j) + list_head_init(&tw.levels[i][j]); + } + + tw.initialised = true; + + return 0; + + fail_move_mtx: + pthread_mutex_destroy(&tw.mtx); + fail_mtx: + return -1; +} + +void tw_fini(void) +{ + size_t i; + size_t j; + + assert(tw.initialised); + + for (i = 0; i < TW_LVLS; ++i) { + for (j = 0; j < TW_SLOTS; ++j) + assert(list_is_empty(&tw.levels[i][j])); + } + + pthread_mutex_destroy(&tw.move_mtx); + pthread_mutex_destroy(&tw.mtx); + + tw.initialised = false; +} + +void tw_init_entry(struct tw_entry * e) +{ + list_head_init(&e->next); + + e->deadline_ns = 0; + e->fire = NULL; + e->arg = NULL; + e->lvl = 0; +} + +void tw_post(struct tw_entry * e, + uint64_t deadline_ns, + tw_fire_fn_t fire, + void * arg) +{ + struct timespec now; + size_t lvl; + size_t slot; + + assert(tw.initialised); + + clock_gettime(PTHREAD_COND_CLOCK, &now); + + lvl = tw_pick_lvl(TS_TO_UINT64(now), deadline_ns); + /* +1 so deadline <= slot_start; lands later in slot. */ + slot = TW_SLOT(tw_slot(deadline_ns, lvl) + 1); + + e->deadline_ns = deadline_ns; + e->fire = fire; + e->arg = arg; + e->lvl = lvl; + + pthread_mutex_lock(&tw.mtx); + + if (!list_is_empty(&e->next)) + list_del(&e->next); + + list_add_tail(&e->next, &tw.levels[lvl][slot]); + + pthread_mutex_unlock(&tw.mtx); +} + +void tw_cancel(struct tw_entry * e) +{ + if (e == NULL) + return; + + assert(tw.initialised); + + pthread_mutex_lock(&tw.mtx); + + if (!list_is_empty(&e->next)) { + list_del(&e->next); + list_head_init(&e->next); + } + + pthread_mutex_unlock(&tw.mtx); +} + +void tw_move(void) +{ + struct timespec now; + struct list_head deferred; + struct list_head * p; + uint64_t now_ns; + size_t i; + size_t j; + size_t cur; + + assert(tw.initialised); + + if (pthread_mutex_trylock(&tw.move_mtx) != 0) + return; + + pthread_cleanup_push(__cleanup_mutex_unlock, &tw.move_mtx); + + pthread_mutex_lock(&tw.mtx); + + pthread_cleanup_push(__cleanup_mutex_unlock, &tw.mtx); + + clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); + + for (i = 0; i < TW_LVLS; ++i) { + cur = tw_slot(now_ns, i); + + j = tw.prv[i]; + if (cur < j) + cur += TW_SLOTS; + + while (j++ < cur) { + size_t s = TW_SLOT(j); + + /* Pop-front so fire may mutate any entry. */ + list_head_init(&deferred); + + while (!list_is_empty(&tw.levels[i][s])) { + struct tw_entry * e; + p = tw.levels[i][s].nxt; + e = list_entry(p, struct tw_entry, next); + list_del(&e->next); + + if (e->deadline_ns > now_ns) { + list_add_tail(&e->next, &deferred); + continue; + } + + pthread_mutex_unlock(&tw.mtx); + e->fire(e->arg); + pthread_mutex_lock(&tw.mtx); + } + + while (!list_is_empty(&deferred)) { + p = deferred.nxt; + list_del(p); + list_add_tail(p, &tw.levels[i][s]); + } + } + + tw.prv[i] = TW_SLOT(cur); + } + + pthread_cleanup_pop(true); /* tw.mtx */ + pthread_cleanup_pop(true); /* tw.move_mtx */ +} + +/* Earliest pending deadline at level lvl, INT64_MAX if level is empty. */ +static int64_t tw_lvl_earliest(size_t lvl, + uint64_t now_ns) +{ + size_t cur = tw_slot(now_ns, lvl); + size_t j; + + for (j = 1; j <= TW_SLOTS; ++j) { + size_t s = TW_SLOT(cur + j); + + if (list_is_empty(&tw.levels[lvl][s])) + continue; + + return (int64_t)(now_ns + ((uint64_t) j << tw_lvl_res(lvl))); + } + + return INT64_MAX; +} + +void tw_next_expiry(struct timespec * out) +{ + struct timespec now; + uint64_t now_ns; + int64_t earliest = INT64_MAX; + size_t i; + + assert(tw.initialised); + + clock_gettime(PTHREAD_COND_CLOCK, &now); + now_ns = TS_TO_UINT64(now); + + pthread_mutex_lock(&tw.mtx); + + for (i = 0; i < TW_LVLS; ++i) { + int64_t dl = tw_lvl_earliest(i, now_ns); + if (dl < earliest) + earliest = dl; + } + + pthread_mutex_unlock(&tw.mtx); + + if (earliest == INT64_MAX) { + /* Empty wheel: tv_nsec=-1 is an invalid normalised value. */ + out->tv_sec = 0; + out->tv_nsec = -1; + } else { + UINT64_TO_TS((uint64_t) earliest, out); + } +} diff --git a/src/tools/CMakeLists.txt b/src/tools/CMakeLists.txt index 3cec8172..6b418838 100644 --- a/src/tools/CMakeLists.txt +++ b/src/tools/CMakeLists.txt @@ -63,6 +63,11 @@ target_include_directories(operf PRIVATE ${TOOLS_INCLUDE_DIRS}) target_link_libraries(operf PRIVATE ouroboros-dev) install(TARGETS operf RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) +add_executable(oftp oftp/oftp.c) +target_include_directories(oftp PRIVATE ${TOOLS_INCLUDE_DIRS}) +target_link_libraries(oftp PRIVATE ouroboros-dev) +install(TARGETS oftp RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") add_executable(ovpn ovpn/ovpn.c) target_include_directories(ovpn PRIVATE ${TOOLS_INCLUDE_DIRS}) diff --git a/src/tools/irm/irm_ipcp_connect.c b/src/tools/irm/irm_ipcp_connect.c index f88c36dc..fb21faec 100644 --- a/src/tools/irm/irm_ipcp_connect.c +++ b/src/tools/irm/irm_ipcp_connect.c @@ -100,16 +100,18 @@ int do_connect_ipcp(int argc, } if (qos != NULL) { - if (strcmp(qos, "best") == 0) - qs = qos_best_effort; - else if (strcmp(qos, "raw") == 0) + if (strcmp(qos, "raw") == 0) qs = qos_raw; - else if (strcmp(qos, "video") == 0) - qs = qos_video; - else if (strcmp(qos, "voice") == 0) - qs = qos_voice; - else if (strcmp(qos, "data") == 0) - qs = qos_data; + else if (strcmp(qos, "safe") == 0) + qs = qos_raw_safe; + else if (strcmp(qos, "rt") == 0) + qs = qos_rt; + else if (strcmp(qos, "rt-safe") == 0) + qs = qos_rt_safe; + else if (strcmp(qos, "msg") == 0) + qs = qos_msg; + else if (strcmp(qos, "stream") == 0) + qs = qos_stream; else printf("Unknown QoS cube, defaulting to raw.\n"); } @@ -126,7 +128,7 @@ int do_connect_ipcp(int argc, if (wildcard_match(comp, MGMT) == 0) { component = MGMT_COMP; - /* FIXME: move to qos_data when stable */ + /* FIXME: move to qos_msg when stable */ if (irm_connect_ipcp(pid, dst, component, qos_raw)) return -1; } diff --git a/src/tools/irm/irm_name_create.c b/src/tools/irm/irm_name_create.c index 1055700c..40a51193 100644 --- a/src/tools/irm/irm_name_create.c +++ b/src/tools/irm/irm_name_create.c @@ -51,10 +51,10 @@ #define RR "round-robin" #define SPILL "spillover" -#define SENC "<security_dir>/server/<name>/enc.conf" +#define SSEC "<security_dir>/server/<name>/sec.conf" #define SCRT "<security_dir>/server/<name>/crt.pem" #define SKEY "<security_dir>/server/<name>/key.pem" -#define CENC "<security_dir>/client/<name>/enc.conf" +#define CSEC "<security_dir>/client/<name>/sec.conf" #define CCRT "<security_dir>/client/<name>/crt.pem" #define CKEY "<security_dir>/client/<name>/key.pem" @@ -63,10 +63,10 @@ static void usage(void) printf("Usage: irm name create\n" " <name>. max %d chars.\n" " [lb LB_POLICY], default: %s\n" - " [sencpath <path>, default: " SENC "]\n" + " [ssecpath <path>, default: " SSEC "]\n" " [scrtpath <path>, default: " SCRT "]\n" " [skeypath <path>, default: " SKEY "]\n" - " [cencpath <path>, default: " CENC "]\n" + " [csecpath <path>, default: " CSEC "]\n" " [ccrtpath <path>, default: " CCRT "]\n" " [ckeypath <path>, default: " CKEY "]\n" "\n" @@ -105,10 +105,10 @@ int do_create_name(int argc, { struct name_info info = {}; char * name = NULL; - char * sencpath = NULL; + char * ssecpath = NULL; char * scrtpath = NULL; char * skeypath = NULL; - char * cencpath = NULL; + char * csecpath = NULL; char * ccrtpath = NULL; char * ckeypath = NULL; char * lb_pol = RR; @@ -119,14 +119,14 @@ int do_create_name(int argc, while (argc > 0) { if (matches(*argv, "lb") == 0) { lb_pol = *(argv + 1); - } else if (matches(*argv, "sencpath") == 0) { - sencpath = *(argv + 1); + } else if (matches(*argv, "ssecpath") == 0) { + ssecpath = *(argv + 1); } else if (matches(*argv, "scrtpath") == 0) { scrtpath = *(argv + 1); } else if (matches(*argv, "skeypath") == 0) { skeypath = *(argv + 1); - } else if (matches(*argv, "cencpath") == 0) { - cencpath = *(argv + 1); + } else if (matches(*argv, "csecpath") == 0) { + csecpath = *(argv + 1); } else if (matches(*argv, "ccrtpath") == 0) { ccrtpath = *(argv + 1); } else if (matches(*argv, "ckeypath") == 0) { @@ -151,7 +151,7 @@ int do_create_name(int argc, strcpy(info.name, name); - if (sencpath != NULL && cp_chk_path(info.s.enc, sencpath) < 0) + if (ssecpath != NULL && cp_chk_path(info.s.sec, ssecpath) < 0) goto fail; if (scrtpath != NULL && cp_chk_path(info.s.crt, scrtpath) < 0) @@ -160,7 +160,7 @@ int do_create_name(int argc, if (skeypath != NULL && cp_chk_path(info.s.key, skeypath) < 0) goto fail; - if (cencpath != NULL && cp_chk_path(info.c.enc, cencpath) < 0) + if (csecpath != NULL && cp_chk_path(info.c.sec, csecpath) < 0) goto fail; if (ccrtpath != NULL && cp_chk_path(info.c.crt, ccrtpath) < 0) diff --git a/src/tools/ocbr/ocbr_client.c b/src/tools/ocbr/ocbr_client.c index 9dd9904c..36c07d43 100644 --- a/src/tools/ocbr/ocbr_client.c +++ b/src/tools/ocbr/ocbr_client.c @@ -37,8 +37,11 @@ */ #include <ouroboros/dev.h> +#include <ouroboros/qos.h> #include <signal.h> +#include <stdlib.h> +#include <string.h> volatile bool stop; @@ -86,6 +89,11 @@ int client_main(char * server, struct timespec end; struct timespec intv = {(gap / BILLION), gap % BILLION}; int ms; + const char * qenv; + qosspec_t qs; + qosspec_t * qsp; + + qsp = NULL; stop = false; @@ -98,16 +106,38 @@ int client_main(char * server, sigaction(SIGHUP, &sig_act, NULL) || sigaction(SIGPIPE, &sig_act, NULL)) { printf("Failed to install sighandler.\n"); - return -1; + return 2; } printf("Client started, duration %d, rate %lu b/s, size %d B.\n", duration, rate, size); - fd = flow_alloc(server, NULL, NULL); + qenv = getenv("OCBR_QOS"); + if (qenv != NULL) { + if (strcmp(qenv, "raw") == 0) + qs = qos_raw; + else if (strcmp(qenv, "safe") == 0) + qs = qos_raw_safe; + else if (strcmp(qenv, "rt") == 0) + qs = qos_rt; + else if (strcmp(qenv, "rt_safe") == 0) + qs = qos_rt_safe; + else if (strcmp(qenv, "msg") == 0) + qs = qos_msg; + else if (strcmp(qenv, "stream") == 0) + qs = qos_stream; + else { + fprintf(stderr, + "Unknown OCBR_QOS='%s', using raw.\n", qenv); + qs = qos_raw; + } + qsp = &qs; + printf("OCBR_QOS=%s\n", qenv); + } + fd = flow_alloc(server, qsp, NULL); if (fd < 0) { printf("Failed to allocate flow.\n"); - return -1; + return 2; } clock_gettime(CLOCK_REALTIME, &start); diff --git a/src/tools/oecho/oecho.c b/src/tools/oecho/oecho.c index 14caab53..ef0a168f 100644 --- a/src/tools/oecho/oecho.c +++ b/src/tools/oecho/oecho.c @@ -101,20 +101,20 @@ static int client_main(void) fd = flow_alloc("oecho", NULL, NULL); if (fd < 0) { printf("Failed to allocate flow.\n"); - return -1; + return 2; } if (flow_write(fd, message, strlen(message) + 1) < 0) { printf("Failed to write packet.\n"); flow_dealloc(fd); - return -1; + return 1; } count = flow_read(fd, buf, BUF_SIZE); if (count < 0) { printf("Failed to read packet.\n"); flow_dealloc(fd); - return -1; + return 1; } printf("Server replied with %.*s\n", (int) count, buf); @@ -126,7 +126,7 @@ static int client_main(void) int main(int argc, char ** argv) { - int ret = -1; + int ret = 0; bool server = false; argc--; diff --git a/src/tools/oftp/oftp.c b/src/tools/oftp/oftp.c new file mode 100644 index 00000000..1ae99403 --- /dev/null +++ b/src/tools/oftp/oftp.c @@ -0,0 +1,441 @@ +/* + * Ouroboros - Copyright (C) 2016 - 2026 + * + * A minimal file-transfer tool over an FRCT stream flow + * + * Dimitri Staessens <dimitri@ouroboros.rocks> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#define _POSIX_C_SOURCE 200809L + +#include <ouroboros/crc64.h> +#include <ouroboros/dev.h> +#include <ouroboros/errno.h> +#include <ouroboros/fccntl.h> +#include <ouroboros/qos.h> + +#include <fcntl.h> +#include <inttypes.h> +#include <signal.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <unistd.h> + +#define BUF_SIZE 16384 + +static volatile sig_atomic_t stop = 0; + +static void apply_rto_min_env(int fd) +{ + const char * env; + long v; + + env = getenv("OFTP_FRCT_RTO_MIN"); + if (env == NULL) + return; + v = strtol(env, NULL, 10); + if (v <= 0) + return; + if (fccntl(fd, FRCTSRTOMIN, (time_t) v) < 0) + fprintf(stderr, + "oftp: failed to set RTO_MIN=%ld ns\n", v); +} + +static void apply_stream_ring_sz_env(int fd) +{ + const char * env; + long v; + + env = getenv("OFTP_FRCT_STREAM_RING_SZ"); + if (env == NULL) + return; + v = strtol(env, NULL, 10); + if (v <= 0) + return; + if (fccntl(fd, FRCTSRRINGSZ, (size_t) v) < 0) + fprintf(stderr, + "oftp: failed to set STREAM_RING_SZ=%ld\n", v); +} + +static void on_signal(int signo) +{ + (void) signo; + stop = 1; +} + +static void usage(void) +{ + printf("Usage: oftp [OPTION]...\n" + "Stream-mode file transfer over an Ouroboros flow.\n\n" + " -l, --listen Run as the receiver (server)\n" + " -n, --name NAME Destination service name (client)\n" + " -i, --in FILE Read input from FILE (default stdin)\n" + " -o, --out FILE Write output to FILE (default stdout)\n" + " -N, --bytes SIZE Stop after SIZE bytes " + "(K/M/G suffix; client only)\n" + " --help Display this help text and exit\n"); +} + +static int parse_size(const char * s, size_t * out) +{ + char * end; + unsigned long v; + size_t mul; + + v = strtoul(s, &end, 0); + if (end == s) + return -1; + + mul = 1; + if (*end == 'k' || *end == 'K') + mul = 1024UL; + else if (*end == 'm' || *end == 'M') + mul = 1024UL * 1024UL; + else if (*end == 'g' || *end == 'G') + mul = 1024UL * 1024UL * 1024UL; + else if (*end != '\0') + return -1; + + *out = (size_t) v * mul; + return 0; +} + +static void report_xfer(const char * tag, + size_t total, + uint64_t crc, + const struct timespec * t0, + const struct timespec * t1) +{ + double elapsed_s; + double mib_per_s; + + elapsed_s = (t1->tv_sec - t0->tv_sec) + + (t1->tv_nsec - t0->tv_nsec) / 1e9; + if (elapsed_s <= 0.0) + elapsed_s = 1e-9; + + mib_per_s = ((double) total / (1024.0 * 1024.0)) / elapsed_s; + + fprintf(stderr, + "oftp: %s %zu bytes in %.3f s (%.2f MiB/s) " + "crc64=%016" PRIx64 "\n", + tag, total, elapsed_s, mib_per_s, crc); +} + +static int xfer_to_flow(int fd, FILE * in, size_t max_bytes) +{ + char buf[BUF_SIZE]; + size_t n; + size_t total; + size_t want; + size_t off; + ssize_t w; + uint64_t crc; + struct timespec t0; + struct timespec t1; + + total = 0; + crc = 0; + + clock_gettime(CLOCK_MONOTONIC, &t0); + + while (!stop) { + want = sizeof(buf); + if (max_bytes > 0 && max_bytes - total < want) + want = max_bytes - total; + if (want == 0) + break; + + n = fread(buf, 1, want, in); + if (n == 0) + break; + + crc64_nvme(&crc, buf, n); + + off = 0; + while (off < n) { + w = flow_write(fd, buf + off, n - off); + if (w < 0) { + fprintf(stderr, + "flow_write failed: %zd\n", w); + return 1; + } + off += (size_t) w; + total += (size_t) w; + } + } + + clock_gettime(CLOCK_MONOTONIC, &t1); + + if (ferror(in)) { + fprintf(stderr, "Input read error.\n"); + return 1; + } + + report_xfer("sent", total, crc, &t0, &t1); + return 0; +} + +static int xfer_from_flow(int fd, FILE * out) +{ + char buf[BUF_SIZE]; + size_t total; + ssize_t n; + uint64_t crc; + struct timespec timeout; + struct timespec t0; + struct timespec t1; + bool started; + + total = 0; + crc = 0; + started = false; + timeout.tv_sec = 1; + timeout.tv_nsec = 0; + + /* Short timeout so SIGTERM/SIGINT 'stop' is observed promptly. */ + fccntl(fd, FLOWSRCVTIMEO, &timeout); + + while (!stop) { + n = flow_read(fd, buf, sizeof(buf)); + if (n == 0) { + /* Clean EOF: peer sent EOS and we drained it. */ + clock_gettime(CLOCK_MONOTONIC, &t1); + fflush(out); + if (!started) + t0 = t1; + report_xfer("received", total, crc, &t0, &t1); + return 0; + } + if (n == -ETIMEDOUT) + continue; + if (n < 0) { + /* Peer aborted before EOS: partial transfer. */ + if (n == -EFLOWDOWN || n == -EFLOWPEER) { + fprintf(stderr, + "oftp: peer aborted at %zu B\n", + total); + return 2; + } + fprintf(stderr, + "flow_read failed: %zd\n", n); + return 1; + } + if (!started) { + clock_gettime(CLOCK_MONOTONIC, &t0); + started = true; + } + crc64_nvme(&crc, buf, (size_t) n); + if (fwrite(buf, 1, (size_t) n, out) != (size_t) n) { + fprintf(stderr, "Output write error.\n"); + return 1; + } + total += (size_t) n; + } + + /* Receiver was signalled (SIGINT/SIGTERM) before EOF. */ + fflush(out); + fprintf(stderr, "oftp: interrupted at %zu B\n", total); + return 2; +} + +static int server_main(const char * outpath) +{ + FILE * out = stdout; + int fd; + int ofd; + int rc; + qosspec_t qs; + + if (outpath != NULL) { + ofd = open(outpath, + O_WRONLY | O_CREAT | O_EXCL | O_NOFOLLOW, + 0600); + if (ofd < 0) { + perror("open"); + return 1; + } + out = fdopen(ofd, "wb"); + if (out == NULL) { + perror("fdopen"); + close(ofd); + unlink(outpath); + return 1; + } + } + + fprintf(stderr, "oftp: listening...\n"); + + fd = flow_accept(&qs, NULL); + if (fd < 0) { + fprintf(stderr, "flow_accept failed: %d\n", fd); + if (out != stdout) + fclose(out); + return 1; + } + + if (qs.service != SVC_STREAM) { + fprintf(stderr, + "oftp: rejecting non-stream flow (service=%u)\n", + qs.service); + flow_dealloc(fd); + if (out != stdout) { + fclose(out); + unlink(outpath); + } + return 1; + } + + apply_rto_min_env(fd); + apply_stream_ring_sz_env(fd); + + rc = xfer_from_flow(fd, out); + + flow_dealloc(fd); + + if (out != stdout) { + fclose(out); + /* Drop the half-written file on abort/interrupt. */ + if (rc != 0) + unlink(outpath); + } + + return rc; +} + +static int client_main(const char * name, + const char * inpath, + size_t max_bytes) +{ + FILE * in; + int fd; + int rc; + qosspec_t qs; + + in = stdin; + qs = qos_stream; + + if (inpath != NULL) { + in = fopen(inpath, "rb"); + if (in == NULL) { + perror("fopen"); + return 1; + } + } + + fd = flow_alloc(name, &qs, NULL); + if (fd < 0) { + fprintf(stderr, "flow_alloc failed: %d\n", fd); + if (in != stdin) + fclose(in); + return 2; + } + + apply_rto_min_env(fd); + apply_stream_ring_sz_env(fd); + + rc = xfer_to_flow(fd, in, max_bytes); + + flow_dealloc(fd); + + if (in != stdin) + fclose(in); + + return rc; +} + +int main(int argc, char ** argv) +{ + bool server; + const char * name; + const char * inpath; + const char * outpath; + size_t max_bytes; + struct sigaction sa; + + server = false; + name = NULL; + inpath = NULL; + outpath = NULL; + max_bytes = 0; + + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = on_signal; + sigaction(SIGINT, &sa, NULL); + sigaction(SIGTERM, &sa, NULL); + signal(SIGPIPE, SIG_IGN); + + argc--; argv++; + while (argc > 0) { + if (strcmp(*argv, "-l") == 0 || + strcmp(*argv, "--listen") == 0) { + server = true; + } else if ((strcmp(*argv, "-n") == 0 || + strcmp(*argv, "--name") == 0) && argc > 1) { + name = *(++argv); argc--; + } else if ((strcmp(*argv, "-i") == 0 || + strcmp(*argv, "--in") == 0) && argc > 1) { + inpath = *(++argv); argc--; + } else if ((strcmp(*argv, "-o") == 0 || + strcmp(*argv, "--out") == 0) && argc > 1) { + outpath = *(++argv); argc--; + } else if ((strcmp(*argv, "-N") == 0 || + strcmp(*argv, "--bytes") == 0) && argc > 1) { + if (parse_size(*(++argv), &max_bytes) < 0) { + fprintf(stderr, + "oftp: bad size '%s'\n", *argv); + return 1; + } + argc--; + } else if (strcmp(*argv, "--help") == 0) { + usage(); + return 0; + } else { + usage(); + return 1; + } + argc--; argv++; + } + + if (server) + return server_main(outpath); + + if (name == NULL) { + usage(); + return 1; + } + + return client_main(name, inpath, max_bytes); +} diff --git a/src/tools/operf/operf.c b/src/tools/operf/operf.c index 1872b351..0198e871 100644 --- a/src/tools/operf/operf.c +++ b/src/tools/operf/operf.c @@ -248,5 +248,5 @@ int main(int argc, char ** argv) if (ret < 0) exit(EXIT_FAILURE); - exit(EXIT_SUCCESS); + exit(ret); } diff --git a/src/tools/operf/operf_client.c b/src/tools/operf/operf_client.c index 7e8f1a9b..e478aeff 100644 --- a/src/tools/operf/operf_client.c +++ b/src/tools/operf/operf_client.c @@ -185,7 +185,7 @@ int client_main(void) sigaction(SIGHUP, &sig_act, NULL) || sigaction(SIGPIPE, &sig_act, NULL)) { printf("Failed to install sighandler.\n"); - return -1; + return 2; } client.sent = 0; @@ -196,7 +196,7 @@ int client_main(void) fd = flow_alloc(client.server_name, NULL, NULL); if (fd < 0) { printf("Failed to allocate flow.\n"); - return -1; + return 2; } if (client.conf.test_type == TEST_TYPE_BI) @@ -207,7 +207,7 @@ int client_main(void) if (flow_write(fd, &client.conf, sizeof(client.conf)) < 0) { printf("Failed to send configuration.\n"); flow_dealloc(fd); - return -1; + return 1; } sleep(1); diff --git a/src/tools/oping/oping.c b/src/tools/oping/oping.c index 763c0d62..10e1e23c 100644 --- a/src/tools/oping/oping.c +++ b/src/tools/oping/oping.c @@ -60,7 +60,7 @@ #include <errno.h> #include <float.h> -#define OPING_BUF_SIZE 1500 +#define OPING_BUF_SIZE 16384 #define ECHO_REQUEST 0 #define ECHO_REPLY 1 #define OPING_MAX_FLOWS 256 @@ -81,8 +81,9 @@ " -F, --flood-busy Flood with busy-polling (lower latency)\n" \ " -i, --interval Interval (default 1000ms)\n" \ " -n, --server-name Name of the oping server\n" \ -" -q, --qos QoS (raw, best, video, voice, data)\n" \ +" -q, --qos QoS (raw, safe, rt, rt-safe, msg)\n" \ " -s, --size Payload size (B, default 64)\n" \ +" -W, --timeout Per-packet recv timeout, ms (default 2000)\n" \ " -Q, --quiet Only print final statistics\n" \ " -D, --timeofday Print time of day before each line\n" \ "\n" \ @@ -93,9 +94,11 @@ struct { int interval; uint32_t count; int size; + int timeout; /* per-packet recv timeout, ms */ bool timestamp; bool flood; bool flood_busy; + long duration; qosspec_t qs; /* stats */ @@ -175,18 +178,20 @@ int main(int argc, argc--; argv++; - client.s_apn = NULL; - client.interval = 1000; - client.size = 64; - client.count = INT_MAX; - client.timestamp = false; - client.flood = false; + client.s_apn = NULL; + client.interval = 1000; + client.size = 64; + client.count = INT_MAX; + client.timeout = 2000; + client.timestamp = false; + client.flood = false; client.flood_busy = false; - client.qs = qos_raw; - client.quiet = false; - server.quiet = false; - server.poll = false; - server.busy = false; + client.duration = 0; + client.qs = qos_raw; + client.quiet = false; + server.quiet = false; + server.poll = false; + server.busy = false; while (argc > 0) { if ((strcmp(*argv, "-i") == 0 || @@ -216,6 +221,12 @@ int main(int argc, argc > 1) { client.size = strtol(*(++argv), &rem, 10); --argc; + } else if ((strcmp(*argv, "-W") == 0 || + strcmp(*argv, "--timeout") == 0) && + argc > 1) { + client.timeout = strtol(*(++argv), &rem, 10); + client.timeout *= time_mul(rem); + --argc; } else if ((strcmp(*argv, "-q") == 0 || strcmp(*argv, "--qos") == 0) && argc > 1) { @@ -249,23 +260,25 @@ int main(int argc, } if (duration > 0) { - if (client.interval == 0) + if (client.flood || client.flood_busy) + client.duration = duration; + else if (client.interval == 0) client.count = duration * 10; else client.count = duration / client.interval; } if (qos != NULL) { - if (strcmp(qos, "best") == 0) - client.qs = qos_best_effort; - else if (strcmp(qos, "raw") == 0) + if (strcmp(qos, "raw") == 0) client.qs = qos_raw; - else if (strcmp(qos, "video") == 0) - client.qs = qos_video; - else if (strcmp(qos, "voice") == 0) - client.qs = qos_voice; - else if (strcmp(qos, "data") == 0) - client.qs = qos_data; + else if (strcmp(qos, "safe") == 0) + client.qs = qos_raw_safe; + else if (strcmp(qos, "rt") == 0) + client.qs = qos_rt; + else if (strcmp(qos, "rt-safe") == 0) + client.qs = qos_rt_safe; + else if (strcmp(qos, "msg") == 0) + client.qs = qos_msg; else printf("Unknown QoS cube, defaulting to raw.\n"); } @@ -298,7 +311,7 @@ int main(int argc, if (ret < 0) exit(EXIT_FAILURE); - exit(EXIT_SUCCESS); + exit(ret); fail: usage(); diff --git a/src/tools/oping/oping_client.c b/src/tools/oping/oping_client.c index 23807f65..4b01315d 100644 --- a/src/tools/oping/oping_client.c +++ b/src/tools/oping/oping_client.c @@ -47,6 +47,7 @@ void shutdown_client(int signo, siginfo_t * info, void * c) case SIGINT: case SIGTERM: case SIGHUP: + case SIGALRM: stop = true; default: return; @@ -89,7 +90,7 @@ static void print_rtt(int len, int seq, void * reader(void * o) { - struct timespec timeout = {client.interval / 1000 + 2, 0}; + struct timespec timeout; struct timespec now = {0, 0}; struct timespec sent; @@ -100,6 +101,9 @@ void * reader(void * o) double ms = 0; uint32_t exp_id = 0; + timeout.tv_sec = client.timeout / 1000; + timeout.tv_nsec = (client.timeout % 1000) * MILLION; + fccntl(fd, FLOWSRCVTIMEO, &timeout); while (!stop && client.rcvd != client.count) { @@ -284,18 +288,15 @@ static int flood_busy_ping(int fd) msg->tv_sec = sent.tv_sec; msg->tv_nsec = sent.tv_nsec; - if (flow_write(fd, buf, - client.size) < 0) { - printf("Failed to send " - "packet.\n"); + if (flow_write(fd, buf, client.size) < 0) { + printf("Failed to send packet.\n"); break; } ++client.sent; do { - n = flow_read(fd, buf, - OPING_BUF_SIZE); + n = flow_read(fd, buf, OPING_BUF_SIZE); } while (n == -EAGAIN && !stop); if (n < 0) @@ -315,9 +316,7 @@ static int flood_busy_ping(int fd) update_rtt_stats(ms); if (!client.quiet) - print_rtt(client.size, - ntohl(msg->id), ms, - NULL); + print_rtt(client.size, ntohl(msg->id), ms, NULL); } return 0; @@ -371,9 +370,7 @@ static int flood_ping(int fd) update_rtt_stats(ms); if (!client.quiet) - print_rtt(client.size, - ntohl(msg->id), ms, - NULL); + print_rtt(client.size, ntohl(msg->id), ms, NULL); } return 0; @@ -404,25 +401,34 @@ static int client_main(void) if (sigaction(SIGINT, &sig_act, NULL) || sigaction(SIGTERM, &sig_act, NULL) || sigaction(SIGHUP, &sig_act, NULL) || - sigaction(SIGPIPE, &sig_act, NULL)) { + sigaction(SIGPIPE, &sig_act, NULL) || + sigaction(SIGALRM, &sig_act, NULL)) { printf("Failed to install sighandler.\n"); - return -1; + return 2; } if (client_init()) { printf("Failed to initialize client.\n"); - return -1; + return 2; } fd = flow_alloc(client.s_apn, &client.qs, NULL); if (fd < 0) { printf("Failed to allocate flow: %d.\n", fd); client_fini(); - return -1; + return 2; } fccntl(fd, FLOWSFLAGS, FLOWFRDWR | FLOWFRNOPART); + if (client.duration > 0) { + struct itimerval it; + memset(&it, 0, sizeof(it)); + it.it_value.tv_sec = client.duration / 1000; + it.it_value.tv_usec = (client.duration % 1000) * 1000; + setitimer(ITIMER_REAL, &it, NULL); + } + clock_gettime(CLOCK_REALTIME, &tic); if (client.flood_busy) @@ -439,5 +445,5 @@ static int client_main(void) flow_dealloc(fd); client_fini(); - return 0; + return client.rcvd == client.sent ? 0 : 1; } diff --git a/src/tools/oping/oping_server.c b/src/tools/oping/oping_server.c index 33af28c4..e98ca040 100644 --- a/src/tools/oping/oping_server.c +++ b/src/tools/oping/oping_server.c @@ -237,6 +237,14 @@ int server_main(void) return -1; } + if (pthread_mutex_init(&server.lock, NULL)) { + fqueue_destroy(server.fq); + fset_destroy(server.flows); + return -1; + } + + memset(server.times, 0, sizeof(server.times)); + pthread_create(&server.cleaner_pt, NULL, cleaner_thread, NULL); if (server.busy) { @@ -255,11 +263,13 @@ int server_main(void) pthread_cancel(server.cleaner_pt); - fset_destroy(server.flows); - fqueue_destroy(server.fq); - + /* Join cancellable threads before tearing down their fset. */ pthread_join(server.server_pt, NULL); pthread_join(server.cleaner_pt, NULL); + pthread_mutex_destroy(&server.lock); + fset_destroy(server.flows); + fqueue_destroy(server.fq); + return 0; } |
