summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.ci/woodpecker/10-build.yaml9
-rw-r--r--.gitignore1
-rw-r--r--CMakeLists.txt1
-rw-r--r--cmake/config/global.cmake12
-rw-r--r--cmake/config/ipcp/broadcast.cmake3
-rw-r--r--cmake/config/ipcp/common.cmake15
-rw-r--r--cmake/config/ipcp/eth.cmake4
-rw-r--r--cmake/config/ipcp/local.cmake32
-rw-r--r--cmake/config/ipcp/udp.cmake4
-rw-r--r--cmake/config/ipcp/unicast.cmake2
-rw-r--r--cmake/config/irmd.cmake10
-rw-r--r--cmake/config/lib.cmake83
-rw-r--r--cmake/config/ssm.cmake53
-rw-r--r--cmake/dependencies.cmake1
-rw-r--r--cmake/dependencies/system/liburcu.cmake45
-rw-r--r--cmake/tags.cmake21
-rw-r--r--cmake/utils/CPUUtils.cmake82
-rw-r--r--doc/man/flow_alloc.388
-rw-r--r--doc/man/flow_read.346
-rw-r--r--doc/man/fqueue.321
-rw-r--r--include/ouroboros/atomics.h39
-rw-r--r--include/ouroboros/crc16.h43
-rw-r--r--include/ouroboros/crc64.h44
-rw-r--r--include/ouroboros/crc8.h43
-rw-r--r--include/ouroboros/crypt.h76
-rw-r--r--include/ouroboros/errno.h1
-rw-r--r--include/ouroboros/fccntl.h13
-rw-r--r--include/ouroboros/flow.h3
-rw-r--r--include/ouroboros/fqueue.h3
-rw-r--r--include/ouroboros/hash.h6
-rw-r--r--include/ouroboros/ipcp-dev.h7
-rw-r--r--include/ouroboros/logs.h7
-rw-r--r--include/ouroboros/name.h6
-rw-r--r--include/ouroboros/np1_flow.h6
-rw-r--r--include/ouroboros/pthread.h6
-rw-r--r--include/ouroboros/qos.h57
-rw-r--r--include/ouroboros/rcu.h110
-rw-r--r--include/ouroboros/serdes-irm.h14
-rw-r--r--include/ouroboros/ssm_pk_buff.h24
-rw-r--r--include/ouroboros/ssm_pool.h12
-rw-r--r--include/ouroboros/ssm_rbuff.h23
-rw-r--r--include/ouroboros/time.h6
-rw-r--r--include/ouroboros/tpm.h1
-rw-r--r--include/ouroboros/tw.h77
-rw-r--r--include/test/certs/ecdsa.h37
-rw-r--r--irmd.conf.in4
-rw-r--r--sec.conf.in (renamed from enc.conf.in)53
-rw-r--r--src/ipcpd/broadcast/dt.c2
-rw-r--r--src/ipcpd/broadcast/main.c7
-rw-r--r--src/ipcpd/config.h.in13
-rw-r--r--src/ipcpd/eth/eth.c658
-rw-r--r--src/ipcpd/ipcp.c37
-rw-r--r--src/ipcpd/ipcp.h4
-rw-r--r--src/ipcpd/local/main.c45
-rw-r--r--src/ipcpd/udp/udp.c155
-rw-r--r--src/ipcpd/unicast/dt.c216
-rw-r--r--src/ipcpd/unicast/fa.c141
-rw-r--r--src/ipcpd/unicast/fa.h3
-rw-r--r--src/ipcpd/unicast/main.c5
-rw-r--r--src/ipcpd/unicast/pff/alternate.c2
-rw-r--r--src/ipcpd/unicast/pff/multipath.c2
-rw-r--r--src/ipcpd/unicast/pff/pft.c19
-rw-r--r--src/ipcpd/unicast/pff/pft.h4
-rw-r--r--src/ipcpd/unicast/pff/simple.c2
-rw-r--r--src/ipcpd/unicast/pff/tests/pft_test.c10
-rw-r--r--src/ipcpd/unicast/routing/graph.c8
-rw-r--r--src/ipcpd/unicast/routing/link-state.c2
-rw-r--r--src/irmd/CMakeLists.txt6
-rw-r--r--src/irmd/config.h.in3
-rw-r--r--src/irmd/configfile.c12
-rw-r--r--src/irmd/ipcp.c32
-rw-r--r--src/irmd/ipcp.h3
-rw-r--r--src/irmd/main.c1079
-rw-r--r--src/irmd/oap.c130
-rw-r--r--src/irmd/oap.h25
-rw-r--r--src/irmd/oap/auth.c440
-rw-r--r--src/irmd/oap/auth.h41
-rw-r--r--src/irmd/oap/cli.c224
-rw-r--r--src/irmd/oap/hdr.c468
-rw-r--r--src/irmd/oap/hdr.h50
-rw-r--r--src/irmd/oap/internal.h25
-rw-r--r--src/irmd/oap/io.c34
-rw-r--r--src/irmd/oap/io.h2
-rw-r--r--src/irmd/oap/srv.c218
-rw-r--r--src/irmd/oap/tests/common.c188
-rw-r--r--src/irmd/oap/tests/common.h25
-rw-r--r--src/irmd/oap/tests/oap_test.c685
-rw-r--r--src/irmd/oap/tests/oap_test_ml_dsa.c43
-rw-r--r--src/irmd/reg/flow.c11
-rw-r--r--src/irmd/reg/flow.h16
-rw-r--r--src/irmd/reg/reg.c528
-rw-r--r--src/irmd/reg/reg.h67
-rw-r--r--src/irmd/reg/tests/flow_test.c20
-rw-r--r--src/irmd/reg/tests/reg_test.c274
-rw-r--r--src/lib/CMakeLists.txt15
-rw-r--r--src/lib/config.h.in36
-rw-r--r--src/lib/crc/crc16.c61
-rw-r--r--src/lib/crc/crc32.c (renamed from src/lib/crc32.c)0
-rw-r--r--src/lib/crc/crc64.c363
-rw-r--r--src/lib/crc/crc8.c62
-rw-r--r--src/lib/crc/tests/CMakeLists.txt21
-rw-r--r--src/lib/crc/tests/crc16_test.c67
-rw-r--r--src/lib/crc/tests/crc32_test.c (renamed from src/lib/tests/crc32_test.c)0
-rw-r--r--src/lib/crc/tests/crc64_test.c126
-rw-r--r--src/lib/crc/tests/crc8_test.c67
-rw-r--r--src/lib/crypt.c457
-rw-r--r--src/lib/crypt/keyrot.c741
-rw-r--r--src/lib/crypt/keyrot.h74
-rw-r--r--src/lib/crypt/openssl.c825
-rw-r--r--src/lib/crypt/openssl.h57
-rw-r--r--src/lib/dev.c1623
-rw-r--r--src/lib/frct.c4270
-rw-r--r--src/lib/hash.c52
-rw-r--r--src/lib/pb/ipcp.proto3
-rw-r--r--src/lib/pb/irm.proto11
-rw-r--r--src/lib/pb/model.proto5
-rw-r--r--src/lib/protobuf.c6
-rw-r--r--src/lib/qoscube.c12
-rw-r--r--src/lib/random.c25
-rw-r--r--src/lib/rib.c18
-rw-r--r--src/lib/serdes-irm.c136
-rw-r--r--src/lib/ssm/flow_set.c42
-rw-r--r--src/lib/ssm/pool.c118
-rw-r--r--src/lib/ssm/rbuff.c147
-rw-r--r--src/lib/ssm/ssm.h.in21
-rw-r--r--src/lib/ssm/tests/pool_sharding_test.c69
-rw-r--r--src/lib/ssm/tests/pool_test.c22
-rw-r--r--src/lib/ssm/tests/rbuff_test.c30
-rw-r--r--src/lib/tests/CMakeLists.txt6
-rw-r--r--src/lib/tests/auth_test.c140
-rw-r--r--src/lib/tests/crypt_test.c430
-rw-r--r--src/lib/tests/hash_test.c147
-rw-r--r--src/lib/tests/kex_test.c360
-rw-r--r--src/lib/tests/keyrot_test.c1083
-rw-r--r--src/lib/tests/tpm_test.c2
-rw-r--r--src/lib/tests/tw_test.c663
-rw-r--r--src/lib/timerwheel.c414
-rw-r--r--src/lib/tw.c307
-rw-r--r--src/tools/CMakeLists.txt5
-rw-r--r--src/tools/irm/irm_ipcp_connect.c22
-rw-r--r--src/tools/irm/irm_name_create.c24
-rw-r--r--src/tools/ocbr/ocbr_client.c36
-rw-r--r--src/tools/oecho/oecho.c8
-rw-r--r--src/tools/oftp/oftp.c441
-rw-r--r--src/tools/operf/operf.c2
-rw-r--r--src/tools/operf/operf_client.c6
-rw-r--r--src/tools/oping/oping.c61
-rw-r--r--src/tools/oping/oping_client.c42
-rw-r--r--src/tools/oping/oping_server.c16
149 files changed, 17703 insertions, 3332 deletions
diff --git a/.ci/woodpecker/10-build.yaml b/.ci/woodpecker/10-build.yaml
index 0a82c469..31b9b9b4 100644
--- a/.ci/woodpecker/10-build.yaml
+++ b/.ci/woodpecker/10-build.yaml
@@ -88,13 +88,6 @@ steps:
done
done
- for rxm_heap in TRUE FALSE; do
- for rxm_block in TRUE FALSE; do
- echo "--- HEAP=$rxm_heap BLOCKING=$rxm_block ---"
- run_build \
- -DRXM_BUFFER_ON_HEAP=$rxm_heap \
- -DRXM_BLOCKING=$rxm_block
- done
- done
+ run_build
diff --git a/.gitignore b/.gitignore
index 43f47a46..b10e8173 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
*~
*#
build/
+/tags
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c886146d..bfabd711 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,5 +69,6 @@ add_subdirectory(src/ipcpd)
add_subdirectory(src/tools)
setup_coverage_target()
include(doc)
+include(tags)
include(install)
diff --git a/cmake/config/global.cmake b/cmake/config/global.cmake
index 0ac256bb..1e172724 100644
--- a/cmake/config/global.cmake
+++ b/cmake/config/global.cmake
@@ -25,8 +25,16 @@ set(SHM_LOCKFILE_NAME "/${SHM_PREFIX}.lockfile" CACHE INTERNAL
# Secure memory configuration
set(IRMD_SECMEM_MAX 1048576 CACHE STRING "IRMd secure heap size")
-set(PROC_SECMEM_MAX 1048576 CACHE STRING "Process secure heap size")
-set(SECMEM_GUARD 32 CACHE STRING "Secure heap min size")
+# ~8 KiB secure heap per encrypted flow (cur+prev node slabs); the total
+# is rounded up to a power of two for the OpenSSL secure-heap allocator.
+set(PROC_SECMEM_FLOWS 512 CACHE STRING
+ "Max concurrent encrypted flows the per-process secure heap is sized for")
+math(EXPR PROC_SECMEM_NEED "${PROC_SECMEM_FLOWS} * 8192")
+set(PROC_SECMEM_MAX 4096)
+while(PROC_SECMEM_MAX LESS PROC_SECMEM_NEED)
+ math(EXPR PROC_SECMEM_MAX "${PROC_SECMEM_MAX} * 2")
+endwhile()
+set(SECMEM_MINSIZE 32 CACHE STRING "Secure heap min alloc size")
# Container/deployment options
set(BUILD_CONTAINER FALSE CACHE BOOL
diff --git a/cmake/config/ipcp/broadcast.cmake b/cmake/config/ipcp/broadcast.cmake
index 79f41d10..f521ed8e 100644
--- a/cmake/config/ipcp/broadcast.cmake
+++ b/cmake/config/ipcp/broadcast.cmake
@@ -4,3 +4,6 @@ set(IPCP_BROADCAST_TARGET ipcpd-broadcast)
set(IPCP_BROADCAST_MPL 100 CACHE STRING
"Default maximum packet lifetime for the Broadcast IPCP, in ms")
+
+set(IPCP_BROADCAST_MTU 1400 CACHE STRING
+ "Layer MTU advertised by the Broadcast IPCP, in bytes")
diff --git a/cmake/config/ipcp/common.cmake b/cmake/config/ipcp/common.cmake
index ffd5dc32..7dbc252b 100644
--- a/cmake/config/ipcp/common.cmake
+++ b/cmake/config/ipcp/common.cmake
@@ -41,3 +41,18 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
set(IPCP_LINUX_TIMERSLACK_NS 100 CACHE STRING
"Slack value for high resolution timers on Linux systems.")
endif()
+
+# Per-flow statistics exposed via the RIB (requires FUSE).
+if(HAVE_FUSE)
+ set(IPCP_FLOW_STATS TRUE CACHE BOOL
+ "Enable per-flow statistics via the RIB")
+ if(IPCP_FLOW_STATS)
+ message(STATUS "IPCP flow statistics enabled")
+ endif()
+
+ set(IPCP_ETH_FLOW_STATS FALSE CACHE BOOL
+ "Enable ipcpd-eth flow statistics via RIB")
+ if(IPCP_ETH_FLOW_STATS)
+ message(STATUS "ipcpd-eth flow statistics enabled")
+ endif()
+endif()
diff --git a/cmake/config/ipcp/eth.cmake b/cmake/config/ipcp/eth.cmake
index 4b9007d2..d336d647 100644
--- a/cmake/config/ipcp/eth.cmake
+++ b/cmake/config/ipcp/eth.cmake
@@ -10,6 +10,10 @@ set(IPCP_ETH_WR_THR 1 CACHE STRING
"Number of writer threads in Ethernet IPCP")
set(IPCP_ETH_QDISC_BYPASS false CACHE BOOL
"Bypass the Qdisc in the kernel when using raw sockets")
+set(IPCP_ETH_SNDBUF 0 CACHE STRING
+ "Raw socket SO_SNDBUF in bytes; 0 = leave kernel default (wmem_default)")
+set(IPCP_ETH_RCVBUF 0 CACHE STRING
+ "Raw socket SO_RCVBUF in bytes; 0 = leave kernel default (rmem_default)")
set(IPCP_ETH_LO_MTU 9000 CACHE STRING
"Restrict Ethernet MTU over loopback interfaces")
set(IPCP_ETH_MGMT_FRAME_SIZE 9000 CACHE STRING
diff --git a/cmake/config/ipcp/local.cmake b/cmake/config/ipcp/local.cmake
index 88ee8998..70423cd1 100644
--- a/cmake/config/ipcp/local.cmake
+++ b/cmake/config/ipcp/local.cmake
@@ -2,8 +2,38 @@
set(IPCP_LOCAL_TARGET ipcpd-local)
-set(IPCP_LOCAL_MPL 100 CACHE STRING
+set(IPCP_LOCAL_MPL 50 CACHE STRING
"Default maximum packet lifetime for the Local IPCP, in ms")
+set(IPCP_LOCAL_MTU 65000 CACHE STRING
+ "Layer MTU advertised by the Local IPCP, in bytes")
+
set(IPCP_LOCAL_POLLING FALSE CACHE BOOL
"Enable active polling in the Local IPCP for low-latency mode")
+
+# IPCP_LOCAL_MTU must fit in the largest enabled GSPP and PUP class
+# (sender-side allocation: daemons use GSPP, apps use PUP). Reserve a
+# margin for sizeof(struct ssm_pk_buff) + HEADSPACE + TAILSPACE.
+math(EXPR _ssm_pk_overhead
+ "${SSM_PK_BUFF_HEADSPACE} + ${SSM_PK_BUFF_TAILSPACE} + 64")
+
+foreach(_pool GSPP PUP)
+ set(_largest 0)
+ foreach(_pair "256;256" "512;512" "1K;1024" "2K;2048" "4K;4096"
+ "16K;16384" "64K;65536" "256K;262144" "1M;1048576")
+ list(GET _pair 0 _name)
+ list(GET _pair 1 _bytes)
+ if(SSM_${_pool}_${_name}_BLOCKS GREATER 0
+ AND _bytes GREATER _largest)
+ set(_largest ${_bytes})
+ endif()
+ endforeach()
+ math(EXPR _avail "${_largest} - ${_ssm_pk_overhead}")
+ if(IPCP_LOCAL_MTU GREATER _avail)
+ message(FATAL_ERROR
+ "IPCP_LOCAL_MTU (${IPCP_LOCAL_MTU}) exceeds largest enabled "
+ "SSM_${_pool} class minus per-block overhead "
+ "(${_largest} - ${_ssm_pk_overhead} = ${_avail} bytes). "
+ "Lower IPCP_LOCAL_MTU or enable a larger SSM_${_pool}_*_BLOCKS.")
+ endif()
+endforeach()
diff --git a/cmake/config/ipcp/udp.cmake b/cmake/config/ipcp/udp.cmake
index 0124c261..af84a844 100644
--- a/cmake/config/ipcp/udp.cmake
+++ b/cmake/config/ipcp/udp.cmake
@@ -10,3 +10,7 @@ set(IPCP_UDP_WR_THR 3 CACHE STRING
"Number of writer threads in UDP IPCPs")
set(IPCP_UDP_MPL 5000 CACHE STRING
"Default maximum packet lifetime for the UDP IPCPs, in ms")
+set(IPCP_UDP4_MTU 1472 CACHE STRING
+ "Fallback UDP4 layer MTU when getsockopt(IP_MTU) is unavailable, in bytes")
+set(IPCP_UDP6_MTU 1452 CACHE STRING
+ "Fallback UDP6 layer MTU when getsockopt(IPV6_MTU) is unavailable, in bytes")
diff --git a/cmake/config/ipcp/unicast.cmake b/cmake/config/ipcp/unicast.cmake
index 3b5b0ce7..b8d4d516 100644
--- a/cmake/config/ipcp/unicast.cmake
+++ b/cmake/config/ipcp/unicast.cmake
@@ -4,6 +4,8 @@ set(IPCP_UNICAST_TARGET ipcpd-unicast)
set(IPCP_UNICAST_MPL 100 CACHE STRING
"Default maximum packet lifetime for the Unicast IPCP, in ms")
+set(IPCP_UNICAST_MTU 1400 CACHE STRING
+ "Layer MTU advertised by the Unicast IPCP, in bytes (TODO: derive per-flow from n-1 path MTU minus DT PCI)")
set(PFT_SIZE 256 CACHE STRING
"Prefix forwarding table size for the Unicast IPCP")
diff --git a/cmake/config/irmd.cmake b/cmake/config/irmd.cmake
index b86a40c5..79e24bae 100644
--- a/cmake/config/irmd.cmake
+++ b/cmake/config/irmd.cmake
@@ -10,8 +10,8 @@ set(ENROLL_TIMEOUT 20000 CACHE STRING
"Timeout for an IPCP to enroll (ms)")
set(REG_TIMEOUT 20000 CACHE STRING
"Timeout for registering a name (ms)")
-set(QUERY_TIMEOUT 200 CACHE STRING
- "Timeout to query a name with an IPCP (ms)")
+set(QUERY_TIMEOUT 2000 CACHE STRING
+ "Timeout to query a name with an IPCP (ms); must exceed shim retry budget")
set(CONNECT_TIMEOUT 20000 CACHE STRING
"Timeout to connect an IPCP to another IPCP (ms)")
set(FLOW_ALLOC_TIMEOUT 20000 CACHE STRING
@@ -20,6 +20,12 @@ set(FLOW_ALLOC_TIMEOUT 20000 CACHE STRING
# OAP (Ouroboros Authentication Protocol)
set(OAP_REPLAY_TIMER 20 CACHE STRING
"OAP replay protection window (s)")
+set(OAP_REPLAY_MAX 4096 CACHE STRING
+ "Maximum entries in the OAP replay cache (bounds memory/CPU under flood)")
+set(OAP_REKEY_TIMER 120 CACHE STRING
+ "Tier-2 re-key interval (s); bounds key age / PCS healing, 0 disables")
+set(OAP_CLIENT_AUTH_DEFAULT TRUE CACHE BOOL
+ "Client requires the server to authenticate by default")
set(DEBUG_PROTO_OAP FALSE CACHE BOOL
"Add Flow allocation protocol message output to IRMd debug logging")
diff --git a/cmake/config/lib.cmake b/cmake/config/lib.cmake
index 287f30dc..81a7d6ba 100644
--- a/cmake/config/lib.cmake
+++ b/cmake/config/lib.cmake
@@ -4,11 +4,11 @@
# Flow limits
set(SYS_MAX_FLOWS 10240 CACHE STRING
"Maximum number of total flows for this system")
-set(PROG_MAX_FLOWS 4096 CACHE STRING
+set(PROC_MAX_FLOWS 4096 CACHE STRING
"Maximum number of flows in an application")
-set(PROG_RES_FDS 64 CACHE STRING
+set(PROC_RES_FDS 64 CACHE STRING
"Number of reserved flow descriptors per application")
-set(PROG_MAX_FQUEUES 32 CACHE STRING
+set(PROC_MAX_FQUEUES 32 CACHE STRING
"Maximum number of flow sets per application")
# Threading
@@ -28,18 +28,28 @@ set(SOCKET_TIMEOUT 500 CACHE STRING
set(QOS_DISABLE_CRC TRUE CACHE BOOL
"Ignores ber setting on all QoS cubes")
-# Delta-t protocol timers
-set(DELTA_T_MPL 60 CACHE STRING
- "Maximum packet lifetime (s)")
-set(DELTA_T_ACK 10 CACHE STRING
- "Maximum time to acknowledge a packet (s)")
-set(DELTA_T_RTX 120 CACHE STRING
- "Maximum time to retransmit a packet (s)")
+include(utils/CPUUtils)
+detect_pclmul()
+detect_pmull()
+if(HAVE_PCLMUL)
+ message(STATUS "CRC-64/NVMe backend: PCLMUL (x86 SSE4.1+PCLMUL)")
+elseif(HAVE_PMULL)
+ message(STATUS "CRC-64/NVMe backend: PMULL (aarch64 crypto)")
+else()
+ message(STATUS "CRC-64/NVMe backend: byte table (no acceleration)")
+endif()
+
+# Delta-t protocol timers (Watson bound: 3*MPL + A + R).
+# MPL is reported per IPCP (IPCP_*_MPL); A and R are FRCT-wide.
+set(DELTA_T_ACK 1000 CACHE STRING
+ "Maximum time to acknowledge a packet (ms)")
+set(DELTA_T_RTX 30000 CACHE STRING
+ "Maximum time to retransmit a packet (ms)")
# FRCT configuration
-set(FRCT_REORDER_QUEUE_SIZE 256 CACHE STRING
+set(FRCT_REORDER_QUEUE_SIZE 128 CACHE STRING
"Size of the reordering queue, must be a power of 2")
-set(FRCT_START_WINDOW 64 CACHE STRING
+set(FRCT_START_WINDOW 128 CACHE STRING
"Start window, must be a power of 2")
set(FRCT_LINUX_RTT_ESTIMATOR TRUE CACHE BOOL
"Use Linux RTT estimator formula instead of the TCP RFC formula")
@@ -48,15 +58,13 @@ set(FRCT_RTO_MDEV_MULTIPLIER 2 CACHE STRING
set(FRCT_RTO_INC_FACTOR 0 CACHE STRING
"Divisor for RTO increase after timeout: RTO += RTX >> X, 0: Karn/Partridge")
set(FRCT_RTO_MIN 250 CACHE STRING
- "Minimum Retransmission Timeout (RTO) for FRCT (us)")
+ "Hard floor for Retransmission Timeout (RTO) for FRCT (us)")
set(FRCT_TICK_TIME 5000 CACHE STRING
"Tick time for FRCT activity (retransmission, acknowledgments) (us)")
+set(FRCT_DEBUG_STDOUT FALSE CACHE BOOL
+ "Print FRCT final counters to stdout at flow teardown")
# Retransmission (RXM) configuration
-set(RXM_BUFFER_ON_HEAP FALSE CACHE BOOL
- "Store packets for retransmission on the heap instead of in packet buffer")
-set(RXM_BLOCKING TRUE CACHE BOOL
- "Use blocking writes for retransmission")
set(RXM_MIN_RESOLUTION 20 CACHE STRING
"Minimum retransmission delay (ns), as a power to 2")
set(RXM_WHEEL_MULTIPLIER 4 CACHE STRING
@@ -79,8 +87,44 @@ set(TPM_DEBUG_ABORT_TIMEOUT 0 CACHE STRING
"TPM abort process after a thread reaches this timeout (s), 0 disables")
# Encryption
-set(KEY_ROTATION_BIT 20 CACHE STRING
- "Bit position in packet counter that triggers key rotation (default 20 = every 2^20 packets)")
+set(KEY_LEAF_BITS 20 CACHE STRING
+ "Packets per leaf key as a power of two (2^20 = AEAD-safe default)")
+set(KEY_NODE_BITS 6 CACHE STRING
+ "Leaf keys per node key, power of two (2^6 = 64; leak compartment)")
+set(KEY_NODE_COUNT 128 CACHE STRING
+ "Node keys per batch (N); <= 4096, the 12-bit on-wire node index")
+set(KEY_REKEY_WATERMARK 4 CACHE STRING
+ "Re-key when this many node keys remain; 0 disables the count trigger")
+set(KEY_REPLAY_WINDOW 2048 CACHE STRING
+ "RX replay window in packets; power of two, >= 128")
+set(KEY_REKEY_WM_CHECK_BITS 16 CACHE STRING
+ "Re-key watermark is consulted once per 2^n flow writes")
+if(NOT KEY_REPLAY_WINDOW MATCHES "^[0-9]+$")
+ message(FATAL_ERROR "KEY_REPLAY_WINDOW must be a positive integer")
+endif()
+math(EXPR _krw_p2 "${KEY_REPLAY_WINDOW} & (${KEY_REPLAY_WINDOW} - 1)")
+if(KEY_REPLAY_WINDOW LESS 128 OR NOT _krw_p2 EQUAL 0)
+ message(FATAL_ERROR "KEY_REPLAY_WINDOW must be a power of two >= 128")
+endif()
+
+# Re-key must finish within its lead window - KEY_REKEY_WATERMARK node keys
+# worth of packets - before the batch exhausts and TX fails closed. dev.c only
+# evaluates the watermark once per FLOW_WM_CHECK writes, so a lead below ~2x
+# that leaves a high-rate flow no room to complete the exchange. Production
+# defaults are vast; this guards under-sized (test) geometries.
+if(KEY_REKEY_WATERMARK GREATER 0)
+ math(EXPR _rk_wm_check "1 << ${KEY_REKEY_WM_CHECK_BITS}")
+ math(EXPR _rk_lead
+ "${KEY_REKEY_WATERMARK} << (${KEY_LEAF_BITS} + ${KEY_NODE_BITS})")
+ math(EXPR _rk_min "2 * ${_rk_wm_check}")
+ if(_rk_lead LESS _rk_min)
+ message(WARNING
+ "Re-key lead is ${_rk_lead} packets vs the watermark check interval "
+ "${_rk_wm_check}; a high-rate flow may exhaust its key batch before the "
+ "re-key completes (TX fails closed until it does). Raise KEY_LEAF_BITS, "
+ "KEY_NODE_BITS, or KEY_REKEY_WATERMARK.")
+ endif()
+endif()
# Flow statistics (requires FUSE)
if(HAVE_FUSE)
@@ -92,3 +136,4 @@ if(HAVE_FUSE)
message(STATUS "Application flow statistics disabled")
endif()
endif()
+
diff --git a/cmake/config/ssm.cmake b/cmake/config/ssm.cmake
index c1f34655..589171ea 100644
--- a/cmake/config/ssm.cmake
+++ b/cmake/config/ssm.cmake
@@ -15,14 +15,12 @@ set(SSM_PUP_NAME_FMT "/${SSM_PREFIX}.pup.%d" CACHE INTERNAL
# Packet buffer configuration
set(SSM_POOL_NAME "/${SHM_PREFIX}.pool" CACHE INTERNAL
"Name for the main POSIX shared memory pool")
-set(SSM_POOL_BLOCKS 16384 CACHE STRING
- "Number of blocks in SSM packet pool, must be a power of 2")
set(SSM_PK_BUFF_HEADSPACE 256 CACHE STRING
"Bytes of headspace to reserve for future headers")
set(SSM_PK_BUFF_TAILSPACE 32 CACHE STRING
"Bytes of tailspace to reserve for future tails")
set(SSM_RBUFF_SIZE 1024 CACHE STRING
- "Number of blocks in rbuff buffer, must be a power of 2")
+ "Number of slots in a flow's rbuff ring; must be a power of 2")
set(SSM_RBUFF_PREFIX "/${SHM_PREFIX}.rbuff." CACHE INTERNAL
"Prefix for rbuff POSIX shared memory filenames")
set(SSM_FLOW_SET_PREFIX "/${SHM_PREFIX}.set." CACHE INTERNAL
@@ -31,12 +29,14 @@ set(SSM_FLOW_SET_PREFIX "/${SHM_PREFIX}.set." CACHE INTERNAL
# Number of shards per size class for reducing contention
set(SSM_POOL_SHARDS 4 CACHE STRING
"Number of allocator shards per size class")
+set(SSM_POOL_RECLAIM_AGE_S 60 CACHE STRING
+ "Minimum age in seconds before a block is presumed stale and reclaimed")
# Global Shared Packet Pool (GSPP) - for privileged processes
# Shared by all processes in 'ouroboros' group (~60 MB total)
set(SSM_GSPP_256_BLOCKS 1024 CACHE STRING
"GSPP: Number of 256B blocks")
-set(SSM_GSPP_512_BLOCKS 768 CACHE STRING
+set(SSM_GSPP_512_BLOCKS 2048 CACHE STRING
"GSPP: Number of 512B blocks")
set(SSM_GSPP_1K_BLOCKS 512 CACHE STRING
"GSPP: Number of 1KB blocks")
@@ -55,13 +55,13 @@ set(SSM_GSPP_1M_BLOCKS 16 CACHE STRING
# Per-User Pool (PUP) - for unprivileged applications
# Each unprivileged app gets its own smaller pool (~7.5 MB total)
-set(SSM_PUP_256_BLOCKS 128 CACHE STRING
+set(SSM_PUP_256_BLOCKS 512 CACHE STRING
"PUP: Number of 256B blocks")
-set(SSM_PUP_512_BLOCKS 96 CACHE STRING
+set(SSM_PUP_512_BLOCKS 512 CACHE STRING
"PUP: Number of 512B blocks")
-set(SSM_PUP_1K_BLOCKS 64 CACHE STRING
+set(SSM_PUP_1K_BLOCKS 512 CACHE STRING
"PUP: Number of 1KB blocks")
-set(SSM_PUP_2K_BLOCKS 48 CACHE STRING
+set(SSM_PUP_2K_BLOCKS 512 CACHE STRING
"PUP: Number of 2KB blocks")
set(SSM_PUP_4K_BLOCKS 32 CACHE STRING
"PUP: Number of 4KB blocks")
@@ -74,6 +74,23 @@ set(SSM_PUP_256K_BLOCKS 2 CACHE STRING
set(SSM_PUP_1M_BLOCKS 0 CACHE STRING
"PUP: Number of 1MB blocks")
+# Zero classes too small for spb header + HEADSPACE + TAILSPACE + 1 B.
+math(EXPR _SSM_MIN_USEFUL_CLASS
+ "32 + ${SSM_PK_BUFF_HEADSPACE} + ${SSM_PK_BUFF_TAILSPACE}")
+foreach(_pair "256:256" "512:512" "1K:1024" "2K:2048")
+ string(REPLACE ":" ";" _p "${_pair}")
+ list(GET _p 0 _suffix)
+ list(GET _p 1 _size)
+ if(_size LESS _SSM_MIN_USEFUL_CLASS)
+ set(SSM_GSPP_${_suffix}_BLOCKS 0)
+ set(SSM_PUP_${_suffix}_BLOCKS 0)
+ endif()
+endforeach()
+unset(_SSM_MIN_USEFUL_CLASS)
+unset(_p)
+unset(_suffix)
+unset(_size)
+
# SSM pool size calculations
include(utils/HumanReadable)
@@ -129,3 +146,23 @@ message(STATUS " Blocks: ${SSM_PUP_256_BLOCKS}, ${SSM_PUP_512_BLOCKS}, "
"${SSM_PUP_1K_BLOCKS}, ${SSM_PUP_2K_BLOCKS}, ${SSM_PUP_4K_BLOCKS}, "
"${SSM_PUP_16K_BLOCKS}, ${SSM_PUP_64K_BLOCKS}, ${SSM_PUP_256K_BLOCKS}, "
"${SSM_PUP_1M_BLOCKS}")
+
+# FRCT reorder queue must fit in every enabled size class. If RQ_SIZE
+# >= any backing pool, the receiver advertises a window the pool
+# cannot back; np1_flow_write fails under load and a single dropped
+# fragment wedges the flow. Auto-zeroed classes are skipped.
+foreach(_class 256 512 1K 2K)
+ if(SSM_PUP_${_class}_BLOCKS GREATER 0
+ AND NOT FRCT_REORDER_QUEUE_SIZE LESS SSM_PUP_${_class}_BLOCKS)
+ message(FATAL_ERROR
+ "FRCT_REORDER_QUEUE_SIZE (${FRCT_REORDER_QUEUE_SIZE}) must be "
+ "< SSM_PUP_${_class}_BLOCKS (${SSM_PUP_${_class}_BLOCKS}): "
+ "the FC window cannot exceed the pool that backs OOO stashing.")
+ endif()
+ if(SSM_GSPP_${_class}_BLOCKS GREATER 0
+ AND NOT FRCT_REORDER_QUEUE_SIZE LESS SSM_GSPP_${_class}_BLOCKS)
+ message(FATAL_ERROR
+ "FRCT_REORDER_QUEUE_SIZE (${FRCT_REORDER_QUEUE_SIZE}) must be "
+ "< SSM_GSPP_${_class}_BLOCKS (${SSM_GSPP_${_class}_BLOCKS}).")
+ endif()
+endforeach()
diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake
index 109fe1d6..ff44ad68 100644
--- a/cmake/dependencies.cmake
+++ b/cmake/dependencies.cmake
@@ -7,6 +7,7 @@ include(dependencies/system/libraries)
include(dependencies/system/explicit_bzero)
include(dependencies/system/robustmutex)
include(dependencies/system/fuse)
+include(dependencies/system/liburcu)
include(dependencies/system/sysrandom)
# Cryptography
diff --git a/cmake/dependencies/system/liburcu.cmake b/cmake/dependencies/system/liburcu.cmake
new file mode 100644
index 00000000..89a7ab12
--- /dev/null
+++ b/cmake/dependencies/system/liburcu.cmake
@@ -0,0 +1,45 @@
+# Userspace RCU (liburcu) - optional. Enables lock-free data-plane key
+# rotation; absent => per-flow rwlock fallback. The "bulletproof" flavour
+# (urcu-bp) auto-registers reader threads, so application threads need no
+# RCU lifecycle plumbing.
+if(PkgConfig_FOUND)
+ pkg_check_modules(URCU_PKG QUIET IMPORTED_TARGET liburcu-bp)
+ if(URCU_PKG_FOUND AND NOT TARGET Urcu::Urcu)
+ add_library(Urcu::Urcu ALIAS PkgConfig::URCU_PKG)
+ endif()
+endif()
+
+if(NOT URCU_PKG_FOUND)
+ find_library(URCU_BP_LIBRARY urcu-bp QUIET)
+ find_library(URCU_COMMON_LIBRARY urcu-common QUIET)
+ find_path(URCU_INCLUDE_DIR urcu-bp.h QUIET)
+ if(URCU_BP_LIBRARY AND URCU_COMMON_LIBRARY AND URCU_INCLUDE_DIR)
+ set(URCU_PKG_FOUND TRUE)
+ if(NOT TARGET Urcu::Urcu)
+ add_library(Urcu::Urcu INTERFACE IMPORTED)
+ set_target_properties(Urcu::Urcu PROPERTIES
+ INTERFACE_LINK_LIBRARIES "${URCU_BP_LIBRARY};${URCU_COMMON_LIBRARY}"
+ INTERFACE_INCLUDE_DIRECTORIES "${URCU_INCLUDE_DIR}")
+ endif()
+ endif()
+endif()
+
+if(URCU_PKG_FOUND)
+ set(DISABLE_LIBURCU FALSE CACHE BOOL "Disable liburcu (RCU) support")
+ if(NOT DISABLE_LIBURCU)
+ if(URCU_PKG_VERSION)
+ message(STATUS "liburcu (RCU) support enabled (version ${URCU_PKG_VERSION})")
+ else()
+ message(STATUS "liburcu (RCU) support enabled")
+ endif()
+ set(HAVE_LIBURCU TRUE CACHE INTERNAL "Userspace RCU (liburcu) available")
+ else()
+ message(STATUS "liburcu (RCU) support disabled by user")
+ unset(HAVE_LIBURCU CACHE)
+ endif()
+else()
+ message(STATUS "Install liburcu (urcu-bp) for lock-free data-plane re-keying")
+ unset(HAVE_LIBURCU CACHE)
+endif()
+
+mark_as_advanced(URCU_BP_LIBRARY URCU_COMMON_LIBRARY URCU_INCLUDE_DIR)
diff --git a/cmake/tags.cmake b/cmake/tags.cmake
new file mode 100644
index 00000000..00e6f0d6
--- /dev/null
+++ b/cmake/tags.cmake
@@ -0,0 +1,21 @@
+find_program(CTAGS_EXECUTABLE
+ NAMES ctags-universal universal-ctags ctags
+ DOC "Generate a ctags index for source navigation: make tags")
+mark_as_advanced(CTAGS_EXECUTABLE)
+
+if(CTAGS_EXECUTABLE)
+ add_custom_target(tags
+ COMMAND ${CTAGS_EXECUTABLE}
+ -R
+ --languages=C
+ --c-kinds=+p
+ --fields=+S
+ --exclude=build
+ --exclude=build-claude
+ --exclude=build_tmp
+ --exclude=.git
+ -f ${CMAKE_SOURCE_DIR}/tags
+ ${CMAKE_SOURCE_DIR}
+ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+ COMMENT "Generating ctags index at ${CMAKE_SOURCE_DIR}/tags")
+endif()
diff --git a/cmake/utils/CPUUtils.cmake b/cmake/utils/CPUUtils.cmake
new file mode 100644
index 00000000..8ca7683a
--- /dev/null
+++ b/cmake/utils/CPUUtils.cmake
@@ -0,0 +1,82 @@
+include(CheckCSourceRuns)
+
+# Compile + run a probe so we only enable a feature the host CPU
+# actually implements (toolchains accept flags the silicon may lack).
+# Cross-compile without an emulator: feature off.
+function(detect_cpu_feature _result_var _flags _source)
+ set(_save_flags "${CMAKE_REQUIRED_FLAGS}")
+ set(_save_quiet "${CMAKE_REQUIRED_QUIET}")
+ set(CMAKE_REQUIRED_FLAGS "${_save_flags} ${_flags}")
+ set(CMAKE_REQUIRED_QUIET TRUE)
+ if(CMAKE_CROSSCOMPILING AND NOT CMAKE_CROSSCOMPILING_EMULATOR)
+ set(${_result_var} FALSE CACHE INTERNAL
+ "${_result_var} (cross-compile without emulator: off)")
+ else()
+ check_c_source_runs("${_source}" ${_result_var})
+ endif()
+ set(CMAKE_REQUIRED_FLAGS "${_save_flags}")
+ set(CMAKE_REQUIRED_QUIET "${_save_quiet}")
+endfunction()
+
+# x86 PCLMULQDQ + SSE4.1. argc-derived input defeats constant folding;
+# SIGILL handler exits cleanly so the kernel skips the core dump.
+function(detect_pclmul)
+ detect_cpu_feature(_HAVE_PCLMUL "-mpclmul"
+"#include <wmmintrin.h>
+#include <signal.h>
+#include <unistd.h>
+static void on_sigill(int sig) { (void) sig; _exit(1); }
+int main(int argc, char ** argv) {
+ __m128i a;
+ __m128i b;
+ (void) argv;
+ signal(SIGILL, on_sigill);
+ a = _mm_set1_epi32(argc);
+ b = _mm_clmulepi64_si128(a, a, 0);
+ return _mm_cvtsi128_si32(b) & 0;
+}")
+ detect_cpu_feature(_HAVE_SSE41 "-msse4.1"
+"#include <smmintrin.h>
+#include <signal.h>
+#include <unistd.h>
+static void on_sigill(int sig) { (void) sig; _exit(1); }
+int main(int argc, char ** argv) {
+ __m128i a;
+ (void) argv;
+ signal(SIGILL, on_sigill);
+ a = _mm_set1_epi32(argc);
+ return _mm_extract_epi32(a, 0) & 0;
+}")
+ if(_HAVE_PCLMUL AND _HAVE_SSE41)
+ set(HAVE_PCLMUL TRUE CACHE INTERNAL
+ "x86 PCLMUL + SSE4.1 intrinsics available")
+ else()
+ unset(HAVE_PCLMUL CACHE)
+ endif()
+endfunction()
+
+# aarch64 FEAT_PMULL (vmull_p64). Pi 4's BCM2711 accepts +crypto at
+# compile time but lacks the hardware — the runtime probe catches that.
+function(detect_pmull)
+ detect_cpu_feature(_HAVE_PMULL "-march=armv8-a+crypto"
+"#include <arm_neon.h>
+#include <signal.h>
+#include <stdint.h>
+#include <unistd.h>
+static void on_sigill(int sig) { (void) sig; _exit(1); }
+int main(int argc, char ** argv) {
+ poly64_t a;
+ poly128_t c;
+ (void) argv;
+ signal(SIGILL, on_sigill);
+ a = (poly64_t) (uint64_t) argc;
+ c = vmull_p64(a, a);
+ return (int) (vgetq_lane_u64((uint64x2_t) c, 0) & 0);
+}")
+ if(_HAVE_PMULL)
+ set(HAVE_PMULL TRUE CACHE INTERNAL
+ "aarch64 PMULL intrinsics available")
+ else()
+ unset(HAVE_PMULL CACHE)
+ endif()
+endfunction()
diff --git a/doc/man/flow_alloc.3 b/doc/man/flow_alloc.3
index dbe5323c..8a9b5f5b 100644
--- a/doc/man/flow_alloc.3
+++ b/doc/man/flow_alloc.3
@@ -62,10 +62,60 @@ The \fBflow_dealloc\fR() function will release any resources
associated with the flow. This call may block and keep reliable flows
active until all packets are acknowledged.
-A \fBqosspec_t\fR specifies the following QoS characteristics of a
-flow:
-
-TODO: specify a qosspec_t
+A \fBqosspec_t\fR specifies the QoS characteristics of a flow.
+The fields are:
+
+.TP
+\fBdelay\fR (ms)
+Maximum one-way delay.
+.TP
+\fBbandwidth\fR (bits/s)
+Minimum bandwidth.
+.TP
+\fBavailability\fR
+Class of 9s (e.g. 5 = 99.999%).
+.TP
+\fBloss\fR
+Tolerated packet loss; 0 selects reliable delivery.
+.TP
+\fBber\fR
+Tolerated bit error rate (errors per billion bits); 0 enables an
+end-to-end integrity check (corrupted packets are dropped).
+.TP
+\fBservice\fR
+Framing / reliability class: \fBSVC_RAW\fR (0) disables FRCT;
+\fBSVC_MESSAGE\fR (1) preserves SDU boundaries; \fBSVC_STREAM\fR (2) is
+a byte stream with no SDU boundaries. \fBSVC_STREAM\fR requires
+\fIloss\fR = 0; otherwise
+\fBflow_alloc\fR()/\fBflow_accept\fR() returns \fB-EINVAL\fR.
+.TP
+\fBmax_gap\fR (ms)
+Maximum tolerated inter-packet gap. Packets exceeding the gap
+budget are dropped under the real-time cubes.
+.TP
+\fBtimeout\fR (ms)
+Peer-liveness timeout; 0 disables. Only applies when FRCT is
+enabled (service > 0).
+
+.PP
+The library provides predefined cubes:
+
+.TP
+\fBqos_raw\fR
+No guarantees, no integrity check.
+.TP
+\fBqos_raw_safe\fR
+Best-effort with end-to-end integrity (ber = 0).
+.TP
+\fBqos_rt\fR / \fBqos_rt_safe\fR
+Real-time messages, optimised for latency over reliability;
+\fBqos_rt_safe\fR adds an end-to-end integrity check.
+.TP
+\fBqos_msg\fR
+Reliable, SDU-preserving delivery.
+.TP
+\fBqos_stream\fR
+Reliable byte stream; no SDU boundaries are preserved.
.SH RETURN VALUE
@@ -117,13 +167,39 @@ _
\fBflow_dealloc\fR() & Thread safety & MT-Safe
.TE
+.SH NOTES
+The returned file descriptor is subject to a single-reader and
+single-writer discipline \(em at most one thread may call
+.BR flow_read ()
+(or monitor the fd via
+.BR fevent ())
+and at most one thread may call
+.BR flow_write ()
+concurrently. See
+.BR flow_read (3),
+.BR flow_write (3),
+and
+.BR fevent (3)
+for details.
+.PP
+.BR flow_dealloc ()
+must not be called concurrently with any thread that is inside
+.BR flow_read (),
+.BR flow_write (),
+.BR fevent (),
+or any other Ouroboros library call on the same fd; the result is
+undefined behaviour. Applications must serialise teardown with
+in-flight use, e.g. by signalling worker threads to drop the fd
+before calling
+.BR flow_dealloc ().
+
.SH TERMINOLOGY
Please see \fBouroboros-glossary\fR(7).
.SH SEE ALSO
-.BR fccntl "(3), " flow_read "(3), " fqueue "(3), " fset "(3), " \
-ouroboros (8)
+.BR fccntl "(3), " fevent "(3), " flow_read "(3), " flow_write "(3), " \
+fqueue "(3), " fset "(3), " ouroboros (8)
.SH COLOPHON
This page is part of the Ouroboros project, found at
diff --git a/doc/man/flow_read.3 b/doc/man/flow_read.3
index acc1f61e..d4a5e883 100644
--- a/doc/man/flow_read.3
+++ b/doc/man/flow_read.3
@@ -39,8 +39,7 @@ end of the datagram.
On success, \fBflow_write\fR() returns the number of bytes written. On
failure, a negative value indicating the error will be returned.
-Partial writes needs to be explicitly enabled. Passing a
-NULL pointer for \fIbuf\fR returns 0 with no other effects.
+Passing a NULL pointer for \fIbuf\fR returns 0 with no other effects.
.SH ERRORS
.B -EINVAL
@@ -62,7 +61,8 @@ The flow has been reported down.
The flow's peer is unresponsive (flow timed out).
.B -EMSGSIZE
-The buffer was too large to be written.
+The received packet does not fit in the caller's buffer and partial
+reads are disabled (see \fBfccntl\fR(3), \fBFLOWFRNOPART\fR).
.SH ATTRIBUTES
@@ -74,11 +74,47 @@ LB|LB|LB
L|L|L.
Interface & Attribute & Value
_
-\fBflow_read\fR() & Thread safety & MT-Safe
+\fBflow_read\fR() & Thread safety & MT-Safe race:fd
_
-\fBflow_write\fR() & Thread safety & MT-Safe
+\fBflow_write\fR() & Thread safety & MT-Safe race:fd
.TE
+.SH THREAD SAFETY
+Only one thread may call
+.BR flow_read ()
+on a given file descriptor at any time. Partial-read state kept
+across calls assumes a single logical reader; two threads racing
+.BR flow_read ()
+on the same fd is undefined behaviour. Likewise, only one thread
+may call
+.BR flow_write ()
+on a given fd at a time; two writer threads on the same fd is
+undefined behaviour.
+.PP
+Combining a writer thread with a reader thread (one thread calling
+.BR flow_write (),
+another calling
+.BR flow_read ()
+or
+.BR fevent ())
+is permitted and safe. The writer does not need a dedicated reader
+thread \(em when the FRCT send window fills,
+.BR flow_write ()
+drives its own inbound rx draining internally to process incoming
+ACKs and reopen the window, clamped by the caller's
+.BR fccntl (3)
+send-timeout if any.
+.PP
+Monitoring the same fd via
+.BR fevent ()
+from a different thread is well-defined but races: events reported
+by
+.BR fevent ()
+may already have been consumed by the racing
+.BR flow_read (),
+so the second reader may then block. See
+.BR fevent (3).
+
.SH TERMINOLOGY
Please see \fBouroboros-glossary\fR(7).
diff --git a/doc/man/fqueue.3 b/doc/man/fqueue.3
index 72a0bc25..f2fb8c9f 100644
--- a/doc/man/fqueue.3
+++ b/doc/man/fqueue.3
@@ -116,6 +116,27 @@ _
\fBfevent\fR() & Thread safety & MT-Safe
.TE
+.SH THREAD SAFETY
+.BR fevent ()
+and
+.BR flow_read ()
+on the same fd from distinct threads is well-defined but races:
+events reported by
+.BR fevent ()
+may already have been consumed by the racing
+.BR flow_read (),
+so the reader may then block. Same shape as
+.BR select (2)
++
+.BR read (2)
+from distinct threads. The intended pattern is that the thread
+invoking
+.BR fevent ()
+is the same thread that calls
+.BR flow_read ()
+on the fds returned by
+.BR fqueue_next ().
+
.SH TERMINOLOGY
Please see \fBouroboros-glossary\fR(7).
diff --git a/include/ouroboros/atomics.h b/include/ouroboros/atomics.h
new file mode 100644
index 00000000..8e667522
--- /dev/null
+++ b/include/ouroboros/atomics.h
@@ -0,0 +1,39 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * Atomic helpers
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+#ifndef OUROBOROS_LIB_ATOMICS_H
+#define OUROBOROS_LIB_ATOMICS_H
+
+#define LOAD_RELAXED(p) (__atomic_load_n(p, __ATOMIC_RELAXED))
+#define LOAD_ACQUIRE(p) (__atomic_load_n(p, __ATOMIC_ACQUIRE))
+#define LOAD(p) (__atomic_load_n(p, __ATOMIC_SEQ_CST))
+
+#define STORE_RELAXED(p, v) (__atomic_store_n(p, v, __ATOMIC_RELAXED))
+#define STORE_RELEASE(p, v) (__atomic_store_n(p, v, __ATOMIC_RELEASE))
+#define STORE(p, v) (__atomic_store_n(p, v, __ATOMIC_SEQ_CST))
+
+#define FETCH_ADD_RELAXED(p, v) (__atomic_fetch_add(p, v, __ATOMIC_RELAXED))
+#define FETCH_SUB_RELAXED(p, v) (__atomic_fetch_sub(p, v, __ATOMIC_RELAXED))
+#define FETCH_ADD(p, v) (__atomic_fetch_add(p, v, __ATOMIC_SEQ_CST))
+#define FETCH_SUB(p, v) (__atomic_fetch_sub(p, v, __ATOMIC_SEQ_CST))
+
+#endif /* OUROBOROS_LIB_ATOMICS_H */
diff --git a/include/ouroboros/crc16.h b/include/ouroboros/crc16.h
new file mode 100644
index 00000000..df4d4f57
--- /dev/null
+++ b/include/ouroboros/crc16.h
@@ -0,0 +1,43 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * 16-bit Cyclic Redundancy Check (CCITT-FALSE variant)
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+/*
+ * Polynomial: ITU-T V.41 / CCITT-FALSE, CRC-16/IBM-3740.
+ * reveng catalog: https://reveng.sourceforge.io/crc-catalogue
+ *
+ * Intended for medium-size header check sequences (typ. <= 4 KiB).
+ * Hamming distance HD=4 up to 32751 message bits.
+ */
+
+#ifndef OUROBOROS_LIB_CRC16_H
+#define OUROBOROS_LIB_CRC16_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define CRC16_HASH_LEN 2
+
+void crc16_ccitt_false(uint16_t * crc,
+ const void * buf,
+ size_t len);
+
+#endif /* OUROBOROS_LIB_CRC16_H */
diff --git a/include/ouroboros/crc64.h b/include/ouroboros/crc64.h
new file mode 100644
index 00000000..f6e407a0
--- /dev/null
+++ b/include/ouroboros/crc64.h
@@ -0,0 +1,44 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * 64-bit Cyclic Redundancy Check (NVMe variant)
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+/*
+ * Polynomial: NVM Express Base Spec, CRC-64/NVMe.
+ * reveng catalog: https://reveng.sourceforge.io/crc-catalogue
+ *
+ * Fold-by-N (PCLMUL/PMULL) algorithm:
+ * V. Gopal et al., "Fast CRC Computation for Generic Polynomials
+ * Using PCLMULQDQ", Intel white paper, 2009.
+ */
+
+#ifndef OUROBOROS_LIB_CRC64_H
+#define OUROBOROS_LIB_CRC64_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define CRC64_HASH_LEN 8
+
+void crc64_nvme(uint64_t * crc,
+ const void * buf,
+ size_t len);
+
+#endif /* OUROBOROS_LIB_CRC64_H */
diff --git a/include/ouroboros/crc8.h b/include/ouroboros/crc8.h
new file mode 100644
index 00000000..97502a25
--- /dev/null
+++ b/include/ouroboros/crc8.h
@@ -0,0 +1,43 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * 8-bit Cyclic Redundancy Check (AUTOSAR variant)
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+/*
+ * Polynomial: AUTOSAR SWS_CRC, CRC-8/AUTOSAR.
+ * reveng catalog: https://reveng.sourceforge.io/crc-catalogue
+ *
+ * Intended for short header check sequences (typ. <= 32 bytes).
+ * Hamming distance HD=4 up to 119 message bits, HD=3 up to 247.
+ */
+
+#ifndef OUROBOROS_LIB_CRC8_H
+#define OUROBOROS_LIB_CRC8_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define CRC8_HASH_LEN 1
+
+void crc8_autosar(uint8_t * crc,
+ const void * buf,
+ size_t len);
+
+#endif /* OUROBOROS_LIB_CRC8_H */
diff --git a/include/ouroboros/crypt.h b/include/ouroboros/crypt.h
index 5e082bb9..9feaa610 100644
--- a/include/ouroboros/crypt.h
+++ b/include/ouroboros/crypt.h
@@ -28,18 +28,19 @@
#include <assert.h>
-#define IVSZ 16
+#define NONCESZ 16
#define SYMMKEYSZ 32
#define MAX_HASH_SIZE 64 /* SHA-512/BLAKE2b max */
#define KEX_ALGO_BUFSZ 32
#define KEX_CIPHER_BUFSZ 32
+#define CACERT_PATH_BUFSZ 256
/*
* On OSX the OpenSSL NIDs are automatically loaded with evp.h.
* Some have a different spelling. This header avoids the double definitions.
*/
- #define NID_undef 0
+#define NID_undef 0
/* Cipher NIDs (match OpenSSL values) */
#define NID_aes_128_gcm 895
@@ -50,7 +51,7 @@
#define NID_aes_256_ctr 906
#define NID_chacha20_poly1305 1018
- #if !defined (__APPLE__) || !defined ( HAVE_OPENSSL )
+#if !defined (__APPLE__) || !defined ( HAVE_OPENSSL )
/* KEX algorithm NIDs (match OpenSSL values) */
#define NID_X9_62_prime256v1 415
#define NID_secp384r1 715
@@ -101,11 +102,15 @@
#define IS_KEX_ALGO_SET(cfg) ((cfg)->x.nid != NID_undef)
#define IS_KEX_CIPHER_SET(cfg) ((cfg)->c.nid != NID_undef)
+/* Flow role: forks the per-direction keys so each end's TX = peer's RX. */
+#define CRYPT_ROLE_INIT 0 /* flow allocator / OAP client */
+#define CRYPT_ROLE_RESP 1 /* flow acceptor / OAP server */
struct crypt_sk {
int nid;
uint8_t * key;
- uint8_t rot_bit; /* Rotation bit to control epoch */
+ uint8_t epoch; /* installed batch epoch */
+ uint8_t role; /* CRYPT_ROLE_INIT / _RESP */
};
struct sec_config {
@@ -114,18 +119,26 @@ struct sec_config {
int nid;
int mode;
} x; /* key exchange */
+
struct {
const char * str;
int nid;
} k; /* kdf */
+
struct {
const char * str;
int nid;
} c; /* cipher */
+
struct {
const char * str;
int nid;
} d; /* digest */
+
+ struct {
+ bool req; /* require peer auth */
+ char cacert[CACERT_PATH_BUFSZ]; /* pinned CA, "" = any */
+ } a; /* authentication */
};
/* Helper macros to set sec_config fields consistently */
@@ -211,9 +224,21 @@ void auth_destroy_ctx(struct auth_ctx * ctx);
int auth_add_crt_to_store(struct auth_ctx * ctx,
void * crt);
+/* Untrusted intermediates: used to build a path, never as trust anchors */
+int auth_add_crt_to_chain(struct auth_ctx * ctx,
+ void * crt);
+
int auth_verify_crt(struct auth_ctx * ctx,
void * crt);
+/* As auth_verify_crt, pin must be in the verified chain (NULL: any) */
+int auth_verify_crt_pin(struct auth_ctx * ctx,
+ void * crt,
+ void * pin);
+
+/* False for PQC keys: their signature digest is intrinsic */
+bool crypt_pk_requires_md(const void * pk);
+
int auth_sign(void * pkp,
int md_nid,
buffer_t msg,
@@ -289,12 +314,16 @@ const char * md_nid_to_str(uint16_t nid);
uint16_t md_str_to_nid(const char * kdf);
-ssize_t md_digest(int md_nid,
- buffer_t in,
- uint8_t * out);
+ssize_t md_digest(int md_nid,
+ buffer_t in,
+ uint8_t * out);
ssize_t md_len(int md_nid);
+int crypt_hkdf_expand(buffer_t key,
+ buffer_t info,
+ buffer_t out);
+
int crypt_encrypt(struct crypt_ctx * ctx,
buffer_t in,
buffer_t * out);
@@ -303,10 +332,37 @@ int crypt_decrypt(struct crypt_ctx * ctx,
buffer_t in,
buffer_t * out);
-int crypt_get_ivsz(struct crypt_ctx * ctx);
+/* One-shot AEAD over an explicit key/nonce. out = ciphertext ‖ tag. */
+int crypt_oneshot_seal(int nid,
+ const uint8_t * key,
+ const uint8_t * nonce,
+ buffer_t aad,
+ buffer_t in,
+ buffer_t * out);
+
+int crypt_oneshot_open(int nid,
+ const uint8_t * key,
+ const uint8_t * nonce,
+ buffer_t aad,
+ buffer_t in,
+ buffer_t * out);
+
+int crypt_get_headsz(struct crypt_ctx * ctx);
int crypt_get_tagsz(struct crypt_ctx * ctx);
+int crypt_rekey(struct crypt_ctx * ctx,
+ struct crypt_sk * sk);
+
+/* Nodes remaining in the TX batch (re-key watermark). */
+int crypt_nodes_left(struct crypt_ctx * ctx);
+
+/* 1 once the peer has been observed on the current generation. */
+int crypt_peer_synced(struct crypt_ctx * ctx);
+
+/* Switch TX to the installed (new) batch (after peer synced/grace). */
+void crypt_tx_promote(struct crypt_ctx * ctx);
+
int crypt_load_crt_file(const char * path,
void ** crt);
@@ -342,6 +398,10 @@ int crypt_load_pubkey_raw_file(const char * path,
int crypt_load_privkey_raw_file(const char * path,
void ** key);
+int crypt_ct_cmp(const void * a,
+ const void * b,
+ size_t len);
+
int crypt_cmp_key(const void * key1,
const void * key2);
diff --git a/include/ouroboros/errno.h b/include/ouroboros/errno.h
index 9d84df88..eedd978f 100644
--- a/include/ouroboros/errno.h
+++ b/include/ouroboros/errno.h
@@ -37,5 +37,6 @@
#ifndef EAUTH /* Exists on BSD */
#define EAUTH 1009 /* Authentication error */
#endif
+#define EREPLAY 1010 /* OAP replay detected */
#endif /* OUROBOROS_ERRNO_H */
diff --git a/include/ouroboros/fccntl.h b/include/ouroboros/fccntl.h
index d3baea8f..e91e91dd 100644
--- a/include/ouroboros/fccntl.h
+++ b/include/ouroboros/fccntl.h
@@ -50,6 +50,12 @@
#define FRCTFRESCNTL 00000002 /* Feedback from receiver */
#define FRCTFLINGER 00000004 /* Send unsent data */
+/* All user-visible bits (readable via FRCTGFLAGS). */
+#define FRCTFMASK (FRCTFRTX | FRCTFRESCNTL | FRCTFLINGER)
+
+/* Subset writable via FRCTSFLAGS; FRCTFRTX is fixed at flow_alloc. */
+#define FRCTFSETMASK (FRCTFRESCNTL | FRCTFLINGER)
+
/* Flow operations */
#define FLOWSRCVTIMEO 00000001 /* Set read timeout */
#define FLOWGRCVTIMEO 00000002 /* Get read timeout */
@@ -60,10 +66,17 @@
#define FLOWGFLAGS 00000007 /* Get flags for flow */
#define FLOWGRXQLEN 00000010 /* Get queue length on rx */
#define FLOWGTXQLEN 00000011 /* Get queue length on tx */
+#define FLOWGMTU 00000012 /* Get per-packet MTU */
/* FRCT operations */
#define FRCTSFLAGS 00001000 /* Set flags for FRCT */
#define FRCTGFLAGS 00002000 /* Get flags for FRCT */
+#define FRCTSMAXSDU 00003000 /* Set max recv SDU size */
+#define FRCTGMAXSDU 00004000 /* Get max recv SDU size */
+#define FRCTSRRINGSZ 00005000 /* Set stream rcv ring sz */
+#define FRCTGRRINGSZ 00006000 /* Get stream rcv ring sz */
+#define FRCTSRTOMIN 00007000 /* Set RTO floor (ns) */
+#define FRCTGRTOMIN 00010000 /* Get RTO floor (ns) */
__BEGIN_DECLS
diff --git a/include/ouroboros/flow.h b/include/ouroboros/flow.h
index fe4582e7..8b096410 100644
--- a/include/ouroboros/flow.h
+++ b/include/ouroboros/flow.h
@@ -25,6 +25,7 @@
#include <ouroboros/qos.h>
+#include <stdint.h>
#include <sys/types.h>
#define SYMMKEYSZ 32
@@ -50,6 +51,8 @@ struct flow_info {
time_t mpl;
+ uint32_t mtu; /* n-1 layer MTU in bytes, 0 = unknown */
+
struct qos_spec qs;
enum flow_state state;
diff --git a/include/ouroboros/fqueue.h b/include/ouroboros/fqueue.h
index 2546c79d..322da3ea 100644
--- a/include/ouroboros/fqueue.h
+++ b/include/ouroboros/fqueue.h
@@ -34,7 +34,8 @@ enum fqtype {
FLOW_UP = (1 << 2),
FLOW_ALLOC = (1 << 3),
FLOW_DEALLOC = (1 << 4),
- FLOW_PEER = (1 << 5)
+ FLOW_PEER = (1 << 5),
+ FLOW_UPD = (1 << 6)
};
struct flow_set;
diff --git a/include/ouroboros/hash.h b/include/ouroboros/hash.h
index 0838df97..c6609ffc 100644
--- a/include/ouroboros/hash.h
+++ b/include/ouroboros/hash.h
@@ -38,6 +38,9 @@ enum hash_algo {
HASH_SHA3_512 = DIR_HASH_SHA3_512,
HASH_CRC32,
HASH_MD5,
+ HASH_CRC64,
+ HASH_CRC8,
+ HASH_CRC16,
};
#define HASH_FMT32 "%02x%02x%02x%02x"
@@ -86,4 +89,7 @@ void str_hash(enum hash_algo algo,
void * dst,
const char * str);
+/* Non-cryptographic finalizer for hashing an integer key to a table index. */
+uint64_t hash_mix64(uint64_t key);
+
#endif /* OUROBOROS_LIB_HASH_H */
diff --git a/include/ouroboros/ipcp-dev.h b/include/ouroboros/ipcp-dev.h
index 93236271..d00d6f08 100644
--- a/include/ouroboros/ipcp-dev.h
+++ b/include/ouroboros/ipcp-dev.h
@@ -28,16 +28,23 @@
#include <ouroboros/ssm_pool.h>
#include <ouroboros/utils.h>
+#include <stdint.h>
+
int ipcp_create_r(const struct ipcp_info * info);
int ipcp_flow_req_arr(const buffer_t * dst,
qosspec_t qs,
time_t mpl,
+ uint32_t mtu,
const buffer_t * data);
+int ipcp_flow_update_arr(int flow_id,
+ const buffer_t * data);
+
int ipcp_flow_alloc_reply(int fd,
int response,
time_t mpl,
+ uint32_t mtu,
const buffer_t * data);
int ipcp_flow_read(int fd,
diff --git a/include/ouroboros/logs.h b/include/ouroboros/logs.h
index 58494531..1ae77673 100644
--- a/include/ouroboros/logs.h
+++ b/include/ouroboros/logs.h
@@ -29,6 +29,7 @@
#include <ouroboros/hash.h>
+#include <pthread.h>
#include <unistd.h>
#include <stdio.h>
#include <stdbool.h>
@@ -55,6 +56,8 @@ void log_fini(void);
#define __olog(CLR, LVL, SYSLVL, ...) \
do { \
+ int __cs; \
+ pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &__cs); \
if (log_syslog) { \
syslog(SYSLVL, __VA_ARGS__); \
} else { \
@@ -64,10 +67,13 @@ void log_fini(void);
printf(CLR_RESET "\n"); \
fflush(stdout); \
} \
+ pthread_setcancelstate(__cs, NULL); \
} while (0)
#define __olog_id(CLR, LVL, SYSLVL, id, fmt, ...) \
do { \
+ int __cs; \
+ pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &__cs); \
if (log_syslog) { \
syslog(SYSLVL, "[" HASH_FMT64 "] " fmt, \
HASH_VAL64(id), ## __VA_ARGS__); \
@@ -79,6 +85,7 @@ void log_fini(void);
printf(CLR_RESET "\n"); \
fflush(stdout); \
} \
+ pthread_setcancelstate(__cs, NULL); \
} while (0)
#ifndef OUROBOROS_DISABLE_LOGGING
diff --git a/include/ouroboros/name.h b/include/ouroboros/name.h
index a9393820..a3aac8c4 100644
--- a/include/ouroboros/name.h
+++ b/include/ouroboros/name.h
@@ -34,9 +34,9 @@ enum pol_balance {
};
struct name_sec_paths {
- char enc[NAME_PATH_SIZE + 1]; /* path to crypt for this name */
- char key[NAME_PATH_SIZE + 1]; /* path to key for this name */
- char crt[NAME_PATH_SIZE + 1]; /* path to crt for this name */
+ char sec[NAME_PATH_SIZE + 1]; /* path to sec.conf for this name */
+ char key[NAME_PATH_SIZE + 1]; /* path to key for this name */
+ char crt[NAME_PATH_SIZE + 1]; /* path to crt for this name */
};
struct name_info {
diff --git a/include/ouroboros/np1_flow.h b/include/ouroboros/np1_flow.h
index 6f341cfc..758b6db8 100644
--- a/include/ouroboros/np1_flow.h
+++ b/include/ouroboros/np1_flow.h
@@ -36,13 +36,17 @@ int np1_flow_resp(int flow_id,
int np1_flow_dealloc(int flow_id,
time_t timeo);
+int np1_flow_fd(int flow_id);
+
+int np1_flow_id(int fd);
+
static const qosspec_t qos_np1 = {
+ .service = SVC_RAW,
.delay = UINT32_MAX,
.bandwidth = 0,
.availability = 0,
.loss = UINT32_MAX,
.ber = UINT32_MAX,
- .in_order = 0,
.max_gap = UINT32_MAX,
.timeout = 0
};
diff --git a/include/ouroboros/pthread.h b/include/ouroboros/pthread.h
index cd500795..3ca79d10 100644
--- a/include/ouroboros/pthread.h
+++ b/include/ouroboros/pthread.h
@@ -24,6 +24,7 @@
#define OUROBOROS_LIB_PTHREAD_H
#include <pthread.h>
+#include <stdio.h>
static int __attribute__((unused)) __timedwait(pthread_cond_t * cond,
pthread_mutex_t * mtx,
@@ -48,4 +49,9 @@ static void __attribute__((unused)) __cleanup_mutex_unlock(void * mutex)
pthread_mutex_unlock((pthread_mutex_t *) mutex);
}
+static void __attribute__((unused)) __cleanup_fclose(void * fp)
+{
+ fclose((FILE *) fp);
+}
+
#endif /* OUROBOROS_LIB_PTHREAD_H */
diff --git a/include/ouroboros/qos.h b/include/ouroboros/qos.h
index 6b0bbc17..7980ad00 100644
--- a/include/ouroboros/qos.h
+++ b/include/ouroboros/qos.h
@@ -28,79 +28,88 @@
#define DEFAULT_PEER_TIMEOUT 120000
+/* qos_spec.service: framing / reliability class. */
+enum qos_service {
+ SVC_RAW = 0, /* No FRCT; best-effort raw messages */
+ SVC_MESSAGE = 1, /* FRCT, reliable ordered messages */
+ SVC_STREAM = 2, /* FRCT, reliable ordered byte stream */
+};
+
typedef struct qos_spec {
+ uint8_t service; /* enum qos_service; gates FRCT (>0). */
uint32_t delay; /* In ms. */
uint64_t bandwidth; /* In bits/s. */
uint8_t availability; /* Class of 9s. */
uint32_t loss; /* Packet loss. */
uint32_t ber; /* Bit error rate, errors per billion bits. */
- uint8_t in_order; /* In-order delivery, enables FRCT. */
uint32_t max_gap; /* In ms. */
uint32_t timeout; /* Peer timeout time, in ms, 0 = no timeout. */
} qosspec_t;
+/* "_safe" = integrity check (ber=0). "rt" = latency over reliability. */
+
static const qosspec_t qos_raw = {
+ .service = SVC_RAW,
.delay = UINT32_MAX,
.bandwidth = 0,
.availability = 0,
.loss = 1,
.ber = 1,
- .in_order = 0,
.max_gap = UINT32_MAX,
- .timeout = DEFAULT_PEER_TIMEOUT
+ .timeout = 0
};
-static const qosspec_t qos_raw_no_errors = {
+static const qosspec_t qos_raw_safe = {
+ .service = SVC_RAW,
.delay = UINT32_MAX,
.bandwidth = 0,
.availability = 0,
.loss = 1,
.ber = 0,
- .in_order = 0,
.max_gap = UINT32_MAX,
- .timeout = DEFAULT_PEER_TIMEOUT
+ .timeout = 0
};
-static const qosspec_t qos_best_effort = {
- .delay = UINT32_MAX,
- .bandwidth = 0,
- .availability = 0,
+static const qosspec_t qos_rt = {
+ .service = SVC_MESSAGE,
+ .delay = 100,
+ .bandwidth = UINT64_MAX,
+ .availability = 3,
.loss = 1,
- .ber = 0,
- .in_order = 1,
- .max_gap = UINT32_MAX,
+ .ber = 1,
+ .max_gap = 100,
.timeout = DEFAULT_PEER_TIMEOUT
};
-static const qosspec_t qos_video = {
+static const qosspec_t qos_rt_safe = {
+ .service = SVC_MESSAGE,
.delay = 100,
.bandwidth = UINT64_MAX,
.availability = 3,
.loss = 1,
.ber = 0,
- .in_order = 1,
.max_gap = 100,
.timeout = DEFAULT_PEER_TIMEOUT
};
-static const qosspec_t qos_voice = {
- .delay = 50,
- .bandwidth = 100000,
- .availability = 5,
- .loss = 1,
+static const qosspec_t qos_msg = {
+ .service = SVC_MESSAGE,
+ .delay = 1000,
+ .bandwidth = 0,
+ .availability = 0,
+ .loss = 0,
.ber = 0,
- .in_order = 1,
- .max_gap = 50,
+ .max_gap = 2000,
.timeout = DEFAULT_PEER_TIMEOUT
};
-static const qosspec_t qos_data = {
+static const qosspec_t qos_stream = {
+ .service = SVC_STREAM,
.delay = 1000,
.bandwidth = 0,
.availability = 0,
.loss = 0,
.ber = 0,
- .in_order = 1,
.max_gap = 2000,
.timeout = DEFAULT_PEER_TIMEOUT
};
diff --git a/include/ouroboros/rcu.h b/include/ouroboros/rcu.h
new file mode 100644
index 00000000..b4e7d27c
--- /dev/null
+++ b/include/ouroboros/rcu.h
@@ -0,0 +1,110 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * Read-mostly pointer publication (RCU, with a locked fallback)
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+#ifndef OUROBOROS_LIB_RCU_H
+#define OUROBOROS_LIB_RCU_H
+
+/*
+ * Lock-free reads of published pointers via liburcu (urcu-bp) when
+ * available; a per-object rwlock fallback otherwise.
+ * Include config.h before this header so HAVE_LIBURCU is defined.
+ *
+ * Embed a struct rcu_guard in the object. A reader brackets its access
+ * with rcu_rdlock/rcu_rdunlock and reads published pointers via rcu_deref.
+ * A writer serialises with rcu_wrlock/rcu_wrunlock and publishes via
+ * rcu_assign; after unlock it reclaims a now-unreachable object with
+ * rcu_reclaim (waits out live readers) before freeing it. rcu_drain waits
+ * out all readers at teardown.
+ */
+
+#include <ouroboros/pthread.h>
+
+#ifdef HAVE_LIBURCU
+
+#include <urcu-bp.h>
+
+struct rcu_guard {
+ pthread_mutex_t w; /* serialises writers; readers use RCU */
+};
+
+#define rcu_guard_init(g) pthread_mutex_init(&(g)->w, NULL)
+#define rcu_guard_fini(g) pthread_mutex_destroy(&(g)->w)
+#define rcu_rdlock(g) ((void) (g), rcu_read_lock())
+#define rcu_rdunlock(g) ((void) (g), rcu_read_unlock())
+#define rcu_wrlock(g) pthread_mutex_lock(&(g)->w)
+#define rcu_wrunlock(g) pthread_mutex_unlock(&(g)->w)
+#define rcu_deref(p) rcu_dereference(p)
+#define rcu_assign(p, v) rcu_assign_pointer(p, v)
+#define rcu_reclaim(g) ((void) (g), synchronize_rcu())
+#define rcu_drain(g) ((void) (g), synchronize_rcu())
+
+/* TSan can miss the publish/consume barrier under urcu. */
+#if defined(__SANITIZE_THREAD__)
+#define RCU_TSAN_ANNOTATE
+#endif
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#define RCU_TSAN_ANNOTATE
+#endif
+#endif
+
+/*
+ * Publish/consume annotations re-expose liburcu's rcu_assign/rcu_deref edge to
+ * TSan, which cannot see liburcu's barriers. Call rcu_publish(p) before
+ * publishing p with rcu_assign, and rcu_consume(p) after reading it with
+ * rcu_deref. No-op without liburcu (the rwlock fallback already gives TSan the
+ * edge) or without TSan.
+ */
+#ifdef RCU_TSAN_ANNOTATE
+#include <sanitizer/tsan_interface.h>
+#define rcu_publish(p) __tsan_release(p)
+#define rcu_consume(p) __tsan_acquire(p)
+#else
+#define rcu_publish(p) ((void) (p))
+#define rcu_consume(p) ((void) (p))
+#endif
+
+#else /* !HAVE_LIBURCU : per-object rwlock fallback */
+
+struct rcu_guard {
+ pthread_rwlock_t rw; /* readers rd, writers wr */
+};
+
+#define rcu_guard_init(g) pthread_rwlock_init(&(g)->rw, NULL)
+#define rcu_guard_fini(g) pthread_rwlock_destroy(&(g)->rw)
+#define rcu_rdlock(g) pthread_rwlock_rdlock(&(g)->rw)
+#define rcu_rdunlock(g) pthread_rwlock_unlock(&(g)->rw)
+#define rcu_wrlock(g) pthread_rwlock_wrlock(&(g)->rw)
+#define rcu_wrunlock(g) pthread_rwlock_unlock(&(g)->rw)
+#define rcu_deref(p) (p)
+#define rcu_assign(p, v) ((p) = (v))
+#define rcu_reclaim(g) ((void) (g)) /* wrlock already excluded readers */
+#define rcu_drain(g) (pthread_rwlock_wrlock(&(g)->rw), \
+ pthread_rwlock_unlock(&(g)->rw))
+
+/* rwlock already gives TSan the publish/consume edge; no annotation. */
+#define rcu_publish(p) ((void) (p))
+#define rcu_consume(p) ((void) (p))
+
+#endif /* HAVE_LIBURCU */
+
+#endif /* OUROBOROS_LIB_RCU_H */
diff --git a/include/ouroboros/serdes-irm.h b/include/ouroboros/serdes-irm.h
index 1dfff4d9..a5854d5b 100644
--- a/include/ouroboros/serdes-irm.h
+++ b/include/ouroboros/serdes-irm.h
@@ -31,6 +31,7 @@
#include <ouroboros/utils.h>
#include <inttypes.h>
+#include <stdbool.h>
int flow_alloc__irm_req_ser(buffer_t * buf,
const struct flow_info * flow,
@@ -51,6 +52,10 @@ int ipcp_flow_req_arr__irm_req_ser(buffer_t * buf,
const struct flow_info * flow,
const buffer_t * data);
+int ipcp_flow_update_arr__irm_req_ser(buffer_t * buf,
+ const struct flow_info * flow,
+ const buffer_t * data);
+
int ipcp_flow_alloc_reply__irm_msg_ser(buffer_t * buf,
const struct flow_info * flow,
int response,
@@ -64,6 +69,15 @@ int flow_dealloc__irm_req_ser(buffer_t * buf,
const struct flow_info * flow,
const struct timespec * timeo);
+int flow_update__irm_req_ser(buffer_t * buf,
+ const struct flow_info * flow,
+ bool rekey);
+
+int flow_rekey__irm_result_des(buffer_t * buf,
+ struct crypt_sk * sk,
+ bool * has_key,
+ bool * initiator);
+
int ipcp_flow_dealloc__irm_req_ser(buffer_t * buf,
const struct flow_info * info);
diff --git a/include/ouroboros/ssm_pk_buff.h b/include/ouroboros/ssm_pk_buff.h
index 1b779ad1..1d5597c7 100644
--- a/include/ouroboros/ssm_pk_buff.h
+++ b/include/ouroboros/ssm_pk_buff.h
@@ -28,25 +28,25 @@
struct ssm_pk_buff;
-size_t ssm_pk_buff_get_idx(struct ssm_pk_buff * spb);
+size_t ssm_pk_buff_get_off(const struct ssm_pk_buff * spb);
-uint8_t * ssm_pk_buff_head(struct ssm_pk_buff * spb);
+uint8_t * ssm_pk_buff_head(const struct ssm_pk_buff * spb);
-uint8_t * ssm_pk_buff_tail(struct ssm_pk_buff * spb);
+uint8_t * ssm_pk_buff_tail(const struct ssm_pk_buff * spb);
-size_t ssm_pk_buff_len(struct ssm_pk_buff * spb);
+size_t ssm_pk_buff_len(const struct ssm_pk_buff * spb);
-uint8_t * ssm_pk_buff_head_alloc(struct ssm_pk_buff * spb,
- size_t size);
+uint8_t * ssm_pk_buff_push(struct ssm_pk_buff * spb,
+ size_t size);
-uint8_t * ssm_pk_buff_tail_alloc(struct ssm_pk_buff * spb,
- size_t size);
+uint8_t * ssm_pk_buff_push_tail(struct ssm_pk_buff * spb,
+ size_t size);
-uint8_t * ssm_pk_buff_head_release(struct ssm_pk_buff * spb,
- size_t size);
+uint8_t * ssm_pk_buff_pop(struct ssm_pk_buff * spb,
+ size_t size);
-uint8_t * ssm_pk_buff_tail_release(struct ssm_pk_buff * spb,
- size_t size);
+uint8_t * ssm_pk_buff_pop_tail(struct ssm_pk_buff * spb,
+ size_t size);
void ssm_pk_buff_truncate(struct ssm_pk_buff * spb,
size_t len);
diff --git a/include/ouroboros/ssm_pool.h b/include/ouroboros/ssm_pool.h
index 89eff8eb..bba76798 100644
--- a/include/ouroboros/ssm_pool.h
+++ b/include/ouroboros/ssm_pool.h
@@ -32,7 +32,7 @@
struct ssm_pool;
-/* Pool API: uid = 0 for GSPP (privileged), uid > 0 for PUP (per-user) */
+/* Pool API: uid = 0 for GSPP (privileged), uid > 0 for PUP (per-user). */
struct ssm_pool * ssm_pool_create(uid_t uid,
gid_t gid);
@@ -46,13 +46,13 @@ int ssm_pool_mlock(struct ssm_pool * pool);
void ssm_pool_gspp_purge(void);
-/* Alloc count bytes, returns block index, a ptr and pk_buff. */
+/* Alloc count bytes, returns block offset, a ptr and pk_buff. */
ssize_t ssm_pool_alloc(struct ssm_pool * pool,
size_t count,
uint8_t ** ptr,
struct ssm_pk_buff ** spb);
-ssize_t ssm_pool_alloc_b(struct ssm_pool * pool,
+ssize_t ssm_pool_alloc_b(struct ssm_pool * pool,
size_t count,
uint8_t ** ptr,
struct ssm_pk_buff ** spb,
@@ -60,13 +60,13 @@ ssize_t ssm_pool_alloc_b(struct ssm_pool * pool,
ssize_t ssm_pool_read(uint8_t ** dst,
struct ssm_pool * pool,
- size_t idx);
+ size_t off);
struct ssm_pk_buff * ssm_pool_get(struct ssm_pool * pool,
- size_t idx);
+ size_t off);
int ssm_pool_remove(struct ssm_pool * pool,
- size_t idx);
+ size_t off);
void ssm_pool_reclaim_orphans(struct ssm_pool * pool,
pid_t pid);
diff --git a/include/ouroboros/ssm_rbuff.h b/include/ouroboros/ssm_rbuff.h
index ffa10b8e..e77eec09 100644
--- a/include/ouroboros/ssm_rbuff.h
+++ b/include/ouroboros/ssm_rbuff.h
@@ -28,10 +28,12 @@
#include <stdint.h>
-#define ACL_RDWR 0000
-#define ACL_RDONLY 0001
-#define ACL_FLOWDOWN 0002
-#define ACL_FLOWPEER 0004
+#define RB_RD 0001 /* read permitted (0 = no access) */
+#define RB_WR 0002 /* write permitted (0 = no access) */
+#define RB_RDWR (RB_RD | RB_WR)
+#define RB_FLOWDOWN 0004
+#define RB_FLOWPEER 0010
+#define RB_REKEY 0020 /* re-key seed parked (out-of-band signal) */
struct ssm_rbuff;
@@ -45,20 +47,23 @@ struct ssm_rbuff * ssm_rbuff_open(pid_t pid,
void ssm_rbuff_close(struct ssm_rbuff * rb);
-void ssm_rbuff_set_acl(struct ssm_rbuff * rb,
- uint32_t flags);
+void ssm_rbuff_set_bits(struct ssm_rbuff * rb,
+ uint32_t bits);
-uint32_t ssm_rbuff_get_acl(struct ssm_rbuff * rb);
+void ssm_rbuff_clr_bits(struct ssm_rbuff * rb,
+ uint32_t bits);
+
+uint32_t ssm_rbuff_get_flags(struct ssm_rbuff * rb);
void ssm_rbuff_fini(struct ssm_rbuff * rb);
int ssm_rbuff_mlock(struct ssm_rbuff * rb);
int ssm_rbuff_write(struct ssm_rbuff * rb,
- size_t idx);
+ size_t off);
int ssm_rbuff_write_b(struct ssm_rbuff * rb,
- size_t idx,
+ size_t off,
const struct timespec * abstime);
ssize_t ssm_rbuff_read(struct ssm_rbuff * rb);
diff --git a/include/ouroboros/time.h b/include/ouroboros/time.h
index 3d037a3c..a4136e8e 100644
--- a/include/ouroboros/time.h
+++ b/include/ouroboros/time.h
@@ -46,6 +46,12 @@
#define TS_TO_UINT64(ts) \
((uint64_t)(ts).tv_sec * BILLION + (uint64_t)(ts).tv_nsec)
+#define UINT64_TO_TS(ns, ts) \
+ do { \
+ (ts)->tv_sec = (time_t)((ns) / BILLION); \
+ (ts)->tv_nsec = (long)((ns) % BILLION); \
+ } while (0)
+
#define TIMEVAL_INIT_S(s) {(s), 0}
#define TIMEVAL_INIT_MS(ms) {(ms) / 1000, ((ms) % 1000) * 1000}
#define TIMEVAL_INIT_US(us) {(us) / MILLION, ((us) % MILLION)}
diff --git a/include/ouroboros/tpm.h b/include/ouroboros/tpm.h
index c01a235c..56c04701 100644
--- a/include/ouroboros/tpm.h
+++ b/include/ouroboros/tpm.h
@@ -24,6 +24,7 @@
#define OUROBOROS_LIB_TPM_H
#include <stdbool.h>
+#include <sys/types.h>
struct tpm;
diff --git a/include/ouroboros/tw.h b/include/ouroboros/tw.h
new file mode 100644
index 00000000..156f99db
--- /dev/null
+++ b/include/ouroboros/tw.h
@@ -0,0 +1,77 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * Generic deadline-ordered callback queue (timing wheel)
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+#ifndef OUROBOROS_TW_H
+#define OUROBOROS_TW_H
+
+#include <ouroboros/cdefs.h>
+#include <ouroboros/list.h>
+
+#include <stddef.h>
+#include <stdint.h>
+#include <time.h>
+
+typedef void (*tw_fire_fn_t)(void * arg);
+
+struct tw_entry {
+ struct list_head next;
+ uint64_t deadline_ns;
+ tw_fire_fn_t fire;
+ void * arg;
+ size_t lvl;
+};
+
+__BEGIN_DECLS
+
+int tw_init(void);
+
+void tw_fini(void);
+
+void tw_init_entry(struct tw_entry * e);
+
+/*
+ * Schedule e to fire at deadline_ns. If e is already posted,
+ * the previous schedule is cancelled and replaced.
+ */
+void tw_post(struct tw_entry * e,
+ uint64_t deadline_ns,
+ tw_fire_fn_t fire,
+ void * arg);
+
+void tw_cancel(struct tw_entry * e);
+
+/*
+ * Advance the wheel and fire due callbacks. Callbacks run with the wheel
+ * unlocked and may call tw_post / tw_cancel on any entry, including the one
+ * currently firing. Concurrent tw_move from a second thread is a no-op.
+ */
+void tw_move(void);
+
+/*
+ * Write the absolute deadline of the earliest pending entry to *out.
+ * Empty wheel is signalled by out->tv_nsec == -1.
+ */
+void tw_next_expiry(struct timespec * out);
+
+__END_DECLS
+
+#endif /* OUROBOROS_TW_H */
diff --git a/include/test/certs/ecdsa.h b/include/test/certs/ecdsa.h
index 1d61a3f8..cbc4ed06 100644
--- a/include/test/certs/ecdsa.h
+++ b/include/test/certs/ecdsa.h
@@ -107,6 +107,23 @@ static const char * signed_server_crt_ec = \
"ktkxoHAFbjQEPQIhAMInHI7lvRmS0IMw1wBF/WlUZWKvhyU/TeMIZfk/JGCS\n"
"-----END CERTIFICATE-----\n";
+/* Valid CA outside the test chain, for cacert= pin mismatch */
+static __attribute__((unused)) const char * other_ca_crt_ec = \
+"-----BEGIN CERTIFICATE-----\n"
+"MIICNjCCAdugAwIBAgIUTZcZ9hKXyCT/VgTw8TD1TB2mzrgwCgYIKoZIzj0EAwIw\n"
+"cDELMAkGA1UEBhMCQkUxDDAKBgNVBAgMA09WTDEOMAwGA1UEBwwFR2hlbnQxDDAK\n"
+"BgNVBAoMA283czEVMBMGA1UECwwMdW5pdHRlc3QubzdzMR4wHAYDVQQDDBVvdGhl\n"
+"ci1jYS51bml0dGVzdC5vN3MwHhcNMjYwNjEyMTU1MjAzWhcNNDYwNjA3MTU1MjAz\n"
+"WjBwMQswCQYDVQQGEwJCRTEMMAoGA1UECAwDT1ZMMQ4wDAYDVQQHDAVHaGVudDEM\n"
+"MAoGA1UECgwDbzdzMRUwEwYDVQQLDAx1bml0dGVzdC5vN3MxHjAcBgNVBAMMFW90\n"
+"aGVyLWNhLnVuaXR0ZXN0Lm83czBZMBMGByqGSM49AgEGCCqGSM49AwEHA0IABNtu\n"
+"FghMww2kQ6a+Coe6VPzfBRUZlm7y6/RfbRFPvErowOqKLQP+wCs8Rq46VmHCYTbB\n"
+"OlRwzJKcNoSeJ4MNWUqjUzBRMB0GA1UdDgQWBBTmEP8W6fgViKIjw8CpTuQwyuOi\n"
+"kTAfBgNVHSMEGDAWgBTmEP8W6fgViKIjw8CpTuQwyuOikTAPBgNVHRMBAf8EBTAD\n"
+"AQH/MAoGCCqGSM49BAMCA0kAMEYCIQDQOCfFcOJm49R975RBPfVMy0pXGx/YeQcy\n"
+"6WKAeLuTowIhAISdVZ6KxsgkwuswMtDWAkCBujep0XSBGXtXmi4959DH\n"
+"-----END CERTIFICATE-----\n";
+
/* Self-signed by server test-1.unittest.o7s using its key */
static __attribute__((unused)) const char * server_crt_ec = \
"-----BEGIN CERTIFICATE-----\n"
@@ -121,5 +138,25 @@ static __attribute__((unused)) const char * server_crt_ec = \
"gRo=\n"
"-----END CERTIFICATE-----\n";
+/*
+ * Name-confusion fixture: real CN is "attacker.unittest.o7s", but the
+ * O field value is "CN=victim.unittest.o7s" so the oneline subject is
+ * "/O=CN=victim.unittest.o7s/CN=attacker.unittest.o7s". A strstr("CN=")
+ * scan latches onto the decoy. The real CN must win.
+ */
+static __attribute__((unused)) const char * confused_crt_ec = \
+"-----BEGIN CERTIFICATE-----\n"
+"MIIB1jCCAX2gAwIBAgIUCfXJzDQ3Sx5qcyVB9Rb4/FdZ+QowCgYIKoZIzj0EAwIw\n"
+"QTEfMB0GA1UECgwWQ049dmljdGltLnVuaXR0ZXN0Lm83czEeMBwGA1UEAwwVYXR0\n"
+"YWNrZXIudW5pdHRlc3QubzdzMB4XDTI2MDYxNDE5MDcwMVoXDTQ2MDYwOTE5MDcw\n"
+"MVowQTEfMB0GA1UECgwWQ049dmljdGltLnVuaXR0ZXN0Lm83czEeMBwGA1UEAwwV\n"
+"YXR0YWNrZXIudW5pdHRlc3QubzdzMFkwEwYHKoZIzj0CAQYIKoZIzj0DAQcDQgAE\n"
+"oLwrbLs3diGcjyY2ErvO/U6CoyyKfl/8e1nxBKXHSOkO5xVmFu+EobEQVFvabxE/\n"
+"x4RttKcGJqUe8vlyQexQq6NTMFEwHQYDVR0OBBYEFGBaOBzTsCakjBN61x0ZnHSk\n"
+"04T3MB8GA1UdIwQYMBaAFGBaOBzTsCakjBN61x0ZnHSk04T3MA8GA1UdEwEB/wQF\n"
+"MAMBAf8wCgYIKoZIzj0EAwIDRwAwRAIgFtBeVxlRuI7y9Bo/Dh97ajTbHJXYMkc6\n"
+"ZqflSN3Q/uACIHWoCVn6u6+JjF+Kj9zubFJ49RIQJthSeP8xj7yTeV17\n"
+"-----END CERTIFICATE-----\n";
+
#endif /* TEST_CERTS_H */
diff --git a/irmd.conf.in b/irmd.conf.in
index dee88392..b9b79782 100644
--- a/irmd.conf.in
+++ b/irmd.conf.in
@@ -56,10 +56,10 @@ prog=["@INSTALL_DIR@/ovpn"] # Defaults to [].
prog=["@INSTALL_DIR@/oping"] # Defaults to [].
args=["--listen"] # Defaults to disabled. Autostart server with these args.
lb="round-robin" # Defaults to spill (load-balancing options: spill, round-robin).
-# server_enc_file=/path/to/enc.conf Default: @OUROBOROS_SRV_CRT_DIR@/<name>/enc.conf
+# server_sec_file=/path/to/sec.conf Default: @OUROBOROS_SRV_CRT_DIR@/<name>/sec.conf
# server_crt_file=/path/to/crt.pem Default: @OUROBOROS_SRV_CRT_DIR@/<name>/crt.pem
# server_key_file=/path/to/key.pem Default: @OUROBOROS_SRV_CRT_DIR@/<name>/key.pem
-# client_enc_file=/path/to/enc.conf Default: @OUROBOROS_CLI_CRT_DIR@/<name>/enc.conf
+# client_sec_file=/path/to/sec.conf Default: @OUROBOROS_CLI_CRT_DIR@/<name>/sec.conf
# client_crt_file=/path/to/crt.pem Default: @OUROBOROS_CLI_CRT_DIR@/<name>/crt.pem
# client_key_file=/path/to/key.pem Default: @OUROBOROS_CLI_CRT_DIR@/<name>/key.pem
diff --git a/enc.conf.in b/sec.conf.in
index 8f91d717..4796b72d 100644
--- a/enc.conf.in
+++ b/sec.conf.in
@@ -1,19 +1,19 @@
-### Example Ouroboros encryption configuration file
+### Example Ouroboros security configuration file
#
-# This file specifies the key exchange (KEX) algorithm and cipher to use
-# for encrypted flows.
+# This file specifies the security parameters for a service: the key
+# exchange (KEX) algorithm, cipher, key derivation, and peer authentication.
#
# File Locations:
# ---------------
#
# This file should be placed at one of:
-# @OUROBOROS_CONFIG_DIR@/security/server/<name>/enc.conf (server-side config)
-# @OUROBOROS_CONFIG_DIR@/security/client/<name>/enc.conf (client-side config)
+# @OUROBOROS_CONFIG_DIR@/security/server/<name>/sec.conf (server-side config)
+# @OUROBOROS_CONFIG_DIR@/security/client/<name>/sec.conf (client-side config)
#
# Where <name> is the service name registered with 'irm name create'.
#
# You can override the default paths using:
-# irm name create <name> sencpath <server-enc-path> cencpath <client-enc-path>
+# irm name create <name> ssecpath <server-sec-path> csecpath <client-sec-path>
#
# Configuration Options:
# ----------------------
@@ -22,7 +22,8 @@
# cipher=<cipher> Symmetric cipher algorithm
# kdf=<hash> Key derivation function hash algorithm
# kem_mode=<mode> KEM encapsulation mode (server or client)
-# none Explicitly disable encryption
+# auth=<policy> Peer authentication policy (required or optional)
+# encryption=none Explicitly disable encryption
#
# Supported KEX algorithms (kex=):
# --------------------------------
@@ -57,11 +58,6 @@
# aes-256-gcm AES-256 in GCM mode (default)
# chacha20-poly1305 ChaCha20-Poly1305
#
-# Stream ciphers (not recommended):
-# aes-128-ctr AES-128 in CTR mode
-# aes-192-ctr AES-192 in CTR mode
-# aes-256-ctr AES-256 in CTR mode
-#
# Key Derivation Functions (kdf=):
# ---------------------------------
#
@@ -76,6 +72,33 @@
# blake2b512 BLAKE2b-512
# blake2s256 BLAKE2s-256
#
+# Peer Authentication (auth=):
+# ----------------------------
+#
+# optional Accept unauthenticated peers
+# required Reject peers that do not present a valid certificate
+#
+# This setting applies to the *peer*: in a client config it requires
+# the server to authenticate; in a server config it requires the
+# client. The defaults mirror the web: a client config defaults to
+# required (the server must authenticate), a server config defaults
+# to optional (client authentication is opt-in). Set auth=required on
+# the server too for mutual authentication. Combine encryption=none
+# with auth=required for authenticated but unencrypted flows.
+#
+# Issuer Pinning (cacert=):
+# -------------------------
+#
+# cacert=<path> Path to a CA certificate that must be part of the
+# peer certificate's verified chain
+#
+# The peer certificate is always validated against the trusted CA
+# store; cacert= further restricts which CA must have issued it: a
+# certificate, if presented, must chain through the pinned CA. Whether
+# a certificate is mandatory is controlled by auth= alone: under
+# auth=optional a peer may still connect without one. The pinned CA
+# must load when the config is read, otherwise flow allocation fails.
+#
# KEM Mode (kem_mode=):
# ---------------------
#
@@ -147,4 +170,8 @@ kdf=sha256
# kdf=sha512
#
# Disable encryption:
-# none
+# encryption=none
+#
+# Authentication required, no encryption:
+# encryption=none
+# auth=required
diff --git a/src/ipcpd/broadcast/dt.c b/src/ipcpd/broadcast/dt.c
index 30e89a4f..95483e33 100644
--- a/src/ipcpd/broadcast/dt.c
+++ b/src/ipcpd/broadcast/dt.c
@@ -28,7 +28,7 @@
#include "config.h"
-#define BROADCAST_MTU 1400 /* FIXME: avoid packet copy. */
+#define BROADCAST_MTU IPCP_BROADCAST_MTU /* FIXME: avoid packet copy. */
#define DT "dt"
#define OUROBOROS_PREFIX DT
diff --git a/src/ipcpd/broadcast/main.c b/src/ipcpd/broadcast/main.c
index b3cbdc79..d18cac82 100644
--- a/src/ipcpd/broadcast/main.c
+++ b/src/ipcpd/broadcast/main.c
@@ -242,7 +242,7 @@ static int broadcast_ipcp_join(int fd,
notifier_event(NOTIFY_DT_CONN_ADD, &conn);
- ipcp_flow_alloc_reply(fd, 0, mpl, &data);
+ ipcp_flow_alloc_reply(fd, 0, mpl, IPCP_BROADCAST_MTU, &data);
return 0;
}
@@ -307,12 +307,13 @@ int main(int argc,
ipcp_sigwait();
if (ipcp_get_state() == IPCP_SHUTDOWN) {
+ ipcp_stop();
stop_components();
finalize_components();
+ } else {
+ ipcp_stop();
}
- ipcp_stop();
-
enroll_fini();
connmgr_fini();
diff --git a/src/ipcpd/config.h.in b/src/ipcpd/config.h.in
index 0b4252e5..7edec526 100644
--- a/src/ipcpd/config.h.in
+++ b/src/ipcpd/config.h.in
@@ -23,8 +23,8 @@
#define PTHREAD_COND_CLOCK @PTHREAD_COND_CLOCK@
#define SYS_MAX_FLOWS @SYS_MAX_FLOWS@
-#define PROG_RES_FDS @PROG_RES_FDS@
-#define PROG_MAX_FLOWS @PROG_MAX_FLOWS@
+#define PROC_RES_FDS @PROC_RES_FDS@
+#define PROC_MAX_FLOWS @PROC_MAX_FLOWS@
#define SOCKET_TIMEOUT @SOCKET_TIMEOUT@
#define CONNECT_TIMEOUT @CONNECT_TIMEOUT@
@@ -46,11 +46,13 @@
#define IPCP_SCHED_THR_MUL @IPCP_SCHED_THR_MUL@
#define PFT_SIZE @PFT_SIZE@
#define IPCP_UNICAST_MPL @IPCP_UNICAST_MPL@
+#define IPCP_UNICAST_MTU @IPCP_UNICAST_MTU@
#define CONNMGR_RCV_TIMEOUT @CONNMGR_RCV_TIMEOUT@
#cmakedefine DISABLE_CORE_LOCK
#cmakedefine BUILD_CONTAINER
#cmakedefine IPCP_FLOW_STATS
+#cmakedefine IPCP_ETH_FLOW_STATS
#cmakedefine IPCP_DEBUG_LOCAL
#ifdef CONFIG_OUROBOROS_DEBUG
#cmakedefine DEBUG_PROTO_DHT
@@ -65,6 +67,8 @@
#define IPCP_UDP_RD_THR @IPCP_UDP_RD_THR@
#define IPCP_UDP_WR_THR @IPCP_UDP_WR_THR@
#define IPCP_UDP_MPL @IPCP_UDP_MPL@
+#define IPCP_UDP4_MTU @IPCP_UDP4_MTU@
+#define IPCP_UDP6_MTU @IPCP_UDP6_MTU@
/* eth */
#cmakedefine HAVE_NETMAP
@@ -76,10 +80,13 @@
#define IPCP_ETH_LO_MTU @IPCP_ETH_LO_MTU@
#define IPCP_ETH_MGMT_FRAME_SIZE @IPCP_ETH_MGMT_FRAME_SIZE@
#define IPCP_ETH_MPL @IPCP_ETH_MPL@
+#define IPCP_ETH_SNDBUF @IPCP_ETH_SNDBUF@
+#define IPCP_ETH_RCVBUF @IPCP_ETH_RCVBUF@
/* local */
#define IPCP_LOCAL_MPL @IPCP_LOCAL_MPL@
+#define IPCP_LOCAL_MTU @IPCP_LOCAL_MTU@
/* broadcast */
-/* local */
#define IPCP_BROADCAST_MPL @IPCP_BROADCAST_MPL@
+#define IPCP_BROADCAST_MTU @IPCP_BROADCAST_MTU@
diff --git a/src/ipcpd/eth/eth.c b/src/ipcpd/eth/eth.c
index 4be7775e..7e038a03 100644
--- a/src/ipcpd/eth/eth.c
+++ b/src/ipcpd/eth/eth.c
@@ -37,19 +37,30 @@
#include "config.h"
+#include <ouroboros/atomics.h>
#include <ouroboros/endian.h>
#include <ouroboros/hash.h>
#include <ouroboros/errno.h>
#include <ouroboros/list.h>
#include <ouroboros/utils.h>
#include <ouroboros/bitmap.h>
+#include <ouroboros/crc8.h>
#include <ouroboros/dev.h>
#include <ouroboros/ipcp-dev.h>
#include <ouroboros/fqueue.h>
#include <ouroboros/logs.h>
+#include <ouroboros/np1_flow.h>
#include <ouroboros/time.h>
#include <ouroboros/fccntl.h>
#include <ouroboros/pthread.h>
+#include <ouroboros/rib.h>
+
+#ifndef IPCP_ETH_FLOW_STATS
+#undef FETCH_ADD_RELAXED
+#define FETCH_ADD_RELAXED(p, v) ((void) 0)
+#undef FETCH_SUB_RELAXED
+#define FETCH_SUB_RELAXED(p, v) ((void) 0)
+#endif
#include "ipcp.h"
#include "np1.h"
@@ -122,7 +133,8 @@
#define MGMT_EID 0
#define DIX_EID_SIZE sizeof(uint16_t)
#define DIX_LENGTH_SIZE sizeof(uint16_t)
-#define DIX_HEADER_SIZE (DIX_EID_SIZE + DIX_LENGTH_SIZE)
+#define DIX_HCS_SIZE CRC8_HASH_LEN
+#define DIX_HEADER_SIZE (DIX_EID_SIZE + DIX_LENGTH_SIZE + DIX_HCS_SIZE)
#define ETH_HEADER_TOT_SIZE (ETH_HEADER_SIZE + DIX_HEADER_SIZE)
#define MAX_EIDS (1 << (8 * DIX_EID_SIZE))
#define ETH_MAX_PACKET_SIZE (ETH_MTU - DIX_HEADER_SIZE)
@@ -130,21 +142,26 @@
#elif defined(BUILD_ETH_LLC)
#define THIS_TYPE IPCP_ETH_LLC
#define MGMT_SAP 0x01
-#define LLC_HEADER_SIZE 3
+#define LLC_FIELDS_SIZE 3
+#define LLC_HCS_SIZE CRC8_HASH_LEN
+#define LLC_HEADER_SIZE (LLC_FIELDS_SIZE + LLC_HCS_SIZE)
#define ETH_HEADER_TOT_SIZE (ETH_HEADER_SIZE + LLC_HEADER_SIZE)
#define MAX_SAPS 64
#define ETH_MAX_PACKET_SIZE (ETH_MTU - LLC_HEADER_SIZE)
#define ETH_FRAME_SIZE (ETH_HEADER_SIZE + ETH_MTU_MAX)
#endif
-#define NAME_QUERY_TIMEO 2000 /* ms */
-#define MGMT_TIMEO 100 /* ms */
+#define NAME_QUERY_TIMEO 1900 /* ms total budget */
+#define NAME_QUERY_RETRIES 3 /* retransmits, 4 attempts total */
+#define MGMT_TIMEO 100 /* ms */
#define MGMT_FRAME_SIZE IPCP_ETH_MGMT_FRAME_SIZE
+#define ETH_RIB_PATH "eth"
#define FLOW_REQ 0
#define FLOW_REPLY 1
#define NAME_QUERY_REQ 2
#define NAME_QUERY_REPLY 3
+#define FLOW_IRM_UPDATE 4
struct mgmt_msg {
#if defined(BUILD_ETH_DIX)
@@ -165,7 +182,7 @@ struct mgmt_msg {
uint32_t delay;
uint32_t timeout;
int32_t response;
- uint8_t in_order;
+ uint8_t service;
#if defined (BUILD_ETH_DIX)
uint8_t code;
uint8_t availability;
@@ -185,6 +202,7 @@ struct eth_frame {
uint8_t ssap;
uint8_t cf;
#endif
+ uint8_t hcs;
uint8_t payload;
} __attribute__((packed));
@@ -196,6 +214,17 @@ struct ef {
int8_t r_sap;
#endif
uint8_t r_addr[MAC_SIZE];
+#ifdef IPCP_ETH_FLOW_STATS
+ struct {
+ time_t stamp;
+ size_t p_rcv;
+ size_t b_rcv;
+ size_t p_dlv_f;
+ size_t p_snd;
+ size_t b_snd;
+ size_t p_snd_f;
+ } stat;
+#endif
};
struct mgmt_frame {
@@ -233,6 +262,22 @@ struct {
struct ef * fd_to_ef;
fset_t * np1_flows;
pthread_rwlock_t flows_lock;
+#ifdef IPCP_ETH_FLOW_STATS
+ struct {
+ size_t n_flows;
+ size_t n_rcv;
+ size_t n_snd;
+ size_t n_mgmt_rcv;
+ size_t n_mgmt_snd;
+ size_t n_bad_id;
+ size_t n_dlv_f;
+ size_t n_buf_f;
+ size_t n_rcv_f;
+ size_t n_snd_f;
+ size_t kern_rcv;
+ size_t kern_drp;
+ } stat;
+#endif
pthread_t packet_writer[IPCP_ETH_WR_THR];
pthread_t packet_reader[IPCP_ETH_RD_THR];
@@ -284,7 +329,14 @@ static int eth_data_init(void)
eth_data.fd_to_ef[i].r_sap = -1;
#endif
memset(&eth_data.fd_to_ef[i].r_addr, 0, MAC_SIZE);
+#ifdef IPCP_ETH_FLOW_STATS
+ memset(&eth_data.fd_to_ef[i].stat, 0,
+ sizeof(eth_data.fd_to_ef[i].stat));
+#endif
}
+#ifdef IPCP_ETH_FLOW_STATS
+ memset(&eth_data.stat, 0, sizeof(eth_data.stat));
+#endif
eth_data.shim_data = shim_data_create();
if (eth_data.shim_data == NULL)
@@ -357,6 +409,227 @@ static void eth_data_fini(void)
free(eth_data.fd_to_ef);
}
+#ifdef IPCP_ETH_FLOW_STATS
+static int eth_rib_read(const char * path,
+ char * buf,
+ size_t len)
+{
+ struct ef * flow;
+ int fd;
+ char tmstr[RIB_TM_STRLEN];
+ struct tm * tm;
+ time_t stamp;
+ char * entry;
+
+ entry = strstr(path, RIB_SEPARATOR) + 1;
+ assert(entry);
+
+ if (len < 2048)
+ return 0;
+
+ buf[0] = '\0';
+
+ if (strcmp(entry, "summary") == 0) {
+ int n;
+#if defined(HAVE_RAW_SOCKETS)
+ int rcvbuf = 0;
+ int sndbuf = 0;
+ int queued = 0;
+ socklen_t optlen = sizeof(rcvbuf);
+# if defined(__linux__)
+ struct tpacket_stats tp_stats;
+ socklen_t tp_len = sizeof(tp_stats);
+# endif
+
+ getsockopt(eth_data.s_fd, SOL_SOCKET,
+ SO_RCVBUF, &rcvbuf, &optlen);
+ optlen = sizeof(sndbuf);
+ getsockopt(eth_data.s_fd, SOL_SOCKET,
+ SO_SNDBUF, &sndbuf, &optlen);
+ ioctl(eth_data.s_fd, FIONREAD, &queued);
+# if defined(__linux__)
+ if (getsockopt(eth_data.s_fd, SOL_PACKET,
+ PACKET_STATISTICS,
+ &tp_stats, &tp_len) == 0) {
+ FETCH_ADD_RELAXED(&eth_data.stat.kern_rcv,
+ tp_stats.tp_packets);
+ FETCH_ADD_RELAXED(&eth_data.stat.kern_drp,
+ tp_stats.tp_drops);
+ }
+# endif
+#endif
+ n = sprintf(buf,
+ "Active flows: %20zu\n"
+ "Total frames received: %20zu\n"
+ "Total frames sent: %20zu\n"
+ "Management frames received: %20zu\n"
+ "Management frames sent: %20zu\n"
+ "Bad EID/SAP frames: %20zu\n"
+ "Delivery (N+1) failures: %20zu\n"
+ "Buffer alloc failures: %20zu\n"
+ "Frame read failures: %20zu\n"
+ "Frame send failures: %20zu\n",
+ LOAD_RELAXED(&eth_data.stat.n_flows),
+ LOAD_RELAXED(&eth_data.stat.n_rcv),
+ LOAD_RELAXED(&eth_data.stat.n_snd),
+ LOAD_RELAXED(&eth_data.stat.n_mgmt_rcv),
+ LOAD_RELAXED(&eth_data.stat.n_mgmt_snd),
+ LOAD_RELAXED(&eth_data.stat.n_bad_id),
+ LOAD_RELAXED(&eth_data.stat.n_dlv_f),
+ LOAD_RELAXED(&eth_data.stat.n_buf_f),
+ LOAD_RELAXED(&eth_data.stat.n_rcv_f),
+ LOAD_RELAXED(&eth_data.stat.n_snd_f));
+#if defined(HAVE_RAW_SOCKETS)
+ n += sprintf(buf + n,
+ "Socket rcvbuf (bytes): %20d\n"
+ "Socket sndbuf (bytes): %20d\n"
+ "Socket queued (bytes): %20d\n",
+ rcvbuf, sndbuf, queued);
+# if defined(__linux__)
+ n += sprintf(buf + n,
+ "Kernel frames received: %20zu\n"
+ "Kernel frames dropped: %20zu\n",
+ LOAD_RELAXED(&eth_data.stat.kern_rcv),
+ LOAD_RELAXED(&eth_data.stat.kern_drp));
+# endif
+#endif
+ return n;
+ }
+
+ fd = atoi(entry);
+
+ if (fd < 0 || fd >= SYS_MAX_FLOWS)
+ return -1;
+
+ flow = &eth_data.fd_to_ef[fd];
+
+ pthread_rwlock_rdlock(&eth_data.flows_lock);
+
+ stamp = flow->stat.stamp;
+ if (stamp == 0) {
+ pthread_rwlock_unlock(&eth_data.flows_lock);
+ return 0;
+ }
+
+ pthread_rwlock_unlock(&eth_data.flows_lock);
+
+ tm = gmtime(&stamp);
+ strftime(tmstr, sizeof(tmstr), RIB_TM_FORMAT, tm);
+
+ sprintf(buf,
+ "Flow established at: %20s\n"
+ "Sent (packets): %20zu\n"
+ "Sent (bytes): %20zu\n"
+ "Send failed (packets): %20zu\n"
+ "Received (packets): %20zu\n"
+ "Received (bytes): %20zu\n"
+ "Delivery (N+1) failures: %20zu\n",
+ tmstr,
+ LOAD_RELAXED(&flow->stat.p_snd),
+ LOAD_RELAXED(&flow->stat.b_snd),
+ LOAD_RELAXED(&flow->stat.p_snd_f),
+ LOAD_RELAXED(&flow->stat.p_rcv),
+ LOAD_RELAXED(&flow->stat.b_rcv),
+ LOAD_RELAXED(&flow->stat.p_dlv_f));
+
+ return strlen(buf);
+}
+
+static int eth_rib_readdir(char *** buf)
+{
+ char entry[RIB_PATH_LEN + 1];
+ size_t i;
+ int idx = 0;
+ int n_entries;
+
+ pthread_rwlock_rdlock(&eth_data.flows_lock);
+
+ n_entries = (int) LOAD_RELAXED(&eth_data.stat.n_flows) + 1;
+
+ *buf = malloc(sizeof(**buf) * n_entries);
+ if (*buf == NULL)
+ goto fail_entries;
+
+ (*buf)[idx] = malloc(strlen("summary") + 1);
+ if ((*buf)[idx] == NULL)
+ goto fail_entry;
+
+ strcpy((*buf)[idx++], "summary");
+
+ for (i = 0; i < SYS_MAX_FLOWS && idx < n_entries; ++i) {
+ if (eth_data.fd_to_ef[i].stat.stamp == 0)
+ continue;
+
+ sprintf(entry, "%zu", i);
+
+ (*buf)[idx] = malloc(strlen(entry) + 1);
+ if ((*buf)[idx] == NULL)
+ goto fail_entry;
+
+ strcpy((*buf)[idx++], entry);
+ }
+
+ pthread_rwlock_unlock(&eth_data.flows_lock);
+
+ return idx;
+
+ fail_entry:
+ while (idx-- > 0)
+ free((*buf)[idx]);
+ free(*buf);
+ fail_entries:
+ pthread_rwlock_unlock(&eth_data.flows_lock);
+ return -ENOMEM;
+}
+
+static int eth_rib_getattr(const char * path,
+ struct rib_attr * attr)
+{
+ int fd;
+ char * entry;
+ struct ef * flow;
+
+ entry = strstr(path, RIB_SEPARATOR) + 1;
+ assert(entry);
+
+ if (strcmp(entry, "summary") == 0) {
+ attr->size = 2048;
+ attr->mtime = 0;
+ return 0;
+ }
+
+ fd = atoi(entry);
+
+ if (fd < 0 || fd >= SYS_MAX_FLOWS) {
+ attr->size = 0;
+ attr->mtime = 0;
+ return 0;
+ }
+
+ flow = &eth_data.fd_to_ef[fd];
+
+ pthread_rwlock_rdlock(&eth_data.flows_lock);
+
+ if (flow->stat.stamp != 0) {
+ attr->size = 2048;
+ attr->mtime = flow->stat.stamp;
+ } else {
+ attr->size = 0;
+ attr->mtime = 0;
+ }
+
+ pthread_rwlock_unlock(&eth_data.flows_lock);
+
+ return 0;
+}
+
+static struct rib_ops eth_r_ops = {
+ .read = eth_rib_read,
+ .readdir = eth_rib_readdir,
+ .getattr = eth_rib_getattr
+};
+#endif /* IPCP_ETH_FLOW_STATS */
+
#ifdef BUILD_ETH_LLC
static uint8_t reverse_bits(uint8_t b)
{
@@ -409,12 +682,18 @@ static int eth_ipcp_send_frame(const uint8_t * dst_addr,
e_frame->ethertype = eth_data.ethertype;
e_frame->eid = htons(deid);
e_frame->length = htons(len);
+ mem_hash(HASH_CRC8, &e_frame->hcs,
+ (uint8_t *) &e_frame->eid,
+ DIX_EID_SIZE + DIX_LENGTH_SIZE);
frame_len = ETH_HEADER_TOT_SIZE + len;
#elif defined(BUILD_ETH_LLC)
e_frame->length = htons(LLC_HEADER_SIZE + len);
e_frame->dsap = dsap;
e_frame->ssap = ssap;
e_frame->cf = cf;
+ mem_hash(HASH_CRC8, &e_frame->hcs,
+ (uint8_t *) &e_frame->dsap,
+ LLC_FIELDS_SIZE);
frame_len = ETH_HEADER_TOT_SIZE + len;
#endif
@@ -440,10 +719,7 @@ static int eth_ipcp_send_frame(const uint8_t * dst_addr,
}
assert(FD_ISSET(eth_data.s_fd, &fds));
- if (sendto(eth_data.s_fd,
- frame,
- frame_len,
- 0,
+ if (sendto(eth_data.s_fd, frame, frame_len, 0,
(struct sockaddr *) &eth_data.device,
sizeof(eth_data.device)) <= 0) {
log_dbg("Failed to send message: %s.", strerror(errno));
@@ -451,6 +727,8 @@ static int eth_ipcp_send_frame(const uint8_t * dst_addr,
}
#endif /* HAVE_NETMAP */
+ FETCH_ADD_RELAXED(&eth_data.stat.n_snd, 1);
+
return 0;
}
@@ -490,7 +768,7 @@ static int eth_ipcp_alloc(const uint8_t * dst_addr,
msg->availability = qs.availability;
msg->loss = hton32(qs.loss);
msg->ber = hton32(qs.ber);
- msg->in_order = qs.in_order;
+ msg->service = qs.service;
msg->max_gap = hton32(qs.max_gap);
msg->timeout = hton32(qs.timeout);
@@ -508,6 +786,9 @@ static int eth_ipcp_alloc(const uint8_t * dst_addr,
buf, len + data->len);
free(buf);
+ if (ret == 0)
+ FETCH_ADD_RELAXED(&eth_data.stat.n_mgmt_snd, 1);
+
return ret;
}
@@ -558,11 +839,65 @@ static int eth_ipcp_alloc_resp(uint8_t * dst_addr,
return -1;
}
+ FETCH_ADD_RELAXED(&eth_data.stat.n_mgmt_snd, 1);
+
free(buf);
return 0;
}
+static int eth_ipcp_flow_update(int fd,
+ const buffer_t * data)
+{
+ struct mgmt_msg * msg;
+ struct ef * flow;
+ uint8_t * buf;
+ uint8_t r_addr[MAC_SIZE];
+ int ret;
+
+ buf = malloc(sizeof(*msg) + ETH_HEADER_TOT_SIZE + data->len);
+ if (buf == NULL)
+ return -1;
+
+ memset(buf, 0, sizeof(*msg) + ETH_HEADER_TOT_SIZE + data->len);
+
+ msg = (struct mgmt_msg *) (buf + ETH_HEADER_TOT_SIZE);
+
+ msg->code = FLOW_IRM_UPDATE;
+
+ pthread_rwlock_rdlock(&eth_data.flows_lock);
+
+ flow = &eth_data.fd_to_ef[fd];
+#if defined(BUILD_ETH_DIX)
+ msg->seid = htons((uint16_t) fd);
+ msg->deid = htons((uint16_t) flow->r_eid);
+#elif defined(BUILD_ETH_LLC)
+ msg->ssap = flow->sap;
+ msg->dsap = (uint8_t) flow->r_sap;
+#endif
+ memcpy(r_addr, flow->r_addr, MAC_SIZE);
+
+ pthread_rwlock_unlock(&eth_data.flows_lock);
+
+ if (data->len > 0)
+ memcpy(msg + 1, data->data, data->len);
+
+ ret = eth_ipcp_send_frame(r_addr,
+#if defined(BUILD_ETH_DIX)
+ MGMT_EID,
+#elif defined(BUILD_ETH_LLC)
+ reverse_bits(MGMT_SAP),
+ reverse_bits(MGMT_SAP),
+#endif
+ buf, sizeof(*msg) + data->len);
+ free(buf);
+
+ if (ret == 0)
+ FETCH_ADD_RELAXED(&eth_data.stat.n_mgmt_snd, 1);
+
+ return ret;
+}
+
static int eth_ipcp_req(uint8_t * r_addr,
#if defined(BUILD_ETH_DIX)
uint16_t r_eid,
@@ -575,7 +910,8 @@ static int eth_ipcp_req(uint8_t * r_addr,
{
int fd;
- fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_ETH_MPL, data);
+ fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_ETH_MPL,
+ ETH_MAX_PACKET_SIZE, data);
if (fd < 0) {
log_err("Could not get new flow from IRMd.");
return -1;
@@ -622,7 +958,7 @@ static int eth_ipcp_alloc_reply(uint8_t * r_addr,
fd = eth_data.ef_to_fd[dsap];
#endif
if (fd < 0) {
- pthread_rwlock_unlock(& eth_data.flows_lock);
+ pthread_rwlock_unlock(&eth_data.flows_lock);
log_err("No flow found with that SAP.");
return -1; /* -EFLOWNOTFOUND */
}
@@ -647,7 +983,8 @@ static int eth_ipcp_alloc_reply(uint8_t * r_addr,
#elif defined(BUILD_ETH_LLC)
log_dbg("Flow reply, fd %d, SSAP %d, DSAP %d.", fd, ssap, dsap);
#endif
- if ((ret = ipcp_flow_alloc_reply(fd, response, mpl, data)) < 0) {
+ if ((ret = ipcp_flow_alloc_reply(fd, response, mpl,
+ ETH_MAX_PACKET_SIZE, data)) < 0) {
log_err("Failed to reply to flow allocation.");
return -1;
}
@@ -689,6 +1026,8 @@ static int eth_ipcp_name_query_req(const uint8_t * hash,
return -1;
}
+ FETCH_ADD_RELAXED(&eth_data.stat.n_mgmt_snd, 1);
+
free(buf);
}
@@ -709,6 +1048,44 @@ static int eth_ipcp_name_query_reply(const uint8_t * hash,
return 0;
}
+static int eth_ipcp_flow_update_arr(const uint8_t * buf,
+ size_t len)
+{
+ struct mgmt_msg * msg;
+ buffer_t data;
+ int fd;
+ int flow_id;
+
+ msg = (struct mgmt_msg *) buf;
+
+ data.data = (uint8_t *) buf + sizeof(*msg);
+ data.len = len - sizeof(*msg);
+
+ pthread_rwlock_rdlock(&eth_data.flows_lock);
+#if defined(BUILD_ETH_DIX)
+ fd = ntohs(msg->deid);
+#elif defined(BUILD_ETH_LLC)
+ fd = eth_data.ef_to_fd[msg->dsap];
+#endif
+ pthread_rwlock_unlock(&eth_data.flows_lock);
+
+ if (fd < 0 || fd >= SYS_MAX_FLOWS) {
+ log_err("Flow update for unknown endpoint.");
+ return -1;
+ }
+
+ flow_id = np1_flow_id(fd);
+ if (flow_id < 0)
+ return -1;
+
+ if (ipcp_flow_update_arr(flow_id, &data) < 0) {
+ log_err("Failed to relay flow update on fd %d.", fd);
+ return -1;
+ }
+
+ return 0;
+}
+
static int eth_ipcp_mgmt_frame(const uint8_t * buf,
size_t len,
uint8_t * r_addr)
@@ -718,20 +1095,24 @@ static int eth_ipcp_mgmt_frame(const uint8_t * buf,
qosspec_t qs;
buffer_t data;
+ if (len < sizeof(*msg))
+ return -1;
+
msg = (struct mgmt_msg *) buf;
switch (msg->code) {
case FLOW_REQ:
msg_len = sizeof(*msg) + ipcp_dir_hash_len();
- assert(len >= msg_len);
+ if (len < msg_len)
+ return -1;
qs.delay = ntoh32(msg->delay);
qs.bandwidth = ntoh64(msg->bandwidth);
qs.availability = msg->availability;
qs.loss = ntoh32(msg->loss);
qs.ber = ntoh32(msg->ber);
- qs.in_order = msg->in_order;
+ qs.service = msg->service;
qs.max_gap = ntoh32(msg->max_gap);
qs.timeout = ntoh32(msg->timeout);
@@ -752,8 +1133,6 @@ static int eth_ipcp_mgmt_frame(const uint8_t * buf,
}
break;
case FLOW_REPLY:
- assert(len >= sizeof(*msg));
-
data.data = (uint8_t *) buf + sizeof(*msg);
data.len = len - sizeof(*msg);
@@ -768,10 +1147,17 @@ static int eth_ipcp_mgmt_frame(const uint8_t * buf,
ntoh32(msg->response),
&data);
break;
+ case FLOW_IRM_UPDATE:
+ eth_ipcp_flow_update_arr(buf, len);
+ break;
case NAME_QUERY_REQ:
+ if (len < sizeof(*msg) + ipcp_dir_hash_len())
+ return -1;
eth_ipcp_name_query_req(buf + sizeof(*msg), r_addr);
break;
case NAME_QUERY_REPLY:
+ if (len < sizeof(*msg) + ipcp_dir_hash_len())
+ return -1;
eth_ipcp_name_query_reply(buf + sizeof(*msg), r_addr);
break;
default:
@@ -844,6 +1230,12 @@ static void * eth_ipcp_packet_reader(void * o)
fd_set fds;
int frame_len;
#endif
+#if defined(HAVE_RAW_SOCKETS)
+ struct sockaddr_ll src;
+ socklen_t slen;
+#endif
+ size_t eth_len;
+ uint8_t hcs;
struct eth_frame * e_frame;
struct mgmt_frame * frame;
@@ -881,24 +1273,58 @@ static void * eth_ipcp_packet_reader(void * o)
if (select(eth_data.s_fd + 1, &fds, NULL, NULL, NULL) < 0)
continue;
assert(FD_ISSET(eth_data.s_fd, &fds));
- if (ipcp_spb_reserve(&spb, ETH_MTU))
+ if (ipcp_spb_reserve(&spb, ETH_MTU)) {
+ FETCH_ADD_RELAXED(&eth_data.stat.n_buf_f, 1);
continue;
- buf = ssm_pk_buff_head_alloc(spb, ETH_HEADER_TOT_SIZE);
+ }
+ buf = ssm_pk_buff_push(spb, ETH_HEADER_TOT_SIZE);
if (buf == NULL) {
log_dbg("Failed to allocate header.");
ipcp_spb_release(spb);
+ FETCH_ADD_RELAXED(&eth_data.stat.n_buf_f, 1);
continue;
}
- frame_len = recv(eth_data.s_fd, buf,
- ETH_MTU + ETH_HEADER_TOT_SIZE, 0);
+ slen = sizeof(src);
+ /* MSG_DONTWAIT: RD_THR>1 race-loser bails with EAGAIN. */
+ frame_len = recvfrom(eth_data.s_fd, buf,
+ ETH_MTU + ETH_HEADER_TOT_SIZE,
+ MSG_DONTWAIT,
+ (struct sockaddr *) &src, &slen);
#endif
- if (frame_len <= 0) {
- log_dbg("Failed to receive frame.");
+ if (frame_len == 0) {
ipcp_spb_release(spb);
+ continue; /* Spurious */
+ }
+
+ if (frame_len < 0) {
+ ipcp_spb_release(spb);
+
+ if (errno == EAGAIN || errno == EWOULDBLOCK)
+ continue;
+
+ log_dbg("Failed to rcv frame: %s.", strerror(errno));
+ FETCH_ADD_RELAXED(&eth_data.stat.n_rcv_f, 1);
continue;
}
#endif
+#if defined(HAVE_NETMAP)
+ eth_len = hdr.len;
+#elif defined(HAVE_BPF)
+ eth_len = ((struct bpf_hdr *) buf)->bh_caplen;
+#else
+ eth_len = (size_t) frame_len;
+#endif
+ /* Defense in depth: reject before parsing dereferences. */
+ if (eth_len < ETH_HEADER_TOT_SIZE)
+ goto fail_frame;
+
+#if defined(HAVE_RAW_SOCKETS)
+ /* Drop our own egress. */
+ if (src.sll_pkttype == PACKET_OUTGOING)
+ goto fail_frame;
+#endif
+
#if defined(HAVE_BPF) && !defined(HAVE_NETMAP)
e_frame = (struct eth_frame *)
(buf + ((struct bpf_hdr *) buf)->bh_hdrlen);
@@ -916,6 +1342,8 @@ static void * eth_ipcp_packet_reader(void * o)
e_frame->dst_hwaddr,
MAC_SIZE) &&
memcmp(br_addr, e_frame->dst_hwaddr, MAC_SIZE)) {
+ FETCH_ADD_RELAXED(&eth_data.stat.n_bad_id, 1);
+ goto fail_frame;
}
#endif
length = ntohs(e_frame->length);
@@ -923,17 +1351,41 @@ static void * eth_ipcp_packet_reader(void * o)
if (e_frame->ethertype != eth_data.ethertype)
goto fail_frame;
+ if (length > ETH_MTU)
+ goto fail_frame;
+
deid = ntohs(e_frame->eid);
- if (deid == MGMT_EID) {
#elif defined (BUILD_ETH_LLC)
if (length > 0x05FF) /* DIX */
goto fail_frame;
+ if (length < LLC_HEADER_SIZE || length > ETH_MTU)
+ goto fail_frame;
+
length -= LLC_HEADER_SIZE;
dsap = reverse_bits(e_frame->dsap);
ssap = reverse_bits(e_frame->ssap);
+#endif
+
+ if (eth_len < ETH_HEADER_TOT_SIZE + (size_t) length)
+ goto fail_frame;
+
+#if defined(BUILD_ETH_DIX)
+ mem_hash(HASH_CRC8, &hcs,
+ (uint8_t *) &e_frame->eid,
+ DIX_EID_SIZE + DIX_LENGTH_SIZE);
+#elif defined(BUILD_ETH_LLC)
+ mem_hash(HASH_CRC8, &hcs,
+ (uint8_t *) &e_frame->dsap,
+ LLC_FIELDS_SIZE);
+#endif
+ if (hcs != e_frame->hcs)
+ goto fail_frame;
+#if defined(BUILD_ETH_DIX)
+ if (deid == MGMT_EID) {
+#elif defined (BUILD_ETH_LLC)
if (ssap == MGMT_SAP && dsap == MGMT_SAP) {
#endif
ipcp_spb_release(spb); /* No need for the N+1 buffer. */
@@ -941,13 +1393,13 @@ static void * eth_ipcp_packet_reader(void * o)
if (length > MGMT_FRAME_SIZE) {
log_warn("Management frame size %u exceeds %u.",
length, MGMT_FRAME_SIZE);
- goto fail_frame;
+ continue;
}
frame = malloc(sizeof(*frame));
if (frame == NULL) {
log_err("Failed to allocate frame.");
- goto fail_frame;
+ continue;
}
memcpy(frame->buf, &e_frame->payload, length);
@@ -958,6 +1410,8 @@ static void * eth_ipcp_packet_reader(void * o)
list_add(&frame->next, &eth_data.mgmt_frames);
pthread_cond_signal(&eth_data.mgmt_cond);
pthread_mutex_unlock(&eth_data.mgmt_lock);
+ FETCH_ADD_RELAXED(&eth_data.stat.n_rcv, 1);
+ FETCH_ADD_RELAXED(&eth_data.stat.n_mgmt_rcv, 1);
} else {
pthread_rwlock_rdlock(&eth_data.flows_lock);
@@ -968,6 +1422,7 @@ static void * eth_ipcp_packet_reader(void * o)
#endif
if (fd < 0) {
pthread_rwlock_unlock(&eth_data.flows_lock);
+ FETCH_ADD_RELAXED(&eth_data.stat.n_bad_id, 1);
goto fail_frame;
}
@@ -976,13 +1431,18 @@ static void * eth_ipcp_packet_reader(void * o)
|| memcmp(eth_data.fd_to_ef[fd].r_addr,
e_frame->src_hwaddr, MAC_SIZE)) {
pthread_rwlock_unlock(&eth_data.flows_lock);
+ FETCH_ADD_RELAXED(&eth_data.stat.n_bad_id, 1);
goto fail_frame;
}
#endif
+ FETCH_ADD_RELAXED(&eth_data.fd_to_ef[fd].stat.p_rcv, 1);
+ FETCH_ADD_RELAXED(&eth_data.fd_to_ef[fd].stat.b_rcv,
+ length);
+ FETCH_ADD_RELAXED(&eth_data.stat.n_rcv, 1);
pthread_rwlock_unlock(&eth_data.flows_lock);
#ifndef HAVE_NETMAP
- ssm_pk_buff_head_release(spb, ETH_HEADER_TOT_SIZE);
+ ssm_pk_buff_pop(spb, ETH_HEADER_TOT_SIZE);
ssm_pk_buff_truncate(spb, length);
#else
if (ipcp_spb_reserve(&spb, length))
@@ -991,8 +1451,13 @@ static void * eth_ipcp_packet_reader(void * o)
buf = ssm_pk_buff_head(spb);
memcpy(buf, &e_frame->payload, length);
#endif
- if (np1_flow_write(fd, spb, NP1_GET_POOL(fd)) < 0)
+ if (np1_flow_write(fd, spb, NP1_GET_POOL(fd)) < 0) {
ipcp_spb_release(spb);
+ FETCH_ADD_RELAXED(
+ &eth_data.fd_to_ef[fd].stat.p_dlv_f,
+ 1);
+ FETCH_ADD_RELAXED(&eth_data.stat.n_dlv_f, 1);
+ }
continue;
fail_frame:
@@ -1048,10 +1513,11 @@ static void * eth_ipcp_packet_writer(void * o)
len = ssm_pk_buff_len(spb);
- if (ssm_pk_buff_head_alloc(spb, ETH_HEADER_TOT_SIZE)
+ if (ssm_pk_buff_push(spb, ETH_HEADER_TOT_SIZE)
== NULL) {
log_dbg("Failed to allocate header.");
ipcp_spb_release(spb);
+ FETCH_ADD_RELAXED(&eth_data.stat.n_buf_f, 1);
continue;
}
@@ -1075,8 +1541,20 @@ static void * eth_ipcp_packet_writer(void * o)
dsap, ssap,
#endif
ssm_pk_buff_head(spb),
- len))
+ len)) {
log_dbg("Failed to send frame.");
+ FETCH_ADD_RELAXED(
+ &eth_data.fd_to_ef[fd].stat.p_snd_f,
+ 1);
+ FETCH_ADD_RELAXED(&eth_data.stat.n_snd_f, 1);
+ } else {
+ FETCH_ADD_RELAXED(
+ &eth_data.fd_to_ef[fd].stat.p_snd,
+ 1);
+ FETCH_ADD_RELAXED(
+ &eth_data.fd_to_ef[fd].stat.b_snd,
+ len);
+ }
ipcp_spb_release(spb);
}
}
@@ -1424,12 +1902,14 @@ static int eth_init_bpf(struct ifreq * ifr)
return -1;
}
#elif defined(HAVE_RAW_SOCKETS)
+#define SOCKOPT()
static int eth_init_raw_socket(struct ifreq * ifr)
{
int idx;
- int flags;
+ int sndbuf;
+ int rcvbuf;
#if defined(IPCP_ETH_QDISC_BYPASS)
- int qdisc_bypass = 1;
+ int qdisc_bypass = 1;
#endif /* ENABLE_QDISC_BYPASS */
idx = if_nametoindex(ifr->ifr_name);
@@ -1437,6 +1917,7 @@ static int eth_init_raw_socket(struct ifreq * ifr)
log_err("Failed to retrieve interface index.");
return -1;
}
+
memset(&(eth_data.device), 0, sizeof(eth_data.device));
eth_data.device.sll_ifindex = idx;
eth_data.device.sll_family = AF_PACKET;
@@ -1453,17 +1934,6 @@ static int eth_init_raw_socket(struct ifreq * ifr)
goto fail_socket;
}
- flags = fcntl(eth_data.s_fd, F_GETFL, 0);
- if (flags < 0) {
- log_err("Failed to get flags.");
- goto fail_device;
- }
-
- if (fcntl(eth_data.s_fd, F_SETFL, flags | O_NONBLOCK)) {
- log_err("Failed to set socket non-blocking.");
- goto fail_device;
- }
-
#if defined(IPCP_ETH_QDISC_BYPASS)
if (setsockopt(eth_data.s_fd, SOL_PACKET, PACKET_QDISC_BYPASS,
&qdisc_bypass, sizeof(qdisc_bypass))) {
@@ -1471,6 +1941,18 @@ static int eth_init_raw_socket(struct ifreq * ifr)
}
#endif
+ sndbuf = IPCP_ETH_SNDBUF;
+ if (sndbuf > 0 && setsockopt(eth_data.s_fd, SOL_SOCKET, SO_SNDBUF,
+ &sndbuf, sizeof(sndbuf))) {
+ log_info("Failed to set SO_SNDBUF to %d.", sndbuf);
+ }
+
+ rcvbuf = IPCP_ETH_RCVBUF;
+ if (rcvbuf > 0 && setsockopt(eth_data.s_fd, SOL_SOCKET, SO_RCVBUF,
+ &rcvbuf, sizeof(rcvbuf))) {
+ log_info("Failed to set SO_RCVBUF to %d.", rcvbuf);
+ }
+
if (bind(eth_data.s_fd, (struct sockaddr *) &eth_data.device,
sizeof(eth_data.device)) < 0) {
log_err("Failed to bind socket to interface.");
@@ -1543,6 +2025,12 @@ static int eth_ipcp_bootstrap(struct ipcp_config * conf)
return -1;
}
#endif /* HAVE_NETMAP */
+#ifdef IPCP_ETH_FLOW_STATS
+ if (rib_reg(ETH_RIB_PATH, &eth_r_ops)) {
+ log_err("Failed to register RIB.");
+ goto fail_rib_reg;
+ }
+#endif
#if defined(__linux__)
if (pthread_create(&eth_data.if_monitor, NULL,
eth_ipcp_if_monitor, NULL)) {
@@ -1606,6 +2094,10 @@ static int eth_ipcp_bootstrap(struct ipcp_config * conf)
#if defined(__linux__)
fail_monitor:
#endif
+#ifdef IPCP_ETH_FLOW_STATS
+ rib_unreg(ETH_RIB_PATH);
+ fail_rib_reg:
+#endif
#if defined(HAVE_NETMAP)
nm_close(eth_data.nmd);
#elif defined(HAVE_BPF)
@@ -1637,12 +2129,14 @@ static int eth_ipcp_unreg(const uint8_t * hash)
static int eth_ipcp_query(const uint8_t * hash)
{
uint8_t r_addr[MAC_SIZE];
- struct timespec timeout = TIMESPEC_INIT_MS(NAME_QUERY_TIMEO);
+ struct timespec timeout;
struct dir_query * query;
int ret;
+ int attempt;
uint8_t * buf;
struct mgmt_msg * msg;
size_t len;
+ long per_ms;
if (shim_data_dir_has(eth_data.shim_data, hash))
return 0;
@@ -1662,32 +2156,46 @@ static int eth_ipcp_query(const uint8_t * hash)
memset(r_addr, 0xff, MAC_SIZE);
- query = shim_data_dir_query_create(eth_data.shim_data, hash);
- if (query == NULL) {
- free(buf);
- return -1;
- }
+ per_ms = NAME_QUERY_TIMEO / (NAME_QUERY_RETRIES + 1);
- if (eth_ipcp_send_frame(r_addr,
+ ret = -1;
+ for (attempt = 0; attempt <= NAME_QUERY_RETRIES; ++attempt) {
+ query = shim_data_dir_query_create(eth_data.shim_data, hash);
+ if (query == NULL) {
+ ret = -1;
+ break;
+ }
+
+ if (eth_ipcp_send_frame(r_addr,
#if defined(BUILD_ETH_DIX)
- MGMT_EID,
+ MGMT_EID,
#elif defined(BUILD_ETH_LLC)
- reverse_bits(MGMT_SAP),
- reverse_bits(MGMT_SAP),
+ reverse_bits(MGMT_SAP),
+ reverse_bits(MGMT_SAP),
#endif
- buf, len)) {
- log_err("Failed to send management frame.");
+ buf, len)) {
+ log_err("Failed to send management frame.");
+ shim_data_dir_query_destroy(eth_data.shim_data,
+ query);
+ ret = -1;
+ break;
+ }
+
+ FETCH_ADD_RELAXED(&eth_data.stat.n_mgmt_snd, 1);
+
+ timeout.tv_sec = per_ms / 1000;
+ timeout.tv_nsec = (per_ms % 1000) * 1000000L;
+
+ ret = shim_data_dir_query_wait(query, &timeout);
+
shim_data_dir_query_destroy(eth_data.shim_data, query);
- free(buf);
- return -1;
+
+ if (ret != -ETIMEDOUT)
+ break;
}
free(buf);
- ret = shim_data_dir_query_wait(query, &timeout);
-
- shim_data_dir_query_destroy(eth_data.shim_data, query);
-
return ret;
}
@@ -1748,6 +2256,14 @@ static int eth_ipcp_flow_alloc(int fd,
}
fset_add(eth_data.np1_flows, fd);
+#ifdef IPCP_ETH_FLOW_STATS
+ pthread_rwlock_wrlock(&eth_data.flows_lock);
+ memset(&eth_data.fd_to_ef[fd].stat, 0,
+ sizeof(eth_data.fd_to_ef[fd].stat));
+ eth_data.fd_to_ef[fd].stat.stamp = time(NULL);
+ FETCH_ADD_RELAXED(&eth_data.stat.n_flows, 1);
+ pthread_rwlock_unlock(&eth_data.flows_lock);
+#endif
#if defined(BUILD_ETH_LLC)
log_dbg("Assigned SAP %d for fd %d.", ssap, fd);
#endif
@@ -1808,6 +2324,14 @@ static int eth_ipcp_flow_alloc_resp(int fd,
}
fset_add(eth_data.np1_flows, fd);
+#ifdef IPCP_ETH_FLOW_STATS
+ pthread_rwlock_wrlock(&eth_data.flows_lock);
+ memset(&eth_data.fd_to_ef[fd].stat, 0,
+ sizeof(eth_data.fd_to_ef[fd].stat));
+ eth_data.fd_to_ef[fd].stat.stamp = time(NULL);
+ FETCH_ADD_RELAXED(&eth_data.stat.n_flows, 1);
+ pthread_rwlock_unlock(&eth_data.flows_lock);
+#endif
#if defined(BUILD_ETH_LLC)
log_dbg("Assigned SAP %d for fd %d.", ssap, fd);
#endif
@@ -1836,6 +2360,12 @@ static int eth_ipcp_flow_dealloc(int fd)
#endif
memset(&eth_data.fd_to_ef[fd].r_addr, 0, MAC_SIZE);
+#ifdef IPCP_ETH_FLOW_STATS
+ memset(&eth_data.fd_to_ef[fd].stat, 0,
+ sizeof(eth_data.fd_to_ef[fd].stat));
+ FETCH_SUB_RELAXED(&eth_data.stat.n_flows, 1);
+#endif
+
pthread_rwlock_unlock(&eth_data.flows_lock);
ipcp_flow_dealloc(fd);
@@ -1854,7 +2384,8 @@ static struct ipcp_ops eth_ops = {
.ipcp_flow_alloc = eth_ipcp_flow_alloc,
.ipcp_flow_join = NULL,
.ipcp_flow_alloc_resp = eth_ipcp_flow_alloc_resp,
- .ipcp_flow_dealloc = eth_ipcp_flow_dealloc
+ .ipcp_flow_dealloc = eth_ipcp_flow_dealloc,
+ .ipcp_flow_update = eth_ipcp_flow_update
};
int main(int argc,
@@ -1902,6 +2433,9 @@ int main(int argc,
#ifdef __linux__
pthread_join(eth_data.if_monitor, NULL);
#endif
+#ifdef IPCP_ETH_FLOW_STATS
+ rib_unreg(ETH_RIB_PATH);
+#endif
}
ipcp_stop();
diff --git a/src/ipcpd/ipcp.c b/src/ipcpd/ipcp.c
index 5ad2401f..dcee4b9c 100644
--- a/src/ipcpd/ipcp.c
+++ b/src/ipcpd/ipcp.c
@@ -363,6 +363,7 @@ static void * acceptloop(void * o)
int ipcp_wait_flow_req_arr(const uint8_t * dst,
qosspec_t qs,
time_t mpl,
+ uint32_t mtu,
const buffer_t * data)
{
struct timespec ts = TIMESPEC_INIT_MS(ALLOC_TIMEOUT);
@@ -392,7 +393,7 @@ int ipcp_wait_flow_req_arr(const uint8_t * dst,
assert(ipcpd.alloc_id == -1);
- fd = ipcp_flow_req_arr(&hash, qs, mpl, data);
+ fd = ipcp_flow_req_arr(&hash, qs, mpl, mtu, data);
if (fd < 0) {
pthread_mutex_unlock(&ipcpd.alloc_lock);
log_err("Failed to get fd for flow.");
@@ -819,6 +820,33 @@ static void do_flow_dealloc(int flow_id,
log_info("Finished deallocating flow %d.", flow_id);
}
+static void do_flow_update(int flow_id,
+ const buffer_t * data,
+ ipcp_msg_t * ret_msg)
+{
+ int fd;
+
+ if (ipcpd.ops->ipcp_flow_update == NULL) {
+ log_err("Failed to update flow: operation unsupported.");
+ ret_msg->result = -ENOTSUP;
+ return;
+ }
+
+ if (ipcp_get_state() != IPCP_OPERATIONAL) {
+ ret_msg->result = -EIPCPSTATE;
+ return;
+ }
+
+ fd = np1_flow_fd(flow_id);
+ if (fd < 0) {
+ log_warn("Flow update for unknown flow_id %d.", flow_id);
+ ret_msg->result = -1;
+ return;
+ }
+
+ ret_msg->result = ipcpd.ops->ipcp_flow_update(fd, data);
+}
+
static void * mainloop(void * o)
{
int sfd;
@@ -917,6 +945,13 @@ static void * mainloop(void * o)
case IPCP_MSG_CODE__IPCP_FLOW_DEALLOC:
do_flow_dealloc(msg->flow_id, msg->timeo_sec, &ret_msg);
break;
+ case IPCP_MSG_CODE__IPCP_FLOW_UPDATE:
+ assert(msg->pk.len > 0 ? msg->pk.data != NULL
+ : msg->pk.data == NULL);
+ data.len = msg->pk.len;
+ data.data = msg->pk.data;
+ do_flow_update(msg->flow_id, &data, &ret_msg);
+ break;
default:
ret_msg.result = -1;
log_err("Unknown message code: %d.", msg->code);
diff --git a/src/ipcpd/ipcp.h b/src/ipcpd/ipcp.h
index 26a780a3..210157ec 100644
--- a/src/ipcpd/ipcp.h
+++ b/src/ipcpd/ipcp.h
@@ -68,6 +68,9 @@ struct ipcp_ops {
const buffer_t * data);
int (* ipcp_flow_dealloc)(int fd);
+
+ int (* ipcp_flow_update)(int fd,
+ const buffer_t * data);
};
int ipcp_init(int argc,
@@ -98,6 +101,7 @@ enum ipcp_state ipcp_get_state(void);
int ipcp_wait_flow_req_arr(const uint8_t * dst,
qosspec_t qs,
time_t mpl,
+ uint32_t mtu,
const buffer_t * data);
int ipcp_wait_flow_resp(const int fd);
diff --git a/src/ipcpd/local/main.c b/src/ipcpd/local/main.c
index 2c867317..c0aeb51e 100644
--- a/src/ipcpd/local/main.c
+++ b/src/ipcpd/local/main.c
@@ -38,6 +38,7 @@
#include <ouroboros/ipcp.h>
#include <ouroboros/ipcp-dev.h>
#include <ouroboros/local-dev.h>
+#include <ouroboros/np1_flow.h>
#include "ipcp.h"
#include "np1.h"
@@ -203,7 +204,8 @@ static int local_ipcp_flow_alloc(int fd,
HASH_VAL32(dst), fd);
assert(dst);
- out_fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_LOCAL_MPL, data);
+ out_fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_LOCAL_MPL,
+ IPCP_LOCAL_MTU, data);
if (out_fd < 0) {
log_dbg("Flow allocation failed: %d", out_fd);
return -1;
@@ -255,14 +257,16 @@ static int local_ipcp_flow_alloc_resp(int fd,
}
if (response < 0) {
- ipcp_flow_alloc_reply(out_fd, response, mpl, data);
+ ipcp_flow_alloc_reply(out_fd, response, mpl,
+ IPCP_LOCAL_MTU, data);
log_info("Flow allocation rejected, fds (%d, %d).", out_fd, fd);
return 0;
}
fset_add(local_data.flows, fd);
- if (ipcp_flow_alloc_reply(out_fd, response, mpl, data) < 0) {
+ if (ipcp_flow_alloc_reply(out_fd, response, mpl,
+ IPCP_LOCAL_MTU, data) < 0) {
log_err("Failed to reply to allocation");
fset_del(local_data.flows, fd);
return -1;
@@ -294,6 +298,38 @@ static int local_ipcp_flow_dealloc(int fd)
return 0;
}
+/* Loopback relay: deliver the update back to the peer end (same IRMd). */
+static int local_ipcp_flow_update(int fd,
+ const buffer_t * data)
+{
+ int out_fd;
+ int out_flow_id;
+
+ pthread_rwlock_rdlock(&local_data.lock);
+
+ out_fd = local_data.in_out[fd];
+
+ pthread_rwlock_unlock(&local_data.lock);
+
+ if (out_fd == -1) {
+ log_err("Flow update on fd %d with no peer.", fd);
+ return -1;
+ }
+
+ out_flow_id = np1_flow_id(out_fd);
+ if (out_flow_id < 0) {
+ log_err("No flow_id for peer fd %d.", out_fd);
+ return -1;
+ }
+
+ if (ipcp_flow_update_arr(out_flow_id, data) < 0) {
+ log_err("Failed to relay flow update to fd %d.", out_fd);
+ return -1;
+ }
+
+ return 0;
+}
+
static struct ipcp_ops local_ops = {
.ipcp_bootstrap = local_ipcp_bootstrap,
.ipcp_enroll = NULL,
@@ -305,7 +341,8 @@ static struct ipcp_ops local_ops = {
.ipcp_flow_alloc = local_ipcp_flow_alloc,
.ipcp_flow_join = NULL,
.ipcp_flow_alloc_resp = local_ipcp_flow_alloc_resp,
- .ipcp_flow_dealloc = local_ipcp_flow_dealloc
+ .ipcp_flow_dealloc = local_ipcp_flow_dealloc,
+ .ipcp_flow_update = local_ipcp_flow_update
};
int main(int argc,
diff --git a/src/ipcpd/udp/udp.c b/src/ipcpd/udp/udp.c
index 452bbc1a..db57e2f4 100644
--- a/src/ipcpd/udp/udp.c
+++ b/src/ipcpd/udp/udp.c
@@ -28,6 +28,8 @@
#include <ouroboros/list.h>
#include <ouroboros/utils.h>
#include <ouroboros/dev.h>
+#include <ouroboros/ipcp-dev.h>
+#include <ouroboros/np1_flow.h>
#include <ouroboros/fqueue.h>
#include <ouroboros/errno.h>
#include <ouroboros/logs.h>
@@ -47,9 +49,14 @@
#include <stdlib.h>
#include <sys/wait.h>
#include <fcntl.h>
+#include <unistd.h>
+#if defined(__linux__)
+#include <netinet/ip.h>
+#endif
#define FLOW_REQ 1
#define FLOW_REPLY 2
+#define FLOW_IRM_UPDATE 3
#define OUR_HEADER_LEN sizeof(uint32_t) /* adds eid */
@@ -87,7 +94,7 @@ struct mgmt_msg {
uint8_t code;
/* QoS parameters from spec */
uint8_t availability;
- uint8_t in_order;
+ uint8_t service;
} __attribute__((packed));
struct mgmt_frame {
@@ -130,6 +137,53 @@ static const char * __inet_ntop(const struct __ADDR * addr,
return inet_ntop(__AF, addr, buf, __ADDRSTRLEN);
}
+#if defined(BUILD_IPCP_UDP4)
+#define UDP_MTU_FALLBACK IPCP_UDP4_MTU
+#define UDP_IP_OVERHEAD 28U /* IPv4 + UDP */
+#else
+#define UDP_MTU_FALLBACK IPCP_UDP6_MTU
+#define UDP_IP_OVERHEAD 48U /* IPv6 + UDP */
+#endif
+
+static uint32_t udp_query_mtu(const struct __SOCKADDR * saddr)
+{
+#if defined(__linux__) && (defined(IP_MTU) || defined(IPV6_MTU))
+ int sock;
+ int mtu = 0;
+ socklen_t len = sizeof(mtu);
+
+ sock = socket(__AF, SOCK_DGRAM, IPPROTO_UDP);
+ if (sock < 0)
+ return UDP_MTU_FALLBACK;
+
+ if (connect(sock, (const struct sockaddr *) saddr,
+ sizeof(*saddr)) < 0)
+ goto fallback;
+
+#if defined(BUILD_IPCP_UDP4) && defined(IP_MTU)
+ if (getsockopt(sock, IPPROTO_IP, IP_MTU, &mtu, &len) < 0)
+ goto fallback;
+#elif defined(BUILD_IPCP_UDP6) && defined(IPV6_MTU)
+ if (getsockopt(sock, IPPROTO_IPV6, IPV6_MTU, &mtu, &len) < 0)
+ goto fallback;
+#else
+ goto fallback;
+#endif
+ close(sock);
+
+ if (mtu <= (int) UDP_IP_OVERHEAD)
+ return UDP_MTU_FALLBACK;
+
+ return (uint32_t) mtu - UDP_IP_OVERHEAD;
+
+ fallback:
+ close(sock);
+#else
+ (void) saddr;
+#endif
+ return UDP_MTU_FALLBACK;
+}
+
static int udp_data_init(void)
{
int i;
@@ -220,7 +274,7 @@ static int udp_ipcp_port_alloc(const struct __SOCKADDR * r_saddr,
msg->availability = qs.availability;
msg->loss = hton32(qs.loss);
msg->ber = hton32(qs.ber);
- msg->in_order = qs.in_order;
+ msg->service = qs.service;
msg->max_gap = hton32(qs.max_gap);
msg->timeout = hton32(qs.timeout);
@@ -277,6 +331,48 @@ static int udp_ipcp_port_alloc_resp(const struct __SOCKADDR * r_saddr,
return 0;
}
+static int udp_ipcp_flow_update(int fd,
+ const buffer_t * data)
+{
+ struct mgmt_msg * msg;
+ struct __SOCKADDR r_saddr;
+ uint32_t d_eid;
+
+ msg = malloc(sizeof(*msg) + data->len);
+ if (msg == NULL)
+ return -1;
+
+ memset(msg, 0, sizeof(*msg) + data->len);
+
+ pthread_rwlock_rdlock(&udp_data.flows_lock);
+
+ r_saddr = udp_data.fd_to_uf[fd].r_saddr;
+ d_eid = (uint32_t) udp_data.fd_to_uf[fd].d_eid;
+
+ pthread_rwlock_unlock(&udp_data.flows_lock);
+
+ msg->eid = hton32(MGMT_EID);
+ msg->code = FLOW_IRM_UPDATE;
+ msg->s_eid = hton32(d_eid);
+ msg->d_eid = hton32((uint32_t) fd);
+
+ if (data->len > 0)
+ memcpy(msg + 1, data->data, data->len);
+
+ if (sendto(udp_data.s_fd, msg, sizeof(*msg) + data->len,
+ SENDTO_FLAGS,
+ (const struct sockaddr *) &r_saddr,
+ sizeof(r_saddr)) < 0) {
+ log_err("Failed to send flow update: %s.", strerror(errno));
+ free(msg);
+ return -1;
+ }
+
+ free(msg);
+
+ return 0;
+}
+
static int udp_ipcp_port_req(struct __SOCKADDR * c_saddr,
int d_eid,
const uint8_t * dst,
@@ -285,7 +381,8 @@ static int udp_ipcp_port_req(struct __SOCKADDR * c_saddr,
{
int fd;
- fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_UDP_MPL, data);
+ fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_UDP_MPL,
+ udp_query_mtu(c_saddr), data);
if (fd < 0) {
log_err("Could not get new flow from IRMd.");
return -1;
@@ -332,7 +429,8 @@ static int udp_ipcp_port_alloc_reply(const struct __SOCKADDR * saddr,
pthread_rwlock_unlock(&udp_data.flows_lock);
- if (ipcp_flow_alloc_reply(s_eid, response, mpl, data) < 0) {
+ if (ipcp_flow_alloc_reply(s_eid, response, mpl,
+ udp_query_mtu(saddr), data) < 0) {
log_err("Failed to reply to flow allocation.");
return -1;
}
@@ -343,6 +441,37 @@ static int udp_ipcp_port_alloc_reply(const struct __SOCKADDR * saddr,
return 0;
}
+static int udp_ipcp_flow_update_arr(const uint8_t * buf,
+ size_t len)
+{
+ struct mgmt_msg * msg;
+ buffer_t data;
+ int fd;
+ int flow_id;
+
+ msg = (struct mgmt_msg *) buf;
+
+ fd = (int) ntoh32(msg->s_eid);
+ if (fd < 0 || fd >= SYS_MAX_FLOWS) {
+ log_err("Flow update for invalid eid %d.", fd);
+ return -1;
+ }
+
+ data.len = len - sizeof(*msg);
+ data.data = (uint8_t *) buf + sizeof(*msg);
+
+ flow_id = np1_flow_id(fd);
+ if (flow_id < 0)
+ return -1;
+
+ if (ipcp_flow_update_arr(flow_id, &data) < 0) {
+ log_err("Failed to relay flow update on fd %d.", fd);
+ return -1;
+ }
+
+ return 0;
+}
+
static int udp_ipcp_mgmt_frame(struct __SOCKADDR c_saddr,
const uint8_t * buf,
size_t len)
@@ -352,13 +481,18 @@ static int udp_ipcp_mgmt_frame(struct __SOCKADDR c_saddr,
qosspec_t qs;
buffer_t data;
+ /* Defence against malformed/corrupted wire input. */
+ if (len < sizeof(*msg))
+ return -1;
+
msg = (struct mgmt_msg *) buf;
switch (msg->code) {
case FLOW_REQ:
msg_len = sizeof(*msg) + ipcp_dir_hash_len();
- assert(len >= msg_len);
+ if (len < msg_len)
+ return -1;
data.len = len - msg_len;
data.data = (uint8_t *) buf + msg_len;
@@ -369,7 +503,7 @@ static int udp_ipcp_mgmt_frame(struct __SOCKADDR c_saddr,
qs.availability = msg->availability;
qs.loss = ntoh32(msg->loss);
qs.ber = ntoh32(msg->ber);
- qs.in_order = msg->in_order;
+ qs.service = msg->service;
qs.max_gap = ntoh32(msg->max_gap);
qs.timeout = ntoh32(msg->timeout);
@@ -377,8 +511,6 @@ static int udp_ipcp_mgmt_frame(struct __SOCKADDR c_saddr,
(uint8_t *) (msg + 1), qs,
&data);
case FLOW_REPLY:
- assert(len >= sizeof(*msg));
-
data.len = len - sizeof(*msg);
data.data = (uint8_t *) buf + sizeof(*msg);
@@ -387,6 +519,8 @@ static int udp_ipcp_mgmt_frame(struct __SOCKADDR c_saddr,
ntoh32(msg->d_eid),
ntoh32(msg->response),
&data);
+ case FLOW_IRM_UPDATE:
+ return udp_ipcp_flow_update_arr(buf, len);
default:
log_err("Unknown message received %d.", msg->code);
return -1;
@@ -549,7 +683,7 @@ static void * udp_ipcp_packet_writer(void * o)
continue;
}
- buf = ssm_pk_buff_head_alloc(spb, OUR_HEADER_LEN);
+ buf = ssm_pk_buff_push(spb, OUR_HEADER_LEN);
if (buf == NULL) {
log_dbg("Failed to allocate header.");
ipcp_spb_release(spb);
@@ -1140,7 +1274,8 @@ static struct ipcp_ops udp_ops = {
.ipcp_flow_alloc = udp_ipcp_flow_alloc,
.ipcp_flow_join = NULL,
.ipcp_flow_alloc_resp = udp_ipcp_flow_alloc_resp,
- .ipcp_flow_dealloc = udp_ipcp_flow_dealloc
+ .ipcp_flow_dealloc = udp_ipcp_flow_dealloc,
+ .ipcp_flow_update = udp_ipcp_flow_update
};
int main(int argc,
diff --git a/src/ipcpd/unicast/dt.c b/src/ipcpd/unicast/dt.c
index 252477f4..e89cb17e 100644
--- a/src/ipcpd/unicast/dt.c
+++ b/src/ipcpd/unicast/dt.c
@@ -31,6 +31,7 @@
#define DT "dt"
#define OUROBOROS_PREFIX DT
+#include <ouroboros/atomics.h>
#include <ouroboros/bitmap.h>
#include <ouroboros/errno.h>
#include <ouroboros/logs.h>
@@ -139,7 +140,7 @@ static void dt_pci_shrink(struct ssm_pk_buff * spb)
{
assert(spb);
- ssm_pk_buff_head_release(spb, dt_pci_info.head_size);
+ ssm_pk_buff_pop(spb, dt_pci_info.head_size);
}
struct {
@@ -168,22 +169,33 @@ struct {
size_t f_nhp_pkt[QOS_CUBE_MAX];
size_t f_nhp_bytes[QOS_CUBE_MAX];
pthread_mutex_t lock;
- } stat[PROG_MAX_FLOWS];
+ } stat[PROC_MAX_FLOWS];
size_t n_flows;
#endif
struct bmp * res_fds;
- struct comp_info comps[PROG_RES_FDS];
+ struct comp_info comps[PROC_RES_FDS];
pthread_rwlock_t lock;
pthread_t listener;
} dt;
+/*
+ * Flow stats are lock-free relaxed atomics on the data path; the per-flow
+ * lock still guards the stamp/addr/n_flows lifecycle (see stat_used).
+ */
+#ifdef IPCP_FLOW_STATS
+#define dt_stat_inc(idx, name, qc, len) \
+ do { \
+ FETCH_ADD_RELAXED(&dt.stat[idx].name ## _pkt[qc], 1); \
+ FETCH_ADD_RELAXED(&dt.stat[idx].name ## _bytes[qc], (len)); \
+ } while (0)
+#define dt_stat_load(idx, field, qc) LOAD_RELAXED(&dt.stat[idx].field[qc])
+
static int dt_rib_read(const char * path,
char * buf,
size_t len)
{
-#ifdef IPCP_FLOW_STATS
int fd;
int i;
char str[QOS_BLOCK_LEN + 1];
@@ -220,7 +232,7 @@ static int dt_rib_read(const char * path,
tm = gmtime(&dt.stat[fd].stamp);
strftime(tmstr, sizeof(tmstr), RIB_TM_FORMAT, tm);
- if (fd >= PROG_RES_FDS) {
+ if (fd >= PROC_RES_FDS) {
fccntl(fd, FLOWGRXQLEN, &rxqlen);
fccntl(fd, FLOWGTXQLEN, &txqlen);
}
@@ -249,20 +261,20 @@ static int dt_rib_read(const char * path,
" failed nhop (packets): %20zu\n"
" failed nhop (bytes): %20zu\n",
i,
- dt.stat[fd].snd_pkt[i],
- dt.stat[fd].snd_bytes[i],
- dt.stat[fd].rcv_pkt[i],
- dt.stat[fd].rcv_bytes[i],
- dt.stat[fd].lcl_w_pkt[i],
- dt.stat[fd].lcl_w_bytes[i],
- dt.stat[fd].lcl_r_pkt[i],
- dt.stat[fd].lcl_r_bytes[i],
- dt.stat[fd].r_drp_pkt[i],
- dt.stat[fd].r_drp_bytes[i],
- dt.stat[fd].w_drp_pkt[i],
- dt.stat[fd].w_drp_bytes[i],
- dt.stat[fd].f_nhp_pkt[i],
- dt.stat[fd].f_nhp_bytes[i]
+ dt_stat_load(fd, snd_pkt, i),
+ dt_stat_load(fd, snd_bytes, i),
+ dt_stat_load(fd, rcv_pkt, i),
+ dt_stat_load(fd, rcv_bytes, i),
+ dt_stat_load(fd, lcl_w_pkt, i),
+ dt_stat_load(fd, lcl_w_bytes, i),
+ dt_stat_load(fd, lcl_r_pkt, i),
+ dt_stat_load(fd, lcl_r_bytes, i),
+ dt_stat_load(fd, r_drp_pkt, i),
+ dt_stat_load(fd, r_drp_bytes, i),
+ dt_stat_load(fd, w_drp_pkt, i),
+ dt_stat_load(fd, w_drp_bytes, i),
+ dt_stat_load(fd, f_nhp_pkt, i),
+ dt_stat_load(fd, f_nhp_bytes, i)
);
strcat(buf, str);
}
@@ -270,17 +282,10 @@ static int dt_rib_read(const char * path,
pthread_mutex_unlock(&dt.stat[fd].lock);
return RIB_FILE_STRLEN;
-#else
- (void) path;
- (void) buf;
- (void) len;
- return 0;
-#endif
}
static int dt_rib_readdir(char *** buf)
{
-#ifdef IPCP_FLOW_STATS
char entry[RIB_PATH_LEN + 1];
size_t i;
int idx = 0;
@@ -296,7 +301,7 @@ static int dt_rib_readdir(char *** buf)
if (*buf == NULL)
goto fail_entries;
- for (i = 0; i < PROG_MAX_FLOWS; ++i) {
+ for (i = 0; i < PROC_MAX_FLOWS; ++i) {
pthread_mutex_lock(&dt.stat[i].lock);
if (dt.stat[i].stamp == 0) {
@@ -327,16 +332,11 @@ static int dt_rib_readdir(char *** buf)
fail_entries:
pthread_rwlock_unlock(&dt.lock);
return -ENOMEM;
-#else
- (void) buf;
- return 0;
-#endif
}
static int dt_rib_getattr(const char * path,
struct rib_attr * attr)
{
-#ifdef IPCP_FLOW_STATS
int fd;
char * entry;
@@ -356,10 +356,7 @@ static int dt_rib_getattr(const char * path,
}
pthread_mutex_unlock(&dt.stat[fd].lock);
-#else
- (void) path;
- (void) attr;
-#endif
+
return 0;
}
@@ -369,7 +366,12 @@ static struct rib_ops r_ops = {
.getattr = dt_rib_getattr
};
-#ifdef IPCP_FLOW_STATS
+/*
+ * Hold dt.lock + per-stat together: dt_rib_readdir samples n_flows
+ * under rdlock and walks stamps under per-stat; updates must be
+ * atomic w.r.t. that snapshot or the malloc(n_flows) buffer can
+ * overflow.
+ */
static void stat_used(int fd,
uint64_t addr)
{
@@ -377,6 +379,7 @@ static void stat_used(int fd,
clock_gettime(CLOCK_REALTIME_COARSE, &now);
+ pthread_rwlock_wrlock(&dt.lock);
pthread_mutex_lock(&dt.stat[fd].lock);
memset(&dt.stat[fd], 0, sizeof(dt.stat[fd]));
@@ -384,14 +387,13 @@ static void stat_used(int fd,
dt.stat[fd].stamp = (addr != INVALID_ADDR) ? now.tv_sec : 0;
dt.stat[fd].addr = addr;
- pthread_mutex_unlock(&dt.stat[fd].lock);
-
- pthread_rwlock_wrlock(&dt.lock);
-
(addr != INVALID_ADDR) ? ++dt.n_flows : --dt.n_flows;
+ pthread_mutex_unlock(&dt.stat[fd].lock);
pthread_rwlock_unlock(&dt.lock);
}
+#else
+#define dt_stat_inc(idx, name, qc, len) ((void) 0)
#endif
static void handle_event(void * self,
@@ -440,15 +442,10 @@ static void packet_handler(int fd,
len = ssm_pk_buff_len(spb);
#ifndef IPCP_FLOW_STATS
- (void) fd;
-#else
- pthread_mutex_lock(&dt.stat[fd].lock);
-
- ++dt.stat[fd].rcv_pkt[qc];
- dt.stat[fd].rcv_bytes[qc] += len;
-
- pthread_mutex_unlock(&dt.stat[fd].lock);
+ (void) fd;
#endif
+ dt_stat_inc(fd, rcv, qc, len);
+
memset(&dt_pci, 0, sizeof(dt_pci));
head = ssm_pk_buff_head(spb);
@@ -458,14 +455,7 @@ static void packet_handler(int fd,
if (dt_pci.ttl == 0) {
log_dbg("TTL was zero.");
ipcp_spb_release(spb);
-#ifdef IPCP_FLOW_STATS
- pthread_mutex_lock(&dt.stat[fd].lock);
-
- ++dt.stat[fd].r_drp_pkt[qc];
- dt.stat[fd].r_drp_bytes[qc] += len;
-
- pthread_mutex_unlock(&dt.stat[fd].lock);
-#endif
+ dt_stat_inc(fd, r_drp, qc, len);
return;
}
@@ -475,14 +465,7 @@ static void packet_handler(int fd,
log_dbg("No next hop for %" PRIu64 ".",
dt_pci.dst_addr);
ipcp_spb_release(spb);
-#ifdef IPCP_FLOW_STATS
- pthread_mutex_lock(&dt.stat[fd].lock);
-
- ++dt.stat[fd].f_nhp_pkt[qc];
- dt.stat[fd].f_nhp_bytes[qc] += len;
-
- pthread_mutex_unlock(&dt.stat[fd].lock);
-#endif
+ dt_stat_inc(fd, f_nhp, qc, len);
return;
}
@@ -494,27 +477,14 @@ static void packet_handler(int fd,
if (ret == -EFLOWDOWN)
notifier_event(NOTIFY_DT_FLOW_DOWN, &ofd);
ipcp_spb_release(spb);
-#ifdef IPCP_FLOW_STATS
- pthread_mutex_lock(&dt.stat[ofd].lock);
-
- ++dt.stat[ofd].w_drp_pkt[qc];
- dt.stat[ofd].w_drp_bytes[qc] += len;
-
- pthread_mutex_unlock(&dt.stat[ofd].lock);
-#endif
+ dt_stat_inc(ofd, w_drp, qc, len);
return;
}
-#ifdef IPCP_FLOW_STATS
- pthread_mutex_lock(&dt.stat[ofd].lock);
- ++dt.stat[ofd].snd_pkt[qc];
- dt.stat[ofd].snd_bytes[qc] += len;
-
- pthread_mutex_unlock(&dt.stat[ofd].lock);
-#endif
+ dt_stat_inc(ofd, snd, qc, len);
} else {
dt_pci_shrink(spb);
- if (dt_pci.eid >= PROG_RES_FDS) {
+ if (dt_pci.eid >= PROC_RES_FDS) {
uint8_t ecn = *(head + dt_pci_info.ecn_o);
fa_np1_rcv(dt_pci.eid, ecn, spb);
return;
@@ -526,20 +496,9 @@ static void packet_handler(int fd,
ipcp_spb_release(spb);
return;
}
-#ifdef IPCP_FLOW_STATS
- pthread_mutex_lock(&dt.stat[fd].lock);
-
- ++dt.stat[fd].lcl_r_pkt[qc];
- dt.stat[fd].lcl_r_bytes[qc] += len;
-
- pthread_mutex_unlock(&dt.stat[fd].lock);
- pthread_mutex_lock(&dt.stat[dt_pci.eid].lock);
-
- ++dt.stat[dt_pci.eid].snd_pkt[qc];
- dt.stat[dt_pci.eid].snd_bytes[qc] += len;
+ dt_stat_inc(fd, lcl_r, qc, len);
+ dt_stat_inc(dt_pci.eid, snd, qc, len);
- pthread_mutex_unlock(&dt.stat[dt_pci.eid].lock);
-#endif
dt.comps[dt_pci.eid].post_packet(dt.comps[dt_pci.eid].comp,
spb);
}
@@ -569,7 +528,9 @@ int dt_init(struct dt_config cfg)
{
int i;
int j;
+#ifdef IPCP_FLOW_STATS
char dtstr[RIB_NAME_STRLEN + 1];
+#endif
enum pol_pff pp;
struct conn_info info;
@@ -636,13 +597,13 @@ int dt_init(struct dt_config cfg)
goto fail_rwlock_init;
}
- dt.res_fds = bmp_create(PROG_RES_FDS, 0);
+ dt.res_fds = bmp_create(PROC_RES_FDS, 0);
if (dt.res_fds == NULL)
goto fail_res_fds;
#ifdef IPCP_FLOW_STATS
memset(dt.stat, 0, sizeof(dt.stat));
- for (i = 0; i < PROG_MAX_FLOWS; ++i)
+ for (i = 0; i < PROC_MAX_FLOWS; ++i)
if (pthread_mutex_init(&dt.stat[i].lock, NULL)) {
log_err("Failed to init mutex for flow %d.", i);
for (j = 0; j < i; ++j)
@@ -651,18 +612,19 @@ int dt_init(struct dt_config cfg)
}
dt.n_flows = 0;
-#endif
+
sprintf(dtstr, "%s." ADDR_FMT32, DT, ADDR_VAL32(&dt.addr));
if (rib_reg(dtstr, &r_ops)) {
log_err("Failed to register RIB.");
goto fail_rib_reg;
}
+#endif
return 0;
- fail_rib_reg:
#ifdef IPCP_FLOW_STATS
- for (i = 0; i < PROG_MAX_FLOWS; ++i)
+ fail_rib_reg:
+ for (i = 0; i < PROC_MAX_FLOWS; ++i)
pthread_mutex_destroy(&dt.stat[i].lock);
fail_stat_lock:
#endif
@@ -685,13 +647,15 @@ int dt_init(struct dt_config cfg)
void dt_fini(void)
{
+#ifdef IPCP_FLOW_STATS
char dtstr[RIB_NAME_STRLEN + 1];
+#endif
int i;
+#ifdef IPCP_FLOW_STATS
sprintf(dtstr, "%s.%" PRIu64, DT, dt.addr);
rib_unreg(dtstr);
-#ifdef IPCP_FLOW_STATS
- for (i = 0; i < PROG_MAX_FLOWS; ++i)
+ for (i = 0; i < PROC_MAX_FLOWS; ++i)
pthread_mutex_destroy(&dt.stat[i].lock);
#endif
bmp_destroy(dt.res_fds);
@@ -791,7 +755,7 @@ int dt_reg_comp(void * comp,
void dt_unreg_comp(int eid)
{
- assert(eid >= 0 && eid < PROG_RES_FDS);
+ assert(eid >= 0 && eid < PROC_RES_FDS);
pthread_rwlock_wrlock(&dt.lock);
@@ -823,33 +787,21 @@ int dt_write_packet(uint64_t dst_addr,
#ifdef IPCP_FLOW_STATS
len = ssm_pk_buff_len(spb);
- if (eid < PROG_RES_FDS) {
- pthread_mutex_lock(&dt.stat[eid].lock);
-
- ++dt.stat[eid].lcl_r_pkt[qc];
- dt.stat[eid].lcl_r_bytes[qc] += len;
-
- pthread_mutex_unlock(&dt.stat[eid].lock);
- }
+ if (eid < PROC_RES_FDS)
+ dt_stat_inc(eid, lcl_r, qc, len);
#endif
fd = pff_nhop(dt.pff[qc], dst_addr);
if (fd < 0) {
log_dbg("Could not get nhop for " ADDR_FMT32 ".",
ADDR_VAL32(&dst_addr));
#ifdef IPCP_FLOW_STATS
- if (eid < PROG_RES_FDS) {
- pthread_mutex_lock(&dt.stat[eid].lock);
-
- ++dt.stat[eid].lcl_r_pkt[qc];
- dt.stat[eid].lcl_r_bytes[qc] += len;
-
- pthread_mutex_unlock(&dt.stat[eid].lock);
- }
+ if (eid < PROC_RES_FDS)
+ dt_stat_inc(eid, lcl_r, qc, len);
#endif
return -EPERM;
}
- head = ssm_pk_buff_head_alloc(spb, dt_pci_info.head_size);
+ head = ssm_pk_buff_push(spb, dt_pci_info.head_size);
if (head == NULL) {
log_dbg("Failed to allocate DT header.");
goto fail_write;
@@ -874,31 +826,17 @@ int dt_write_packet(uint64_t dst_addr,
goto fail_write;
}
#ifdef IPCP_FLOW_STATS
- pthread_mutex_lock(&dt.stat[fd].lock);
-
- if (dt_pci.eid < PROG_RES_FDS) {
- ++dt.stat[fd].lcl_w_pkt[qc];
- dt.stat[fd].lcl_w_bytes[qc] += len;
- }
- ++dt.stat[fd].snd_pkt[qc];
- dt.stat[fd].snd_bytes[qc] += len;
-
- pthread_mutex_unlock(&dt.stat[fd].lock);
+ if (dt_pci.eid < PROC_RES_FDS)
+ dt_stat_inc(fd, lcl_w, qc, len);
+ dt_stat_inc(fd, snd, qc, len);
#endif
return 0;
fail_write:
#ifdef IPCP_FLOW_STATS
- pthread_mutex_lock(&dt.stat[fd].lock);
-
- if (eid < PROG_RES_FDS) {
- ++dt.stat[fd].lcl_w_pkt[qc];
- dt.stat[fd].lcl_w_bytes[qc] += len;
- }
- ++dt.stat[fd].w_drp_pkt[qc];
- dt.stat[fd].w_drp_bytes[qc] += len;
-
- pthread_mutex_unlock(&dt.stat[fd].lock);
+ if (eid < PROC_RES_FDS)
+ dt_stat_inc(fd, lcl_w, qc, len);
+ dt_stat_inc(fd, w_drp, qc, len);
#endif
return -1;
}
diff --git a/src/ipcpd/unicast/fa.c b/src/ipcpd/unicast/fa.c
index c157d71c..c6eca175 100644
--- a/src/ipcpd/unicast/fa.c
+++ b/src/ipcpd/unicast/fa.c
@@ -37,6 +37,7 @@
#include <ouroboros/errno.h>
#include <ouroboros/dev.h>
#include <ouroboros/ipcp-dev.h>
+#include <ouroboros/np1_flow.h>
#include <ouroboros/rib.h>
#include <ouroboros/random.h>
#include <ouroboros/pthread.h>
@@ -61,9 +62,10 @@
#define TIMEOUT 10 * MILLION /* nanoseconds */
#define MSGBUFSZ 32768
-#define FLOW_REQ 0
-#define FLOW_REPLY 1
-#define FLOW_UPDATE 2
+#define FLOW_REQ 0
+#define FLOW_REPLY 1
+#define FLOW_UPDATE 2
+#define FLOW_IRM_UPDATE 3
#define STAT_FILE_LEN 0
@@ -81,7 +83,7 @@ struct fa_msg {
uint16_t ece;
uint8_t code;
uint8_t availability;
- uint8_t in_order;
+ uint8_t service;
} __attribute__((packed));
struct cmd {
@@ -111,7 +113,7 @@ struct fa_flow {
struct {
pthread_rwlock_t flows_lock;
- struct fa_flow flows[PROG_MAX_FLOWS];
+ struct fa_flow flows[PROC_MAX_FLOWS];
#ifdef IPCP_FLOW_STATS
size_t n_flows;
#endif
@@ -125,11 +127,11 @@ struct {
struct psched * psched;
} fa;
+#ifdef IPCP_FLOW_STATS
static int fa_rib_read(const char * path,
char * buf,
size_t len)
{
-#ifdef IPCP_FLOW_STATS
struct fa_flow * flow;
int fd;
char r_addrstr[21];
@@ -145,7 +147,7 @@ static int fa_rib_read(const char * path,
fd = atoi(entry);
- if (fd < 0 || fd >= PROG_MAX_FLOWS)
+ if (fd < 0 || fd >= PROC_MAX_FLOWS)
return -1;
if (len < 1536)
@@ -199,17 +201,10 @@ static int fa_rib_read(const char * path,
pthread_rwlock_unlock(&fa.flows_lock);
return strlen(buf);
-#else
- (void) path;
- (void) buf;
- (void) len;
- return 0;
-#endif
}
static int fa_rib_readdir(char *** buf)
{
-#ifdef IPCP_FLOW_STATS
char entry[RIB_PATH_LEN + 1];
size_t i;
int idx = 0;
@@ -225,7 +220,7 @@ static int fa_rib_readdir(char *** buf)
if (*buf == NULL)
goto fail_entries;
- for (i = 0; i < PROG_MAX_FLOWS; ++i) {
+ for (i = 0; i < PROC_MAX_FLOWS; ++i) {
struct fa_flow * flow;
flow = &fa.flows[i];
@@ -254,16 +249,11 @@ static int fa_rib_readdir(char *** buf)
fail_entries:
pthread_rwlock_unlock(&fa.flows_lock);
return -ENOMEM;
-#else
- (void) buf;
- return 0;
-#endif
}
static int fa_rib_getattr(const char * path,
struct rib_attr * attr)
{
-#ifdef IPCP_FLOW_STATS
int fd;
char * entry;
struct fa_flow * flow;
@@ -286,10 +276,7 @@ static int fa_rib_getattr(const char * path,
}
pthread_rwlock_unlock(&fa.flows_lock);
-#else
- (void) path;
- (void) attr;
-#endif
+
return 0;
}
@@ -298,6 +285,7 @@ static struct rib_ops r_ops = {
.readdir = fa_rib_readdir,
.getattr = fa_rib_getattr
};
+#endif /* IPCP_FLOW_STATS */
static int eid_to_fd(uint64_t eid)
{
@@ -306,7 +294,7 @@ static int eid_to_fd(uint64_t eid)
fd = eid & 0xFFFFFFFF;
- if (fd < 0 || fd >= PROG_MAX_FLOWS)
+ if (fd < 0 || fd >= PROC_MAX_FLOWS)
return -1;
flow = &fa.flows[fd];
@@ -496,11 +484,12 @@ static int fa_handle_flow_req(struct fa_msg * msg,
qs.availability = msg->availability;
qs.loss = ntoh32(msg->loss);
qs.ber = ntoh32(msg->ber);
- qs.in_order = msg->in_order;
+ qs.service = msg->service;
qs.max_gap = ntoh32(msg->max_gap);
qs.timeout = ntoh32(msg->timeout);
- fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_UNICAST_MPL, &data);
+ fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_UNICAST_MPL,
+ IPCP_UNICAST_MTU, &data);
if (fd < 0)
return fd;
@@ -528,7 +517,8 @@ static int fa_handle_flow_reply(struct fa_msg * msg,
time_t mpl = IPCP_UNICAST_MPL;
int response;
- assert(len >= sizeof(*msg));
+ if (len < sizeof(*msg))
+ return -EINVAL;
data.data = (uint8_t *) msg + sizeof(*msg);
data.len = len - sizeof(*msg);
@@ -558,7 +548,8 @@ static int fa_handle_flow_reply(struct fa_msg * msg,
pthread_rwlock_unlock(&fa.flows_lock);
- if (ipcp_flow_alloc_reply(fd, response, mpl, &data) < 0) {
+ if (ipcp_flow_alloc_reply(fd, response, mpl,
+ IPCP_UNICAST_MTU, &data) < 0) {
log_err("Failed to reply for flow allocation on fd %d.", fd);
return -EIRMD;
}
@@ -572,8 +563,8 @@ static int fa_handle_flow_update(struct fa_msg * msg,
struct fa_flow * flow;
int fd;
- (void) len;
- assert(len >= sizeof(*msg));
+ if (len < sizeof(*msg))
+ return -EINVAL;
pthread_rwlock_wrlock(&fa.flows_lock);
@@ -596,6 +587,43 @@ static int fa_handle_flow_update(struct fa_msg * msg,
return 0;
}
+static int fa_handle_flow_irm_update(struct fa_msg * msg,
+ size_t len)
+{
+ buffer_t data;
+ int fd;
+ int flow_id;
+
+ if (len < sizeof(*msg))
+ return -EINVAL;
+
+ data.data = (uint8_t *) msg + sizeof(*msg);
+ data.len = len - sizeof(*msg);
+
+ pthread_rwlock_rdlock(&fa.flows_lock);
+
+ fd = eid_to_fd(ntoh64(msg->r_eid));
+
+ pthread_rwlock_unlock(&fa.flows_lock);
+
+ if (fd < 0) {
+ log_err("Flow update for unknown EID %" PRIu64 ".",
+ ntoh64(msg->r_eid));
+ return -ENOTALLOC;
+ }
+
+ flow_id = np1_flow_id(fd);
+ if (flow_id < 0)
+ return -ENOTALLOC;
+
+ if (ipcp_flow_update_arr(flow_id, &data) < 0) {
+ log_err("Failed to relay flow update on fd %d.", fd);
+ return -EIRMD;
+ }
+
+ return 0;
+}
+
static void * fa_handle_packet(void * o)
{
(void) o;
@@ -624,6 +652,10 @@ static void * fa_handle_packet(void * o)
if (fa_handle_flow_update(msg, len) < 0)
log_err("Error handling flow update.");
break;
+ case FLOW_IRM_UPDATE:
+ if (fa_handle_flow_irm_update(msg, len) < 0)
+ log_err("Error handling flow update.");
+ break;
default:
log_warn("Recieved unknown flow allocation message.");
break;
@@ -652,8 +684,10 @@ int fa_init(void)
if (pthread_cond_init(&fa.cond, &cattr))
goto fail_cond;
+#ifdef IPCP_FLOW_STATS
if (rib_reg(FA, &r_ops))
goto fail_rib_reg;
+#endif
fa.eid = dt_reg_comp(&fa, &fa_post_packet, FA);
if ((int) fa.eid < 0)
@@ -666,8 +700,10 @@ int fa_init(void)
return 0;
fail_dt_reg:
+#ifdef IPCP_FLOW_STATS
rib_unreg(FA);
fail_rib_reg:
+#endif
pthread_cond_destroy(&fa.cond);
fail_cond:
pthread_condattr_destroy(&cattr);
@@ -681,8 +717,9 @@ int fa_init(void)
void fa_fini(void)
{
+#ifdef IPCP_FLOW_STATS
rib_unreg(FA);
-
+#endif
pthread_cond_destroy(&fa.cond);;
pthread_mutex_destroy(&fa.mtx);
pthread_rwlock_destroy(&fa.flows_lock);
@@ -789,7 +826,7 @@ int fa_alloc(int fd,
msg->availability = qs.availability;
msg->loss = hton32(qs.loss);
msg->ber = hton32(qs.ber);
- msg->in_order = qs.in_order;
+ msg->service = qs.service;
msg->max_gap = hton32(qs.max_gap);
msg->timeout = hton32(qs.timeout);
@@ -878,6 +915,44 @@ int fa_alloc_resp(int fd,
return -1;
}
+int fa_irm_update(int fd,
+ const buffer_t * data)
+{
+ struct fa_msg * msg;
+ struct ssm_pk_buff * spb;
+ struct fa_flow * flow;
+ qoscube_t qc = QOS_CUBE_BE;
+ uint64_t r_addr;
+
+ flow = &fa.flows[fd];
+
+ if (ipcp_spb_reserve(&spb, sizeof(*msg) + data->len))
+ return -1;
+
+ msg = (struct fa_msg *) ssm_pk_buff_head(spb);
+ memset(msg, 0, sizeof(*msg));
+
+ msg->code = FLOW_IRM_UPDATE;
+ if (data->len > 0)
+ memcpy(msg + 1, data->data, data->len);
+
+ pthread_rwlock_rdlock(&fa.flows_lock);
+
+ msg->r_eid = hton64(flow->r_eid);
+ msg->s_eid = hton64(flow->s_eid);
+ r_addr = flow->r_addr;
+
+ pthread_rwlock_unlock(&fa.flows_lock);
+
+ if (dt_write_packet(r_addr, qc, fa.eid, spb)) {
+ log_err("Failed to send flow update packet.");
+ ipcp_spb_release(spb);
+ return -1;
+ }
+
+ return 0;
+}
+
int fa_dealloc(int fd)
{
if (ipcp_flow_fini(fd) < 0)
diff --git a/src/ipcpd/unicast/fa.h b/src/ipcpd/unicast/fa.h
index 0c19dc25..f31b40e9 100644
--- a/src/ipcpd/unicast/fa.h
+++ b/src/ipcpd/unicast/fa.h
@@ -45,6 +45,9 @@ int fa_alloc_resp(int fd,
int fa_dealloc(int fd);
+int fa_irm_update(int fd,
+ const buffer_t * data);
+
void fa_np1_rcv(uint64_t eid,
uint8_t ecn,
struct ssm_pk_buff * spb);
diff --git a/src/ipcpd/unicast/main.c b/src/ipcpd/unicast/main.c
index 583a04ff..1155b88b 100644
--- a/src/ipcpd/unicast/main.c
+++ b/src/ipcpd/unicast/main.c
@@ -273,7 +273,8 @@ static struct ipcp_ops unicast_ops = {
.ipcp_flow_alloc = fa_alloc,
.ipcp_flow_join = NULL,
.ipcp_flow_alloc_resp = fa_alloc_resp,
- .ipcp_flow_dealloc = fa_dealloc
+ .ipcp_flow_dealloc = fa_dealloc,
+ .ipcp_flow_update = fa_irm_update
};
int main(int argc,
@@ -307,8 +308,8 @@ int main(int argc,
ipcp_sigwait();
if (ipcp_get_state() == IPCP_SHUTDOWN) {
- stop_components();
ipcp_stop();
+ stop_components();
finalize_components();
} else {
ipcp_stop();
diff --git a/src/ipcpd/unicast/pff/alternate.c b/src/ipcpd/unicast/pff/alternate.c
index be1c35c0..1c508c1b 100644
--- a/src/ipcpd/unicast/pff/alternate.c
+++ b/src/ipcpd/unicast/pff/alternate.c
@@ -211,7 +211,7 @@ struct pff_i * alternate_pff_create(void)
if (pthread_rwlock_init(&tmp->lock, NULL))
goto fail_lock;
- tmp->pft = pft_create(PFT_SIZE, false);
+ tmp->pft = pft_create(PFT_SIZE);
if (tmp->pft == NULL)
goto fail_pft;
diff --git a/src/ipcpd/unicast/pff/multipath.c b/src/ipcpd/unicast/pff/multipath.c
index c636e789..9ba59592 100644
--- a/src/ipcpd/unicast/pff/multipath.c
+++ b/src/ipcpd/unicast/pff/multipath.c
@@ -63,7 +63,7 @@ struct pff_i * multipath_pff_create(void)
if (pthread_rwlock_init(&tmp->lock, NULL))
goto fail_rwlock;
- tmp->pft = pft_create(PFT_SIZE, false);
+ tmp->pft = pft_create(PFT_SIZE);
if (tmp->pft == NULL)
goto fail_pft;
diff --git a/src/ipcpd/unicast/pff/pft.c b/src/ipcpd/unicast/pff/pft.c
index a0d70799..d0e562d6 100644
--- a/src/ipcpd/unicast/pff/pft.c
+++ b/src/ipcpd/unicast/pff/pft.c
@@ -43,12 +43,10 @@ struct pft_entry {
struct pft {
struct list_head * buckets;
- bool hash_key;
uint64_t buckets_size;
};
-struct pft * pft_create(uint64_t buckets,
- bool hash_key)
+struct pft * pft_create(uint64_t buckets)
{
struct pft * tmp;
unsigned int i;
@@ -69,7 +67,6 @@ struct pft * pft_create(uint64_t buckets,
if (tmp == NULL)
return NULL;
- tmp->hash_key = hash_key;
tmp->buckets_size = buckets;
tmp->buckets = malloc(buckets * sizeof(*tmp->buckets));
@@ -113,22 +110,10 @@ void pft_flush(struct pft * pft)
}
}
-static uint64_t hash(uint64_t key)
-{
- uint64_t res[2];
-
- mem_hash(HASH_MD5, res, (uint8_t *) &key, sizeof(key));
-
- return res[0];
-}
-
static uint64_t calc_key(struct pft * pft,
uint64_t dst)
{
- if (pft->hash_key)
- dst = hash(dst);
-
- return (dst & (pft->buckets_size - 1));
+ return hash_mix64(dst) & (pft->buckets_size - 1);
}
int pft_insert(struct pft * pft,
diff --git a/src/ipcpd/unicast/pff/pft.h b/src/ipcpd/unicast/pff/pft.h
index 3bb9cff7..15bbe451 100644
--- a/src/ipcpd/unicast/pff/pft.h
+++ b/src/ipcpd/unicast/pff/pft.h
@@ -24,14 +24,12 @@
#define OUROBOROS_PFT_H
#include <stdint.h>
-#include <stdbool.h>
#include <stdlib.h>
struct pft;
/* Buckets is rounded up to the nearest power of 2 */
-struct pft * pft_create(uint64_t buckets,
- bool hash_key);
+struct pft * pft_create(uint64_t buckets);
void pft_destroy(struct pft * table);
diff --git a/src/ipcpd/unicast/pff/simple.c b/src/ipcpd/unicast/pff/simple.c
index be542bdb..7befa42f 100644
--- a/src/ipcpd/unicast/pff/simple.c
+++ b/src/ipcpd/unicast/pff/simple.c
@@ -63,7 +63,7 @@ struct pff_i * simple_pff_create(void)
return NULL;
}
- tmp->pft = pft_create(PFT_SIZE, false);
+ tmp->pft = pft_create(PFT_SIZE);
if (tmp->pft == NULL) {
pthread_rwlock_destroy(&tmp->lock);
free(tmp);
diff --git a/src/ipcpd/unicast/pff/tests/pft_test.c b/src/ipcpd/unicast/pff/tests/pft_test.c
index 4962c241..20e73a94 100644
--- a/src/ipcpd/unicast/pff/tests/pft_test.c
+++ b/src/ipcpd/unicast/pff/tests/pft_test.c
@@ -38,15 +38,7 @@ int pft_test(int argc,
(void) argc;
(void) argv;
- pft = pft_create(TBL_SIZE, true);
- if (pft == NULL) {
- printf("Failed to create.\n");
- return -1;
- }
-
- pft_destroy(pft);
-
- pft = pft_create(TBL_SIZE, false);
+ pft = pft_create(TBL_SIZE);
if (pft == NULL) {
printf("Failed to create.\n");
return -1;
diff --git a/src/ipcpd/unicast/routing/graph.c b/src/ipcpd/unicast/routing/graph.c
index 0226c762..c168eb7d 100644
--- a/src/ipcpd/unicast/routing/graph.c
+++ b/src/ipcpd/unicast/routing/graph.c
@@ -603,9 +603,9 @@ static int graph_routing_table_lfa(struct graph * graph,
struct list_head * table,
int ** dist)
{
- int * n_dist[PROG_MAX_FLOWS];
- uint64_t addrs[PROG_MAX_FLOWS];
- int n_index[PROG_MAX_FLOWS];
+ int * n_dist[PROC_MAX_FLOWS];
+ uint64_t addrs[PROC_MAX_FLOWS];
+ int n_index[PROC_MAX_FLOWS];
struct list_head * p;
struct list_head * q;
struct vertex * v;
@@ -618,7 +618,7 @@ static int graph_routing_table_lfa(struct graph * graph,
if (graph_routing_table_simple(graph, s_addr, table, dist))
goto fail_table;
- for (j = 0; j < PROG_MAX_FLOWS; j++) {
+ for (j = 0; j < PROC_MAX_FLOWS; j++) {
n_dist[j] = NULL;
n_index[j] = -1;
addrs[j] = -1;
diff --git a/src/ipcpd/unicast/routing/link-state.c b/src/ipcpd/unicast/routing/link-state.c
index 051dd98d..c4ea9e1c 100644
--- a/src/ipcpd/unicast/routing/link-state.c
+++ b/src/ipcpd/unicast/routing/link-state.c
@@ -415,7 +415,7 @@ static void calculate_pff(struct routing_i * instance)
struct list_head table;
struct list_head * p;
struct list_head * q;
- int fds[PROG_MAX_FLOWS];
+ int fds[PROC_MAX_FLOWS];
assert(instance);
diff --git a/src/irmd/CMakeLists.txt b/src/irmd/CMakeLists.txt
index 9aa747ca..5aa457ff 100644
--- a/src/irmd/CMakeLists.txt
+++ b/src/irmd/CMakeLists.txt
@@ -7,11 +7,11 @@ if(HAVE_TOML)
set(INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}")
configure_file("${CMAKE_SOURCE_DIR}/irmd.conf.in"
"${CMAKE_BINARY_DIR}/${OUROBOROS_CONFIG_FILE}.example" @ONLY)
- configure_file("${CMAKE_SOURCE_DIR}/enc.conf.in"
- "${CMAKE_BINARY_DIR}/enc.conf.example" @ONLY)
+ configure_file("${CMAKE_SOURCE_DIR}/sec.conf.in"
+ "${CMAKE_BINARY_DIR}/sec.conf.example" @ONLY)
install(FILES "${CMAKE_BINARY_DIR}/${OUROBOROS_CONFIG_FILE}.example"
DESTINATION "${OUROBOROS_CONFIG_DIR}")
- install(FILES "${CMAKE_BINARY_DIR}/enc.conf.example"
+ install(FILES "${CMAKE_BINARY_DIR}/sec.conf.example"
DESTINATION "${OUROBOROS_CONFIG_DIR}")
install(CODE "
if(NOT EXISTS \"${OUROBOROS_CONFIG_DIR}/${OUROBOROS_CONFIG_FILE}\")
diff --git a/src/irmd/config.h.in b/src/irmd/config.h.in
index df0cd718..e14cff75 100644
--- a/src/irmd/config.h.in
+++ b/src/irmd/config.h.in
@@ -42,6 +42,9 @@
#define FLOW_DEALLOC_TIMEOUT @FLOW_DEALLOC_TIMEOUT@
#define OAP_REPLAY_TIMER @OAP_REPLAY_TIMER@
+#define OAP_REPLAY_MAX @OAP_REPLAY_MAX@
+#define OAP_REKEY_TIMER @OAP_REKEY_TIMER@
+#cmakedefine01 OAP_CLIENT_AUTH_DEFAULT
#define BOOTSTRAP_TIMEOUT @BOOTSTRAP_TIMEOUT@
#define ENROLL_TIMEOUT @ENROLL_TIMEOUT@
diff --git a/src/irmd/configfile.c b/src/irmd/configfile.c
index 53608eee..35cf4292 100644
--- a/src/irmd/configfile.c
+++ b/src/irmd/configfile.c
@@ -922,10 +922,10 @@ static int toml_name(toml_table_t * table,
toml_array_t * progs;
toml_array_t * args;
toml_datum_t lb;
- toml_datum_t senc;
+ toml_datum_t ssec;
toml_datum_t scrt;
toml_datum_t skey;
- toml_datum_t cenc;
+ toml_datum_t csec;
toml_datum_t ccrt;
toml_datum_t ckey;
@@ -957,8 +957,8 @@ static int toml_name(toml_table_t * table,
log_err("Invalid load-balancing policy for %s.", name);
return -1;
}
- senc = toml_string_in(table, "server_enc_file");
- if (senc.ok && cp_chk_path(info.s.enc, senc.u.s) < 0)
+ ssec = toml_string_in(table, "server_sec_file");
+ if (ssec.ok && cp_chk_path(info.s.sec, ssec.u.s) < 0)
return -1;
scrt = toml_string_in(table, "server_crt_file");
@@ -969,8 +969,8 @@ static int toml_name(toml_table_t * table,
if (skey.ok && cp_chk_path(info.s.key, skey.u.s) < 0)
return -1;
- cenc = toml_string_in(table, "client_enc_file");
- if (cenc.ok && cp_chk_path(info.c.enc, cenc.u.s) < 0)
+ csec = toml_string_in(table, "client_sec_file");
+ if (csec.ok && cp_chk_path(info.c.sec, csec.u.s) < 0)
return -1;
ccrt = toml_string_in(table, "client_crt_file");
diff --git a/src/irmd/ipcp.c b/src/irmd/ipcp.c
index a7da186c..7eccfc80 100644
--- a/src/irmd/ipcp.c
+++ b/src/irmd/ipcp.c
@@ -444,6 +444,38 @@ int ipcp_flow_join(const struct flow_info * flow,
return ret;
}
+int ipcp_flow_update(const struct flow_info * flow,
+ const buffer_t data)
+{
+ ipcp_msg_t msg = IPCP_MSG__INIT;
+ ipcp_msg_t * recv_msg;
+ int ret;
+
+ msg.code = IPCP_MSG_CODE__IPCP_FLOW_UPDATE;
+ msg.has_flow_id = true;
+ msg.flow_id = flow->id;
+ msg.has_pk = true;
+ msg.pk.data = data.data;
+ msg.pk.len = data.len;
+
+ recv_msg = send_recv_ipcp_msg(flow->n_1_pid, &msg);
+ if (recv_msg == NULL) {
+ log_err("Did not receive message.");
+ return -EIPCP;
+ }
+
+ if (!recv_msg->has_result) {
+ log_err("Message has no result");
+ ipcp_msg__free_unpacked(recv_msg, NULL);
+ return -EIPCP;
+ }
+
+ ret = recv_msg->result;
+ ipcp_msg__free_unpacked(recv_msg, NULL);
+
+ return ret;
+}
+
int ipcp_flow_alloc(const struct flow_info * flow,
const buffer_t dst,
const buffer_t data)
diff --git a/src/irmd/ipcp.h b/src/irmd/ipcp.h
index f1025096..8d06623c 100644
--- a/src/irmd/ipcp.h
+++ b/src/irmd/ipcp.h
@@ -68,4 +68,7 @@ int ipcp_flow_dealloc(pid_t pid,
int flow_id,
time_t timeo);
+int ipcp_flow_update(const struct flow_info * flow,
+ const buffer_t data);
+
#endif /* OUROBOROS_IRMD_IPCP_H */
diff --git a/src/irmd/main.c b/src/irmd/main.c
index a85a9bf0..19be4ab9 100644
--- a/src/irmd/main.c
+++ b/src/irmd/main.c
@@ -36,6 +36,7 @@
#include <ouroboros/crypt.h>
#include <ouroboros/errno.h>
#include <ouroboros/flow.h>
+#include <ouroboros/fqueue.h>
#include <ouroboros/hash.h>
#include <ouroboros/irm.h>
#include <ouroboros/list.h>
@@ -86,7 +87,11 @@
#define TIMESYNC_SLACK 100 /* ms */
#define OAP_SEEN_TIMER 20 /* s */
#define DEALLOC_TIME 300 /* s */
-#define DIRECT_MPL 1 /* s */
+#define REKEY_BATCH 64 /* flows re-keyed per timer pass */
+#define REKEY_RESP_TIMEO 20 /* s; give-up on a re-key RESPONSE */
+#define DIRECT_MPL 20 /* ms */
+/* bytes; in-process, bounded only by PUP/GSPP. */
+#define DIRECT_MTU 65000
enum irm_state {
IRMD_NULL = 0,
@@ -103,13 +108,38 @@ struct cmd {
int fd;
};
+/* In-flight Tier-2 re-key, owned solely by the re-key worker thread. */
+struct rekey_ctx {
+ struct list_head next;
+
+ int flow_id;
+ void * ctx; /* OAP client ctx (opaque) */
+ struct timespec deadline; /* reap if no RESPONSE by then */
+};
+
+enum rekey_evt_type {
+ REKEY_INIT = 0, /* start an exchange for flow_id */
+ REKEY_REQ, /* a REQUEST arrived for flow_id */
+ REKEY_RESP, /* a RESPONSE arrived for flow_id */
+ REKEY_DIRECT /* in-process re-key, direct flow */
+};
+
+struct rekey_evt {
+ struct list_head next;
+
+ enum rekey_evt_type type;
+ int flow_id;
+ pid_t n_1_pid; /* INIT: flow's lower IPCP */
+ buffer_t buf; /* RESP: owned RESPONSE payload */
+};
+
struct {
bool log_stdout; /* log to stdout */
#ifdef HAVE_TOML
char * cfg_file; /* configuration file path */
#endif
struct lockfile * lf; /* single irmd per system */
- struct ssm_pool * gspp; /* pool for packets */
+ struct ssm_pool * gspp; /* pool for packets */
int sockfd; /* UNIX socket */
@@ -124,6 +154,13 @@ struct {
pthread_t irm_sanitize; /* clean up irmd resources */
pthread_t acceptor; /* accept new commands */
+
+ struct {
+ pthread_t worker; /* Tier-2 re-key orchestrator */
+ struct list_head inbox; /* re-key events for worker */
+ pthread_cond_t cond; /* inbox signal condvar */
+ pthread_mutex_t mtx; /* inbox lock */
+ } rk;
} irmd;
static enum irm_state irmd_get_state(void)
@@ -452,8 +489,8 @@ static void name_update_sec_paths(struct name_info * info)
assert(info != NULL);
- if (strlen(info->s.enc) == 0)
- sprintf(info->s.enc, "%s/%s/enc.conf", srv_dir, info->name);
+ if (strlen(info->s.sec) == 0)
+ sprintf(info->s.sec, "%s/%s/sec.conf", srv_dir, info->name);
if (strlen(info->s.crt) == 0)
sprintf(info->s.crt, "%s/%s/crt.pem", srv_dir, info->name);
@@ -461,8 +498,8 @@ static void name_update_sec_paths(struct name_info * info)
if (strlen(info->s.key) == 0)
sprintf(info->s.key, "%s/%s/key.pem", srv_dir, info->name);
- if (strlen(info->c.enc) == 0)
- sprintf(info->c.enc, "%s/%s/enc.conf", cli_dir, info->name);
+ if (strlen(info->c.sec) == 0)
+ sprintf(info->c.sec, "%s/%s/sec.conf", cli_dir, info->name);
if (strlen(info->c.crt) == 0)
sprintf(info->c.crt, "%s/%s/crt.pem", cli_dir, info->name);
@@ -782,7 +819,8 @@ static int name_unreg(const char * name,
static int get_peer_ids(int fd,
uid_t * uid,
- gid_t * gid)
+ gid_t * gid,
+ pid_t * pid)
{
#if defined(__linux__)
struct ucred ucred;
@@ -795,9 +833,14 @@ static int get_peer_ids(int fd,
*uid = ucred.uid;
*gid = ucred.gid;
+ if (pid != NULL)
+ *pid = ucred.pid;
#else
if (getpeereid(fd, uid, gid) < 0)
goto fail;
+
+ if (pid != NULL)
+ *pid = -1; /* no portable SO_PEERCRED.pid equivalent */
#endif
return 0;
fail:
@@ -846,6 +889,7 @@ static int flow_accept(struct flow_info * flow,
{
buffer_t req_hdr;
buffer_t resp_hdr;
+ buffer_t peer_crt = BUF_INIT;
char name[NAME_SIZE + 1];
struct name_info info;
int err;
@@ -909,7 +953,12 @@ static int flow_accept(struct flow_info * flow,
flow->uid = reg_get_proc_uid(flow->n_pid);
- err = oap_srv_process(&info, req_hdr, &resp_hdr, data, sk);
+ err = oap_srv_process(&info, req_hdr, &resp_hdr, data, sk,
+ false, NULL, &peer_crt);
+ if (err == -EREPLAY) {
+ log_warn("Dropping replayed alloc request for %s.", name);
+ goto fail_replay;
+ }
if (err < 0) {
log_err("OAP processing failed for %s.", name);
goto fail_oap;
@@ -920,16 +969,21 @@ static int flow_accept(struct flow_info * flow,
log_err("Failed to respond to direct flow.");
goto fail_resp;
}
+ if (sk->nid != NID_undef)
+ reg_flow_set_rekey(flow->id, false, peer_crt);
log_info("Flow %d accepted (direct) by %d for %s.",
flow->id, flow->n_pid, name);
} else if (ipcp_flow_alloc_resp(flow, 0, resp_hdr) < 0) {
log_err("Failed to respond to flow allocation.");
goto fail_resp;
} else {
+ if (sk->nid != NID_undef)
+ reg_flow_set_rekey(flow->id, false, peer_crt);
log_info("Flow %d accepted by %d for %s (uid %d).",
flow->id, flow->n_pid, name, flow->uid);
}
+ freebuf(peer_crt);
freebuf(req_hdr);
freebuf(resp_hdr);
@@ -938,6 +992,10 @@ static int flow_accept(struct flow_info * flow,
fail_oap:
if (!reg_flow_is_direct(flow->id))
ipcp_flow_alloc_resp(flow, err, resp_hdr);
+ fail_replay:
+ freebuf(peer_crt);
+ freebuf(req_hdr);
+ freebuf(resp_hdr);
fail_wait:
reg_destroy_flow(flow->id);
fail_flow:
@@ -945,6 +1003,7 @@ static int flow_accept(struct flow_info * flow,
fail_resp:
flow->state = FLOW_NULL;
+ freebuf(peer_crt);
freebuf(req_hdr);
freebuf(resp_hdr);
reg_destroy_flow(flow->id);
@@ -1193,6 +1252,7 @@ static int flow_alloc_direct(const char * dst,
struct flow_info acc; /* server side flow */
buffer_t req_hdr = BUF_INIT;
buffer_t resp_hdr = BUF_INIT;
+ buffer_t no_crt = BUF_INIT;
void * ctx;
int err;
@@ -1202,13 +1262,14 @@ static int flow_alloc_direct(const char * dst,
return -EAGAIN;
}
- if (oap_cli_prepare(&ctx, info, &req_hdr, *data) < 0) {
+ if (oap_cli_prepare(&ctx, info, &req_hdr, *data, false) < 0) {
log_err("Failed to prepare OAP for %s.", dst);
return -EBADF;
}
acc.n_1_pid = flow->n_pid;
acc.mpl = DIRECT_MPL;
+ acc.mtu = DIRECT_MTU;
acc.qs = flow->qs;
acc.state = FLOW_ALLOCATED;
@@ -1234,7 +1295,7 @@ static int flow_alloc_direct(const char * dst,
return -ETIMEDOUT;
}
- err = oap_cli_complete(ctx, info, resp_hdr, data, sk);
+ err = oap_cli_complete(ctx, info, resp_hdr, data, sk, NULL, NULL);
if (err < 0) {
log_err("OAP completion failed for %s.", dst);
freebuf(resp_hdr);
@@ -1244,8 +1305,13 @@ static int flow_alloc_direct(const char * dst,
flow->id = acc.id;
flow->n_1_pid = acc.n_pid;
flow->mpl = DIRECT_MPL;
+ flow->mtu = DIRECT_MTU;
flow->state = FLOW_ALLOCATED;
+ /* Mark encrypted for re-key; the acceptor caches the cert. */
+ if (sk->nid != NID_undef)
+ reg_flow_set_rekey(acc.id, true, no_crt);
+
log_info("Flow %d allocated (direct) for %d to %s.",
flow->id, flow->n_pid, dst);
@@ -1264,6 +1330,7 @@ static int flow_alloc(const char * dst,
buffer_t req_hdr = BUF_INIT;
buffer_t resp_hdr = BUF_INIT;
buffer_t hash = BUF_INIT;
+ buffer_t peer_crt = BUF_INIT;
struct name_info info;
void * ctx;
int err;
@@ -1297,6 +1364,8 @@ static int flow_alloc(const char * dst,
goto fail_flow;
}
+ reg_set_name_for_flow_id(dst, flow->id);
+
if (get_ipcp_by_dst(dst, &flow->n_1_pid, &hash) < 0) {
log_err("Failed to find IPCP for %s.", dst);
err = -EIPCP;
@@ -1309,7 +1378,7 @@ static int flow_alloc(const char * dst,
goto fail_prepare;
}
- if (oap_cli_prepare(&ctx, &info, &req_hdr, *data) < 0) {
+ if (oap_cli_prepare(&ctx, &info, &req_hdr, *data, false) < 0) {
log_err("Failed to prepare OAP request for %s.", dst);
err = -EBADF;
goto fail_prepare;
@@ -1341,12 +1410,16 @@ static int flow_alloc(const char * dst,
goto fail_peer;
}
- err = oap_cli_complete(ctx, &info, resp_hdr, data, sk);
+ err = oap_cli_complete(ctx, &info, resp_hdr, data, sk, NULL, &peer_crt);
if (err < 0) {
log_err("OAP completion failed for %s.", dst);
goto fail_complete;
}
+ if (sk->nid != NID_undef)
+ reg_flow_set_rekey(flow->id, true, peer_crt);
+
+ freebuf(peer_crt);
freebuf(req_hdr);
freebuf(resp_hdr);
freebuf(hash);
@@ -1354,7 +1427,8 @@ static int flow_alloc(const char * dst,
return 0;
fail_complete:
- ctx = NULL; /* freee'd on complete */
+ freebuf(peer_crt);
+ ctx = NULL; /* free'd on complete */
fail_peer:
flow->state = FLOW_DEALLOCATED;
fail_wait:
@@ -1421,6 +1495,741 @@ static int flow_dealloc_resp(struct flow_info * flow)
return 0;
}
+/*
+ * Inbox producers. Any thread may post; the worker drains. INIT carries
+ * the flow's lower IPCP pid; RESP transfers ownership of buf.
+ */
+static void rekey_post(enum rekey_evt_type type,
+ int flow_id,
+ pid_t n_1_pid,
+ buffer_t * buf)
+{
+ struct rekey_evt * evt;
+
+ evt = malloc(sizeof(*evt));
+ if (evt == NULL) {
+ log_err("Failed to malloc re-key event for flow %d.", flow_id);
+ if (type == REKEY_INIT || type == REKEY_DIRECT)
+ reg_flow_clear_in_flight(flow_id);
+ else
+ reg_flow_rekey_arr_done(flow_id, type == REKEY_REQ);
+
+ if (buf != NULL)
+ freebuf(*buf);
+
+ return;
+ }
+
+ list_head_init(&evt->next);
+ evt->type = type;
+ evt->flow_id = flow_id;
+ evt->n_1_pid = n_1_pid;
+ clrbuf(evt->buf);
+ if (buf != NULL) {
+ evt->buf = *buf;
+ clrbuf(*buf);
+ }
+
+ pthread_mutex_lock(&irmd.rk.mtx);
+
+ list_add_tail(&evt->next, &irmd.rk.inbox);
+ pthread_cond_signal(&irmd.rk.cond);
+
+ pthread_mutex_unlock(&irmd.rk.mtx);
+}
+
+static void rekey_post_init(int flow_id,
+ pid_t n_1_pid)
+{
+ rekey_post(REKEY_INIT, flow_id, n_1_pid, NULL);
+}
+
+static void rekey_post_resp(int flow_id,
+ buffer_t * buf)
+{
+ rekey_post(REKEY_RESP, flow_id, 0, buf);
+}
+
+static void rekey_post_req(int flow_id,
+ pid_t n_1_pid,
+ buffer_t * buf)
+{
+ rekey_post(REKEY_REQ, flow_id, n_1_pid, buf);
+}
+
+static void rekey_post_direct(int flow_id)
+{
+ rekey_post(REKEY_DIRECT, flow_id, 0, NULL);
+}
+
+/* Worker-only: find an in-flight entry by flow_id. */
+static struct rekey_ctx * rekey_find(struct list_head * tbl,
+ int flow_id)
+{
+ struct list_head * p;
+
+ list_for_each(p, tbl) {
+ struct rekey_ctx * e = list_entry(p, struct rekey_ctx, next);
+ if (e->flow_id == flow_id)
+ return e;
+ }
+
+ return NULL;
+}
+
+/* Worker-only: drop an entry, freeing its OAP ctx. */
+static void rekey_drop(struct rekey_ctx * e)
+{
+ if (e->ctx != NULL)
+ oap_ctx_free(e->ctx);
+
+ list_del(&e->next);
+ free(e);
+}
+
+/* Resolve a flow's registered name info; < 0 if the flow or name is gone. */
+static int rekey_name_info(int flow_id,
+ struct name_info * info)
+{
+ char name[NAME_SIZE + 1];
+
+ if (reg_get_name_for_flow_id(name, flow_id) < 0)
+ return -1;
+
+ return reg_get_name_info(name, info);
+}
+
+/* Flow-update relay payload: a 1-byte type prefix on an opaque body. */
+enum flow_upd_type {
+ FLOW_UPD_REKEY_REQ = 0,
+ FLOW_UPD_REKEY_RESP = 1,
+};
+
+/* Prepend the update type to body; caller frees out on success. */
+static int flow_upd_wrap(buffer_t * out,
+ uint8_t type,
+ const buffer_t * body)
+{
+ out->len = body->len + 1;
+ out->data = malloc(out->len);
+ if (out->data == NULL)
+ return -ENOMEM;
+
+ out->data[0] = type;
+ memcpy(out->data + 1, body->data, body->len);
+
+ return 0;
+}
+
+/* Cleanup handlers — the re-key worker is cancelled at shutdown. */
+static void rk_free_evt(void * o)
+{
+ struct rekey_evt * evt = o;
+
+ freebuf(evt->buf);
+ free(evt);
+}
+
+static void rk_freebuf(void * o)
+{
+ freebuf(*(buffer_t *) o);
+}
+
+static void rk_clear_in_flight(void * o)
+{
+ reg_flow_clear_in_flight(*(int *) o);
+}
+
+static void rk_clear_key(void * o)
+{
+ crypt_secure_clear(o, SYMMKEYSZ);
+}
+
+static void rekey_do_initiate(struct list_head * tbl,
+ int flow_id,
+ pid_t n_1_pid)
+{
+ struct rekey_ctx * e;
+ struct flow_info info;
+ struct name_info name;
+ buffer_t req = BUF_INIT;
+ buffer_t upd = BUF_INIT;
+ buffer_t data = BUF_INIT;
+ void * ctx = NULL;
+ int ret;
+
+ e = rekey_find(tbl, flow_id);
+ if (e != NULL)
+ rekey_drop(e); /* Replace in-flight entries */
+
+ if (rekey_name_info(flow_id, &name) < 0) {
+ log_err("Failed to get name info to re-key flow %d.", flow_id);
+ goto fail;
+ }
+
+ if (oap_cli_prepare(&ctx, &name, &req, data, true) < 0) {
+ log_err("Failed to prepare re-key for flow %d.", flow_id);
+ goto fail;
+ }
+
+ memset(&info, 0, sizeof(info));
+ info.id = flow_id;
+ info.n_1_pid = n_1_pid;
+
+ if (flow_upd_wrap(&upd, FLOW_UPD_REKEY_REQ, &req) < 0) {
+ log_err("Failed to wrap re-key request for flow %d.", flow_id);
+ goto fail_ctx;
+ }
+
+ pthread_cleanup_push(rk_clear_in_flight, &flow_id);
+ pthread_cleanup_push(oap_ctx_free, ctx);
+ pthread_cleanup_push(rk_freebuf, &req);
+ pthread_cleanup_push(rk_freebuf, &upd);
+ ret = ipcp_flow_update(&info, upd);
+ pthread_cleanup_pop(false);
+ pthread_cleanup_pop(false);
+ pthread_cleanup_pop(false);
+ pthread_cleanup_pop(false);
+ freebuf(upd);
+ if (ret < 0) {
+ log_err("Failed to send re-key request for flow %d.", flow_id);
+ goto fail_ctx;
+ }
+
+ e = malloc(sizeof(*e));
+ if (e == NULL) {
+ log_err("Failed to malloc re-key ctx for flow %d.", flow_id);
+ goto fail_ctx;
+ }
+
+ list_head_init(&e->next);
+ e->flow_id = flow_id;
+ e->ctx = ctx;
+ clock_gettime(PTHREAD_COND_CLOCK, &e->deadline);
+ e->deadline.tv_sec += REKEY_RESP_TIMEO;
+
+ list_add(&e->next, tbl);
+
+ log_dbg("Re-key request sent for flow %d.", flow_id);
+
+ freebuf(req);
+
+ return;
+
+ fail_ctx:
+ oap_ctx_free(ctx);
+ freebuf(req);
+ fail:
+ reg_flow_clear_in_flight(flow_id);
+}
+
+/* Worker-only: complete the exchange, install the pending seed. */
+static void rekey_do_complete(struct list_head * tbl,
+ int flow_id,
+ buffer_t buf)
+{
+ struct rekey_ctx * e;
+ struct name_info info;
+ struct crypt_sk sk;
+ uint8_t kbuf[SYMMKEYSZ];
+ buffer_t data = BUF_INIT;
+ buffer_t crt = BUF_INIT;
+ uint8_t newgen;
+
+ e = rekey_find(tbl, flow_id);
+ if (e == NULL) {
+ log_dbg("Stale re-key RESPONSE for flow %d.", flow_id);
+ return;
+ }
+
+ /* A concurrent responder already parked a seed; don't overwrite. */
+ if (reg_flow_rekey_pending(flow_id)) {
+ log_dbg("Re-key already pending for flow %d.", flow_id);
+ goto finish;
+ }
+
+ if (rekey_name_info(flow_id, &info) < 0) {
+ log_err("Failed to get name info to re-key flow %d.", flow_id);
+ goto finish;
+ }
+
+ sk.key = kbuf;
+
+ reg_flow_get_peer_crt(flow_id, &crt);
+
+ /* oap_cli_complete frees the ctx on every path. */
+ if (oap_cli_complete(e->ctx, &info, buf, &data, &sk, &crt, NULL) < 0) {
+ log_warn("Failed to complete re-key for flow %d.", flow_id);
+ e->ctx = NULL;
+ goto finish_clear;
+ }
+
+ e->ctx = NULL;
+
+ if (data.len != 1) {
+ log_warn("Re-key reply malformed for flow %d.", flow_id);
+ goto finish_clear;
+ }
+
+ newgen = *(uint8_t *) data.data;
+
+ if (newgen >= 16) {
+ log_warn("Re-key gen %u out of range for flow %d.",
+ newgen, flow_id);
+ goto finish_clear;
+ }
+
+ if (reg_flow_store_pending(flow_id, kbuf, newgen, true) < 0)
+ log_warn("Flow %d gone during re-key.", flow_id);
+ else
+ reg_notify_flow(flow_id, FLOW_UPD);
+
+ log_dbg("Re-key completed for flow %d (gen %u).", flow_id, newgen);
+
+ finish_clear:
+ crypt_secure_clear(kbuf, SYMMKEYSZ);
+ freebuf(data);
+ finish:
+ freebuf(crt);
+ rekey_drop(e);
+ reg_flow_clear_in_flight(flow_id);
+}
+
+/* Worker-only: reap entries whose RESPONSE never arrived. */
+static void rekey_reap_expired(struct list_head * tbl)
+{
+ struct list_head * p;
+ struct list_head * h;
+ struct timespec now;
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+
+ list_for_each_safe(p, h, tbl) {
+ struct rekey_ctx * e = list_entry(p, struct rekey_ctx, next);
+ if (ts_diff_ns(&e->deadline, &now) > 0)
+ continue;
+
+ log_warn("Re-key timed out for flow %d.", e->flow_id);
+ reg_flow_clear_in_flight(e->flow_id);
+ rekey_drop(e);
+ }
+}
+
+/* Responder side: process request, install pending seed, send response. */
+static int rekey_respond(struct flow_info * flow,
+ buffer_t * pk)
+{
+ struct name_info info;
+ struct crypt_sk sk;
+ uint8_t kbuf[SYMMKEYSZ];
+ buffer_t rsp = BUF_INIT;
+ buffer_t upd = BUF_INIT;
+ buffer_t data = BUF_INIT;
+ buffer_t crt = BUF_INIT;
+ uint8_t newgen;
+ int epoch;
+ int err;
+
+ epoch = reg_flow_get_epoch(flow->id);
+ if (epoch < 0) {
+ log_warn("Re-key for unknown flow %d.", flow->id);
+ return -EBADF;
+ }
+
+ /* Collision: we are driving our own exchange; let it win. */
+ if (reg_flow_rekey_should_yield(flow->id)) {
+ log_dbg("Yielding to own re-key for flow %d.", flow->id);
+ return 0;
+ }
+
+ if (rekey_name_info(flow->id, &info) < 0) {
+ log_err("Failed to get name info to re-key flow %d.", flow->id);
+ return -ENAME;
+ }
+
+ if (reg_flow_rekey_pending(flow->id)) {
+ log_dbg("Duplicate re-key request for flow %d.", flow->id);
+ return 0;
+ }
+
+ newgen = (uint8_t) ((epoch + 1) & 0x0F);
+ data.data = &newgen;
+ data.len = 1;
+
+ sk.key = kbuf;
+
+ reg_flow_get_peer_crt(flow->id, &crt);
+
+ err = oap_srv_process(&info, *pk, &rsp, &data, &sk, true, &crt, NULL);
+ if (err < 0) {
+ /* data still points to stack newgen; don't free it. */
+ log_err("Failed to process re-key OAP for flow %d.", flow->id);
+ goto finish;
+ }
+
+ /* On success oap_srv_process repointed data to client output. */
+ freebuf(data);
+
+ if (reg_flow_store_pending(flow->id, kbuf, newgen, false) < 0) {
+ log_warn("Flow %d gone during re-key.", flow->id);
+ err = -EBADF;
+ goto finish;
+ }
+
+ reg_notify_flow(flow->id, FLOW_UPD);
+
+ if (flow_upd_wrap(&upd, FLOW_UPD_REKEY_RESP, &rsp) == 0) {
+ pthread_cleanup_push(rk_clear_key, kbuf);
+ pthread_cleanup_push(rk_freebuf, &rsp);
+ pthread_cleanup_push(rk_freebuf, &crt);
+ pthread_cleanup_push(rk_freebuf, &upd);
+ if (ipcp_flow_update(flow, upd) < 0)
+ log_err("Failed to send re-key response for flow %d.",
+ flow->id);
+ pthread_cleanup_pop(false);
+ pthread_cleanup_pop(false);
+ pthread_cleanup_pop(false);
+ pthread_cleanup_pop(false);
+ freebuf(upd);
+ }
+
+ err = 0;
+ finish:
+ crypt_secure_clear(kbuf, SYMMKEYSZ);
+ freebuf(rsp);
+ freebuf(crt);
+
+ return err;
+}
+
+/*
+ * Worker-only: re-key a direct (loopback) flow, the exchange runs in-process:
+ * build a client request, then derive the shared seed, and hand the one seed
+ * to both apps with RB_REKEY.
+ */
+static void rekey_do_direct(int flow_id)
+{
+ struct name_info info;
+ struct crypt_sk sk;
+ uint8_t kbuf[SYMMKEYSZ];
+ buffer_t req = BUF_INIT;
+ buffer_t rsp = BUF_INIT;
+ buffer_t data = BUF_INIT;
+ buffer_t crt = BUF_INIT;
+ void * ctx = NULL;
+ uint8_t newgen;
+ int epoch;
+
+ epoch = reg_flow_get_epoch(flow_id);
+ if (epoch < 0) {
+ log_warn("Re-key for unknown flow %d.", flow_id);
+ reg_flow_clear_in_flight(flow_id);
+ return;
+ }
+
+ if (rekey_name_info(flow_id, &info) < 0) {
+ log_err("Failed to get name info to re-key flow %d.", flow_id);
+ reg_flow_clear_in_flight(flow_id);
+ return;
+ }
+
+ if (oap_cli_prepare(&ctx, &info, &req, data, true) < 0) {
+ log_err("Failed to prepare re-key for flow %d.", flow_id);
+ reg_flow_clear_in_flight(flow_id);
+ return;
+ }
+
+ newgen = (uint8_t) ((epoch + 1) & 0x0F);
+ data.data = &newgen;
+ data.len = 1;
+
+ sk.key = kbuf;
+
+ reg_flow_get_peer_crt(flow_id, &crt);
+
+ if (oap_srv_process(&info, req, &rsp, &data, &sk, true,
+ &crt, NULL) < 0) {
+ /* data still points to stack newgen; don't free it. */
+ log_err("Failed to process re-key OAP for flow %d.", flow_id);
+ reg_flow_clear_in_flight(flow_id);
+ goto out;
+ }
+
+ /* On success oap_srv_process repointed data to its output. */
+ freebuf(data);
+
+ if (reg_flow_store_pending_direct(flow_id, kbuf, newgen) < 0) {
+ log_warn("Flow %d gone during re-key.", flow_id);
+ reg_flow_clear_in_flight(flow_id);
+ goto out;
+ }
+
+ reg_notify_flow_peers(flow_id, FLOW_UPD);
+
+ log_dbg("Re-key completed (direct) for flow %d (gen %u).",
+ flow_id, newgen);
+ out:
+ crypt_secure_clear(kbuf, SYMMKEYSZ);
+ oap_ctx_free(ctx);
+ freebuf(req);
+ freebuf(rsp);
+ freebuf(crt);
+}
+
+/* Route one snapshot entry to the wire or in-process re-key path. */
+static void rekey_dispatch(struct list_head * tbl,
+ const struct rekey_info * ri)
+{
+ if (ri->direct)
+ rekey_do_direct(ri->flow_id);
+ else
+ rekey_do_initiate(tbl, ri->flow_id, ri->n_1_pid);
+}
+
+static int flow_update_arr(struct flow_info * flow,
+ buffer_t * pk)
+{
+ uint8_t type;
+ bool is_req;
+
+ if (pk->len < 1)
+ return -EINVAL;
+
+ type = pk->data[0];
+
+ switch (type) {
+ case FLOW_UPD_REKEY_REQ:
+ is_req = true;
+ break;
+ case FLOW_UPD_REKEY_RESP:
+ is_req = false;
+ break;
+ default:
+ log_warn("Unknown flow update type %u.", type);
+ return -EINVAL;
+ }
+
+ /* Drop floods/spoofs before allocating a worker event. */
+ if (!reg_flow_rekey_arr_admit(flow->id, flow->n_1_pid, is_req))
+ return 0;
+
+ /* Strip the type byte, keeping the malloc base for hand-off. */
+ memmove(pk->data, pk->data + 1, pk->len - 1);
+ pk->len -= 1;
+
+ /* Defer to worker; an inline RESP send deadlocks loopback. */
+ if (is_req)
+ rekey_post_req(flow->id, flow->n_1_pid, pk);
+ else
+ rekey_post_resp(flow->id, pk);
+
+ return 0;
+}
+
+static int flow_update(struct flow_info * flow,
+ uid_t uid,
+ pid_t cpid,
+ bool rekey,
+ struct crypt_sk * sk,
+ bool * has_key,
+ bool * initiator)
+{
+ uint8_t seed[SYMMKEYSZ];
+ uint8_t epoch;
+ int rc;
+
+ *has_key = false;
+ *initiator = false;
+
+ if (rekey) {
+ pid_t n_1_pid;
+
+ if (!reg_flow_owned_by(flow->id, uid))
+ return -EPERM;
+
+ /* Direct flows re-key in-process; no lower IPCP carrier. */
+ if (reg_flow_is_direct(flow->id)) {
+ if (reg_flow_rekey_begin(flow->id))
+ rekey_post_direct(flow->id);
+
+ return 0;
+ }
+
+ /* Watermark re-key: the app can't know its lower IPCP. */
+ n_1_pid = reg_flow_get_n_1_pid(flow->id);
+ if (n_1_pid <= 0)
+ return 0;
+
+ /* One exchange per flow; the latch arbitrates collisions. */
+ if (reg_flow_rekey_begin(flow->id))
+ rekey_post_init(flow->id, n_1_pid);
+
+ return 0;
+ }
+
+ rc = reg_flow_take_pending(flow->id, uid, cpid, seed, &epoch,
+ initiator);
+ if (rc == -EPERM)
+ return -EPERM;
+
+ if (rc != 0)
+ return 0;
+
+ memcpy(sk->key, seed, SYMMKEYSZ);
+ sk->epoch = epoch;
+ *has_key = true;
+
+ crypt_secure_clear(seed, SYMMKEYSZ);
+
+ log_dbg("Delivered re-key seed for flow %d (gen %u).",
+ flow->id, epoch);
+
+ return 0;
+}
+
+static void rekey_table_cleanup(void * o)
+{
+ struct list_head * tbl = o;
+ struct list_head * p;
+ struct list_head * h;
+
+ list_for_each_safe(p, h, tbl) {
+ struct rekey_ctx * e = list_entry(p, struct rekey_ctx, next);
+ rekey_drop(e);
+ }
+}
+
+static struct rekey_evt * rekey_event_wait(const struct timespec * dl)
+{
+ struct rekey_evt * evt = NULL;
+ int ret = 0;
+
+ pthread_mutex_lock(&irmd.rk.mtx);
+ pthread_cleanup_push(__cleanup_mutex_unlock, &irmd.rk.mtx);
+
+ while (list_is_empty(&irmd.rk.inbox) && ret != -ETIMEDOUT)
+ ret = -pthread_cond_timedwait(&irmd.rk.cond, &irmd.rk.mtx, dl);
+
+ if (!list_is_empty(&irmd.rk.inbox)) {
+ evt = list_first_entry(&irmd.rk.inbox, struct rekey_evt, next);
+ list_del(&evt->next);
+ }
+
+ pthread_cleanup_pop(true);
+
+ return evt;
+}
+
+static struct timespec rekey_deadline(struct list_head * tbl,
+ struct timespec next)
+{
+ struct timespec deadline = next;
+ struct list_head * p;
+
+ list_for_each(p, tbl) {
+ struct rekey_ctx * e;
+ e = list_entry(p, struct rekey_ctx, next);
+ if (ts_diff_ns(&e->deadline, &deadline) < 0)
+ deadline = e->deadline;
+ }
+
+ return deadline;
+}
+
+static void rekey_handle_evt(struct list_head * tbl,
+ struct rekey_evt * evt)
+{
+ struct flow_info rinfo;
+
+ pthread_cleanup_push(rk_free_evt, evt);
+
+ switch (evt->type) {
+ case REKEY_INIT:
+ rekey_do_initiate(tbl, evt->flow_id, evt->n_1_pid);
+ break;
+ case REKEY_REQ:
+ memset(&rinfo, 0, sizeof(rinfo));
+ rinfo.id = evt->flow_id;
+ rinfo.n_1_pid = evt->n_1_pid;
+ rekey_respond(&rinfo, &evt->buf);
+ reg_flow_rekey_arr_done(evt->flow_id, true);
+ break;
+ case REKEY_RESP:
+ rekey_do_complete(tbl, evt->flow_id, evt->buf);
+ reg_flow_rekey_arr_done(evt->flow_id, false);
+ break;
+ case REKEY_DIRECT:
+ rekey_do_direct(evt->flow_id);
+ break;
+ default:
+ break;
+ }
+
+ pthread_cleanup_pop(true);
+}
+
+/* On the periodic tick, dispatch all flows due for re-keying. */
+static void rekey_run_periodic(struct list_head * tbl,
+ struct timespec * next)
+{
+ struct rekey_info snap[REKEY_BATCH];
+ struct timespec now;
+ int n;
+ int i;
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+
+ if (ts_diff_ns(next, &now) > 0)
+ return;
+
+ n = reg_flow_snapshot_rekey_due(snap, REKEY_BATCH);
+ for (i = 0; i < n; ++i)
+ rekey_dispatch(tbl, &snap[i]);
+
+ clock_gettime(PTHREAD_COND_CLOCK, next);
+ next->tv_sec += OAP_REKEY_TIMER;
+}
+
+/*
+ * Single worker owning all in-flight Tier-2 re-keys. It drains the
+ * inbox, runs the periodic snapshot, and reaps timed-out exchanges.
+ * The table is touched only here, so it needs no lock.
+ */
+static void * rekey_worker(void * o)
+{
+ struct list_head table;
+ struct timespec next;
+
+ (void) o;
+
+ list_head_init(&table);
+
+ clock_gettime(PTHREAD_COND_CLOCK, &next);
+ next.tv_sec += OAP_REKEY_TIMER;
+
+ pthread_cleanup_push(rekey_table_cleanup, &table);
+
+ while (true) {
+ struct rekey_evt * evt;
+ struct timespec deadline;
+
+ deadline = rekey_deadline(&table, next);
+
+ evt = rekey_event_wait(&deadline);
+
+ if (evt != NULL)
+ rekey_handle_evt(&table, evt);
+
+ rekey_run_periodic(&table, &next);
+
+ rekey_reap_expired(&table);
+ }
+
+ pthread_cleanup_pop(true);
+
+ return (void *) 0;
+}
+
static void * acceptloop(void * o)
{
int csockfd;
@@ -1491,6 +2300,11 @@ static irm_msg_t * do_command_msg(irm_msg_t * msg,
struct timespec now;
struct timespec ts = TIMESPEC_INIT_S(0); /* static analysis */
int res;
+ bool has_key = false;
+ bool initiator = false;
+ uid_t uid;
+ gid_t gid;
+ pid_t cpid;
irm_msg_t * ret_msg;
buffer_t data;
@@ -1557,7 +2371,7 @@ static irm_msg_t * do_command_msg(irm_msg_t * msg,
case IRM_MSG_CODE__IRM_PROC_ANNOUNCE:
proc.pid = msg->pid;
strcpy(proc.prog, msg->prog);
- res = get_peer_ids(fd, &proc.uid, &proc.gid);
+ res = get_peer_ids(fd, &proc.uid, &proc.gid, NULL);
if (res < 0)
log_err("Failed to get UID/GID for pid %d.", msg->pid);
else
@@ -1600,26 +2414,29 @@ static irm_msg_t * do_command_msg(irm_msg_t * msg,
flow = flow_info_msg_to_s(msg->flow_info);
sk.key = kbuf;
res = flow_accept(&flow, &data, abstime, &sk);
- if (res == 0) {
- ret_msg->flow_info = flow_info_s_to_msg(&flow);
- ret_msg->has_pk = data.len != 0;
- ret_msg->pk.data = data.data;
- ret_msg->pk.len = data.len;
- ret_msg->has_cipher_nid = true;
- ret_msg->cipher_nid = sk.nid;
- if (sk.nid != NID_undef) {
- hbuf = malloc(SYMMKEYSZ);
- if (hbuf == NULL) {
- log_err("Failed to malloc key buf");
- return NULL;
- }
-
- memcpy(hbuf, kbuf, SYMMKEYSZ);
- ret_msg->sym_key.data = hbuf;
- ret_msg->sym_key.len = SYMMKEYSZ;
- ret_msg->has_sym_key = true;
- }
+ if (res != 0)
+ break;
+
+ ret_msg->flow_info = flow_info_s_to_msg(&flow);
+ ret_msg->has_pk = data.len != 0;
+ ret_msg->pk.data = data.data;
+ ret_msg->pk.len = data.len;
+ ret_msg->has_cipher_nid = true;
+ ret_msg->cipher_nid = sk.nid;
+ if (sk.nid == NID_undef)
+ break;
+
+ hbuf = malloc(SYMMKEYSZ);
+ if (hbuf == NULL) {
+ log_err("Failed to malloc key buf");
+ res = -ENOMEM;
+ break;
}
+
+ memcpy(hbuf, kbuf, SYMMKEYSZ);
+ ret_msg->sym_key.data = hbuf;
+ ret_msg->sym_key.len = SYMMKEYSZ;
+ ret_msg->has_sym_key = true;
break;
case IRM_MSG_CODE__IRM_FLOW_ALLOC:
data.len = msg->pk.len;
@@ -1630,25 +2447,29 @@ static irm_msg_t * do_command_msg(irm_msg_t * msg,
abstime = abstime == NULL ? &max : abstime;
sk.key = kbuf;
res = flow_alloc(msg->dst, &flow, &data, abstime, &sk);
- if (res == 0) {
- ret_msg->flow_info = flow_info_s_to_msg(&flow);
- ret_msg->has_pk = data.len != 0;
- ret_msg->pk.data = data.data;
- ret_msg->pk.len = data.len;
- ret_msg->has_cipher_nid = true;
- ret_msg->cipher_nid = sk.nid;
- if (sk.nid != NID_undef) {
- hbuf = malloc(SYMMKEYSZ);
- if (hbuf == NULL) {
- log_err("Failed to malloc key buf");
- return NULL;
- }
- memcpy(hbuf, kbuf, SYMMKEYSZ);
- ret_msg->sym_key.data = hbuf;
- ret_msg->sym_key.len = SYMMKEYSZ;
- ret_msg->has_sym_key = true;
- }
+ if (res != 0)
+ break;
+
+ ret_msg->flow_info = flow_info_s_to_msg(&flow);
+ ret_msg->has_pk = data.len != 0;
+ ret_msg->pk.data = data.data;
+ ret_msg->pk.len = data.len;
+ ret_msg->has_cipher_nid = true;
+ ret_msg->cipher_nid = sk.nid;
+ if (sk.nid == NID_undef)
+ break;
+
+ hbuf = malloc(SYMMKEYSZ);
+ if (hbuf == NULL) {
+ log_err("Failed to malloc key buf");
+ res = -ENOMEM;
+ break;
}
+
+ memcpy(hbuf, kbuf, SYMMKEYSZ);
+ ret_msg->sym_key.data = hbuf;
+ ret_msg->sym_key.len = SYMMKEYSZ;
+ ret_msg->has_sym_key = true;
break;
case IRM_MSG_CODE__IRM_FLOW_JOIN:
assert(msg->pk.len == 0 && msg->pk.data == NULL);
@@ -1687,6 +2508,51 @@ static irm_msg_t * do_command_msg(irm_msg_t * msg,
flow = flow_info_msg_to_s(msg->flow_info);
res = flow_alloc_reply(&flow, msg->response, &data);
break;
+ case IRM_MSG_CODE__IPCP_FLOW_UPDATE_ARR:
+ data.len = msg->pk.len;
+ data.data = msg->pk.data;
+ msg->pk.data = NULL; /* pass data */
+ msg->pk.len = 0;
+ flow = flow_info_msg_to_s(msg->flow_info);
+ res = flow_update_arr(&flow, &data);
+ freebuf(data);
+ break;
+ case IRM_MSG_CODE__IRM_FLOW_UPDATE:
+ flow = flow_info_msg_to_s(msg->flow_info);
+ if (get_peer_ids(fd, &uid, &gid, &cpid) < 0) {
+ res = -EPERM;
+ break;
+ }
+
+ if (cpid <= 0) /* non-Linux: fall back to asserted pid */
+ cpid = flow.n_pid;
+
+ sk.key = kbuf;
+ res = flow_update(&flow, uid, cpid, msg->rekey, &sk, &has_key,
+ &initiator);
+ if (res != 0)
+ break;
+
+ ret_msg->flow_info = flow_info_s_to_msg(&flow);
+ if (!has_key)
+ break;
+
+ hbuf = malloc(SYMMKEYSZ);
+ if (hbuf == NULL) {
+ log_err("Failed to malloc key buf");
+ res = -ENOMEM;
+ break;
+ }
+
+ memcpy(hbuf, kbuf, SYMMKEYSZ);
+ ret_msg->sym_key.data = hbuf;
+ ret_msg->sym_key.len = SYMMKEYSZ;
+ ret_msg->has_sym_key = true;
+ ret_msg->has_generation = true;
+ ret_msg->generation = sk.epoch;
+ ret_msg->has_rk_initiator = true;
+ ret_msg->rk_initiator = initiator;
+ break;
default:
log_err("Don't know that message code.");
res = -1;
@@ -1706,6 +2572,13 @@ static irm_msg_t * do_command_msg(irm_msg_t * msg,
return ret_msg;
}
+/* Wipe the session key from a reply before its buffers are freed. */
+static void clear_msg_key(irm_msg_t * msg)
+{
+ if (msg != NULL && msg->has_sym_key)
+ crypt_secure_clear(msg->sym_key.data, msg->sym_key.len);
+}
+
static void * mainloop(void * o)
{
int sfd;
@@ -1717,6 +2590,7 @@ static void * mainloop(void * o)
while (true) {
irm_msg_t * ret_msg;
struct cmd * cmd;
+ bool had_key;
pthread_mutex_lock(&irmd.cmd_lock);
@@ -1780,6 +2654,9 @@ static void * mainloop(void * o)
irm_msg__pack(ret_msg, buffer.data);
+ had_key = ret_msg->has_sym_key;
+ clear_msg_key(ret_msg);
+
irm_msg__free_unpacked(ret_msg, NULL);
pthread_cleanup_push(__cleanup_close_ptr, &sfd);
@@ -1794,6 +2671,9 @@ static void * mainloop(void * o)
strerror(errno));
}
+ if (had_key)
+ crypt_secure_clear(buffer.data, buffer.len);
+
pthread_cleanup_pop(true);
pthread_cleanup_pop(true);
@@ -1801,6 +2681,7 @@ static void * mainloop(void * o)
continue;
fail:
+ clear_msg_key(ret_msg);
irm_msg__free_unpacked(ret_msg, NULL);
fail_msg:
close(sfd);
@@ -1884,12 +2765,14 @@ void * irm_sanitize(void * o)
return (void *) 0;
}
-static int irm_load_store(char * dpath)
+static int irm_load_store(char * dpath,
+ bool anchor)
{
struct stat st;
struct dirent * dent;
DIR * dir;
void * crt;
+ int ret;
if (stat(dpath, &st) == -1) {
log_dbg("Store directory %s not found.", dpath);
@@ -1933,7 +2816,9 @@ static int irm_load_store(char * dpath)
goto fail_file;
}
- if (oap_auth_add_ca_crt(crt) < 0) {
+ ret = anchor ? oap_auth_add_ca_crt(crt)
+ : oap_auth_add_chain_crt(crt);
+ if (ret < 0) {
log_err("Failed to add certificate from %s to store.",
path);
goto fail_crt_add;
@@ -2030,6 +2915,29 @@ static int irm_init(void)
list_head_init(&irmd.cmds);
+ if (pthread_mutex_init(&irmd.rk.mtx, NULL)) {
+ log_err("Failed to initialize mutex.");
+ goto fail_rk_mtx;
+ }
+
+ if (pthread_condattr_init(&cattr)) {
+ log_err("Failed to initialize condattr.");
+ goto fail_rk_mtx;
+ }
+
+#ifndef __APPLE__
+ pthread_condattr_setclock(&cattr, PTHREAD_COND_CLOCK);
+#endif
+ if (pthread_cond_init(&irmd.rk.cond, &cattr)) {
+ log_err("Failed to initialize condvar.");
+ pthread_condattr_destroy(&cattr);
+ goto fail_rk_cond;
+ }
+
+ pthread_condattr_destroy(&cattr);
+
+ list_head_init(&irmd.rk.inbox);
+
if (stat(SOCK_PATH, &st) == -1) {
if (mkdir(SOCK_PATH, 0777)) {
log_err("Failed to create sockets directory.");
@@ -2077,12 +2985,12 @@ static int irm_init(void)
goto fail_oap;
}
- if (irm_load_store(OUROBOROS_CA_CRT_DIR) < 0) {
+ if (irm_load_store(OUROBOROS_CA_CRT_DIR, true) < 0) {
log_err("Failed to load CA certificates.");
goto fail_load_store;
}
- if (irm_load_store(OUROBOROS_CHAIN_DIR) < 0) {
+ if (irm_load_store(OUROBOROS_CHAIN_DIR, false) < 0) {
log_err("Failed to load intermediate certificates.");
goto fail_load_store;
}
@@ -2133,6 +3041,10 @@ static int irm_init(void)
fail_sock_path:
unlink(IRM_SOCK_PATH);
fail_stat:
+ pthread_cond_destroy(&irmd.rk.cond);
+ fail_rk_cond:
+ pthread_mutex_destroy(&irmd.rk.mtx);
+ fail_rk_mtx:
pthread_cond_destroy(&irmd.cmd_cond);
fail_cmd_cond:
pthread_mutex_destroy(&irmd.cmd_lock);
@@ -2181,13 +3093,28 @@ static void irm_fini(void)
pthread_mutex_unlock(&irmd.cmd_lock);
+ pthread_mutex_lock(&irmd.rk.mtx);
+
+ list_for_each_safe(p, h, &irmd.rk.inbox) {
+ struct rekey_evt * evt;
+ evt = list_entry(p, struct rekey_evt, next);
+ list_del(&evt->next);
+ freebuf(evt->buf);
+ free(evt);
+ }
+
+ pthread_mutex_unlock(&irmd.rk.mtx);
+
pthread_mutex_destroy(&irmd.cmd_lock);
pthread_cond_destroy(&irmd.cmd_cond);
+ pthread_mutex_destroy(&irmd.rk.mtx);
+ pthread_cond_destroy(&irmd.rk.cond);
pthread_rwlock_destroy(&irmd.state_lock);
#ifdef HAVE_FUSE
while (rmdir(FUSE_PREFIX) < 0 && retries-- > 0)
nanosleep(&wait, NULL);
+
if (retries < 0)
log_err("Failed to remove " FUSE_PREFIX);
#endif
@@ -2220,10 +3147,18 @@ static int irm_start(void)
if (pthread_create(&irmd.acceptor, NULL, acceptloop, NULL))
goto fail_acceptor;
+ if (OAP_REKEY_TIMER > 0) {
+ if (pthread_create(&irmd.rk.worker, NULL, rekey_worker, NULL))
+ goto fail_rekey_worker;
+ }
+
log_info("Ouroboros IPC Resource Manager daemon started...");
return 0;
+ fail_rekey_worker:
+ pthread_cancel(irmd.acceptor);
+ pthread_join(irmd.acceptor, NULL);
fail_acceptor:
pthread_cancel(irmd.irm_sanitize);
pthread_join(irmd.irm_sanitize, NULL);
@@ -2263,6 +3198,11 @@ static void irm_sigwait(sigset_t sigset)
static void irm_stop(void)
{
+ if (OAP_REKEY_TIMER > 0) {
+ pthread_cancel(irmd.rk.worker);
+ pthread_join(irmd.rk.worker, NULL);
+ }
+
pthread_cancel(irmd.acceptor);
pthread_cancel(irmd.irm_sanitize);
@@ -2383,26 +3323,31 @@ int main(int argc,
goto fail_irm_init;
}
- if (irm_init() < 0)
+ if (crypt_secure_malloc_init(IRMD_SECMEM_MAX) < 0) {
+ log_err("Failed to initialize secure memory allocation.");
+ goto fail_secmem;
+ }
+
+ if (irm_init() < 0) {
+ log_err("Failed to initialize IRMd.");
goto fail_irm_init;
+ }
if (reg_init() < 0) {
log_err("Failed to initialize registry.");
goto fail_reg;
}
- if (crypt_secure_malloc_init(IRMD_SECMEM_MAX) < 0) {
- log_err("Failed to initialize secure memory allocation.");
- goto fail_reg;
- }
-
pthread_sigmask(SIG_BLOCK, &sigset, NULL);
- if (irm_start() < 0)
+ if (irm_start() < 0) {
+ log_err("Failed to start IRMd.");
goto fail_irm_start;
+ }
#ifdef HAVE_TOML
if (irm_configure(irmd.cfg_file) < 0) {
+ log_err("Failed to load IRMd configuration.");
irmd_set_state(IRMD_SHUTDOWN);
ret = EXIT_FAILURE;
}
@@ -2415,15 +3360,16 @@ int main(int argc,
pthread_sigmask(SIG_UNBLOCK, &sigset, NULL);
- crypt_secure_malloc_fini();
- crypt_cleanup();
-
reg_clear();
reg_fini();
irm_fini();
+ crypt_secure_malloc_fini();
+
+ crypt_cleanup();
+
log_info("Ouroboros IPC Resource Manager daemon exited. Bye.");
log_fini();
@@ -2435,5 +3381,8 @@ int main(int argc,
fail_reg:
irm_fini();
fail_irm_init:
+ crypt_secure_malloc_fini();
+ crypt_cleanup();
+ fail_secmem:
exit(EXIT_FAILURE);
}
diff --git a/src/irmd/oap.c b/src/irmd/oap.c
deleted file mode 100644
index 1831f533..00000000
--- a/src/irmd/oap.c
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Ouroboros - Copyright (C) 2016 - 2026
- *
- * OAP - Shared credential and configuration loading
- *
- * Dimitri Staessens <dimitri@ouroboros.rocks>
- * Sander Vrijders <sander@ouroboros.rocks>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., http://www.fsf.org/about/contact/.
- */
-
-#if defined(__linux__) || defined(__CYGWIN__)
- #define _DEFAULT_SOURCE
-#else
- #define _POSIX_C_SOURCE 200809L
-#endif
-
-#define OUROBOROS_PREFIX "irmd/oap"
-
-#include <ouroboros/crypt.h>
-#include <ouroboros/errno.h>
-#include <ouroboros/logs.h>
-
-#include "config.h"
-
-#include <assert.h>
-#include <string.h>
-#include <sys/stat.h>
-
-/*
- * Shared credential and configuration loading helpers
- */
-
-#ifndef OAP_TEST_MODE
-
-static bool file_exists(const char * path)
-{
- struct stat s;
-
- if (stat(path, &s) < 0 && errno == ENOENT) {
- log_dbg("File %s does not exist.", path);
- return false;
- }
-
- return true;
-}
-
-int load_credentials(const char * name,
- const struct name_sec_paths * paths,
- void ** pkp,
- void ** crt)
-{
- assert(paths != NULL);
- assert(pkp != NULL);
- assert(crt != NULL);
-
- *pkp = NULL;
- *crt = NULL;
-
- if (!file_exists(paths->crt) || !file_exists(paths->key)) {
- log_info("No authentication certificates for %s.", name);
- return 0;
- }
-
- if (crypt_load_crt_file(paths->crt, crt) < 0) {
- log_err("Failed to load %s for %s.", paths->crt, name);
- goto fail_crt;
- }
-
- if (crypt_load_privkey_file(paths->key, pkp) < 0) {
- log_err("Failed to load %s for %s.", paths->key, name);
- goto fail_key;
- }
-
- log_info("Loaded authentication certificates for %s.", name);
-
- return 0;
-
- fail_key:
- crypt_free_crt(*crt);
- *crt = NULL;
- fail_crt:
- return -EAUTH;
-}
-
-int load_kex_config(const char * name,
- const char * path,
- struct sec_config * cfg)
-{
- assert(name != NULL);
- assert(cfg != NULL);
-
- memset(cfg, 0, sizeof(*cfg));
-
- /* Load encryption config */
- if (!file_exists(path))
- log_dbg("No encryption %s for %s.", path, name);
-
- if (load_sec_config_file(cfg, path) < 0) {
- log_warn("Failed to load %s for %s.", path, name);
- return -1;
- }
-
- if (!IS_KEX_ALGO_SET(cfg)) {
- log_info("Key exchange not configured for %s.", name);
- return 0;
- }
-
- if (cfg->c.nid == NID_undef || crypt_nid_to_str(cfg->c.nid) == NULL) {
- log_err("Invalid cipher NID %d for %s.", cfg->c.nid, name);
- return -ECRYPT;
- }
-
- log_info("Encryption enabled for %s.", name);
-
- return 0;
-}
-
-#endif /* OAP_TEST_MODE */
diff --git a/src/irmd/oap.h b/src/irmd/oap.h
index d6d8dfe2..86f11e21 100644
--- a/src/irmd/oap.h
+++ b/src/irmd/oap.h
@@ -28,6 +28,8 @@
#include <ouroboros/name.h>
#include <ouroboros/utils.h>
+#include <stdbool.h>
+
/* OAP authentication state (in oap/auth.c) */
int oap_auth_init(void);
@@ -35,31 +37,46 @@ void oap_auth_fini(void);
int oap_auth_add_ca_crt(void * crt);
+int oap_auth_add_chain_crt(void * crt);
+
/*
* Prepare OAP request header for server, returns context
* Passes client data for srv, returns srv data for client
+* rekey forces ephemeral server-encap KEX (no client-encap; preserves FS/PCS)
*/
int oap_cli_prepare(void ** ctx,
const struct name_info * info,
buffer_t * req_buf,
- buffer_t data);
+ buffer_t data,
+ bool rekey);
/*
* Server processes header, creates response header, returns secret key.
* data is in/out: input=srv data to send, output=cli data received.
+ * rekey drops the cert and verifies against cached_crt; peer_crt (or NULL)
+ * receives a copy of the peer cert to cache at the initial handshake.
*/
int oap_srv_process(const struct name_info * info,
buffer_t req_buf,
buffer_t * rsp_buf,
buffer_t * data,
- struct crypt_sk * sk);
+ struct crypt_sk * sk,
+ bool rekey,
+ const buffer_t * cached_crt,
+ buffer_t * peer_crt);
-/* Complete OAP, returns secret key and server data, frees ctx */
+/*
+ * Complete OAP, returns secret key and server data, frees ctx.
+ * cached_crt verifies a cert-less re-key; peer_crt (or NULL) receives a
+ * copy of the peer cert to cache at the initial handshake.
+ */
int oap_cli_complete(void * ctx,
const struct name_info * info,
buffer_t rsp_buf,
buffer_t * data,
- struct crypt_sk * sk);
+ struct crypt_sk * sk,
+ const buffer_t * cached_crt,
+ buffer_t * peer_crt);
/* Free OAP state (on failure before complete) */
void oap_ctx_free(void * ctx);
diff --git a/src/irmd/oap/auth.c b/src/irmd/oap/auth.c
index 4b86f055..f70f9df1 100644
--- a/src/irmd/oap/auth.c
+++ b/src/irmd/oap/auth.c
@@ -29,8 +29,8 @@
#define OUROBOROS_PREFIX "irmd/oap"
#include <ouroboros/crypt.h>
+#include <ouroboros/endian.h>
#include <ouroboros/errno.h>
-#include <ouroboros/list.h>
#include <ouroboros/logs.h>
#include <ouroboros/pthread.h>
#include <ouroboros/time.h>
@@ -44,38 +44,99 @@
#include <stdlib.h>
#include <string.h>
-struct oap_replay_entry {
- struct list_head next;
- uint64_t timestamp;
- uint8_t id[OAP_ID_SIZE];
+/*
+ * Replay cache: three timestamp-generation hash buckets. A header's bucket
+ * is gen(T) = T / OAP_REPLAY_TIMER, taken mod 3. Staleness bounds a valid T
+ * to generations {G-1, G, G+1} (G is now's generation; a within-slack future
+ * stamp can reach G+1), which are distinct mod 3; the aliasing generation
+ * G-3 is always rejected as too old first. Each bucket is an open-addressed
+ * hash set whose slots are live iff slot.gen == bucket.gen, so a stale bucket
+ * clears in O(1) by bumping its gen. Overflow fails closed (reject), never
+ * evicts, so a flood cannot displace a genuine entry into a replayable state.
+ */
+#define OAP_REPLAY_GENS 3
+
+struct oap_replay_slot {
+ uint64_t gen; /* live iff == bucket gen; 0 = never used */
+ uint64_t ts;
+ uint8_t id[OAP_ID_SIZE];
+};
+
+struct oap_replay_bucket {
+ uint64_t gen;
+ size_t count;
+ struct oap_replay_slot * slots;
};
static struct {
struct auth_ctx * ca_ctx;
struct {
- struct list_head list;
- pthread_mutex_t mtx;
+ size_t mask; /* slots per bucket - 1 */
+ size_t cap; /* fail-closed threshold */
+ struct oap_replay_bucket bucket[OAP_REPLAY_GENS];
+ pthread_mutex_t mtx;
} replay;
} oap_auth;
+/* FNV-1a over id || ts; the table mask reduces it to a slot index. */
+static size_t replay_hash(const uint8_t * id,
+ uint64_t ts)
+{
+ uint64_t hh = 14695981039346656037ULL;
+ size_t i;
+
+ for (i = 0; i < OAP_ID_SIZE; i++) {
+ hh ^= id[i];
+ hh *= 1099511628211ULL;
+ }
+
+ for (i = 0; i < sizeof(ts); i++) {
+ hh ^= (uint8_t) (ts >> (i * 8));
+ hh *= 1099511628211ULL;
+ }
+
+ return (size_t) hh;
+}
+
int oap_auth_init(void)
{
+ size_t m = 1;
+ int i;
+
oap_auth.ca_ctx = auth_create_ctx();
if (oap_auth.ca_ctx == NULL) {
log_err("Failed to create OAP auth context.");
goto fail_ctx;
}
- list_head_init(&oap_auth.replay.list);
+ while (m < (size_t) OAP_REPLAY_MAX * 2)
+ m <<= 1;
+
+ oap_auth.replay.mask = m - 1;
+ oap_auth.replay.cap = OAP_REPLAY_MAX;
+
+ for (i = 0; i < OAP_REPLAY_GENS; i++) {
+ struct oap_replay_bucket * b = &oap_auth.replay.bucket[i];
+ b->gen = 0;
+ b->count = 0;
+ b->slots = calloc(m, sizeof(*b->slots));
+ if (b->slots == NULL) {
+ log_err("Failed to alloc OAP replay bucket.");
+ goto fail_bucket;
+ }
+ }
if (pthread_mutex_init(&oap_auth.replay.mtx, NULL)) {
log_err("Failed to init OAP replay mutex.");
- goto fail_mtx;
+ goto fail_bucket;
}
return 0;
- fail_mtx:
+ fail_bucket:
+ for (i = 0; i < OAP_REPLAY_GENS; i++)
+ free(oap_auth.replay.bucket[i].slots);
+
auth_destroy_ctx(oap_auth.ca_ctx);
fail_ctx:
return -1;
@@ -83,16 +144,13 @@ int oap_auth_init(void)
void oap_auth_fini(void)
{
- struct list_head * p;
- struct list_head * h;
+ int i;
pthread_mutex_lock(&oap_auth.replay.mtx);
- list_for_each_safe(p, h, &oap_auth.replay.list) {
- struct oap_replay_entry * e;
- e = list_entry(p, struct oap_replay_entry, next);
- list_del(&e->next);
- free(e);
+ for (i = 0; i < OAP_REPLAY_GENS; i++) {
+ free(oap_auth.replay.bucket[i].slots);
+ oap_auth.replay.bucket[i].slots = NULL;
}
pthread_mutex_unlock(&oap_auth.replay.mtx);
@@ -106,18 +164,214 @@ int oap_auth_add_ca_crt(void * crt)
return auth_add_crt_to_store(oap_auth.ca_ctx, crt);
}
+int oap_auth_add_chain_crt(void * crt)
+{
+ return auth_add_crt_to_chain(oap_auth.ca_ctx, crt);
+}
+
+/* HKDF info = LABEL (incl. NUL separator) || request-hash [|| response-hash] */
+#define OAP_BIND_LABEL "o7s-oap-bind"
+#define OAP_KC_LABEL "o7s-oap-kc"
+#define OAP_HS_LABEL "o7s-oap-hs"
+
+int oap_resp_hash(int md_nid,
+ buffer_t kex,
+ buffer_t data,
+ buffer_t crt,
+ buffer_t * out)
+{
+ buffer_t cat = BUF_INIT;
+ uint8_t * p;
+ ssize_t len;
+
+ assert(out != NULL);
+ assert(out->data != NULL);
+
+ cat.len = kex.len + data.len + crt.len;
+ if (cat.len == 0)
+ return -EINVAL;
+
+ cat.data = malloc(cat.len);
+ if (cat.data == NULL)
+ return -ENOMEM;
+
+ p = cat.data;
+ if (kex.len > 0) {
+ memcpy(p, kex.data, kex.len);
+ p += kex.len;
+ }
+
+ if (data.len > 0) {
+ memcpy(p, data.data, data.len);
+ p += data.len;
+ }
+
+ if (crt.len > 0)
+ memcpy(p, crt.data, crt.len);
+
+ len = md_digest(md_nid, cat, out->data);
+
+ freebuf(cat);
+
+ if (len < 0)
+ return -ECRYPT;
+
+ out->len = (size_t) len;
+
+ return 0;
+}
+
+/* HKDF-expand sk->key with info into out; -ECRYPT on failure. */
+static int oap_hkdf_expand(const struct crypt_sk * sk,
+ buffer_t info,
+ uint8_t * out,
+ size_t outlen)
+{
+ buffer_t prk;
+ buffer_t okm;
+
+ prk.len = SYMMKEYSZ;
+ prk.data = sk->key;
+ okm.len = outlen;
+ okm.data = out;
+
+ if (crypt_hkdf_expand(prk, info, okm) < 0)
+ return -ECRYPT;
+
+ return 0;
+}
+
+/* info = label || H(req) */
+#define OAP_HS_INFO_SZ (sizeof(OAP_HS_LABEL) + MAX_HASH_SIZE)
+int oap_derive_hs_key(const struct crypt_sk * sk,
+ buffer_t req_hash,
+ uint8_t * out)
+{
+ uint8_t info_buf[OAP_HS_INFO_SZ];
+ buffer_t info;
+ size_t len;
+
+ assert(sk != NULL);
+ assert(req_hash.data != NULL);
+ assert(out != NULL);
+
+ if (req_hash.len == 0 || req_hash.len > MAX_HASH_SIZE)
+ return -EINVAL;
+
+ len = sizeof(OAP_HS_LABEL);
+ memcpy(info_buf, OAP_HS_LABEL, len);
+ memcpy(info_buf + len, req_hash.data, req_hash.len);
+ len += req_hash.len;
+
+ info.len = len;
+ info.data = info_buf;
+
+ return oap_hkdf_expand(sk, info, out, SYMMKEYSZ);
+}
+
+/* info = label || H(req) || H(resp) || cipher_nid || kdf_nid */
+#define OAP_BIND_INFO_SZ \
+ (sizeof(OAP_BIND_LABEL) + 2 * MAX_HASH_SIZE + 2 * sizeof(uint16_t))
+int oap_bind_session_key(struct crypt_sk * sk,
+ buffer_t req_hash,
+ buffer_t resp_hash,
+ int kdf_nid)
+{
+ uint8_t info_buf[OAP_BIND_INFO_SZ];
+ uint8_t tmp[SYMMKEYSZ];
+ uint16_t suite[2];
+ buffer_t info;
+ size_t len;
+
+ assert(sk != NULL);
+ assert(req_hash.data != NULL);
+ assert(resp_hash.data != NULL);
+
+ if (req_hash.len == 0 || req_hash.len > MAX_HASH_SIZE)
+ return -EINVAL;
+
+ if (resp_hash.len == 0 || resp_hash.len > MAX_HASH_SIZE)
+ return -EINVAL;
+
+ len = sizeof(OAP_BIND_LABEL);
+ memcpy(info_buf, OAP_BIND_LABEL, len);
+ memcpy(info_buf + len, req_hash.data, req_hash.len);
+ len += req_hash.len;
+
+ memcpy(info_buf + len, resp_hash.data, resp_hash.len);
+ len += resp_hash.len;
+
+ suite[0] = hton16((uint16_t) sk->nid);
+ suite[1] = hton16((uint16_t) kdf_nid);
+ memcpy(info_buf + len, suite, sizeof(suite));
+ len += sizeof(suite);
+
+ info.len = len;
+ info.data = info_buf;
+
+ if (oap_hkdf_expand(sk, info, tmp, SYMMKEYSZ) < 0)
+ return -ECRYPT;
+
+ memcpy(sk->key, tmp, SYMMKEYSZ);
+ crypt_secure_clear(tmp, SYMMKEYSZ);
+
+ return 0;
+}
+
+/* info = label || H(req) || H(resp) */
+#define OAP_KC_INFO_SZ (sizeof(OAP_KC_LABEL) + 2 * MAX_HASH_SIZE)
+int oap_key_confirm_tag(const struct crypt_sk * sk,
+ buffer_t req_hash,
+ buffer_t resp_hash,
+ uint8_t * out,
+ size_t outlen)
+{
+ uint8_t info_buf[OAP_KC_INFO_SZ];
+ buffer_t info;
+ size_t len;
+
+ assert(sk != NULL);
+ assert(req_hash.data != NULL);
+ assert(resp_hash.data != NULL);
+ assert(out != NULL);
+
+ if (req_hash.len == 0 || req_hash.len > MAX_HASH_SIZE)
+ return -EINVAL;
+
+ if (resp_hash.len == 0 || resp_hash.len > MAX_HASH_SIZE)
+ return -EINVAL;
+
+ if (outlen > MAX_HASH_SIZE)
+ return -EINVAL;
+
+ len = sizeof(OAP_KC_LABEL);
+ memcpy(info_buf, OAP_KC_LABEL, len);
+ memcpy(info_buf + len, req_hash.data, req_hash.len);
+ len += req_hash.len;
+
+ memcpy(info_buf + len, resp_hash.data, resp_hash.len);
+ len += resp_hash.len;
+
+ info.len = len;
+ info.data = info_buf;
+
+ return oap_hkdf_expand(sk, info, out, outlen);
+}
+
#define TIMESYNC_SLACK 100 /* ms */
#define ID_IS_EQUAL(id1, id2) (memcmp(id1, id2, OAP_ID_SIZE) == 0)
int oap_check_hdr(const struct oap_hdr * hdr)
{
- struct list_head * p;
- struct list_head * h;
- struct timespec now;
- struct oap_replay_entry * new;
- uint64_t stamp;
- uint64_t cur;
- uint8_t * id;
- ssize_t delta;
+ struct oap_replay_bucket * b;
+ struct oap_replay_slot * slots;
+ struct timespec now;
+ uint64_t stamp;
+ uint64_t cur;
+ uint64_t gen;
+ uint8_t * id;
+ size_t h;
+ ssize_t delta;
+ int ret = 0;
assert(hdr != NULL);
@@ -131,63 +385,72 @@ int oap_check_hdr(const struct oap_hdr * hdr)
delta = (ssize_t)(cur - stamp) / MILLION;
if (delta < -TIMESYNC_SLACK) {
log_err_id(id, "OAP header from %zd ms into future.", -delta);
- goto fail_stamp;
+ return -EAUTH;
}
if (delta > OAP_REPLAY_TIMER * 1000) {
log_err_id(id, "OAP header too old (%zd ms).", delta);
- goto fail_stamp;
+ return -EAUTH;
}
- new = malloc(sizeof(*new));
- if (new == NULL) {
- log_err_id(id, "Failed to allocate memory for OAP element.");
- goto fail_stamp;
- }
+ gen = stamp / ((uint64_t) OAP_REPLAY_TIMER * BILLION);
pthread_mutex_lock(&oap_auth.replay.mtx);
- list_for_each_safe(p, h, &oap_auth.replay.list) {
- struct oap_replay_entry * e;
- e = list_entry(p, struct oap_replay_entry, next);
- if (cur > e->timestamp + OAP_REPLAY_TIMER * BILLION) {
- list_del(&e->next);
- free(e);
- continue;
- }
+ b = &oap_auth.replay.bucket[gen % OAP_REPLAY_GENS];
- if (e->timestamp == stamp && ID_IS_EQUAL(e->id, id)) {
- log_warn_id(id, "OAP header already known.");
- goto fail_replay;
- }
+ /* Rotate a stale bucket in O(1): its old-gen slots become free. */
+ if (b->gen != gen) {
+ b->gen = gen;
+ b->count = 0;
}
- memcpy(new->id, id, OAP_ID_SIZE);
- new->timestamp = stamp;
+ slots = b->slots;
- list_add_tail(&new->next, &oap_auth.replay.list);
+ h = replay_hash(id, stamp) & oap_auth.replay.mask;
+ while (slots[h].gen == gen) {
+ if (slots[h].ts == stamp && ID_IS_EQUAL(slots[h].id, id)) {
+ log_warn_id(id, "OAP header already known.");
+ ret = -EREPLAY;
+ goto out;
+ }
- pthread_mutex_unlock(&oap_auth.replay.mtx);
+ h = (h + 1) & oap_auth.replay.mask;
+ }
- return 0;
+ /* Empty slot found; fail closed when the window is at capacity. */
+ if (b->count >= oap_auth.replay.cap) {
+ log_warn_id(id, "OAP replay cache full; rejecting.");
+ ret = -EAUTH;
+ goto out;
+ }
- fail_replay:
+ slots[h].gen = gen;
+ slots[h].ts = stamp;
+ memcpy(slots[h].id, id, OAP_ID_SIZE);
+ b->count++;
+ out:
pthread_mutex_unlock(&oap_auth.replay.mtx);
- free(new);
- fail_stamp:
- return -EAUTH;
+
+ return ret;
}
-int oap_auth_peer(char * name,
- const struct oap_hdr * local_hdr,
- const struct oap_hdr * peer_hdr)
+int oap_auth_peer(char * name,
+ const struct sec_config * cfg,
+ const struct oap_hdr * local_hdr,
+ const struct oap_hdr * peer_hdr,
+ const buffer_t * cached_crt)
{
void * crt;
void * pk = NULL;
- buffer_t sign; /* Signed region */
+ void * pin = NULL;
+ buffer_t crt_der; /* cert source: wire, else cached (re-key) */
+ buffer_t sign; /* Signed region */
uint8_t * id = peer_hdr->id.data;
+ int ret;
assert(name != NULL);
+ assert(cfg != NULL);
assert(local_hdr != NULL);
assert(peer_hdr != NULL);
@@ -196,13 +459,22 @@ int oap_auth_peer(char * name,
goto fail_check;
}
- if (peer_hdr->crt.len == 0) {
+ /* Re-key drops the wire cert; fall back to the cached peer cert. */
+ crt_der = peer_hdr->crt;
+ if (crt_der.len == 0 && cached_crt != NULL)
+ crt_der = *cached_crt;
+
+ if (crt_der.len == 0) {
+ if (cfg->a.req) {
+ log_err_id(id, "Peer did not provide a certificate.");
+ goto fail_check;
+ }
log_dbg_id(id, "No crt provided.");
name[0] = '\0';
return 0;
}
- if (crypt_load_crt_der(peer_hdr->crt, &crt) < 0) {
+ if (crypt_load_crt_der(crt_der, &crt) < 0) {
log_err_id(id, "Failed to load crt.");
goto fail_check;
}
@@ -216,26 +488,58 @@ int oap_auth_peer(char * name,
log_dbg_id(id, "Got public key from crt.");
- if (auth_verify_crt(oap_auth.ca_ctx, crt) < 0) {
+ if (cfg->a.cacert[0] != '\0') {
+ if (crypt_load_crt_file(cfg->a.cacert, &pin) < 0) {
+ log_err_id(id, "Failed to load pinned CA %s.",
+ cfg->a.cacert);
+ goto fail_crt;
+ }
+ }
+
+ ret = auth_verify_crt_pin(oap_auth.ca_ctx, crt, pin);
+ if (ret == -ENOENT) {
+ log_err_id(id, "Peer crt not issued by pinned CA %s.",
+ cfg->a.cacert);
+ goto fail_pin;
+ }
+
+ if (ret < 0) {
log_err_id(id, "Failed to verify peer with CA store.");
- goto fail_crt;
+ goto fail_pin;
}
log_dbg_id(id, "Successfully verified peer crt.");
- sign = peer_hdr->hdr;
+ /* Digest pin: peer must sign with the configured digest */
+ if (crypt_pk_requires_md(pk) &&
+ cfg->d.nid != NID_undef && peer_hdr->md_nid != cfg->d.nid) {
+ log_err_id(id, "Peer did not sign with %s.",
+ md_nid_to_str(cfg->d.nid));
+ goto fail_pin;
+ }
+
+ /* Sealed responses verify over the reconstructed plaintext. */
+ sign = peer_hdr->sealed_pt.data != NULL ?
+ peer_hdr->sealed_pt : peer_hdr->hdr;
sign.len -= peer_hdr->sig.len;
if (auth_verify_sig(pk, peer_hdr->md_nid, sign, peer_hdr->sig) < 0) {
log_err_id(id, "Failed to verify signature.");
- goto fail_check_sig;
+ goto fail_pin;
}
- if (crypt_get_crt_name(crt, name) < 0) {
- log_warn_id(id, "Failed to extract name from certificate.");
- name[0] = '\0';
+ ret = crypt_get_crt_name(crt, name);
+ if (ret < 0) {
+ if (ret == -ENAME)
+ log_err_id(id, "Certificate CN too long.");
+ else
+ log_err_id(id, "No name in certificate.");
+ goto fail_pin;
}
+ if (pin != NULL)
+ crypt_free_crt(pin);
+
crypt_free_key(pk);
crypt_free_crt(crt);
@@ -243,7 +547,9 @@ int oap_auth_peer(char * name,
return 0;
- fail_check_sig:
+ fail_pin:
+ if (pin != NULL)
+ crypt_free_crt(pin);
fail_crt:
crypt_free_key(pk);
crypt_free_crt(crt);
diff --git a/src/irmd/oap/auth.h b/src/irmd/oap/auth.h
index 4f748750..72938b53 100644
--- a/src/irmd/oap/auth.h
+++ b/src/irmd/oap/auth.h
@@ -23,13 +23,46 @@
#ifndef OUROBOROS_IRMD_OAP_AUTH_H
#define OUROBOROS_IRMD_OAP_AUTH_H
+#include <ouroboros/crypt.h>
+
#include "hdr.h"
int oap_check_hdr(const struct oap_hdr * hdr);
-/* name is updated with the peer's certificate name if available */
-int oap_auth_peer(char * name,
- const struct oap_hdr * local_hdr,
- const struct oap_hdr * peer_hdr);
+/*
+ * name is set to the peer crt CN, "" if no crt was presented.
+ * cached_crt (or NULL) is the peer cert from the initial handshake, used
+ * to verify a cert-less re-key.
+ */
+int oap_auth_peer(char * name,
+ const struct sec_config * cfg,
+ const struct oap_hdr * local_hdr,
+ const struct oap_hdr * peer_hdr,
+ const buffer_t * cached_crt);
+
+/* Derive the handshake key that seals the response identity block. */
+int oap_derive_hs_key(const struct crypt_sk * sk,
+ buffer_t req_hash,
+ uint8_t * out);
+
+/* resp_hash = H(kex || data || crt): binds the server response transcript. */
+int oap_resp_hash(int md_nid,
+ buffer_t kex,
+ buffer_t data,
+ buffer_t crt,
+ buffer_t * out);
+
+/* Fold request + response transcript + negotiated suite into the key. */
+int oap_bind_session_key(struct crypt_sk * sk,
+ buffer_t req_hash,
+ buffer_t resp_hash,
+ int kdf_nid);
+
+/* Server->client key-confirmation tag derived from the bound key. */
+int oap_key_confirm_tag(const struct crypt_sk * sk,
+ buffer_t req_hash,
+ buffer_t resp_hash,
+ uint8_t * out,
+ size_t outlen);
#endif /* OUROBOROS_IRMD_OAP_AUTH_H */
diff --git a/src/irmd/oap/cli.c b/src/irmd/oap/cli.c
index 7a202da7..689d67ca 100644
--- a/src/irmd/oap/cli.c
+++ b/src/irmd/oap/cli.c
@@ -54,7 +54,7 @@ struct oap_cli_ctx {
uint8_t req_hash[MAX_HASH_SIZE];
size_t req_hash_len;
int req_md_nid;
- struct sec_config kcfg;
+ struct sec_config scfg;
struct oap_hdr local_hdr;
void * pkp; /* Ephemeral keypair */
uint8_t * key; /* For client-encap KEM */
@@ -69,7 +69,7 @@ struct oap_cli_ctx {
extern int load_cli_credentials(const struct name_info * info,
void ** pkp,
void ** crt);
-extern int load_cli_kex_config(const struct name_info * info,
+extern int load_cli_sec_config(const struct name_info * info,
struct sec_config * cfg);
extern int load_server_kem_pk(const char * name,
struct sec_config * cfg,
@@ -87,13 +87,18 @@ int load_cli_credentials(const struct name_info * info,
return load_credentials(info->name, &info->c, pkp, crt);
}
-int load_cli_kex_config(const struct name_info * info,
+int load_cli_sec_config(const struct name_info * info,
struct sec_config * cfg)
{
assert(info != NULL);
assert(cfg != NULL);
- return load_kex_config(info->name, info->c.enc, cfg);
+ memset(cfg, 0, sizeof(*cfg));
+
+ /* A client authenticates the server by default, like an https client */
+ cfg->a.req = OAP_CLIENT_AUTH_DEFAULT;
+
+ return load_sec_config(info->name, info->c.sec, cfg);
}
int load_server_kem_pk(const char * name,
@@ -133,13 +138,13 @@ int load_server_kem_pk(const char * name,
static int do_client_kex_prepare_dhe(struct oap_cli_ctx * s)
{
- struct sec_config * kcfg = &s->kcfg;
+ struct sec_config * scfg = &s->scfg;
buffer_t * kex = &s->local_hdr.kex;
uint8_t * id = s->id.data;
ssize_t len;
/* Generate ephemeral keypair, send PK */
- len = kex_pkp_create(kcfg, &s->pkp, kex->data);
+ len = kex_pkp_create(scfg, &s->pkp, kex->data);
if (len < 0) {
log_err_id(id, "Failed to generate DHE keypair.");
return -ECRYPT;
@@ -147,7 +152,7 @@ static int do_client_kex_prepare_dhe(struct oap_cli_ctx * s)
kex->len = (size_t) len;
log_dbg_id(id, "Generated ephemeral %s keys (%zd bytes).",
- kcfg->x.str, len);
+ scfg->x.str, len);
return 0;
}
@@ -155,24 +160,24 @@ static int do_client_kex_prepare_dhe(struct oap_cli_ctx * s)
static int do_client_kex_prepare_kem_encap(const char * server_name,
struct oap_cli_ctx * s)
{
- struct sec_config * kcfg = &s->kcfg;
+ struct sec_config * scfg = &s->scfg;
buffer_t * kex = &s->local_hdr.kex;
uint8_t * id = s->id.data;
buffer_t server_pk = BUF_INIT;
uint8_t key_buf[SYMMKEYSZ];
ssize_t len;
- if (load_server_kem_pk(server_name, kcfg, &server_pk) < 0) {
+ if (load_server_kem_pk(server_name, scfg, &server_pk) < 0) {
log_err_id(id, "Failed to load server KEM pk.");
return -ECRYPT;
}
- if (IS_HYBRID_KEM(kcfg->x.str))
+ if (IS_HYBRID_KEM(scfg->x.str))
len = kex_kem_encap_raw(server_pk, kex->data,
- kcfg->k.nid, key_buf);
+ scfg->k.nid, key_buf);
else
len = kex_kem_encap(server_pk, kex->data,
- kcfg->k.nid, key_buf);
+ scfg->k.nid, key_buf);
freebuf(server_pk);
@@ -198,13 +203,13 @@ static int do_client_kex_prepare_kem_encap(const char * server_name,
static int do_client_kex_prepare_kem_decap(struct oap_cli_ctx * s)
{
- struct sec_config * kcfg = &s->kcfg;
+ struct sec_config * scfg = &s->scfg;
buffer_t * kex = &s->local_hdr.kex;
uint8_t * id = s->id.data;
ssize_t len;
/* Server encaps: generate keypair, send PK */
- len = kex_pkp_create(kcfg, &s->pkp, kex->data);
+ len = kex_pkp_create(scfg, &s->pkp, kex->data);
if (len < 0) {
log_err_id(id, "Failed to generate KEM keypair.");
return -ECRYPT;
@@ -219,13 +224,13 @@ static int do_client_kex_prepare_kem_decap(struct oap_cli_ctx * s)
static int do_client_kex_prepare(const char * server_name,
struct oap_cli_ctx * s)
{
- struct sec_config * kcfg = &s->kcfg;
+ struct sec_config * scfg = &s->scfg;
- if (!IS_KEX_ALGO_SET(kcfg))
+ if (!IS_KEX_ALGO_SET(scfg))
return 0;
- if (IS_KEM_ALGORITHM(kcfg->x.str)) {
- if (kcfg->x.mode == KEM_MODE_CLIENT_ENCAP)
+ if (IS_KEM_ALGORITHM(scfg->x.str)) {
+ if (scfg->x.mode == KEM_MODE_CLIENT_ENCAP)
return do_client_kex_prepare_kem_encap(server_name, s);
else
return do_client_kex_prepare_kem_decap(s);
@@ -237,11 +242,13 @@ static int do_client_kex_prepare(const char * server_name,
int oap_cli_prepare(void ** ctx,
const struct name_info * info,
buffer_t * req_buf,
- buffer_t data)
+ buffer_t data,
+ bool rekey)
{
struct oap_cli_ctx * s;
void * pkp = NULL;
void * crt = NULL;
+ buffer_t no_tag = BUF_INIT;
ssize_t ret;
assert(ctx != NULL);
@@ -276,22 +283,34 @@ int oap_cli_prepare(void ** ctx,
goto fail_id;
}
- /* Load KEX config */
- if (load_cli_kex_config(info, &s->kcfg) < 0) {
- log_err_id(s->id.data, "Failed to load KEX config for %s.",
+ /* Load security config */
+ if (load_cli_sec_config(info, &s->scfg) < 0) {
+ log_err_id(s->id.data, "Failed to load security config for %s.",
info->name);
goto fail_kex;
}
- oap_hdr_init(&s->local_hdr, s->id, s->kex_buf, data, s->kcfg.c.nid);
+ /* Re-key forces server-encap: client-encap forfeits FS/PCS. */
+ if (rekey && s->scfg.x.mode == KEM_MODE_CLIENT_ENCAP) {
+ s->scfg.x.mode = KEM_MODE_SERVER_ENCAP;
+ log_dbg_id(s->id.data, "Re-key forcing ephemeral server KEX.");
+ }
+
+ /* Re-key omits the cert; the server verifies against its cache. */
+ if (rekey && crt != NULL) {
+ crypt_free_crt(crt);
+ crt = NULL;
+ }
+
+ oap_hdr_init(&s->local_hdr, s->id, s->kex_buf, data, s->scfg.c.nid);
if (do_client_kex_prepare(info->name, s) < 0) {
log_err_id(s->id.data, "Failed to prepare client KEX.");
goto fail_kex;
}
- if (oap_hdr_encode(&s->local_hdr, pkp, crt, &s->kcfg,
- (buffer_t) BUF_INIT, NID_undef)) {
+ if (oap_hdr_encode(&s->local_hdr, pkp, crt, &s->scfg,
+ no_tag, NID_undef, NULL)) {
log_err_id(s->id.data, "Failed to create OAP request header.");
goto fail_hdr;
}
@@ -299,7 +318,7 @@ int oap_cli_prepare(void ** ctx,
debug_oap_hdr_snd(&s->local_hdr);
/* Compute and store hash of request for verification in complete */
- s->req_md_nid = s->kcfg.d.nid != NID_undef ? s->kcfg.d.nid : NID_sha384;
+ s->req_md_nid = s->scfg.d.nid != NID_undef ? s->scfg.d.nid : NID_sha384;
ret = md_digest(s->req_md_nid, s->local_hdr.hdr, s->req_hash);
if (ret < 0) {
log_err_id(s->id.data, "Failed to hash request.");
@@ -324,6 +343,7 @@ int oap_cli_prepare(void ** ctx,
return 0;
fail_hash:
+ oap_hdr_fini(&s->local_hdr);
fail_hdr:
crypt_secure_free(s->key, SYMMKEYSZ);
crypt_free_key(s->pkp);
@@ -358,11 +378,11 @@ static int do_client_kex_complete_kem(struct oap_cli_ctx * s,
const struct oap_hdr * peer_hdr,
struct crypt_sk * sk)
{
- struct sec_config * kcfg = &s->kcfg;
+ struct sec_config * scfg = &s->scfg;
uint8_t * id = s->id.data;
uint8_t key_buf[SYMMKEYSZ];
- if (kcfg->x.mode == KEM_MODE_SERVER_ENCAP) {
+ if (scfg->x.mode == KEM_MODE_SERVER_ENCAP) {
buffer_t ct;
if (peer_hdr->kex.len == 0) {
@@ -373,27 +393,27 @@ static int do_client_kex_complete_kem(struct oap_cli_ctx * s,
ct.data = peer_hdr->kex.data;
ct.len = peer_hdr->kex.len;
- if (kex_kem_decap(s->pkp, ct, kcfg->k.nid, key_buf) < 0) {
+ if (kex_kem_decap(s->pkp, ct, scfg->k.nid, key_buf) < 0) {
log_err_id(id, "Failed to decapsulate KEM.");
return -ECRYPT;
}
log_dbg_id(id, "Client decapsulated server CT.");
- } else if (kcfg->x.mode == KEM_MODE_CLIENT_ENCAP) {
+ } else if (scfg->x.mode == KEM_MODE_CLIENT_ENCAP) {
/* Key already derived during prepare */
memcpy(sk->key, s->key, SYMMKEYSZ);
- sk->nid = kcfg->c.nid;
- log_info_id(id, "Negotiated %s + %s.", kcfg->x.str,
- kcfg->c.str);
+ sk->nid = scfg->c.nid;
+ log_info_id(id, "Negotiated %s + %s.", scfg->x.str,
+ scfg->c.str);
return 0;
}
memcpy(sk->key, key_buf, SYMMKEYSZ);
- sk->nid = kcfg->c.nid;
+ sk->nid = scfg->c.nid;
crypt_secure_clear(key_buf, SYMMKEYSZ);
- log_info_id(id, "Negotiated %s + %s.", kcfg->x.str, kcfg->c.str);
+ log_info_id(id, "Negotiated %s + %s.", scfg->x.str, scfg->c.str);
return 0;
}
@@ -402,7 +422,7 @@ static int do_client_kex_complete_dhe(struct oap_cli_ctx * s,
const struct oap_hdr * peer_hdr,
struct crypt_sk * sk)
{
- struct sec_config * kcfg = &s->kcfg;
+ struct sec_config * scfg = &s->scfg;
uint8_t * id = s->id.data;
uint8_t key_buf[SYMMKEYSZ];
@@ -412,7 +432,7 @@ static int do_client_kex_complete_dhe(struct oap_cli_ctx * s,
return -ECRYPT;
}
- if (kex_dhe_derive(kcfg, s->pkp, peer_hdr->kex, key_buf) < 0) {
+ if (kex_dhe_derive(scfg, s->pkp, peer_hdr->kex, key_buf) < 0) {
log_err_id(id, "Failed to derive DHE secret.");
return -ECRYPT;
}
@@ -420,10 +440,10 @@ static int do_client_kex_complete_dhe(struct oap_cli_ctx * s,
log_dbg_id(id, "DHE: derived shared secret.");
memcpy(sk->key, key_buf, SYMMKEYSZ);
- sk->nid = kcfg->c.nid;
+ sk->nid = scfg->c.nid;
crypt_secure_clear(key_buf, SYMMKEYSZ);
- log_info_id(id, "Negotiated %s + %s.", kcfg->x.str, kcfg->c.str);
+ log_info_id(id, "Negotiated %s + %s.", scfg->x.str, scfg->c.str);
return 0;
}
@@ -433,17 +453,17 @@ static int do_client_kex_complete(struct oap_cli_ctx * s,
const struct oap_hdr * peer_hdr,
struct crypt_sk * sk)
{
- struct sec_config * kcfg = &s->kcfg;
+ struct sec_config * scfg = &s->scfg;
uint8_t * id = s->id.data;
int cipher_nid;
int kdf_nid;
- if (!IS_KEX_ALGO_SET(kcfg))
+ if (!IS_KEX_ALGO_SET(scfg))
return 0;
/* Save client's configured minimums */
- cipher_nid = kcfg->c.nid;
- kdf_nid = kcfg->k.nid;
+ cipher_nid = scfg->c.nid;
+ kdf_nid = scfg->k.nid;
/* Accept server's cipher choice */
if (peer_hdr->cipher_str == NULL) {
@@ -451,15 +471,15 @@ static int do_client_kex_complete(struct oap_cli_ctx * s,
return -ECRYPT;
}
- SET_KEX_CIPHER(kcfg, peer_hdr->cipher_str);
- if (crypt_validate_nid(kcfg->c.nid) < 0) {
+ SET_KEX_CIPHER(scfg, peer_hdr->cipher_str);
+ if (crypt_validate_nid(scfg->c.nid) < 0) {
log_err_id(id, "Server cipher '%s' not supported.",
peer_hdr->cipher_str);
return -ENOTSUP;
}
/* Verify server cipher >= client's minimum */
- if (crypt_cipher_rank(kcfg->c.nid) < crypt_cipher_rank(cipher_nid)) {
+ if (crypt_cipher_rank(scfg->c.nid) < crypt_cipher_rank(cipher_nid)) {
log_err_id(id, "Server cipher %s too weak.",
peer_hdr->cipher_str);
return -ECRYPT;
@@ -469,20 +489,20 @@ static int do_client_kex_complete(struct oap_cli_ctx * s,
peer_hdr->cipher_str);
/* Accept server's KDF for non-client-encap modes */
- if (kcfg->x.mode != KEM_MODE_CLIENT_ENCAP
+ if (scfg->x.mode != KEM_MODE_CLIENT_ENCAP
&& peer_hdr->kdf_nid != NID_undef) {
if (crypt_kdf_rank(peer_hdr->kdf_nid)
< crypt_kdf_rank(kdf_nid)) {
log_err_id(id, "Server KDF too weak.");
return -ECRYPT;
}
- SET_KEX_KDF_NID(kcfg, peer_hdr->kdf_nid);
+ SET_KEX_KDF_NID(scfg, peer_hdr->kdf_nid);
log_dbg_id(id, "Accepted server KDF %s.",
- md_nid_to_str(kcfg->k.nid));
+ md_nid_to_str(scfg->k.nid));
}
/* Derive shared secret */
- if (IS_KEM_ALGORITHM(kcfg->x.str))
+ if (IS_KEM_ALGORITHM(scfg->x.str))
return do_client_kex_complete_kem(s, peer_hdr, sk);
return do_client_kex_complete_dhe(s, peer_hdr, sk);
@@ -492,12 +512,20 @@ int oap_cli_complete(void * ctx,
const struct name_info * info,
buffer_t rsp_buf,
buffer_t * data,
- struct crypt_sk * sk)
+ struct crypt_sk * sk,
+ const buffer_t * cached_crt,
+ buffer_t * peer_crt)
{
struct oap_cli_ctx * s = ctx;
struct oap_hdr peer_hdr;
char peer[NAME_SIZE + 1];
+ uint8_t kc_buf[MAX_HASH_SIZE];
+ uint8_t resp_hash_buf[MAX_HASH_SIZE];
+ uint8_t hs_key[SYMMKEYSZ];
+ buffer_t req_hash = BUF_INIT;
+ buffer_t resp_hash = BUF_INIT;
uint8_t * id;
+ int rc;
assert(ctx != NULL);
assert(info != NULL);
@@ -515,7 +543,7 @@ int oap_cli_complete(void * ctx,
log_dbg_id(id, "Completing OAP for %s.", info->name);
/* Decode response header using client's md_nid for hash length */
- if (oap_hdr_decode(&peer_hdr, rsp_buf, s->req_md_nid) < 0) {
+ if (oap_hdr_decode(&peer_hdr, rsp_buf, s->req_md_nid, false) < 0) {
log_err_id(id, "Failed to decode OAP response header.");
goto fail_oap;
}
@@ -528,20 +556,52 @@ int oap_cli_complete(void * ctx,
goto fail_oap;
}
- /* Authenticate server */
- if (oap_auth_peer(peer, &s->local_hdr, &peer_hdr) < 0) {
- log_err_id(id, "Failed to authenticate server.");
+ /* Complete key exchange first; the sealed identity needs the secret */
+ if (do_client_kex_complete(s, &peer_hdr, sk) < 0) {
+ log_err_id(id, "Failed to complete key exchange.");
goto fail_oap;
}
- /* Verify request hash in authenticated response */
- if (peer_hdr.req_hash.len == 0) {
- log_err_id(id, "Response missing req_hash.");
+ req_hash.data = s->req_hash;
+ req_hash.len = s->req_hash_len;
+
+ /* Decrypt the sealed server identity (data+cert+sig) before auth */
+ if (sk->nid != NID_undef && peer_hdr.sealed.data != NULL) {
+ if (oap_derive_hs_key(sk, req_hash, hs_key) < 0) {
+ log_err_id(id, "Failed to derive handshake key.");
+ goto fail_oap;
+ }
+
+ rc = oap_hdr_unseal(&peer_hdr, hs_key);
+
+ crypt_secure_clear(hs_key, SYMMKEYSZ);
+
+ if (rc < 0) {
+ log_err_id(id, "Failed to unseal server identity.");
+ goto fail_oap;
+ }
+ }
+
+ /* Authenticate server (cert + signature now in cleartext) */
+ if (oap_auth_peer(peer, &s->scfg, &s->local_hdr, &peer_hdr,
+ cached_crt) < 0) {
+ log_err_id(id, "Failed to authenticate server.");
goto fail_oap;
}
- if (memcmp(peer_hdr.req_hash.data, s->req_hash, s->req_hash_len) != 0) {
- log_err_id(id, "Response req_hash mismatch.");
+ /* Surface the peer cert so the caller can cache it for re-key. */
+ if (peer_crt != NULL && peer_hdr.crt.len > 0) {
+ peer_crt->data = malloc(peer_hdr.crt.len);
+ if (peer_crt->data == NULL)
+ goto fail_oap;
+
+ memcpy(peer_crt->data, peer_hdr.crt.data, peer_hdr.crt.len);
+ peer_crt->len = peer_hdr.crt.len;
+ }
+
+ /* Response must carry a transcript tag of the expected length */
+ if (peer_hdr.rsp_tag.len != s->req_hash_len) {
+ log_err_id(id, "Response transcript tag mismatch.");
goto fail_oap;
}
@@ -552,10 +612,43 @@ int oap_cli_complete(void * ctx,
goto fail_oap;
}
- /* Complete key exchange */
- if (do_client_kex_complete(s, &peer_hdr, sk) < 0) {
- log_err_id(id, "Failed to complete key exchange.");
- goto fail_oap;
+ if (sk->nid != NID_undef) {
+ /* Encrypted: bind the key and verify key confirmation */
+ resp_hash.data = resp_hash_buf;
+
+ if (oap_resp_hash(s->req_md_nid, peer_hdr.kex,
+ peer_hdr.data, peer_hdr.crt,
+ &resp_hash) < 0) {
+ log_err_id(id, "Failed to hash response.");
+ goto fail_oap;
+ }
+
+ if (oap_bind_session_key(sk, req_hash, resp_hash,
+ s->scfg.k.nid) < 0) {
+ log_err_id(id, "Failed to bind session key.");
+ goto fail_oap;
+ }
+
+ if (oap_key_confirm_tag(sk, req_hash, resp_hash, kc_buf,
+ s->req_hash_len) < 0) {
+ log_err_id(id, "Failed to confirm session key.");
+ goto fail_oap;
+ }
+
+ if (crypt_ct_cmp(peer_hdr.rsp_tag.data, kc_buf,
+ s->req_hash_len) != 0) {
+ log_err_id(id, "Key confirmation mismatch.");
+ goto fail_oap;
+ }
+ } else {
+ /* Cleartext path is config-driven, never a wire downgrade */
+ assert(!IS_KEX_ALGO_SET(&s->scfg));
+ /* Unencrypted: verify request-echo integrity */
+ if (crypt_ct_cmp(peer_hdr.rsp_tag.data, s->req_hash,
+ s->req_hash_len) != 0) {
+ log_err_id(id, "Response tag mismatch.");
+ goto fail_oap;
+ }
}
/* Copy piggybacked data from server response */
@@ -566,11 +659,14 @@ int oap_cli_complete(void * ctx,
log_info_id(id, "OAP completed for %s.", info->name);
+ freebuf(peer_hdr.sealed_pt);
+
oap_ctx_free(s);
return 0;
fail_oap:
+ freebuf(peer_hdr.sealed_pt);
oap_ctx_free(s);
return -ECRYPT;
}
diff --git a/src/irmd/oap/hdr.c b/src/irmd/oap/hdr.c
index 5465dd2a..f8400b46 100644
--- a/src/irmd/oap/hdr.c
+++ b/src/irmd/oap/hdr.c
@@ -30,6 +30,7 @@
#include <ouroboros/crypt.h>
#include <ouroboros/endian.h>
+#include <ouroboros/errno.h>
#include <ouroboros/hash.h>
#include <ouroboros/logs.h>
#include <ouroboros/rib.h>
@@ -45,9 +46,17 @@
#include <string.h>
#include <time.h>
+#define OAP_SEAL_TAGSZ 16 /* AEAD tag on the sealed identity block */
+/* Sealed length prefix: data_len ‖ crt_len. */
+#define OAP_SEAL_LENSZ (sizeof(uint16_t) + sizeof(uint16_t))
+
+/* hs_key is single-use per handshake, so a fixed nonce is reuse-safe. */
+static const uint8_t oap_seal_nonce[12];
+
int oap_hdr_decode(struct oap_hdr * oap_hdr,
buffer_t hdr,
- int req_md_nid)
+ int req_md_nid,
+ bool rekey)
{
off_t offset;
uint16_t kex_len;
@@ -88,11 +97,13 @@ int oap_hdr_decode(struct oap_hdr * oap_hdr,
oap_hdr->md_str = md_nid_to_str(oap_hdr->md_nid);
offset += sizeof(uint16_t);
- /* Validate NIDs: NID_undef is valid at parse time, else must be known.
+ /*
+ * Validate NIDs: NID_undef is valid at parse time, else must be known.
* Note: md_nid=NID_undef only valid for PQC; enforced at sign/verify.
*/
if (ciph_nid != NID_undef && crypt_validate_nid(ciph_nid) < 0)
goto fail_decode;
+
if (oap_hdr->kdf_nid != NID_undef &&
md_validate_nid(oap_hdr->kdf_nid) < 0)
goto fail_decode;
@@ -115,10 +126,37 @@ int oap_hdr_decode(struct oap_hdr * oap_hdr,
data_len = (size_t) ntoh16(*(uint16_t *)(hdr.data + offset));
offset += sizeof(uint16_t);
- /* Response includes req_hash when md_nid is set */
+ assert((size_t) offset == OAP_HDR_MIN_SIZE);
+
+ /* Response includes rsp_tag when md_nid is set */
hash_len = (req_md_nid != NID_undef) ?
(size_t) md_len(req_md_nid) : 0;
+ /* Encrypted response: sealed block is data_len‖crt_len‖data‖crt‖sig. */
+ if (req_md_nid != NID_undef && ciph_nid != NID_undef) {
+ if (hdr.len < (size_t) offset + oap_hdr->kex.len + hash_len +
+ OAP_SEAL_TAGSZ + OAP_SEAL_LENSZ)
+ goto fail_decode;
+
+ oap_hdr->kex.data = hdr.data + offset;
+ offset += oap_hdr->kex.len;
+
+ oap_hdr->rsp_tag.data = hdr.data + offset;
+ oap_hdr->rsp_tag.len = hash_len;
+ offset += hash_len;
+
+ oap_hdr->sealed.data = hdr.data + offset;
+ oap_hdr->sealed.len = hdr.len - offset;
+
+ /* crt/data/sig lengths are sealed; set by oap_hdr_unseal. */
+ oap_hdr->crt.len = crt_len;
+ oap_hdr->data.len = data_len;
+
+ oap_hdr->hdr = hdr;
+
+ return 0;
+ }
+
/* Validate total length */
if (hdr.len < (size_t) offset + crt_len + oap_hdr->kex.len +
data_len + hash_len)
@@ -128,8 +166,12 @@ int oap_hdr_decode(struct oap_hdr * oap_hdr,
sig_len = hdr.len - offset - crt_len - oap_hdr->kex.len -
data_len - hash_len;
- /* Unsigned packets must not have trailing bytes */
- if (crt_len == 0 && sig_len != 0)
+ /*
+ * Unsigned packets must not have trailing bytes. A re-key request
+ * is signed but cert-less (verified against the cached peer cert),
+ * so the rekey caller permits crt_len==0 with a signature.
+ */
+ if (crt_len == 0 && sig_len != 0 && !rekey)
goto fail_decode;
/* Parse variable fields */
@@ -144,8 +186,8 @@ int oap_hdr_decode(struct oap_hdr * oap_hdr,
oap_hdr->data.len = data_len;
offset += data_len;
- oap_hdr->req_hash.data = hdr.data + offset;
- oap_hdr->req_hash.len = hash_len;
+ oap_hdr->rsp_tag.data = hdr.data + offset;
+ oap_hdr->rsp_tag.len = hash_len;
offset += hash_len;
oap_hdr->sig.data = hdr.data + offset;
@@ -164,6 +206,7 @@ void oap_hdr_fini(struct oap_hdr * oap_hdr)
{
assert(oap_hdr != NULL);
+ freebuf(oap_hdr->sealed_pt);
freebuf(oap_hdr->hdr);
memset(oap_hdr, 0, sizeof(*oap_hdr));
}
@@ -207,12 +250,229 @@ void oap_hdr_init(struct oap_hdr * hdr,
hdr->nid = nid;
}
+/* Write the 36-byte fixed header; stamp is already in network order. */
+static void write_oap_fixed(uint8_t * buf,
+ const struct oap_hdr * hdr,
+ const struct sec_config * scfg,
+ size_t crt_len,
+ size_t data_len,
+ uint64_t stamp)
+{
+ uint16_t v;
+ uint16_t kex_len;
+ off_t offset = 0;
+
+ memcpy(buf + offset, hdr->id.data, hdr->id.len);
+ offset += hdr->id.len;
+
+ memcpy(buf + offset, &stamp, sizeof(stamp));
+ offset += sizeof(stamp);
+
+ v = hton16(hdr->nid);
+ memcpy(buf + offset, &v, sizeof(v));
+ offset += sizeof(v);
+
+ v = hton16(scfg->k.nid);
+ memcpy(buf + offset, &v, sizeof(v));
+ offset += sizeof(v);
+
+ v = hton16(scfg->d.nid);
+ memcpy(buf + offset, &v, sizeof(v));
+ offset += sizeof(v);
+
+ v = hton16((uint16_t) crt_len);
+ memcpy(buf + offset, &v, sizeof(v));
+ offset += sizeof(v);
+
+ kex_len = (uint16_t) hdr->kex.len;
+ if (hdr->kex.len > 0 && IS_KEM_ALGORITHM(scfg->x.str)) {
+ if (IS_HYBRID_KEM(scfg->x.str))
+ kex_len |= OAP_KEX_FMT_BIT;
+ if (scfg->x.mode == KEM_MODE_CLIENT_ENCAP)
+ kex_len |= OAP_KEX_ROLE_BIT;
+ }
+
+ kex_len = hton16(kex_len);
+ memcpy(buf + offset, &kex_len, sizeof(kex_len));
+ offset += sizeof(kex_len);
+
+ v = hton16((uint16_t) data_len);
+ memcpy(buf + offset, &v, sizeof(v));
+}
+
+/*
+ * Pack lens ‖ data ‖ crt, sign prefix ‖ body, append the signature, then
+ * AEAD-seal lens ‖ data ‖ crt ‖ sig under prefix as AAD. The cert, app data
+ * and their sizes stay confidential; *out is the opaque sealed block. The
+ * signature rides inside the seal so it can't deanonymise the server.
+ */
+static int oap_seal_body(int nid,
+ const uint8_t * seal_key,
+ void * pkp,
+ int md_nid,
+ buffer_t prefix,
+ buffer_t data,
+ buffer_t crt,
+ buffer_t * out)
+{
+ buffer_t sig = BUF_INIT;
+ buffer_t sign;
+ buffer_t aad;
+ buffer_t plain;
+ uint8_t * buf;
+ uint8_t * tmp;
+ uint16_t datalen;
+ uint16_t crtlen;
+ size_t body_len;
+ off_t offset;
+
+ datalen = hton16((uint16_t) data.len);
+ crtlen = hton16((uint16_t) crt.len);
+
+ body_len = OAP_SEAL_LENSZ + data.len + crt.len;
+
+ buf = malloc(prefix.len + body_len);
+ if (buf == NULL)
+ return -1;
+
+ memcpy(buf, prefix.data, prefix.len);
+ offset = (off_t) prefix.len;
+
+ memcpy(buf + offset, &datalen, sizeof(datalen));
+ offset += sizeof(datalen);
+
+ memcpy(buf + offset, &crtlen, sizeof(crtlen));
+ offset += sizeof(crtlen);
+
+ if (data.len != 0)
+ memcpy(buf + offset, data.data, data.len);
+
+ offset += data.len;
+
+ if (crt.len != 0)
+ memcpy(buf + offset, crt.data, crt.len);
+
+ /* Sign prefix ‖ lens ‖ data ‖ crt (plaintext, before sealing). */
+ sign.data = buf;
+ sign.len = prefix.len + body_len;
+
+ if (pkp != NULL && auth_sign(pkp, md_nid, sign, &sig) < 0)
+ goto fail_buf;
+
+ /* Append the signature so the seal covers lens ‖ data ‖ crt ‖ sig. */
+ if (sig.len != 0) {
+ tmp = realloc(buf, prefix.len + body_len + sig.len);
+ if (tmp == NULL)
+ goto fail_sig;
+
+ buf = tmp;
+ memcpy(buf + prefix.len + body_len, sig.data, sig.len);
+ }
+
+ aad.data = buf;
+ aad.len = prefix.len;
+ plain.data = buf + prefix.len;
+ plain.len = body_len + sig.len;
+
+ if (crypt_oneshot_seal(nid, seal_key, oap_seal_nonce,
+ aad, plain, out) < 0)
+ goto fail_sig;
+
+ free(buf);
+ freebuf(sig);
+
+ return 0;
+
+ fail_sig:
+ freebuf(sig);
+ fail_buf:
+ free(buf);
+ return -1;
+}
+
+/* Encode an identity-hidden response: wire = prefix ‖ oap_seal_body(...). */
+static int oap_hdr_encode_sealed(struct oap_hdr * hdr,
+ void * pkp,
+ void * crt,
+ struct sec_config * scfg,
+ buffer_t rsp_tag,
+ int req_md_nid,
+ const uint8_t * seal_key)
+{
+ struct timespec now;
+ uint64_t stamp;
+ buffer_t der = BUF_INIT;
+ buffer_t sealed = BUF_INIT;
+ buffer_t prefix;
+ off_t offset;
+
+ clock_gettime(CLOCK_REALTIME, &now);
+ stamp = hton64(TS_TO_UINT64(now));
+
+ if (crt != NULL && crypt_crt_der(crt, &der) < 0)
+ goto fail_der;
+
+ prefix.len = OAP_HDR_MIN_SIZE + hdr->kex.len + rsp_tag.len;
+ prefix.data = malloc(prefix.len);
+ if (prefix.data == NULL)
+ goto fail_der;
+
+ /* Cleartext crt_len/data_len are 0; real lengths prefix the seal. */
+ write_oap_fixed(prefix.data, hdr, scfg, 0, 0, stamp);
+ offset = OAP_HDR_MIN_SIZE;
+
+ if (hdr->kex.len != 0)
+ memcpy(prefix.data + offset, hdr->kex.data, hdr->kex.len);
+
+ offset += hdr->kex.len;
+
+ if (rsp_tag.len != 0)
+ memcpy(prefix.data + offset, rsp_tag.data, rsp_tag.len);
+
+ offset += rsp_tag.len;
+
+ assert((size_t) offset == prefix.len);
+
+ if (oap_seal_body(hdr->nid, seal_key, pkp, scfg->d.nid,
+ prefix, hdr->data, der, &sealed) < 0)
+ goto fail_prefix;
+
+ hdr->hdr.len = prefix.len + sealed.len;
+ hdr->hdr.data = malloc(hdr->hdr.len);
+ if (hdr->hdr.data == NULL)
+ goto fail_sealed;
+
+ memcpy(hdr->hdr.data, prefix.data, prefix.len);
+ memcpy(hdr->hdr.data + prefix.len, sealed.data, sealed.len);
+
+ freebuf(sealed);
+ free(prefix.data);
+ freebuf(der);
+
+ if (oap_hdr_decode(hdr, hdr->hdr, req_md_nid, false) < 0)
+ goto fail_decode;
+
+ return 0;
+
+ fail_decode:
+ oap_hdr_fini(hdr);
+ return -1;
+ fail_sealed:
+ freebuf(sealed);
+ fail_prefix:
+ free(prefix.data);
+ fail_der:
+ freebuf(der);
+ return -1;
+}
+
int oap_hdr_encode(struct oap_hdr * hdr,
void * pkp,
void * crt,
- struct sec_config * kcfg,
- buffer_t req_hash,
- int req_md_nid)
+ struct sec_config * scfg,
+ buffer_t rsp_tag,
+ int req_md_nid,
+ const uint8_t * seal_key)
{
struct timespec now;
uint64_t stamp;
@@ -220,16 +480,15 @@ int oap_hdr_encode(struct oap_hdr * hdr,
buffer_t der = BUF_INIT;
buffer_t sig = BUF_INIT;
buffer_t sign;
- uint16_t len;
- uint16_t ciph_nid;
- uint16_t kdf_nid;
- uint16_t md_nid;
- uint16_t kex_len;
off_t offset;
assert(hdr != NULL);
assert(hdr->id.data != NULL && hdr->id.len == OAP_ID_SIZE);
- assert(kcfg != NULL);
+ assert(scfg != NULL);
+
+ if (seal_key != NULL)
+ return oap_hdr_encode_sealed(hdr, pkp, crt, scfg, rsp_tag,
+ req_md_nid, seal_key);
clock_gettime(CLOCK_REALTIME, &now);
stamp = hton64(TS_TO_UINT64(now));
@@ -237,86 +496,40 @@ int oap_hdr_encode(struct oap_hdr * hdr,
if (crt != NULL && crypt_crt_der(crt, &der) < 0)
goto fail_der;
- ciph_nid = hton16(hdr->nid);
- kdf_nid = hton16(kcfg->k.nid);
- md_nid = hton16(kcfg->d.nid);
-
- /* Build kex_len with flags */
- kex_len = (uint16_t) hdr->kex.len;
- if (hdr->kex.len > 0 && IS_KEM_ALGORITHM(kcfg->x.str)) {
- if (IS_HYBRID_KEM(kcfg->x.str))
- kex_len |= OAP_KEX_FMT_BIT;
- if (kcfg->x.mode == KEM_MODE_CLIENT_ENCAP)
- kex_len |= OAP_KEX_ROLE_BIT;
- }
- kex_len = hton16(kex_len);
-
- /* Fixed header (36 bytes) + variable fields + req_hash (if auth) */
+ /* Fixed header (36 bytes) + variable fields + rsp_tag (rsp only) */
out.len = OAP_HDR_MIN_SIZE + der.len + hdr->kex.len + hdr->data.len +
- req_hash.len;
+ rsp_tag.len;
out.data = malloc(out.len);
if (out.data == NULL)
goto fail_out;
- offset = 0;
-
- /* id (16 bytes) */
- memcpy(out.data + offset, hdr->id.data, hdr->id.len);
- offset += hdr->id.len;
-
- /* timestamp (8 bytes) */
- memcpy(out.data + offset, &stamp, sizeof(stamp));
- offset += sizeof(stamp);
-
- /* cipher_nid (2 bytes) */
- memcpy(out.data + offset, &ciph_nid, sizeof(ciph_nid));
- offset += sizeof(ciph_nid);
-
- /* kdf_nid (2 bytes) */
- memcpy(out.data + offset, &kdf_nid, sizeof(kdf_nid));
- offset += sizeof(kdf_nid);
-
- /* md_nid (2 bytes) */
- memcpy(out.data + offset, &md_nid, sizeof(md_nid));
- offset += sizeof(md_nid);
-
- /* crt_len (2 bytes) */
- len = hton16((uint16_t) der.len);
- memcpy(out.data + offset, &len, sizeof(len));
- offset += sizeof(len);
-
- /* kex_len + flags (2 bytes) */
- memcpy(out.data + offset, &kex_len, sizeof(kex_len));
- offset += sizeof(kex_len);
-
- /* data_len (2 bytes) */
- len = hton16((uint16_t) hdr->data.len);
- memcpy(out.data + offset, &len, sizeof(len));
- offset += sizeof(len);
-
- /* Fixed header complete (36 bytes) */
- assert((size_t) offset == OAP_HDR_MIN_SIZE);
+ write_oap_fixed(out.data, hdr, scfg, der.len, hdr->data.len, stamp);
+ offset = OAP_HDR_MIN_SIZE;
/* certificate (variable) */
if (der.len != 0)
memcpy(out.data + offset, der.data, der.len);
+
offset += der.len;
/* kex data (variable) */
if (hdr->kex.len != 0)
memcpy(out.data + offset, hdr->kex.data, hdr->kex.len);
+
offset += hdr->kex.len;
/* data (variable) */
if (hdr->data.len != 0)
memcpy(out.data + offset, hdr->data.data, hdr->data.len);
+
offset += hdr->data.len;
- /* req_hash (variable, only for authenticated responses) */
- if (req_hash.len != 0)
- memcpy(out.data + offset, req_hash.data, req_hash.len);
- offset += req_hash.len;
+ /* rsp_tag (variable, response only) */
+ if (rsp_tag.len != 0)
+ memcpy(out.data + offset, rsp_tag.data, rsp_tag.len);
+
+ offset += rsp_tag.len;
assert((size_t) offset == out.len);
@@ -324,7 +537,7 @@ int oap_hdr_encode(struct oap_hdr * hdr,
sign.data = out.data;
sign.len = out.len;
- if (pkp != NULL && auth_sign(pkp, kcfg->d.nid, sign, &sig) < 0)
+ if (pkp != NULL && auth_sign(pkp, scfg->d.nid, sign, &sig) < 0)
goto fail_sig;
hdr->hdr = out;
@@ -340,7 +553,7 @@ int oap_hdr_encode(struct oap_hdr * hdr,
clrbuf(out);
}
- if (oap_hdr_decode(hdr, hdr->hdr, req_md_nid) < 0)
+ if (oap_hdr_decode(hdr, hdr->hdr, req_md_nid, false) < 0)
goto fail_decode;
freebuf(der);
@@ -360,28 +573,99 @@ int oap_hdr_encode(struct oap_hdr * hdr,
return -1;
}
+int oap_hdr_unseal(struct oap_hdr * hdr,
+ const uint8_t * key)
+{
+ buffer_t pt = BUF_INIT;
+ buffer_t prefix;
+ uint8_t * recon;
+ size_t body_len;
+ size_t pt_len;
+ size_t data_len;
+ size_t crt_len;
+
+ assert(hdr != NULL);
+ assert(key != NULL);
+
+ if (hdr->sealed.data == NULL || hdr->sealed.len == 0)
+ return -EINVAL;
+
+ /* AAD prefix is fixed‖kex‖rsp_tag; sealed starts right after. */
+ prefix.data = hdr->hdr.data;
+ prefix.len = (size_t) (hdr->sealed.data - hdr->hdr.data);
+
+ if (crypt_oneshot_open(hdr->nid, key, oap_seal_nonce, prefix,
+ hdr->sealed, &pt) < 0)
+ return -ECRYPT;
+
+ pt_len = pt.len;
+
+ /* Plaintext = data_len ‖ crt_len ‖ data ‖ crt ‖ sig. */
+ if (pt_len < OAP_SEAL_LENSZ)
+ goto fail_auth;
+
+ data_len = (size_t) ntoh16(*(uint16_t *) pt.data);
+ crt_len = (size_t) ntoh16(*(uint16_t *)(pt.data + sizeof(uint16_t)));
+
+ body_len = OAP_SEAL_LENSZ + data_len + crt_len;
+ if (pt_len < body_len)
+ goto fail_auth;
+
+ /* Rebuild prefix ‖ lens ‖ data ‖ crt ‖ sig (whole signed region). */
+ recon = malloc(prefix.len + pt_len);
+ if (recon == NULL)
+ goto fail_mem;
+
+ memcpy(recon, prefix.data, prefix.len);
+ memcpy(recon + prefix.len, pt.data, pt_len);
+
+ freebuf(pt);
+
+ hdr->sealed_pt.data = recon;
+ hdr->sealed_pt.len = prefix.len + pt_len;
+
+ hdr->data.data = recon + prefix.len + OAP_SEAL_LENSZ;
+ hdr->data.len = data_len;
+ hdr->crt.data = recon + prefix.len + OAP_SEAL_LENSZ + data_len;
+ hdr->crt.len = crt_len;
+ hdr->sig.data = recon + prefix.len + body_len;
+ hdr->sig.len = pt_len - body_len;
+
+ return 0;
+
+ fail_mem:
+ freebuf(pt);
+ return -ENOMEM;
+ fail_auth:
+ freebuf(pt);
+ return -EAUTH;
+}
+
#ifdef DEBUG_PROTO_OAP
#define OAP_KEX_IS_KEM(hdr) ((hdr)->kex_flags.role | (hdr)->kex_flags.fmt)
static void debug_oap_hdr(const struct oap_hdr * hdr)
{
assert(hdr);
+ if (hdr->sealed.len > 0)
+ log_proto(" Sealed block: [%zu bytes] on wire",
+ hdr->sealed.len);
+
if (hdr->crt.len > 0)
log_proto(" crt: [%zu bytes]", hdr->crt.len);
+ else if (hdr->sealed.len > 0)
+ log_proto(" crt: <sealed>");
else
log_proto(" crt: <none>");
if (hdr->kex.len > 0) {
if (OAP_KEX_IS_KEM(hdr))
- log_proto(" Key Exchange Data:"
- " [%zu bytes] [%s]",
+ log_proto(" Key Exchange Data: [%zu bytes] [%s]",
hdr->kex.len,
hdr->kex_flags.role ?
- "Client encaps" :
- "Server encaps");
+ "Client encaps" : "Server encaps");
else
- log_proto(" Key Exchange Data:"
- " [%zu bytes]",
+ log_proto(" Key Exchange Data: [%zu bytes]",
hdr->kex.len);
} else
log_proto(" Key Exchange Data: <none>");
@@ -403,16 +687,20 @@ static void debug_oap_hdr(const struct oap_hdr * hdr)
if (hdr->data.len > 0)
log_proto(" Data: [%zu bytes]", hdr->data.len);
+ else if (hdr->sealed.len > 0)
+ log_proto(" Data: <sealed>");
else
log_proto(" Data: <none>");
- if (hdr->req_hash.len > 0)
- log_proto(" Req Hash: [%zu bytes]", hdr->req_hash.len);
+ if (hdr->rsp_tag.len > 0)
+ log_proto(" Rsp Tag: [%zu bytes]", hdr->rsp_tag.len);
else
- log_proto(" Req Hash: <none>");
+ log_proto(" Rsp Tag: <none>");
if (hdr->sig.len > 0)
log_proto(" Signature: [%zu bytes]", hdr->sig.len);
+ else if (hdr->sealed.len > 0)
+ log_proto(" Signature: <sealed>");
else
log_proto(" Signature: <none>");
}
@@ -432,8 +720,9 @@ void debug_oap_hdr_rcv(const struct oap_hdr * hdr)
tm = gmtime(&stamp);
strftime(tmstr, sizeof(tmstr), RIB_TM_FORMAT, tm);
- log_proto("OAP_HDR [" HASH_FMT64 " @ %s ] <--",
- HASH_VAL64(hdr->id.data), tmstr);
+ log_proto("OAP_HDR [" HASH_FMT64 " @ %s ]%s <--",
+ HASH_VAL64(hdr->id.data), tmstr,
+ hdr->sealed.len > 0 ? " [sealed]" : "");
debug_oap_hdr(hdr);
#else
@@ -455,8 +744,9 @@ void debug_oap_hdr_snd(const struct oap_hdr * hdr)
tm = gmtime(&stamp);
strftime(tmstr, sizeof(tmstr), RIB_TM_FORMAT, tm);
- log_proto("OAP_HDR [" HASH_FMT64 " @ %s ] -->",
- HASH_VAL64(hdr->id.data), tmstr);
+ log_proto("OAP_HDR [" HASH_FMT64 " @ %s ]%s -->",
+ HASH_VAL64(hdr->id.data), tmstr,
+ hdr->sealed.len > 0 ? " [sealed]" : "");
debug_oap_hdr(hdr);
#else
diff --git a/src/irmd/oap/hdr.h b/src/irmd/oap/hdr.h
index 6016452c..1a599727 100644
--- a/src/irmd/oap/hdr.h
+++ b/src/irmd/oap/hdr.h
@@ -43,6 +43,9 @@
#define OAP_KEX_IS_RAW_FMT(hdr) (((hdr)->kex_flags.fmt) == 1)
/*
+ * Plaintext layout (request, and unencrypted/signed response). The
+ * signature covers the whole packet except itself.
+ *
* 0 1 2 3
* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ---+
@@ -83,8 +86,8 @@
* | | |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
* | | |
- * + req_hash (variable, response only) + |
- * | H(request) using req md_nid / sha384 | |
+ * + rsp_tag (variable, response only) + |
+ * | key-confirm tag (enc), else H(request) | |
* | | |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ---+
* | |
@@ -92,6 +95,25 @@
* | DSA signature over signed region |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
+ * Encrypted response - wire layout. The certificate, application data and
+ * signature are AEAD-sealed - hiding the server identity and the cert/data
+ * sizes; kex and rsp_tag move ahead of the sealed block as cleartext AAD.
+ *
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ---+
+ * | fixed header (36 bytes, see above) | |
+ * + id, timestamp, NIDs, crt_len=0, kex_len, data_len=0 + | AAD
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+ * | kex_data (variable) | |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+ * | rsp_tag (variable, response only) | |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ---+
+ * | SEAL( data_len ‖ crt_len ‖ data ‖ crt ‖ sig ) | |
+ * + encrypted cert, app data and signature + | Sealed
+ * | + AEAD tag (128 bits) | | area
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ---+
+ *
* cipher_nid: NID value for symmetric cipher (0 = none)
* kdf_nid: NID value for KDF function (0 = none)
* md_nid: NID value for signature hash (0 = PQC/no signature)
@@ -105,6 +127,11 @@
* Request: sig_len = total - 36 - crt_len - kex_len - data_len
* Response: sig_len = total - 36 - crt_len - kex_len - data_len - hash_len
* where hash_len = md_len(req_md_nid / sha384)
+ *
+ * The signed plaintext inside the seal is prefix ‖ data_len ‖ crt_len ‖
+ * data ‖ crt ‖ sig; the cleartext prefix (fixed ‖ kex ‖ rsp_tag) is the
+ * AEAD AAD. Cleartext crt_len/data_len are 0 - the real lengths are sealed,
+ * hiding the cert and data sizes; oap_hdr_unseal reads them to split.
*/
/* Parsed OAP header - buffers pointing to a single memory region */
@@ -120,12 +147,15 @@ struct oap_hdr {
bool fmt; /* Format */
bool role; /* Role */
} kex_flags;
+
buffer_t id;
buffer_t crt;
buffer_t kex;
buffer_t data;
- buffer_t req_hash; /* H(request) - response only */
+ buffer_t rsp_tag; /* key-confirm tag / H(req), rsp only */
buffer_t sig;
+ buffer_t sealed; /* wire ciphertext ‖ tag (sealed rsp) */
+ buffer_t sealed_pt; /* prefix‖lens‖data‖crt‖sig, owned */
buffer_t hdr;
};
@@ -141,13 +171,19 @@ void oap_hdr_fini(struct oap_hdr * oap_hdr);
int oap_hdr_encode(struct oap_hdr * hdr,
void * pkp,
void * crt,
- struct sec_config * kcfg,
- buffer_t req_hash,
- int req_md_nid);
+ struct sec_config * scfg,
+ buffer_t rsp_tag,
+ int req_md_nid,
+ const uint8_t * seal_key);
int oap_hdr_decode(struct oap_hdr * hdr,
buffer_t buf,
- int req_md_nid);
+ int req_md_nid,
+ bool rekey);
+
+/* Decrypt a sealed response identity block; fills data, crt and sig. */
+int oap_hdr_unseal(struct oap_hdr * hdr,
+ const uint8_t * key);
void debug_oap_hdr_rcv(const struct oap_hdr * hdr);
diff --git a/src/irmd/oap/internal.h b/src/irmd/oap/internal.h
index 6dd44d56..4a156723 100644
--- a/src/irmd/oap/internal.h
+++ b/src/irmd/oap/internal.h
@@ -36,12 +36,13 @@
int oap_check_hdr(const struct oap_hdr * hdr);
-int oap_auth_peer(char * name,
- const struct oap_hdr * local_hdr,
- const struct oap_hdr * peer_hdr);
+int oap_auth_peer(char * name,
+ const struct sec_config * cfg,
+ const struct oap_hdr * local_hdr,
+ const struct oap_hdr * peer_hdr);
int oap_negotiate_cipher(const struct oap_hdr * peer_hdr,
- struct sec_config * kcfg);
+ struct sec_config * scfg);
#ifndef OAP_TEST_MODE
int load_credentials(const char * name,
@@ -49,7 +50,7 @@ int load_credentials(const char * name,
void ** pkp,
void ** crt);
-int load_kex_config(const char * name,
+int load_sec_config(const char * name,
const char * path,
struct sec_config * cfg);
#endif
@@ -59,7 +60,7 @@ int load_srv_credentials(const struct name_info * info,
void ** pkp,
void ** crt);
-int load_srv_kex_config(const struct name_info * info,
+int load_srv_sec_config(const struct name_info * info,
struct sec_config * cfg);
int load_server_kem_keypair(const char * name,
@@ -69,7 +70,7 @@ int load_server_kem_keypair(const char * name,
extern int load_srv_credentials(const struct name_info * info,
void ** pkp,
void ** crt);
-extern int load_srv_kex_config(const struct name_info * info,
+extern int load_srv_sec_config(const struct name_info * info,
struct sec_config * cfg);
extern int load_server_kem_keypair(const char * name,
struct sec_config * cfg,
@@ -78,7 +79,7 @@ extern int load_server_kem_keypair(const char * name,
int do_server_kex(const struct name_info * info,
struct oap_hdr * peer_hdr,
- struct sec_config * kcfg,
+ struct sec_config * scfg,
buffer_t * kex,
struct crypt_sk * sk);
@@ -87,7 +88,7 @@ int load_cli_credentials(const struct name_info * info,
void ** pkp,
void ** crt);
-int load_cli_kex_config(const struct name_info * info,
+int load_cli_sec_config(const struct name_info * info,
struct sec_config * cfg);
int load_server_kem_pk(const char * name,
@@ -97,21 +98,21 @@ int load_server_kem_pk(const char * name,
extern int load_cli_credentials(const struct name_info * info,
void ** pkp,
void ** crt);
-extern int load_cli_kex_config(const struct name_info * info,
+extern int load_cli_sec_config(const struct name_info * info,
struct sec_config * cfg);
extern int load_server_kem_pk(const char * name,
struct sec_config * cfg,
buffer_t * pk);
#endif
-int oap_client_kex_prepare(struct sec_config * kcfg,
+int oap_client_kex_prepare(struct sec_config * scfg,
buffer_t server_pk,
buffer_t * kex,
uint8_t * key,
void ** ephemeral_pkp);
int oap_client_kex_complete(const struct oap_hdr * peer_hdr,
- struct sec_config * kcfg,
+ struct sec_config * scfg,
void * pkp,
uint8_t * key);
diff --git a/src/irmd/oap/io.c b/src/irmd/oap/io.c
index c2c91b91..b5daa432 100644
--- a/src/irmd/oap/io.c
+++ b/src/irmd/oap/io.c
@@ -50,11 +50,17 @@ static bool file_exists(const char * path)
{
struct stat s;
- if (stat(path, &s) < 0 && errno == ENOENT) {
+ if (stat(path, &s) == 0)
+ return true;
+
+ if (errno == ENOENT) {
log_dbg("File %s does not exist.", path);
return false;
}
+ /* Can't stat for another reason; assume present, fail on load */
+ log_warn("Failed to stat %s: %s.", path, strerror(errno));
+
return true;
}
@@ -96,16 +102,16 @@ int load_credentials(const char * name,
return -EAUTH;
}
-int load_kex_config(const char * name,
+int load_sec_config(const char * name,
const char * path,
struct sec_config * cfg)
{
+ void * pin;
+
assert(name != NULL);
assert(cfg != NULL);
- memset(cfg, 0, sizeof(*cfg));
-
- /* Load encryption config */
+ /* Load security config */
if (!file_exists(path))
log_dbg("No encryption %s for %s.", path, name);
@@ -114,6 +120,15 @@ int load_kex_config(const char * name,
return -1;
}
+ if (cfg->a.cacert[0] != '\0') {
+ if (crypt_load_crt_file(cfg->a.cacert, &pin) < 0) {
+ log_err("Failed to load pinned CA %s for %s.",
+ cfg->a.cacert, name);
+ return -EAUTH;
+ }
+ crypt_free_crt(pin);
+ }
+
if (!IS_KEX_ALGO_SET(cfg)) {
log_info("Key exchange not configured for %s.", name);
return 0;
@@ -125,8 +140,13 @@ int load_kex_config(const char * name,
return -ENOTSUP;
}
#endif
- if (cfg->c.nid == NID_undef) {
- log_err("Invalid cipher for %s.", name);
+ if (crypt_kex_rank(cfg->x.nid) < 1) {
+ log_err("Key exchange not supported for %s.", name);
+ return -ENOTSUP;
+ }
+
+ if (crypt_cipher_rank(cfg->c.nid) < 1) {
+ log_err("Cipher not supported for %s.", name);
return -ECRYPT;
}
diff --git a/src/irmd/oap/io.h b/src/irmd/oap/io.h
index 2d47c62f..953e3898 100644
--- a/src/irmd/oap/io.h
+++ b/src/irmd/oap/io.h
@@ -32,7 +32,7 @@ int load_credentials(const char * name,
void ** pkp,
void ** crt);
-int load_kex_config(const char * name,
+int load_sec_config(const char * name,
const char * path,
struct sec_config * cfg);
#endif
diff --git a/src/irmd/oap/srv.c b/src/irmd/oap/srv.c
index afc54acc..5d631618 100644
--- a/src/irmd/oap/srv.c
+++ b/src/irmd/oap/srv.c
@@ -49,7 +49,7 @@
extern int load_srv_credentials(const struct name_info * info,
void ** pkp,
void ** crt);
-extern int load_srv_kex_config(const struct name_info * info,
+extern int load_srv_sec_config(const struct name_info * info,
struct sec_config * cfg);
extern int load_server_kem_keypair(const char * name,
bool raw_fmt,
@@ -67,13 +67,16 @@ int load_srv_credentials(const struct name_info * info,
return load_credentials(info->name, &info->s, pkp, crt);
}
-int load_srv_kex_config(const struct name_info * info,
+int load_srv_sec_config(const struct name_info * info,
struct sec_config * cfg)
{
assert(info != NULL);
assert(cfg != NULL);
- return load_kex_config(info->name, info->s.enc, cfg);
+ memset(cfg, 0, sizeof(*cfg));
+
+ /* Client auth stays opt-in (mTLS); enable with auth=required */
+ return load_sec_config(info->name, info->s.sec, cfg);
}
int load_server_kem_keypair(const char * name,
@@ -135,7 +138,7 @@ static int get_algo_from_peer_key(const struct oap_hdr * peer_hdr,
}
static int negotiate_cipher(const struct oap_hdr * peer_hdr,
- struct sec_config * kcfg)
+ struct sec_config * scfg)
{
uint8_t * id = peer_hdr->id.data;
int cli_nid;
@@ -143,27 +146,25 @@ static int negotiate_cipher(const struct oap_hdr * peer_hdr,
int srv_rank;
/* Cipher: select the strongest of client and server */
- cli_nid = peer_hdr->cipher_str != NULL
- ? (int) crypt_str_to_nid(peer_hdr->cipher_str)
- : NID_undef;
+ if (peer_hdr->cipher_str != NULL)
+ cli_nid = (int) crypt_str_to_nid(peer_hdr->cipher_str);
+ else
+ cli_nid = NID_undef;
- if (cli_nid != NID_undef
- && crypt_cipher_rank(cli_nid) < 0) {
+ if (cli_nid != NID_undef && crypt_cipher_rank(cli_nid) < 0) {
log_err_id(id, "Unsupported cipher '%s'.",
peer_hdr->cipher_str);
return -ENOTSUP;
}
cli_rank = crypt_cipher_rank(cli_nid);
- srv_rank = crypt_cipher_rank(kcfg->c.nid);
+ srv_rank = crypt_cipher_rank(scfg->c.nid);
if (cli_rank > srv_rank) {
- SET_KEX_CIPHER_NID(kcfg, cli_nid);
- log_dbg_id(id, "Selected client cipher %s.",
- kcfg->c.str);
+ SET_KEX_CIPHER_NID(scfg, cli_nid);
+ log_dbg_id(id, "Selected client cipher %s.", scfg->c.str);
} else if (srv_rank > 0) {
- log_dbg_id(id, "Selected server cipher %s.",
- kcfg->c.str);
+ log_dbg_id(id, "Selected server cipher %s.", scfg->c.str);
} else {
log_err_id(id, "Encryption requested, no cipher.");
return -ECRYPT;
@@ -178,31 +179,27 @@ static int negotiate_cipher(const struct oap_hdr * peer_hdr,
}
cli_rank = crypt_kdf_rank(peer_hdr->kdf_nid);
- srv_rank = crypt_kdf_rank(kcfg->k.nid);
+ srv_rank = crypt_kdf_rank(scfg->k.nid);
- /*
- * For client-encap KEM, the KDF is baked into
- * the ciphertext. The server must use the client's
- * KDF and can only verify the minimum.
- */
+ /* Client-encap KEM bakes KDF into ciphertext; verify min. */
if (OAP_KEX_ROLE(peer_hdr) == KEM_MODE_CLIENT_ENCAP) {
if (srv_rank > cli_rank) {
log_err_id(id, "Client KDF too weak.");
return -ECRYPT;
}
- SET_KEX_KDF_NID(kcfg, peer_hdr->kdf_nid);
+ SET_KEX_KDF_NID(scfg, peer_hdr->kdf_nid);
} else if (cli_rank > srv_rank) {
- SET_KEX_KDF_NID(kcfg, peer_hdr->kdf_nid);
+ SET_KEX_KDF_NID(scfg, peer_hdr->kdf_nid);
log_dbg_id(id, "Selected client KDF %s.",
- md_nid_to_str(kcfg->k.nid));
+ md_nid_to_str(scfg->k.nid));
} else if (srv_rank > 0) {
log_dbg_id(id, "Selected server KDF %s.",
- md_nid_to_str(kcfg->k.nid));
+ md_nid_to_str(scfg->k.nid));
}
- if (IS_KEX_ALGO_SET(kcfg))
+ if (IS_KEX_ALGO_SET(scfg))
log_info_id(id, "Negotiated %s + %s.",
- kcfg->x.str, kcfg->c.str);
+ scfg->x.str, scfg->c.str);
else
log_info_id(id, "No key exchange.");
@@ -211,7 +208,7 @@ static int negotiate_cipher(const struct oap_hdr * peer_hdr,
static int do_server_kem_decap(const struct name_info * info,
const struct oap_hdr * peer_hdr,
- struct sec_config * kcfg,
+ struct sec_config * scfg,
struct crypt_sk * sk)
{
buffer_t ct;
@@ -228,7 +225,7 @@ static int do_server_kem_decap(const struct name_info * info,
ct.data = peer_hdr->kex.data;
ct.len = peer_hdr->kex.len;
- ret = kex_kem_decap(server_pkp, ct, kcfg->k.nid, sk->key);
+ ret = kex_kem_decap(server_pkp, ct, scfg->k.nid, sk->key);
crypt_free_key(server_pkp);
@@ -243,7 +240,7 @@ static int do_server_kem_decap(const struct name_info * info,
}
static int do_server_kem_encap(const struct oap_hdr * peer_hdr,
- struct sec_config * kcfg,
+ struct sec_config * scfg,
buffer_t * kex,
struct crypt_sk * sk)
{
@@ -254,12 +251,12 @@ static int do_server_kem_encap(const struct oap_hdr * peer_hdr,
client_pk.data = peer_hdr->kex.data;
client_pk.len = peer_hdr->kex.len;
- if (IS_HYBRID_KEM(kcfg->x.str))
+ if (IS_HYBRID_KEM(scfg->x.str))
ct_len = kex_kem_encap_raw(client_pk, kex->data,
- kcfg->k.nid, sk->key);
+ scfg->k.nid, sk->key);
else
ct_len = kex_kem_encap(client_pk, kex->data,
- kcfg->k.nid, sk->key);
+ scfg->k.nid, sk->key);
if (ct_len < 0) {
log_err_id(id, "Failed to encapsulate KEM.");
@@ -275,26 +272,26 @@ static int do_server_kem_encap(const struct oap_hdr * peer_hdr,
static int do_server_kex_kem(const struct name_info * info,
struct oap_hdr * peer_hdr,
- struct sec_config * kcfg,
+ struct sec_config * scfg,
buffer_t * kex,
struct crypt_sk * sk)
{
int ret;
- kcfg->x.mode = peer_hdr->kex_flags.role;
+ scfg->x.mode = peer_hdr->kex_flags.role;
- if (kcfg->x.mode == KEM_MODE_CLIENT_ENCAP) {
- ret = do_server_kem_decap(info, peer_hdr, kcfg, sk);
+ if (scfg->x.mode == KEM_MODE_CLIENT_ENCAP) {
+ ret = do_server_kem_decap(info, peer_hdr, scfg, sk);
kex->len = 0;
} else {
- ret = do_server_kem_encap(peer_hdr, kcfg, kex, sk);
+ ret = do_server_kem_encap(peer_hdr, scfg, kex, sk);
}
return ret;
}
static int do_server_kex_dhe(const struct oap_hdr * peer_hdr,
- struct sec_config * kcfg,
+ struct sec_config * scfg,
buffer_t * kex,
struct crypt_sk * sk)
{
@@ -303,7 +300,7 @@ static int do_server_kex_dhe(const struct oap_hdr * peer_hdr,
int ret;
uint8_t * id = peer_hdr->id.data;
- key_len = kex_pkp_create(kcfg, &epkp, kex->data);
+ key_len = kex_pkp_create(scfg, &epkp, kex->data);
if (key_len < 0) {
log_err_id(id, "Failed to generate key pair.");
return -ECRYPT;
@@ -311,9 +308,9 @@ static int do_server_kex_dhe(const struct oap_hdr * peer_hdr,
kex->len = (size_t) key_len;
- log_dbg_id(id, "Generated %s ephemeral keys.", kcfg->x.str);
+ log_dbg_id(id, "Generated %s ephemeral keys.", scfg->x.str);
- ret = kex_dhe_derive(kcfg, epkp, peer_hdr->kex, sk->key);
+ ret = kex_dhe_derive(scfg, epkp, peer_hdr->kex, sk->key);
if (ret < 0) {
log_err_id(id, "Failed to derive secret.");
kex_pkp_destroy(epkp);
@@ -327,7 +324,7 @@ static int do_server_kex_dhe(const struct oap_hdr * peer_hdr,
int do_server_kex(const struct name_info * info,
struct oap_hdr * peer_hdr,
- struct sec_config * kcfg,
+ struct sec_config * scfg,
buffer_t * kex,
struct crypt_sk * sk)
{
@@ -339,60 +336,71 @@ int do_server_kex(const struct name_info * info,
/* No KEX data from client */
if (peer_hdr->kex.len == 0) {
- if (IS_KEX_ALGO_SET(kcfg)) {
+ if (IS_KEX_ALGO_SET(scfg)) {
log_warn_id(id, "KEX requested without info.");
return -ECRYPT;
}
return 0;
}
- if (negotiate_cipher(peer_hdr, kcfg) < 0)
+ if (negotiate_cipher(peer_hdr, scfg) < 0)
return -ECRYPT;
/* Save server's configured KEX before overwriting */
- srv_kex_nid = kcfg->x.nid;
+ srv_kex_nid = scfg->x.nid;
if (OAP_KEX_ROLE(peer_hdr) != KEM_MODE_CLIENT_ENCAP) {
/* Server encapsulation or DHE: extract algo from DER PK */
if (get_algo_from_peer_key(peer_hdr, algo_buf) < 0)
return -ECRYPT;
- SET_KEX_ALGO(kcfg, algo_buf);
+ SET_KEX_ALGO(scfg, algo_buf);
/* Reject if client KEX is weaker than server's */
- if (crypt_kex_rank(kcfg->x.nid)
+ if (crypt_kex_rank(scfg->x.nid)
< crypt_kex_rank(srv_kex_nid)) {
log_err_id(id, "Client KEX %s too weak.",
- kcfg->x.str);
+ scfg->x.str);
return -ECRYPT;
}
}
/* Dispatch based on algorithm type */
- if (IS_KEM_ALGORITHM(kcfg->x.str))
- return do_server_kex_kem(info, peer_hdr, kcfg, kex, sk);
+ if (IS_KEM_ALGORITHM(scfg->x.str))
+ return do_server_kex_kem(info, peer_hdr, scfg, kex, sk);
else
- return do_server_kex_dhe(peer_hdr, kcfg, kex, sk);
+ return do_server_kex_dhe(peer_hdr, scfg, kex, sk);
}
int oap_srv_process(const struct name_info * info,
buffer_t req_buf,
buffer_t * rsp_buf,
buffer_t * data,
- struct crypt_sk * sk)
+ struct crypt_sk * sk,
+ bool rekey,
+ const buffer_t * cached_crt,
+ buffer_t * peer_crt)
{
struct oap_hdr peer_hdr;
struct oap_hdr local_hdr;
- struct sec_config kcfg;
+ struct sec_config scfg;
uint8_t kex_buf[CRYPT_KEY_BUFSZ];
uint8_t hash_buf[MAX_HASH_SIZE];
- buffer_t req_hash = BUF_INIT;
+ uint8_t kc_buf[MAX_HASH_SIZE];
+ uint8_t resp_hash_buf[MAX_HASH_SIZE];
+ uint8_t hs_key[SYMMKEYSZ];
+ const uint8_t * seal_key = NULL;
+ buffer_t req_hash = BUF_INIT;
+ buffer_t resp_hash = BUF_INIT;
+ buffer_t crt_der = BUF_INIT;
+ buffer_t rsp_tag = BUF_INIT;
ssize_t hash_ret;
- char cli_name[NAME_SIZE + 1]; /* TODO */
+ char cli_name[NAME_SIZE + 1];
uint8_t * id;
void * pkp = NULL;
void * crt = NULL;
int req_md_nid;
+ int ret;
assert(info != NULL);
assert(rsp_buf != NULL);
@@ -412,13 +420,19 @@ int oap_srv_process(const struct name_info * info,
goto fail_cred;
}
- if (load_srv_kex_config(info, &kcfg) < 0) {
- log_err("Failed to load KEX config for %s.", info->name);
+ /* Re-key omits the cert; the peer verifies against its cache. */
+ if (rekey && crt != NULL) {
+ crypt_free_crt(crt);
+ crt = NULL;
+ }
+
+ if (load_srv_sec_config(info, &scfg) < 0) {
+ log_err("Failed to load security config for %s.", info->name);
goto fail_kex;
}
/* Decode incoming header (NID_undef = request, no hash) */
- if (oap_hdr_decode(&peer_hdr, req_buf, NID_undef) < 0) {
+ if (oap_hdr_decode(&peer_hdr, req_buf, NID_undef, rekey) < 0) {
log_err("Failed to decode OAP header.");
goto fail_auth;
}
@@ -427,22 +441,38 @@ int oap_srv_process(const struct name_info * info,
id = peer_hdr.id.data; /* Logging */
- if (oap_check_hdr(&peer_hdr) < 0) {
- log_err_id(id, "OAP header failed replay check.");
+ ret = oap_check_hdr(&peer_hdr);
+ if (ret == -EREPLAY) {
+ log_warn_id(id, "OAP header failed replay check.");
+ goto fail_replay;
+ }
+ if (ret < 0) {
+ log_err_id(id, "OAP header check failed.");
goto fail_auth;
}
oap_hdr_init(&local_hdr, peer_hdr.id, kex_buf, *data, NID_undef);
- if (oap_auth_peer(cli_name, &local_hdr, &peer_hdr) < 0) {
+ if (oap_auth_peer(cli_name, &scfg, &local_hdr, &peer_hdr,
+ cached_crt) < 0) {
log_err_id(id, "Failed to authenticate client.");
goto fail_auth;
}
- if (do_server_kex(info, &peer_hdr, &kcfg, &local_hdr.kex, sk) < 0)
+ /* Surface the peer cert so the caller can cache it for re-key. */
+ if (peer_crt != NULL && peer_hdr.crt.len > 0) {
+ peer_crt->data = malloc(peer_hdr.crt.len);
+ if (peer_crt->data == NULL)
+ goto fail_auth;
+
+ memcpy(peer_crt->data, peer_hdr.crt.data, peer_hdr.crt.len);
+ peer_crt->len = peer_hdr.crt.len;
+ }
+
+ if (do_server_kex(info, &peer_hdr, &scfg, &local_hdr.kex, sk) < 0)
goto fail_kex;
- sk->nid = kcfg.c.nid;
+ sk->nid = scfg.c.nid;
/* Build response header with hash of client request */
local_hdr.nid = sk->nid;
@@ -458,10 +488,58 @@ int oap_srv_process(const struct name_info * info,
goto fail_auth;
}
req_hash.data = hash_buf;
- req_hash.len = (size_t) hash_ret;
+ req_hash.len = (size_t) hash_ret;
- if (oap_hdr_encode(&local_hdr, pkp, crt, &kcfg,
- req_hash, req_md_nid) < 0) {
+ rsp_tag = req_hash;
+
+ /* Bind the key to the transcript and confirm it to the client */
+ if (sk->nid != NID_undef) {
+ if (crt != NULL && crypt_crt_der(crt, &crt_der) < 0) {
+ log_err_id(id, "Failed to serialize cert.");
+ goto fail_auth;
+ }
+
+ resp_hash.data = resp_hash_buf;
+
+ ret = oap_resp_hash(req_md_nid, local_hdr.kex, *data,
+ crt_der, &resp_hash);
+
+ freebuf(crt_der);
+
+ if (ret < 0) {
+ log_err_id(id, "Failed to hash response.");
+ goto fail_auth;
+ }
+
+ /* Derive the identity-seal key before bind mutates sk->key */
+ if (oap_derive_hs_key(sk, req_hash, hs_key) < 0) {
+ log_err_id(id, "Failed to derive handshake key.");
+ goto fail_auth;
+ }
+
+ seal_key = hs_key;
+
+ if (oap_bind_session_key(sk, req_hash, resp_hash,
+ scfg.k.nid) < 0) {
+ log_err_id(id, "Failed to bind session key.");
+ goto fail_auth;
+ }
+
+ if (oap_key_confirm_tag(sk, req_hash, resp_hash, kc_buf,
+ (size_t) hash_ret) < 0) {
+ log_err_id(id, "Failed to confirm session key.");
+ goto fail_auth;
+ }
+
+ rsp_tag.data = kc_buf;
+ }
+
+ ret = oap_hdr_encode(&local_hdr, pkp, crt, &scfg,
+ rsp_tag, req_md_nid, seal_key);
+
+ crypt_secure_clear(hs_key, SYMMKEYSZ);
+
+ if (ret < 0) {
log_err_id(id, "Failed to create OAP response header.");
goto fail_auth;
}
@@ -486,11 +564,17 @@ int oap_srv_process(const struct name_info * info,
fail_data:
oap_hdr_fini(&local_hdr);
fail_auth:
+ crypt_secure_clear(hs_key, SYMMKEYSZ);
crypt_free_crt(crt);
crypt_free_key(pkp);
fail_cred:
return -EAUTH;
+ fail_replay:
+ crypt_free_crt(crt);
+ crypt_free_key(pkp);
+ return -EREPLAY;
+
fail_kex:
crypt_free_crt(crt);
crypt_free_key(pkp);
diff --git a/src/irmd/oap/tests/common.c b/src/irmd/oap/tests/common.c
index 0a1af100..49ea9187 100644
--- a/src/irmd/oap/tests/common.c
+++ b/src/irmd/oap/tests/common.c
@@ -29,39 +29,51 @@
#include <string.h>
#include <stdio.h>
-int load_srv_kex_config(const struct name_info * info,
+int load_srv_sec_config(const struct name_info * info,
struct sec_config * cfg)
{
(void) info;
memset(cfg, 0, sizeof(*cfg));
+ cfg->a.req = test_cfg.srv.req_auth;
+ if (test_cfg.srv.cacert != NULL)
+ strcpy(cfg->a.cacert, test_cfg.srv.cacert);
+
+ /* Digest is kept without kex, as in parse_sec_config */
+ SET_KEX_DIGEST_NID(cfg, test_cfg.srv.md);
+
if (test_cfg.srv.kex == NID_undef)
return 0;
SET_KEX_ALGO_NID(cfg, test_cfg.srv.kex);
SET_KEX_CIPHER_NID(cfg, test_cfg.srv.cipher);
SET_KEX_KDF_NID(cfg, test_cfg.srv.kdf);
- SET_KEX_DIGEST_NID(cfg, test_cfg.srv.md);
SET_KEX_KEM_MODE(cfg, test_cfg.srv.kem_mode);
return 0;
}
-int load_cli_kex_config(const struct name_info * info,
+int load_cli_sec_config(const struct name_info * info,
struct sec_config * cfg)
{
(void) info;
memset(cfg, 0, sizeof(*cfg));
+ cfg->a.req = test_cfg.cli.req_auth;
+ if (test_cfg.cli.cacert != NULL)
+ strcpy(cfg->a.cacert, test_cfg.cli.cacert);
+
+ /* Digest is kept without kex, as in parse_sec_config */
+ SET_KEX_DIGEST_NID(cfg, test_cfg.cli.md);
+
if (test_cfg.cli.kex == NID_undef)
return 0;
SET_KEX_ALGO_NID(cfg, test_cfg.cli.kex);
SET_KEX_CIPHER_NID(cfg, test_cfg.cli.cipher);
SET_KEX_KDF_NID(cfg, test_cfg.cli.kdf);
- SET_KEX_DIGEST_NID(cfg, test_cfg.cli.md);
SET_KEX_KEM_MODE(cfg, test_cfg.cli.kem_mode);
return 0;
@@ -152,13 +164,15 @@ void oap_test_teardown(struct oap_test_ctx * ctx)
if (ctx->cli.state != NULL) {
res.key = ctx->cli.key;
oap_cli_complete(ctx->cli.state, &ctx->cli.info, dummy,
- &ctx->data, &res);
+ &ctx->data, &res, NULL, NULL);
ctx->cli.state = NULL;
}
freebuf(ctx->data);
freebuf(ctx->resp_hdr);
freebuf(ctx->req_hdr);
+ freebuf(ctx->srv_crt);
+ freebuf(ctx->cli_crt);
crypt_free_crt(ctx->im_ca);
crypt_free_crt(ctx->root_ca);
@@ -170,7 +184,7 @@ void oap_test_teardown(struct oap_test_ctx * ctx)
int oap_cli_prepare_ctx(struct oap_test_ctx * ctx)
{
return oap_cli_prepare(&ctx->cli.state, &ctx->cli.info, &ctx->req_hdr,
- ctx->data);
+ ctx->data, ctx->rekey);
}
int oap_srv_process_ctx(struct oap_test_ctx * ctx)
@@ -179,7 +193,9 @@ int oap_srv_process_ctx(struct oap_test_ctx * ctx)
int ret;
ret = oap_srv_process(&ctx->srv.info, ctx->req_hdr,
- &ctx->resp_hdr, &ctx->data, &res);
+ &ctx->resp_hdr, &ctx->data, &res, ctx->rekey,
+ ctx->rekey ? &ctx->srv_crt : NULL,
+ ctx->rekey ? NULL : &ctx->srv_crt);
if (ret == 0)
ctx->srv.nid = res.nid;
@@ -192,7 +208,9 @@ int oap_cli_complete_ctx(struct oap_test_ctx * ctx)
int ret;
ret = oap_cli_complete(ctx->cli.state, &ctx->cli.info, ctx->resp_hdr,
- &ctx->data, &res);
+ &ctx->data, &res,
+ ctx->rekey ? &ctx->cli_crt : NULL,
+ ctx->rekey ? NULL : &ctx->cli_crt);
ctx->cli.state = NULL;
if (ret == 0)
@@ -243,6 +261,147 @@ int roundtrip_auth_only(const char * root_ca,
return TEST_RC_FAIL;
}
+int roundtrip_rekey(const char * root_ca,
+ const char * im_ca_str)
+{
+ struct oap_test_ctx ctx;
+
+ TEST_START();
+
+ if (oap_test_setup(&ctx, root_ca, im_ca_str) < 0)
+ goto fail;
+
+ /* Initial handshake: the client caches the server cert. */
+ if (oap_cli_prepare_ctx(&ctx) < 0) {
+ printf("Initial client prepare failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (oap_srv_process_ctx(&ctx) < 0) {
+ printf("Initial server process failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (oap_cli_complete_ctx(&ctx) < 0) {
+ printf("Initial client complete failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (memcmp(ctx.cli.key, ctx.srv.key, SYMMKEYSZ) != 0) {
+ printf("Initial keys do not match.\n");
+ goto fail_cleanup;
+ }
+
+ if (ctx.cli_crt.len == 0) {
+ printf("Server cert was not cached for re-key.\n");
+ goto fail_cleanup;
+ }
+
+ /* Re-key: cert dropped on the wire, verified against the cache. */
+ freebuf(ctx.req_hdr);
+ freebuf(ctx.resp_hdr);
+ freebuf(ctx.data);
+
+ ctx.rekey = true;
+
+ if (oap_cli_prepare_ctx(&ctx) < 0) {
+ printf("Re-key client prepare failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (oap_srv_process_ctx(&ctx) < 0) {
+ printf("Re-key server process failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (oap_cli_complete_ctx(&ctx) < 0) {
+ printf("Re-key client complete failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (memcmp(ctx.cli.key, ctx.srv.key, SYMMKEYSZ) != 0) {
+ printf("Re-key keys do not match.\n");
+ goto fail_cleanup;
+ }
+
+ oap_test_teardown(&ctx);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_cleanup:
+ oap_test_teardown(&ctx);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+int roundtrip_rekey_badcache(const char * root_ca,
+ const char * im_ca_str)
+{
+ struct oap_test_ctx ctx;
+
+ TEST_START();
+
+ if (oap_test_setup(&ctx, root_ca, im_ca_str) < 0)
+ goto fail;
+
+ if (oap_cli_prepare_ctx(&ctx) < 0) {
+ printf("Initial client prepare failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (oap_srv_process_ctx(&ctx) < 0) {
+ printf("Initial server process failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (oap_cli_complete_ctx(&ctx) < 0) {
+ printf("Initial client complete failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (ctx.cli_crt.len == 0) {
+ printf("Server cert was not cached.\n");
+ goto fail_cleanup;
+ }
+
+ /* Corrupt the cached cert: the re-key must fail closed. */
+ ctx.cli_crt.data[ctx.cli_crt.len / 2] ^= 0xFF;
+
+ freebuf(ctx.req_hdr);
+ freebuf(ctx.resp_hdr);
+ freebuf(ctx.data);
+
+ ctx.rekey = true;
+
+ if (oap_cli_prepare_ctx(&ctx) < 0) {
+ printf("Re-key client prepare failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (oap_srv_process_ctx(&ctx) < 0) {
+ printf("Re-key server process failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (oap_cli_complete_ctx(&ctx) == 0) {
+ printf("Re-key accepted a corrupted cached cert.\n");
+ goto fail_cleanup;
+ }
+
+ oap_test_teardown(&ctx);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_cleanup:
+ oap_test_teardown(&ctx);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
int roundtrip_kex_only(void)
{
struct name_info cli_info;
@@ -271,14 +430,15 @@ int roundtrip_kex_only(void)
}
if (oap_cli_prepare(&cli_state, &cli_info, &req_hdr,
- data) < 0) {
+ data, false) < 0) {
printf("Client prepare failed.\n");
goto fail_cleanup;
}
res.key = srv_key;
- if (oap_srv_process(&srv_info, req_hdr, &resp_hdr, &data, &res) < 0) {
+ if (oap_srv_process(&srv_info, req_hdr, &resp_hdr, &data, &res,
+ false, NULL, NULL) < 0) {
printf("Server process failed.\n");
goto fail_cleanup;
}
@@ -287,7 +447,8 @@ int roundtrip_kex_only(void)
res.key = cli_key;
- if (oap_cli_complete(cli_state, &cli_info, resp_hdr, &data, &res) < 0) {
+ if (oap_cli_complete(cli_state, &cli_info, resp_hdr, &data, &res,
+ NULL, NULL) < 0) {
printf("Client complete failed.\n");
cli_state = NULL;
goto fail_cleanup;
@@ -316,7 +477,8 @@ int roundtrip_kex_only(void)
fail_cleanup:
if (cli_state != NULL) {
res.key = cli_key;
- oap_cli_complete(cli_state, &cli_info, resp_hdr, &data, &res);
+ oap_cli_complete(cli_state, &cli_info, resp_hdr, &data,
+ &res, NULL, NULL);
}
freebuf(resp_hdr);
freebuf(req_hdr);
@@ -396,7 +558,7 @@ int corrupted_response(const char * root_ca,
res.key = ctx.cli.key;
if (oap_cli_complete(ctx.cli.state, &ctx.cli.info, ctx.resp_hdr,
- &ctx.data, &res) == 0) {
+ &ctx.data, &res, NULL, NULL) == 0) {
printf("Client should reject corrupted response.\n");
ctx.cli.state = NULL;
goto fail_cleanup;
diff --git a/src/irmd/oap/tests/common.h b/src/irmd/oap/tests/common.h
index d4b6733a..c47096fb 100644
--- a/src/irmd/oap/tests/common.h
+++ b/src/irmd/oap/tests/common.h
@@ -32,12 +32,14 @@
/* Per-side security configuration for tests */
struct test_sec_cfg {
- int kex; /* KEX algorithm NID */
- int cipher; /* Cipher NID for encryption */
- int kdf; /* KDF NID for key derivation */
- int md; /* Digest NID for signatures */
- int kem_mode; /* KEM encapsulation mode (0 for ECDH) */
- bool auth; /* Use authentication (certificates) */
+ int kex; /* KEX algorithm NID */
+ int cipher; /* Cipher NID for encryption */
+ int kdf; /* KDF NID for key derivation */
+ int md; /* Digest NID for signatures */
+ int kem_mode; /* KEM encapsulation mode (0 for ECDH) */
+ bool auth; /* Use authentication (certificates) */
+ bool req_auth; /* Require peer authentication */
+ const char * cacert; /* Pinned issuing CA path */
};
/* Test configuration - set by each test before running roundtrip */
@@ -69,6 +71,11 @@ struct oap_test_ctx {
buffer_t data;
void * root_ca;
void * im_ca;
+
+ /* Re-key (tier iii): drop the cert, verify against the cache. */
+ bool rekey;
+ buffer_t srv_crt; /* client cert cached by server */
+ buffer_t cli_crt; /* server cert cached by client */
};
int oap_test_setup(struct oap_test_ctx * ctx,
@@ -86,6 +93,12 @@ int oap_cli_complete_ctx(struct oap_test_ctx * ctx);
int roundtrip_auth_only(const char * root_ca,
const char * im_ca_str);
+int roundtrip_rekey(const char * root_ca,
+ const char * im_ca_str);
+
+int roundtrip_rekey_badcache(const char * root_ca,
+ const char * im_ca_str);
+
int roundtrip_kex_only(void);
int corrupted_request(const char * root_ca,
diff --git a/src/irmd/oap/tests/oap_test.c b/src/irmd/oap/tests/oap_test.c
index a324b586..fc10150b 100644
--- a/src/irmd/oap/tests/oap_test.c
+++ b/src/irmd/oap/tests/oap_test.c
@@ -32,6 +32,7 @@
#include <ouroboros/crypt.h>
#include <ouroboros/endian.h>
+#include <ouroboros/errno.h>
#include <ouroboros/flow.h>
#include <ouroboros/name.h>
#include <ouroboros/random.h>
@@ -41,9 +42,12 @@
#include <test/certs/ecdsa.h>
#include "oap.h"
+#include "oap/auth.h"
#include "common.h"
#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
#include <string.h>
#ifdef HAVE_OPENSSL
@@ -174,6 +178,7 @@ static int test_oap_roundtrip(int kex)
oap_test_teardown(&ctx);
TEST_SUCCESS("(%s)", kex_str);
+
return TEST_RC_SUCCESS;
fail_cleanup:
@@ -198,6 +203,20 @@ static int test_oap_roundtrip_auth_only(void)
return roundtrip_auth_only(root_ca_crt_ec, im_ca_crt_ec);
}
+static int test_oap_rekey(void)
+{
+ test_default_cfg();
+
+ return roundtrip_rekey(root_ca_crt_ec, im_ca_crt_ec);
+}
+
+static int test_oap_rekey_badcache(void)
+{
+ test_default_cfg();
+
+ return roundtrip_rekey_badcache(root_ca_crt_ec, im_ca_crt_ec);
+}
+
static int test_oap_roundtrip_kex_only(void)
{
memset(&test_cfg, 0, sizeof(test_cfg));
@@ -238,6 +257,7 @@ static int test_oap_piggyback_data(void)
ctx.data.data = malloc(ctx.data.len);
if (ctx.data.data == NULL)
goto fail_cleanup;
+
memcpy(ctx.data.data, cli_data_str, ctx.data.len);
if (oap_cli_prepare_ctx(&ctx) < 0)
@@ -288,6 +308,7 @@ static int test_oap_piggyback_data(void)
oap_test_teardown(&ctx);
TEST_SUCCESS();
+
return TEST_RC_SUCCESS;
fail_cleanup:
@@ -356,6 +377,7 @@ static int test_oap_inflated_length_field(void)
oap_test_teardown(&ctx);
TEST_SUCCESS();
+
return TEST_RC_SUCCESS;
fail_cleanup:
@@ -400,6 +422,7 @@ static int test_oap_deflated_length_field(void)
oap_test_teardown(&ctx);
TEST_SUCCESS();
+
return TEST_RC_SUCCESS;
fail_cleanup:
@@ -458,6 +481,7 @@ static int test_oap_nid_without_kex(void)
oap_test_teardown(&ctx);
TEST_SUCCESS();
+
return TEST_RC_SUCCESS;
fail_cleanup:
@@ -509,6 +533,61 @@ static int test_oap_unsupported_nid(void)
oap_test_teardown(&ctx);
TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+
+ fail_cleanup:
+ oap_test_teardown(&ctx);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* Client rejects a response whose key-confirmation tag is tampered */
+static int test_oap_key_confirm_mismatch(void)
+{
+ struct oap_test_ctx ctx;
+
+ TEST_START();
+
+ /* Unauthenticated + encrypted: response unsigned, KC is the gate */
+ memset(&test_cfg, 0, sizeof(test_cfg));
+ test_cfg.srv.kex = NID_X25519;
+ test_cfg.srv.cipher = NID_aes_256_gcm;
+ test_cfg.srv.kdf = NID_sha256;
+ test_cfg.srv.md = NID_sha256;
+ test_cfg.srv.auth = NO_AUTH;
+ test_cfg.cli.kex = NID_X25519;
+ test_cfg.cli.cipher = NID_aes_256_gcm;
+ test_cfg.cli.kdf = NID_sha256;
+ test_cfg.cli.md = NID_sha256;
+ test_cfg.cli.auth = NO_AUTH;
+
+ if (oap_test_setup(&ctx, root_ca_crt_ec, im_ca_crt_ec) < 0)
+ goto fail;
+
+ if (oap_cli_prepare_ctx(&ctx) < 0) {
+ printf("Client prepare failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (oap_srv_process_ctx(&ctx) < 0) {
+ printf("Server process failed.\n");
+ goto fail_cleanup;
+ }
+
+ /* The key-confirm tag is the last field of an unsigned response */
+ ctx.resp_hdr.data[ctx.resp_hdr.len - 1] ^= 0xFF;
+
+ if (oap_cli_complete_ctx(&ctx) == 0) {
+ printf("Client accepted a bad key-confirmation tag.\n");
+ goto fail_cleanup;
+ }
+
+ oap_test_teardown(&ctx);
+
+ TEST_SUCCESS();
+
return TEST_RC_SUCCESS;
fail_cleanup:
@@ -609,6 +688,7 @@ static int test_oap_cipher_mismatch(void)
oap_test_teardown(&ctx);
TEST_SUCCESS();
+
return TEST_RC_SUCCESS;
fail_cleanup:
@@ -655,6 +735,7 @@ static int test_oap_srv_enc_cli_none(void)
oap_test_teardown(&ctx);
TEST_SUCCESS();
+
return TEST_RC_SUCCESS;
fail_cleanup:
@@ -724,6 +805,7 @@ static int test_oap_cli_enc_srv_none(void)
oap_test_teardown(&ctx);
TEST_SUCCESS();
+
return TEST_RC_SUCCESS;
fail_cleanup:
@@ -733,7 +815,7 @@ static int test_oap_cli_enc_srv_none(void)
return TEST_RC_FAIL;
}
-/* Client rejects server response with downgraded cipher */
+/* Unauthenticated server: client floor-rejects a downgraded cipher */
static int test_oap_cli_rejects_downgrade(void)
{
struct oap_test_ctx ctx;
@@ -747,7 +829,7 @@ static int test_oap_cli_rejects_downgrade(void)
test_cfg.srv.cipher = NID_aes_256_gcm;
test_cfg.srv.kdf = NID_sha256;
test_cfg.srv.md = NID_sha256;
- test_cfg.srv.auth = AUTH;
+ test_cfg.srv.auth = NO_AUTH;
test_cfg.cli.kex = NID_X25519;
test_cfg.cli.cipher = NID_aes_256_gcm;
@@ -769,7 +851,7 @@ static int test_oap_cli_rejects_downgrade(void)
}
/* Tamper: replace cipher NID with weaker one */
- weak = hton16(NID_aes_128_ctr);
+ weak = hton16(NID_aes_128_gcm);
memcpy(ctx.resp_hdr.data + OAP_CIPHER_NID_OFFSET,
&weak, sizeof(weak));
@@ -782,6 +864,69 @@ static int test_oap_cli_rejects_downgrade(void)
oap_test_teardown(&ctx);
TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+
+ fail_cleanup:
+ oap_test_teardown(&ctx);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/*
+ * Suite binding: a cipher swapped to a higher rank clears the client floor
+ * check, but the bound key commits to the negotiated suite, so the swap must
+ * still fail key confirmation.
+ */
+static int test_oap_cli_rejects_suite_swap(void)
+{
+ struct oap_test_ctx ctx;
+ uint16_t swap;
+
+ TEST_START();
+
+ memset(&test_cfg, 0, sizeof(test_cfg));
+
+ /* Both AES-128-GCM: a swap to AES-256 outranks the client floor */
+ test_cfg.srv.kex = NID_X25519;
+ test_cfg.srv.cipher = NID_aes_128_gcm;
+ test_cfg.srv.kdf = NID_sha256;
+ test_cfg.srv.md = NID_sha256;
+ test_cfg.srv.auth = NO_AUTH;
+ test_cfg.cli.kex = NID_X25519;
+ test_cfg.cli.cipher = NID_aes_128_gcm;
+ test_cfg.cli.kdf = NID_sha256;
+ test_cfg.cli.md = NID_sha256;
+ test_cfg.cli.auth = NO_AUTH;
+
+ if (oap_test_setup(&ctx, root_ca_crt_ec, im_ca_crt_ec) < 0)
+ goto fail;
+
+ if (oap_cli_prepare_ctx(&ctx) < 0) {
+ printf("Client prepare failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (oap_srv_process_ctx(&ctx) < 0) {
+ printf("Server process failed.\n");
+ goto fail_cleanup;
+ }
+
+ /* Swap the response cipher to a higher-ranked one */
+ swap = hton16(NID_aes_256_gcm);
+ memcpy(ctx.resp_hdr.data + OAP_CIPHER_NID_OFFSET,
+ &swap, sizeof(swap));
+
+ if (oap_cli_complete_ctx(&ctx) == 0) {
+ printf("Client accepted a swapped cipher suite.\n");
+ goto fail_cleanup;
+ }
+
+ oap_test_teardown(&ctx);
+
+ TEST_SUCCESS();
+
return TEST_RC_SUCCESS;
fail_cleanup:
@@ -831,6 +976,7 @@ static int test_oap_srv_rejects_weak_kex(void)
oap_test_teardown(&ctx);
TEST_SUCCESS();
+
return TEST_RC_SUCCESS;
fail_cleanup:
@@ -890,6 +1036,7 @@ static int test_oap_roundtrip_md(int md)
oap_test_teardown(&ctx);
TEST_SUCCESS("(%s)", md_str ? md_str : "default");
+
return TEST_RC_SUCCESS;
fail_cleanup:
@@ -955,6 +1102,7 @@ static int test_oap_outdated_packet(void)
oap_test_teardown(&ctx);
TEST_SUCCESS();
+
return TEST_RC_SUCCESS;
fail_cleanup:
@@ -1003,6 +1151,7 @@ static int test_oap_future_packet(void)
oap_test_teardown(&ctx);
TEST_SUCCESS();
+
return TEST_RC_SUCCESS;
fail_cleanup:
@@ -1053,15 +1202,16 @@ static int test_oap_replay_packet(void)
freebuf(ctx.req_hdr);
ctx.req_hdr = saved_req;
- /* Replayed request should fail */
- if (oap_srv_process_ctx(&ctx) == 0) {
- printf("Server should reject replayed packet.\n");
+ /* Replay must return -EREPLAY so callers can drop silently. */
+ if (oap_srv_process_ctx(&ctx) != -EREPLAY) {
+ printf("Replayed packet rejection != -EREPLAY.\n");
goto fail_cleanup;
}
oap_test_teardown(&ctx);
TEST_SUCCESS();
+
return TEST_RC_SUCCESS;
fail_cleanup:
@@ -1071,6 +1221,150 @@ static int test_oap_replay_packet(void)
return TEST_RC_FAIL;
}
+/* Encode a distinct OAP session ID from an index */
+static void make_id(uint8_t * id,
+ size_t idx)
+{
+ memset(id, 0, OAP_ID_SIZE);
+ memcpy(id, &idx, sizeof(idx));
+}
+
+/*
+ * Replay cache fails closed at capacity: a flood is rejected and no genuine
+ * entry is evicted (so it cannot be replayed).
+ */
+static int test_oap_replay_cap(void)
+{
+ struct oap_hdr h;
+ struct timespec now;
+ uint8_t id[OAP_ID_SIZE];
+ uint64_t stamp;
+ size_t i;
+
+ TEST_START();
+
+ if (oap_auth_init() < 0) {
+ printf("Failed to init OAP.\n");
+ goto fail;
+ }
+
+ clock_gettime(CLOCK_REALTIME, &now);
+ stamp = TS_TO_UINT64(now);
+
+ memset(&h, 0, sizeof(h));
+ h.id.data = id;
+ h.id.len = OAP_ID_SIZE;
+ h.timestamp = stamp;
+
+ /* Fill one generation bucket to capacity with distinct IDs */
+ for (i = 0; i < OAP_REPLAY_MAX; i++) {
+ make_id(id, i);
+ if (oap_check_hdr(&h) != 0) {
+ printf("Distinct header %zu rejected.\n", i);
+ goto fail_fini;
+ }
+ }
+
+ /* One past capacity fails closed (rejected, not evict-oldest) */
+ make_id(id, OAP_REPLAY_MAX);
+ if (oap_check_hdr(&h) != -EAUTH) {
+ printf("Header past capacity not fail-closed.\n");
+ goto fail_fini;
+ }
+
+ /* No genuine entry was evicted: the oldest still reads as a replay */
+ make_id(id, 0);
+ if (oap_check_hdr(&h) != -EREPLAY) {
+ printf("Genuine entry evicted under flood.\n");
+ goto fail_fini;
+ }
+
+ oap_auth_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+
+ fail_fini:
+ oap_auth_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/*
+ * Distinct timestamp generations use separate buckets and are detected
+ * independently (covers the multi-generation / rotation path).
+ */
+static int test_oap_replay_generations(void)
+{
+ struct oap_hdr h;
+ struct timespec now;
+ uint8_t id[OAP_ID_SIZE];
+ uint64_t cur;
+ uint64_t gen_ns;
+ uint64_t stamp_a;
+ uint64_t stamp_b;
+
+ TEST_START();
+
+ if (oap_auth_init() < 0) {
+ printf("Failed to init OAP.\n");
+ goto fail;
+ }
+
+ clock_gettime(CLOCK_REALTIME, &now);
+ cur = TS_TO_UINT64(now);
+ gen_ns = (uint64_t) OAP_REPLAY_TIMER * BILLION;
+
+ /* stamp_a in the current generation, stamp_b one generation older */
+ stamp_a = cur;
+ stamp_b = (cur / gen_ns) * gen_ns - 1;
+
+ memset(&h, 0, sizeof(h));
+ h.id.data = id;
+ h.id.len = OAP_ID_SIZE;
+ make_id(id, 1);
+
+ /* First sighting in each generation is accepted */
+ h.timestamp = stamp_a;
+ if (oap_check_hdr(&h) != 0) {
+ printf("Gen-A header rejected.\n");
+ goto fail_fini;
+ }
+
+ h.timestamp = stamp_b;
+ if (oap_check_hdr(&h) != 0) {
+ printf("Gen-B header rejected.\n");
+ goto fail_fini;
+ }
+
+ /* Each generation independently detects its own replay */
+ h.timestamp = stamp_a;
+ if (oap_check_hdr(&h) != -EREPLAY) {
+ printf("Gen-A replay not detected.\n");
+ goto fail_fini;
+ }
+
+ h.timestamp = stamp_b;
+ if (oap_check_hdr(&h) != -EREPLAY) {
+ printf("Gen-B replay not detected.\n");
+ goto fail_fini;
+ }
+
+ oap_auth_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+
+ fail_fini:
+ oap_auth_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
/* Server rejects client certificate when root CA is missing from store */
static int test_oap_missing_root_ca(void)
{
@@ -1125,6 +1419,7 @@ static int test_oap_missing_root_ca(void)
oap_test_teardown(&ctx);
TEST_SUCCESS();
+
return TEST_RC_SUCCESS;
fail_teardown:
@@ -1173,6 +1468,355 @@ static int test_oap_server_name_mismatch(void)
oap_test_teardown(&ctx);
TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+
+ fail_cleanup:
+ oap_test_teardown(&ctx);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* Client requiring auth rejects a response without certificate */
+static int test_oap_cli_requires_srv_auth(void)
+{
+ struct oap_test_ctx ctx;
+
+ test_default_cfg();
+ test_cfg.srv.auth = NO_AUTH;
+ test_cfg.cli.req_auth = true;
+
+ TEST_START();
+
+ if (oap_test_setup(&ctx, root_ca_crt_ec, im_ca_crt_ec) < 0)
+ goto fail;
+
+ if (oap_cli_prepare_ctx(&ctx) < 0) {
+ printf("Client prepare failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (oap_srv_process_ctx(&ctx) < 0) {
+ printf("Server process failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (oap_cli_complete_ctx(&ctx) == 0) {
+ printf("Client should reject unauthenticated server.\n");
+ goto fail_cleanup;
+ }
+
+ oap_test_teardown(&ctx);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+
+ fail_cleanup:
+ oap_test_teardown(&ctx);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* Server requiring auth rejects a request without certificate */
+static int test_oap_srv_requires_cli_auth(void)
+{
+ struct oap_test_ctx ctx;
+
+ test_default_cfg();
+ test_cfg.srv.req_auth = true;
+
+ TEST_START();
+
+ if (oap_test_setup(&ctx, root_ca_crt_ec, im_ca_crt_ec) < 0)
+ goto fail;
+
+ if (oap_cli_prepare_ctx(&ctx) < 0) {
+ printf("Client prepare failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (oap_srv_process_ctx(&ctx) == 0) {
+ printf("Server should reject unauthenticated client.\n");
+ goto fail_cleanup;
+ }
+
+ oap_test_teardown(&ctx);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+
+ fail_cleanup:
+ oap_test_teardown(&ctx);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* Roundtrip succeeds when both sides require and provide auth */
+static int test_oap_mutual_req_auth(void)
+{
+ struct oap_test_ctx ctx;
+
+ test_default_cfg();
+ test_cfg.srv.req_auth = true;
+ test_cfg.cli.auth = AUTH;
+ test_cfg.cli.req_auth = true;
+
+ TEST_START();
+
+ if (oap_test_setup(&ctx, root_ca_crt_ec, im_ca_crt_ec) < 0)
+ goto fail;
+
+ if (oap_cli_prepare_ctx(&ctx) < 0) {
+ printf("Client prepare failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (oap_srv_process_ctx(&ctx) < 0) {
+ printf("Server process failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (oap_cli_complete_ctx(&ctx) < 0) {
+ printf("Client complete failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (memcmp(ctx.cli.key, ctx.srv.key, SYMMKEYSZ) != 0) {
+ printf("Client and server keys do not match!\n");
+ goto fail_cleanup;
+ }
+
+ oap_test_teardown(&ctx);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+
+ fail_cleanup:
+ oap_test_teardown(&ctx);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* Client rejects a server signature with a different digest */
+static int test_oap_cli_rejects_md_mismatch(void)
+{
+ struct oap_test_ctx ctx;
+
+ test_default_cfg();
+ test_cfg.srv.md = NID_sha384;
+
+ TEST_START();
+
+ if (oap_test_setup(&ctx, root_ca_crt_ec, im_ca_crt_ec) < 0)
+ goto fail;
+
+ if (oap_cli_prepare_ctx(&ctx) < 0) {
+ printf("Client prepare failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (oap_srv_process_ctx(&ctx) < 0) {
+ printf("Server process failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (oap_cli_complete_ctx(&ctx) == 0) {
+ printf("Client should reject digest mismatch.\n");
+ goto fail_cleanup;
+ }
+
+ oap_test_teardown(&ctx);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+
+ fail_cleanup:
+ oap_test_teardown(&ctx);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* Server rejects a client signature with a different digest */
+static int test_oap_srv_rejects_md_mismatch(void)
+{
+ struct oap_test_ctx ctx;
+
+ test_default_cfg();
+ test_cfg.cli.auth = AUTH;
+ test_cfg.cli.md = NID_sha384;
+
+ TEST_START();
+
+ if (oap_test_setup(&ctx, root_ca_crt_ec, im_ca_crt_ec) < 0)
+ goto fail;
+
+ if (oap_cli_prepare_ctx(&ctx) < 0) {
+ printf("Client prepare failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (oap_srv_process_ctx(&ctx) == 0) {
+ printf("Server should reject digest mismatch.\n");
+ goto fail_cleanup;
+ }
+
+ oap_test_teardown(&ctx);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+
+ fail_cleanup:
+ oap_test_teardown(&ctx);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* Naive substring search over raw bytes (memmem is not portable here). */
+static bool buf_contains(const uint8_t * hay,
+ size_t hlen,
+ const uint8_t * needle,
+ size_t nlen)
+{
+ size_t i;
+
+ if (nlen == 0 || nlen > hlen)
+ return false;
+
+ for (i = 0; i + nlen <= hlen; i++) {
+ if (memcmp(hay + i, needle, nlen) == 0)
+ return true;
+ }
+
+ return false;
+}
+
+/* The server certificate must not appear in cleartext on the wire */
+static int test_oap_server_cert_hidden(void)
+{
+ struct oap_test_ctx ctx;
+ void * crt = NULL;
+ buffer_t der = BUF_INIT;
+
+ test_default_cfg();
+
+ TEST_START();
+
+ if (oap_test_setup(&ctx, root_ca_crt_ec, im_ca_crt_ec) < 0)
+ goto fail;
+
+ if (oap_cli_prepare_ctx(&ctx) < 0) {
+ printf("Client prepare failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (oap_srv_process_ctx(&ctx) < 0) {
+ printf("Server process failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (crypt_load_crt_str(signed_server_crt_ec, &crt) < 0) {
+ printf("Failed to load server crt.\n");
+ goto fail_cleanup;
+ }
+
+ if (crypt_crt_der(crt, &der) < 0) {
+ printf("Failed to DER-encode server crt.\n");
+ goto fail_crt;
+ }
+
+ if (der.len == 0 || der.len > ctx.resp_hdr.len) {
+ printf("Unexpected cert/response sizes.\n");
+ goto fail_der;
+ }
+
+ if (buf_contains(ctx.resp_hdr.data, ctx.resp_hdr.len,
+ der.data, der.len)) {
+ printf("Server certificate found in cleartext.\n");
+ goto fail_der;
+ }
+
+ /* The handshake must still complete and agree on a key */
+ if (oap_cli_complete_ctx(&ctx) < 0) {
+ printf("Client complete failed.\n");
+ goto fail_der;
+ }
+
+ if (memcmp(ctx.cli.key, ctx.srv.key, SYMMKEYSZ) != 0) {
+ printf("Client and server keys do not match!\n");
+ goto fail_der;
+ }
+
+ freebuf(der);
+ crypt_free_crt(crt);
+ oap_test_teardown(&ctx);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+
+ fail_der:
+ freebuf(der);
+ fail_crt:
+ crypt_free_crt(crt);
+ fail_cleanup:
+ oap_test_teardown(&ctx);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* Tampering the sealed identity block fails the handshake */
+static int test_oap_sealed_tamper(void)
+{
+ struct oap_test_ctx ctx;
+ size_t pos;
+
+ test_default_cfg();
+
+ TEST_START();
+
+ if (oap_test_setup(&ctx, root_ca_crt_ec, im_ca_crt_ec) < 0)
+ goto fail;
+
+ if (oap_cli_prepare_ctx(&ctx) < 0) {
+ printf("Client prepare failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (oap_srv_process_ctx(&ctx) < 0) {
+ printf("Server process failed.\n");
+ goto fail_cleanup;
+ }
+
+ if (ctx.resp_hdr.len < 64) {
+ printf("Response too short for test.\n");
+ goto fail_cleanup;
+ }
+
+ /* Flip a byte inside the sealed ciphertext, before the AEAD tag */
+ pos = ctx.resp_hdr.len - 32;
+ ctx.resp_hdr.data[pos] ^= 0xFF;
+
+ if (oap_cli_complete_ctx(&ctx) == 0) {
+ printf("Client accepted a tampered identity block.\n");
+ goto fail_cleanup;
+ }
+
+ oap_test_teardown(&ctx);
+
+ TEST_SUCCESS();
+
return TEST_RC_SUCCESS;
fail_cleanup:
@@ -1191,17 +1835,22 @@ int oap_test(int argc,
(void) argv;
ret |= test_oap_auth_init_fini();
+ ret |= test_oap_replay_cap();
+ ret |= test_oap_replay_generations();
#ifdef HAVE_OPENSSL
ret |= test_oap_roundtrip_auth_only();
ret |= test_oap_roundtrip_kex_only();
ret |= test_oap_piggyback_data();
+ ret |= test_oap_rekey();
+ ret |= test_oap_rekey_badcache();
ret |= test_oap_roundtrip_all();
ret |= test_oap_roundtrip_md_all();
ret |= test_oap_corrupted_request();
ret |= test_oap_corrupted_response();
+ ret |= test_oap_key_confirm_mismatch();
ret |= test_oap_truncated_request();
ret |= test_oap_inflated_length_field();
ret |= test_oap_deflated_length_field();
@@ -1212,6 +1861,7 @@ int oap_test(int argc,
ret |= test_oap_srv_enc_cli_none();
ret |= test_oap_cli_enc_srv_none();
ret |= test_oap_cli_rejects_downgrade();
+ ret |= test_oap_cli_rejects_suite_swap();
ret |= test_oap_srv_rejects_weak_kex();
ret |= test_oap_outdated_packet();
@@ -1219,6 +1869,17 @@ int oap_test(int argc,
ret |= test_oap_replay_packet();
ret |= test_oap_missing_root_ca();
ret |= test_oap_server_name_mismatch();
+
+ ret |= test_oap_cli_requires_srv_auth();
+ ret |= test_oap_srv_requires_cli_auth();
+ ret |= test_oap_mutual_req_auth();
+
+
+ ret |= test_oap_cli_rejects_md_mismatch();
+ ret |= test_oap_srv_rejects_md_mismatch();
+
+ ret |= test_oap_server_cert_hidden();
+ ret |= test_oap_sealed_tamper();
#else
(void) test_oap_roundtrip_auth_only;
(void) test_oap_roundtrip_kex_only;
@@ -1229,6 +1890,7 @@ int oap_test(int argc,
(void) test_oap_roundtrip_md_all;
(void) test_oap_corrupted_request;
(void) test_oap_corrupted_response;
+ (void) test_oap_key_confirm_mismatch;
(void) test_oap_truncated_request;
(void) test_oap_inflated_length_field;
(void) test_oap_deflated_length_field;
@@ -1238,12 +1900,23 @@ int oap_test(int argc,
(void) test_oap_srv_enc_cli_none;
(void) test_oap_cli_enc_srv_none;
(void) test_oap_cli_rejects_downgrade;
+ (void) test_oap_cli_rejects_suite_swap;
(void) test_oap_srv_rejects_weak_kex;
(void) test_oap_outdated_packet;
(void) test_oap_future_packet;
(void) test_oap_replay_packet;
+ (void) test_oap_replay_generations;
(void) test_oap_missing_root_ca;
(void) test_oap_server_name_mismatch;
+ (void) test_oap_cli_requires_srv_auth;
+ (void) test_oap_srv_requires_cli_auth;
+ (void) test_oap_mutual_req_auth;
+ (void) test_oap_cli_rejects_md_mismatch;
+ (void) test_oap_srv_rejects_md_mismatch;
+ (void) test_oap_server_cert_hidden;
+ (void) test_oap_sealed_tamper;
+ (void) test_oap_rekey;
+ (void) test_oap_rekey_badcache;
ret = TEST_RC_SKIP;
#endif
diff --git a/src/irmd/oap/tests/oap_test_ml_dsa.c b/src/irmd/oap/tests/oap_test_ml_dsa.c
index 81b307ab..8691aa00 100644
--- a/src/irmd/oap/tests/oap_test_ml_dsa.c
+++ b/src/irmd/oap/tests/oap_test_ml_dsa.c
@@ -179,6 +179,7 @@ int load_server_kem_pk(const char * name,
pk->data = malloc(test_kem_pk_len);
if (pk->data == NULL)
return -1;
+
memcpy(pk->data, test_kem_pk, test_kem_pk_len);
pk->len = test_kem_pk_len;
@@ -237,6 +238,39 @@ static int test_oap_roundtrip_auth_only(void)
return roundtrip_auth_only(root_ca_crt_ml, im_ca_crt_ml);
}
+/* Digest pin does not apply to PQC: the digest is intrinsic */
+static int test_oap_cli_md_pin_exempts_pqc(void)
+{
+ test_cfg_init(NID_undef, NID_undef, NID_undef, 0, NO_CLI_AUTH);
+ test_cfg.cli.md = NID_sha256;
+
+ return roundtrip_auth_only(root_ca_crt_ml, im_ca_crt_ml);
+}
+
+static int test_oap_srv_md_pin_exempts_pqc(void)
+{
+ test_cfg_init(NID_undef, NID_undef, NID_undef, 0, CLI_AUTH);
+ test_cfg.srv.md = NID_sha256;
+
+ return roundtrip_auth_only(root_ca_crt_ml, im_ca_crt_ml);
+}
+
+static int test_oap_rekey(void)
+{
+ test_cfg_init(NID_X25519, NID_aes_256_gcm, NID_sha256,
+ 0, NO_CLI_AUTH);
+
+ return roundtrip_rekey(root_ca_crt_ml, im_ca_crt_ml);
+}
+
+static int test_oap_rekey_badcache(void)
+{
+ test_cfg_init(NID_X25519, NID_aes_256_gcm, NID_sha256,
+ 0, NO_CLI_AUTH);
+
+ return roundtrip_rekey_badcache(root_ca_crt_ml, im_ca_crt_ml);
+}
+
static int test_oap_corrupted_request(void)
{
test_cfg_init(NID_MLKEM768, NID_aes_256_gcm, get_random_kdf(),
@@ -422,6 +456,8 @@ int oap_test_ml_dsa(int argc,
#ifdef HAVE_OPENSSL_ML_KEM
ret |= test_oap_roundtrip_auth_only();
+ ret |= test_oap_cli_md_pin_exempts_pqc();
+ ret |= test_oap_srv_md_pin_exempts_pqc();
ret |= test_oap_roundtrip_kem_all();
@@ -430,8 +466,15 @@ int oap_test_ml_dsa(int argc,
ret |= test_oap_corrupted_request();
ret |= test_oap_corrupted_response();
ret |= test_oap_truncated_request();
+
+ ret |= test_oap_rekey();
+ ret |= test_oap_rekey_badcache();
#else
(void) test_oap_roundtrip_auth_only;
+ (void) test_oap_cli_md_pin_exempts_pqc;
+ (void) test_oap_srv_md_pin_exempts_pqc;
+ (void) test_oap_rekey;
+ (void) test_oap_rekey_badcache;
(void) test_oap_roundtrip_kem;
(void) test_oap_roundtrip_kem_all;
(void) test_oap_kem_srv_uncfg;
diff --git a/src/irmd/reg/flow.c b/src/irmd/reg/flow.c
index 93c3e128..8be2dfc7 100644
--- a/src/irmd/reg/flow.c
+++ b/src/irmd/reg/flow.c
@@ -24,6 +24,7 @@
#define OUROBOROS_PREFIX "reg/flow"
+#include <ouroboros/crypt.h>
#include <ouroboros/logs.h>
#include "flow.h"
@@ -32,6 +33,7 @@
#include <errno.h>
#include <stdbool.h>
#include <stdlib.h>
+#include <string.h>
struct reg_flow * reg_flow_create(const struct flow_info * info)
{
@@ -42,6 +44,7 @@ struct reg_flow * reg_flow_create(const struct flow_info * info)
assert(info->n_pid != 0);
assert(info->n_1_pid == 0);
assert(info->mpl == 0);
+ assert(info->mtu == 0);
assert(info->state == FLOW_INIT);
flow = malloc(sizeof(*flow));
@@ -67,10 +70,12 @@ static void destroy_rbuffs(struct reg_flow * flow)
{
if (flow->n_rb != NULL)
ssm_rbuff_destroy(flow->n_rb);
+
flow->n_rb = NULL;
if (flow->n_1_rb != NULL)
ssm_rbuff_destroy(flow->n_1_rb);
+
flow->n_1_rb = NULL;
}
@@ -78,6 +83,11 @@ void reg_flow_destroy(struct reg_flow * flow)
{
assert(flow != NULL);
+ if (flow->rk.pending_seed != NULL)
+ crypt_secure_free(flow->rk.pending_seed, SYMMKEYSZ);
+
+ freebuf(flow->rk.peer_crt);
+
switch(flow->info.state) {
case FLOW_ACCEPT_PENDING:
clrbuf(flow->req_data);
@@ -160,6 +170,7 @@ int reg_flow_update(struct reg_flow * flow,
assert(info->mpl != 0);
flow->info.mpl = info->mpl;
+ flow->info.mtu = info->mtu;
if (flow->info.state == FLOW_ALLOC_PENDING)
break;
diff --git a/src/irmd/reg/flow.h b/src/irmd/reg/flow.h
index 9a4046d3..166bed61 100644
--- a/src/irmd/reg/flow.h
+++ b/src/irmd/reg/flow.h
@@ -49,6 +49,22 @@ struct reg_flow {
bool direct;
+ /* Tier-2 re-key state (encrypted flows only) */
+ struct {
+ bool encrypted; /* flow carries a cipher */
+ uint8_t epoch; /* last epoch installed by app */
+ bool initiator; /* OAP initiator (role 0) */
+ bool in_flight; /* a re-key is in progress */
+ bool req_queued; /* a peer REQ is in the inbox */
+ bool resp_queued; /* a peer RESP is in the inbox */
+ uint8_t * pending_seed; /* secure heap; NULL until set */
+ uint8_t pending_epoch;
+ bool pending_initiator; /* pending seed: oap_cli side */
+ bool has_pending; /* new seed awaits app pull */
+ uint8_t pulled; /* direct: per-app pull mask */
+ buffer_t peer_crt; /* peer cert DER, cached at HS */
+ } rk;
+
struct ssm_rbuff * n_rb;
struct ssm_rbuff * n_1_rb;
};
diff --git a/src/irmd/reg/reg.c b/src/irmd/reg/reg.c
index 0025f695..ebf3959d 100644
--- a/src/irmd/reg/reg.c
+++ b/src/irmd/reg/reg.c
@@ -25,6 +25,7 @@ The IPC Resource Manager - Registry
#define OUROBOROS_PREFIX "reg"
#include <ouroboros/bitmap.h>
+#include <ouroboros/crypt.h>
#include <ouroboros/errno.h>
#include <ouroboros/list.h>
#include <ouroboros/logs.h>
@@ -871,6 +872,7 @@ int reg_list_ipcps(ipcp_list_msg_t *** ipcps)
fail:
while (i-- > 0)
ipcp_list_msg__free_unpacked((*ipcps)[i], NULL);
+
free(*ipcps);
fail_malloc:
pthread_mutex_unlock(&reg.mtx);
@@ -1032,6 +1034,20 @@ int reg_get_name_for_flow_id(char * buf,
return f == NULL ? -ENOENT : 0;
}
+void reg_set_name_for_flow_id(const char * name,
+ int flow_id)
+{
+ struct reg_flow * f;
+
+ pthread_mutex_lock(&reg.mtx);
+
+ f = __reg_get_flow(flow_id);
+ if (f != NULL)
+ strcpy(f->name, name);
+
+ pthread_mutex_unlock(&reg.mtx);
+}
+
int reg_list_names(name_info_msg_t *** names)
{
struct list_head * p;
@@ -1076,6 +1092,7 @@ int reg_list_names(name_info_msg_t *** names)
fail:
while (i-- > 0)
name_info_msg__free_unpacked((*names)[i], NULL);
+
free(*names);
fail_malloc:
pthread_mutex_unlock(&reg.mtx);
@@ -1820,7 +1837,11 @@ int reg_respond_alloc(struct flow_info * info,
goto fail_flow;
}
- assert(flow->info.state == FLOW_ALLOC_PENDING);
+ if (flow->info.state != FLOW_ALLOC_PENDING) {
+ log_warn("Flow %d already responded.", info->id);
+ goto fail_flow;
+ }
+
assert(flow->rsp_data.len == 0);
assert(flow->rsp_data.data == NULL);
@@ -2098,6 +2119,511 @@ bool reg_flow_is_direct(int flow_id)
return ret;
}
+void reg_flow_set_rekey(int flow_id,
+ bool initiator,
+ buffer_t peer_crt)
+{
+ struct reg_flow * flow;
+ uint8_t * crt = NULL;
+
+ /* Copy the cert outside the lock; publish it with rk.encrypted. */
+ if (peer_crt.len > 0) {
+ crt = malloc(peer_crt.len);
+ if (crt != NULL)
+ memcpy(crt, peer_crt.data, peer_crt.len);
+ else
+ log_warn("Failed to cache peer cert for re-key.");
+ }
+
+ pthread_mutex_lock(&reg.mtx);
+
+ flow = __reg_get_flow(flow_id);
+ if (flow != NULL) {
+ flow->rk.encrypted = true;
+ flow->rk.initiator = initiator;
+ flow->rk.epoch = 0;
+ if (crt != NULL) {
+ freebuf(flow->rk.peer_crt);
+ flow->rk.peer_crt.data = crt;
+ flow->rk.peer_crt.len = peer_crt.len;
+ crt = NULL;
+ }
+ }
+
+ pthread_mutex_unlock(&reg.mtx);
+
+ free(crt);
+}
+
+int reg_flow_get_peer_crt(int flow_id,
+ buffer_t * crt)
+{
+ struct reg_flow * flow;
+ int ret = -ENOENT;
+
+ assert(crt != NULL);
+
+ clrbuf(*crt);
+
+ pthread_mutex_lock(&reg.mtx);
+
+ flow = __reg_get_flow(flow_id);
+ if (flow != NULL && flow->rk.peer_crt.len > 0) {
+ crt->data = malloc(flow->rk.peer_crt.len);
+ if (crt->data == NULL) {
+ ret = -ENOMEM;
+ } else {
+ memcpy(crt->data, flow->rk.peer_crt.data,
+ flow->rk.peer_crt.len);
+ crt->len = flow->rk.peer_crt.len;
+ ret = 0;
+ }
+ }
+
+ pthread_mutex_unlock(&reg.mtx);
+
+ return ret;
+}
+
+int reg_flow_get_epoch(int flow_id)
+{
+ struct reg_flow * flow;
+ int epoch = -1;
+
+ pthread_mutex_lock(&reg.mtx);
+
+ flow = __reg_get_flow(flow_id);
+ if (flow != NULL && flow->rk.encrypted)
+ epoch = flow->rk.epoch;
+
+ pthread_mutex_unlock(&reg.mtx);
+
+ return epoch;
+}
+
+bool reg_flow_rekey_pending(int flow_id)
+{
+ struct reg_flow * flow;
+ bool ret = false;
+
+ pthread_mutex_lock(&reg.mtx);
+
+ flow = __reg_get_flow(flow_id);
+ if (flow != NULL)
+ ret = flow->rk.has_pending;
+
+ pthread_mutex_unlock(&reg.mtx);
+
+ return ret;
+}
+
+pid_t reg_flow_get_n_1_pid(int flow_id)
+{
+ struct reg_flow * flow;
+ pid_t pid = -1;
+
+ pthread_mutex_lock(&reg.mtx);
+
+ flow = __reg_get_flow(flow_id);
+ if (flow != NULL)
+ pid = flow->info.n_1_pid;
+
+ pthread_mutex_unlock(&reg.mtx);
+
+ return pid;
+}
+
+int reg_flow_snapshot_rekey_due(struct rekey_info * snap,
+ int max)
+{
+ struct list_head * p;
+ int n = 0;
+
+ pthread_mutex_lock(&reg.mtx);
+
+ llist_for_each(p, &reg.flows) {
+ struct reg_flow * f;
+
+ if (n == max)
+ break;
+
+ f = list_entry(p, struct reg_flow, next);
+
+ if (f->info.state != FLOW_ALLOCATED)
+ continue;
+
+ if (!f->rk.encrypted)
+ continue;
+
+ /* Direct flows have no IPCP initiator; either side drives. */
+ if (!f->direct && !f->rk.initiator)
+ continue;
+
+ if (f->rk.in_flight || f->rk.has_pending)
+ continue;
+
+ f->rk.in_flight = true;
+
+ snap[n].flow_id = f->info.id;
+ snap[n].n_pid = f->info.n_pid;
+ snap[n].n_1_pid = f->info.n_1_pid;
+ snap[n].epoch = f->rk.epoch;
+ snap[n].direct = f->direct;
+ strcpy(snap[n].name, f->name);
+ ++n;
+ }
+
+ pthread_mutex_unlock(&reg.mtx);
+
+ return n;
+}
+
+void reg_flow_clear_in_flight(int flow_id)
+{
+ struct reg_flow * flow;
+
+ pthread_mutex_lock(&reg.mtx);
+
+ flow = __reg_get_flow(flow_id);
+ if (flow != NULL)
+ flow->rk.in_flight = false;
+
+ pthread_mutex_unlock(&reg.mtx);
+}
+
+/* Test-and-set the in-flight latch; refuse if a re-key is already active. */
+bool reg_flow_rekey_begin(int flow_id)
+{
+ struct reg_flow * flow;
+ bool ret = false;
+
+ pthread_mutex_lock(&reg.mtx);
+
+ flow = __reg_get_flow(flow_id);
+ if (flow != NULL && flow->rk.encrypted) {
+ if (!flow->rk.in_flight && !flow->rk.has_pending) {
+ flow->rk.in_flight = true;
+ ret = true;
+ }
+ }
+
+ pthread_mutex_unlock(&reg.mtx);
+
+ return ret;
+}
+
+/* Initiator yields the responder role while driving its own exchange. */
+bool reg_flow_rekey_should_yield(int flow_id)
+{
+ struct reg_flow * flow;
+ bool ret = false;
+
+ pthread_mutex_lock(&reg.mtx);
+
+ flow = __reg_get_flow(flow_id);
+ if (flow != NULL)
+ ret = flow->rk.initiator && flow->rk.in_flight;
+
+ pthread_mutex_unlock(&reg.mtx);
+
+ return ret;
+}
+
+int reg_flow_store_pending(int flow_id,
+ const uint8_t * seed,
+ uint8_t epoch,
+ bool initiator)
+{
+ struct reg_flow * flow;
+ int ret = -ENOENT;
+
+ pthread_mutex_lock(&reg.mtx);
+
+ flow = __reg_get_flow(flow_id);
+ if (flow != NULL) {
+ /* Exchange done: release the latch regardless of parking. */
+ flow->rk.in_flight = false;
+
+ if (flow->rk.pending_seed == NULL)
+ flow->rk.pending_seed = crypt_secure_malloc(SYMMKEYSZ);
+
+ if (flow->rk.pending_seed != NULL) {
+ memcpy(flow->rk.pending_seed, seed, SYMMKEYSZ);
+ flow->rk.pending_epoch = epoch;
+ flow->rk.pending_initiator = initiator;
+ flow->rk.has_pending = true;
+ /* Doorbell raised only after the seed is parked. */
+ if (flow->n_rb != NULL)
+ ssm_rbuff_set_bits(flow->n_rb, RB_REKEY);
+ ret = 0;
+ } else {
+ ret = -ENOMEM;
+ }
+ }
+
+ pthread_mutex_unlock(&reg.mtx);
+
+ return ret;
+}
+
+/* Direct re-key: which of the two local apps has pulled the seed. */
+#define RK_N_PID 0x1 /* acceptor (n_pid) pulled the seed */
+#define RK_N_1_PID 0x2 /* allocator (n_1_pid) pulled the seed */
+#define RK_PID_MASK (RK_N_PID | RK_N_1_PID)
+
+/*
+ * Park a single re-key seed for a direct flow and ring BOTH apps'
+ * doorbells. The seed is the one shared secret; each app pulls it once
+ * (reg_flow_take_pending), so it is held until both have taken it.
+ */
+int reg_flow_store_pending_direct(int flow_id,
+ const uint8_t * seed,
+ uint8_t epoch)
+{
+ struct reg_flow * flow;
+ int ret = -ENOENT;
+
+ pthread_mutex_lock(&reg.mtx);
+
+ flow = __reg_get_flow(flow_id);
+ if (flow == NULL)
+ goto out;
+
+ /* Exchange done: release the latch regardless of parking. */
+ flow->rk.in_flight = false;
+
+ if (flow->rk.pending_seed == NULL)
+ flow->rk.pending_seed = crypt_secure_malloc(SYMMKEYSZ);
+
+ if (flow->rk.pending_seed == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ memcpy(flow->rk.pending_seed, seed, SYMMKEYSZ);
+ flow->rk.pending_epoch = epoch;
+ flow->rk.has_pending = true;
+ flow->rk.pulled = 0;
+
+ /* A departed peer never pulls; treat its side as already done. */
+ if (flow->info.n_pid <= 0)
+ flow->rk.pulled |= RK_N_PID;
+
+ if (flow->info.n_1_pid <= 0)
+ flow->rk.pulled |= RK_N_1_PID;
+
+ if (flow->n_rb != NULL && !(flow->rk.pulled & RK_N_PID))
+ ssm_rbuff_set_bits(flow->n_rb, RB_REKEY);
+
+ if (flow->n_1_rb != NULL && !(flow->rk.pulled & RK_N_1_PID))
+ ssm_rbuff_set_bits(flow->n_1_rb, RB_REKEY);
+
+ ret = 0;
+ out:
+ pthread_mutex_unlock(&reg.mtx);
+
+ return ret;
+}
+
+/* A caller may act on a flow if it is privileged or owns the flow. */
+static bool uid_may_access(uid_t caller,
+ uid_t owner)
+{
+ return is_ouroboros_member_uid(caller) || caller == owner;
+}
+
+/*
+ * Caller holds reg.mtx. The direct seed is shared by both apps, so the
+ * per-app initiator role is resolved from the verified caller pid (the
+ * allocator is n_1_pid), and the seed is held until both have pulled.
+ */
+static void __take_pending_direct(struct reg_flow * flow,
+ pid_t cpid,
+ uint8_t * seed,
+ uint8_t * epoch,
+ bool * initiator)
+{
+ bool allocator;
+
+ allocator = cpid == flow->info.n_1_pid;
+
+ memcpy(seed, flow->rk.pending_seed, SYMMKEYSZ);
+ *epoch = flow->rk.pending_epoch;
+ *initiator = allocator;
+ flow->rk.epoch = flow->rk.pending_epoch;
+
+ if (allocator) {
+ flow->rk.pulled |= RK_N_1_PID;
+ if (flow->n_1_rb != NULL)
+ ssm_rbuff_clr_bits(flow->n_1_rb, RB_REKEY);
+ } else {
+ flow->rk.pulled |= RK_N_PID;
+ if (flow->n_rb != NULL)
+ ssm_rbuff_clr_bits(flow->n_rb, RB_REKEY);
+ }
+
+ if ((flow->rk.pulled & RK_PID_MASK) != RK_PID_MASK)
+ return;
+
+ flow->rk.has_pending = false;
+ flow->rk.pulled = 0;
+ crypt_secure_clear(flow->rk.pending_seed, SYMMKEYSZ);
+}
+
+int reg_flow_take_pending(int flow_id,
+ uid_t uid,
+ pid_t cpid,
+ uint8_t * seed,
+ uint8_t * epoch,
+ bool * initiator)
+{
+ struct reg_flow * flow;
+ int ret = -ENOENT;
+
+ pthread_mutex_lock(&reg.mtx);
+
+ flow = __reg_get_flow(flow_id);
+ if (flow == NULL || !flow->rk.has_pending)
+ goto out;
+
+ if (!uid_may_access(uid, flow->info.uid)) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ if (flow->direct) {
+ __take_pending_direct(flow, cpid, seed, epoch, initiator);
+ ret = 0;
+ goto out;
+ }
+
+ memcpy(seed, flow->rk.pending_seed, SYMMKEYSZ);
+ *epoch = flow->rk.pending_epoch;
+ *initiator = flow->rk.pending_initiator;
+ flow->rk.epoch = flow->rk.pending_epoch;
+ flow->rk.has_pending = false;
+ crypt_secure_clear(flow->rk.pending_seed, SYMMKEYSZ);
+ if (flow->n_rb != NULL)
+ ssm_rbuff_clr_bits(flow->n_rb, RB_REKEY);
+
+ ret = 0;
+ out:
+ pthread_mutex_unlock(&reg.mtx);
+
+ return ret;
+}
+
+/*
+ * Admit a peer-driven re-key arrival before a worker event is allocated:
+ * the flow must exist, carry a cipher, and the update must come from its
+ * own lower IPCP. Coalesces to one queued REQ and one queued RESP per flow
+ * so a flooding peer cannot grow the inbox without bound.
+ */
+bool reg_flow_rekey_arr_admit(int flow_id,
+ pid_t n_1_pid,
+ bool is_req)
+{
+ struct reg_flow * flow;
+ bool admit = false;
+
+ pthread_mutex_lock(&reg.mtx);
+
+ flow = __reg_get_flow(flow_id);
+ if (flow != NULL && flow->rk.encrypted
+ && flow->info.n_1_pid == n_1_pid) {
+ if (is_req && !flow->rk.req_queued) {
+ flow->rk.req_queued = true;
+ admit = true;
+ } else if (!is_req && flow->rk.in_flight
+ && !flow->rk.resp_queued) {
+ flow->rk.resp_queued = true;
+ admit = true;
+ }
+ }
+
+ pthread_mutex_unlock(&reg.mtx);
+
+ return admit;
+}
+
+void reg_flow_rekey_arr_done(int flow_id,
+ bool is_req)
+{
+ struct reg_flow * flow;
+
+ pthread_mutex_lock(&reg.mtx);
+
+ flow = __reg_get_flow(flow_id);
+ if (flow != NULL) {
+ if (is_req)
+ flow->rk.req_queued = false;
+ else
+ flow->rk.resp_queued = false;
+ }
+
+ pthread_mutex_unlock(&reg.mtx);
+}
+
+bool reg_flow_owned_by(int flow_id,
+ uid_t uid)
+{
+ struct reg_flow * flow;
+ bool ret = false;
+
+ pthread_mutex_lock(&reg.mtx);
+
+ flow = __reg_get_flow(flow_id);
+ if (flow != NULL)
+ ret = uid_may_access(uid, flow->info.uid);
+
+ pthread_mutex_unlock(&reg.mtx);
+
+ return ret;
+}
+
+/* Caller holds reg.mtx. */
+static void __notify_proc(pid_t pid,
+ int flow_id,
+ int event)
+{
+ struct reg_proc * proc;
+
+ proc = __reg_get_proc(pid);
+ if (proc != NULL)
+ ssm_flow_set_notify(proc->set, flow_id, event);
+}
+
+void reg_notify_flow(int flow_id,
+ int event)
+{
+ struct reg_flow * flow;
+
+ pthread_mutex_lock(&reg.mtx);
+
+ flow = __reg_get_flow(flow_id);
+ if (flow != NULL)
+ __notify_proc(flow->info.n_pid, flow_id, event);
+
+ pthread_mutex_unlock(&reg.mtx);
+}
+
+/* Wake both endpoints of a direct flow (acceptor and allocator). */
+void reg_notify_flow_peers(int flow_id,
+ int event)
+{
+ struct reg_flow * flow;
+
+ pthread_mutex_lock(&reg.mtx);
+
+ flow = __reg_get_flow(flow_id);
+ if (flow != NULL) {
+ __notify_proc(flow->info.n_pid, flow_id, event);
+ __notify_proc(flow->info.n_1_pid, flow_id, event);
+ }
+
+ pthread_mutex_unlock(&reg.mtx);
+}
+
int reg_respond_flow_direct(int flow_id,
buffer_t * pbuf)
{
diff --git a/src/irmd/reg/reg.h b/src/irmd/reg/reg.h
index 6b576471..8a313d46 100644
--- a/src/irmd/reg/reg.h
+++ b/src/irmd/reg/reg.h
@@ -109,6 +109,9 @@ int reg_get_name_for_hash(char * buf,
int reg_get_name_for_flow_id(char * buf,
int flow_id);
+void reg_set_name_for_flow_id(const char * name,
+ int flow_id);
+
/* TODO don't rely on protobuf here */
int reg_list_names(name_info_msg_t *** names);
@@ -163,6 +166,70 @@ int reg_wait_flow_direct(int flow_id,
bool reg_flow_is_direct(int flow_id);
+/* Per-flow snapshot for the re-key timer */
+struct rekey_info {
+ int flow_id;
+ pid_t n_pid;
+ pid_t n_1_pid;
+ char name[NAME_SIZE + 1];
+ uint8_t epoch;
+ bool direct;
+};
+
+void reg_flow_set_rekey(int flow_id,
+ bool initiator,
+ buffer_t peer_crt);
+
+int reg_flow_get_peer_crt(int flow_id,
+ buffer_t * crt);
+
+int reg_flow_get_epoch(int flow_id);
+
+bool reg_flow_rekey_pending(int flow_id);
+
+pid_t reg_flow_get_n_1_pid(int flow_id);
+
+int reg_flow_snapshot_rekey_due(struct rekey_info * snap,
+ int max);
+
+void reg_flow_clear_in_flight(int flow_id);
+
+bool reg_flow_rekey_begin(int flow_id);
+
+bool reg_flow_rekey_should_yield(int flow_id);
+
+int reg_flow_store_pending(int flow_id,
+ const uint8_t * seed,
+ uint8_t epoch,
+ bool initiator);
+
+int reg_flow_store_pending_direct(int flow_id,
+ const uint8_t * seed,
+ uint8_t epoch);
+
+int reg_flow_take_pending(int flow_id,
+ uid_t uid,
+ pid_t cpid,
+ uint8_t * seed,
+ uint8_t * epoch,
+ bool * initiator);
+
+bool reg_flow_rekey_arr_admit(int flow_id,
+ pid_t n_1_pid,
+ bool is_req);
+
+void reg_flow_rekey_arr_done(int flow_id,
+ bool is_req);
+
+bool reg_flow_owned_by(int flow_id,
+ uid_t uid);
+
+void reg_notify_flow(int flow_id,
+ int event);
+
+void reg_notify_flow_peers(int flow_id,
+ int event);
+
void reg_dealloc_flow(struct flow_info * info);
void reg_dealloc_flow_resp(struct flow_info * info);
diff --git a/src/irmd/reg/tests/flow_test.c b/src/irmd/reg/tests/flow_test.c
index 7e1c1360..18214078 100644
--- a/src/irmd/reg/tests/flow_test.c
+++ b/src/irmd/reg/tests/flow_test.c
@@ -122,6 +122,21 @@ static int test_reg_flow_create_has_mpl(void) {
return TEST_RC_SUCCESS;
}
+static int test_reg_flow_create_has_mtu(void) {
+ struct flow_info info = {
+ .id = 1,
+ .n_pid = 1,
+ .n_1_pid = 0,
+ .mtu = 1400,
+ .qs = qos_raw,
+ .state = FLOW_ALLOC_PENDING
+ };
+
+ reg_flow_create(&info); /* assert fail */
+
+ return TEST_RC_SUCCESS;
+}
+
static int test_reg_flow_update(void)
{
struct reg_flow * f;
@@ -136,7 +151,7 @@ static int test_reg_flow_update(void)
struct flow_info upd = {
.id = 1,
.n_pid = 1,
- .qs = qos_data,
+ .qs = qos_msg,
.state = FLOW_DEALLOCATED
};
@@ -179,7 +194,7 @@ static int test_reg_flow_update_wrong_id(void)
struct flow_info upd = {
.id = 2,
.n_pid = 1,
- .qs = qos_data,
+ .qs = qos_msg,
.state = FLOW_DEALLOCATED
};
@@ -210,6 +225,7 @@ static int test_reg_flow_assert_fails(void)
ret |= test_assert_fail(test_reg_flow_create_has_n_1_pid);
ret |= test_assert_fail(test_reg_flow_create_wrong_state);
ret |= test_assert_fail(test_reg_flow_create_has_mpl);
+ ret |= test_assert_fail(test_reg_flow_create_has_mtu);
ret |= test_assert_fail(test_reg_flow_update_wrong_id);
return ret;
diff --git a/src/irmd/reg/tests/reg_test.c b/src/irmd/reg/tests/reg_test.c
index f4b0188b..a8c1b1fa 100644
--- a/src/irmd/reg/tests/reg_test.c
+++ b/src/irmd/reg/tests/reg_test.c
@@ -31,6 +31,7 @@
#define TEST_N_1_PID 3999
#define TEST_FAKE_ID 9128349
#define TEST_MPL 5
+#define TEST_MTU 1400
#define TEST_PROG "reg_test" /* own binary for binary check */
#define TEST_IPCP "testipcp"
#define TEST_NAME "testname"
@@ -239,7 +240,7 @@ static int test_reg_accept_flow_success(void)
struct flow_info n_1_info = {
.n_1_pid = TEST_N_1_PID,
- .qs = qos_data,
+ .qs = qos_msg,
.state = FLOW_ALLOCATED /* RESPONSE SUCCESS */
};
@@ -266,6 +267,7 @@ static int test_reg_accept_flow_success(void)
n_1_info.id = info.id;
n_1_info.mpl = 1;
+ n_1_info.mtu = TEST_MTU;
pthread_create(&thr, NULL, test_flow_respond_accept, &n_1_info);
@@ -284,6 +286,11 @@ static int test_reg_accept_flow_success(void)
goto fail;
}
+ if (info.mtu != TEST_MTU) {
+ printf("MTU not propagated.\n");
+ goto fail;
+ }
+
if (rbuf.data == NULL) {
printf("rbuf data not returned.\n");
goto fail;
@@ -336,7 +343,7 @@ static int test_reg_accept_flow_success_no_crypt(void)
struct flow_info n_1_info = {
.n_1_pid = TEST_N_1_PID,
- .qs = qos_data,
+ .qs = qos_msg,
.state = FLOW_ALLOCATED /* RESPONSE SUCCESS */
};
@@ -363,6 +370,7 @@ static int test_reg_accept_flow_success_no_crypt(void)
n_1_info.id = info.id;
n_1_info.mpl = 1;
+ n_1_info.mtu = TEST_MTU;
pthread_create(&thr, NULL, test_flow_respond_accept, &n_1_info);
@@ -381,6 +389,11 @@ static int test_reg_accept_flow_success_no_crypt(void)
goto fail;
}
+ if (info.mtu != TEST_MTU) {
+ printf("MTU not propagated.\n");
+ goto fail;
+ }
+
if (rbuf.data == NULL) {
printf("rbuf data was not returned.\n");
goto fail;
@@ -431,7 +444,7 @@ static int test_reg_allocate_flow_fail(void)
struct flow_info n_1_info = {
.n_1_pid = TEST_N_1_PID,
- .qs = qos_data,
+ .qs = qos_msg,
.state = FLOW_DEALLOCATED /* RESPONSE FAIL */
};
@@ -489,6 +502,93 @@ static int test_reg_allocate_flow_fail(void)
return TEST_RC_FAIL;
}
+static int test_reg_respond_alloc_duplicate(void)
+{
+ pthread_t thr;
+ struct timespec abstime;
+ struct timespec timeo = TIMESPEC_INIT_S(1);
+ buffer_t rbuf = BUF_INIT;
+ buffer_t empty = BUF_INIT;
+ struct flow_info dup_info;
+
+ struct flow_info info = {
+ .n_pid = TEST_PID,
+ .qs = qos_raw
+ };
+
+ struct flow_info n_1_info = {
+ .n_1_pid = TEST_N_1_PID,
+ .qs = qos_msg,
+ .state = FLOW_ALLOCATED /* RESPONSE SUCCESS */
+ };
+
+ TEST_START();
+
+ clock_gettime(PTHREAD_COND_CLOCK, &abstime);
+ ts_add(&abstime, &timeo, &abstime);
+
+ if (reg_init() < 0) {
+ printf("Failed to init registry.\n");
+ goto fail;
+ }
+
+ if (reg_create_flow(&info) < 0) {
+ printf("Failed to add flow.\n");
+ goto fail;
+ }
+
+ info.n_1_pid = TEST_N_1_PID;
+
+ if (reg_prepare_flow_alloc(&info) < 0) {
+ printf("Failed to prepare flow for alloc.\n");
+ goto fail;
+ }
+
+ n_1_info.id = info.id;
+ n_1_info.mpl = 1;
+ n_1_info.mtu = TEST_MTU;
+
+ pthread_create(&thr, NULL, test_flow_respond_alloc, &n_1_info);
+
+ if (reg_wait_flow_allocated(&info, &rbuf, &abstime) < 0) {
+ printf("Flow allocation failed.\n");
+ pthread_join(thr, NULL);
+ reg_destroy_flow(info.id);
+ reg_fini();
+ goto fail;
+ }
+
+ pthread_join(thr, NULL);
+ freebuf(rbuf);
+
+ if (info.mtu != TEST_MTU) {
+ printf("MTU not propagated.\n");
+ goto fail;
+ }
+
+ /* Duplicate reply on an already-ALLOCATED flow must not assert. */
+ dup_info = n_1_info;
+ dup_info.state = FLOW_DEALLOCATED;
+
+ if (reg_respond_alloc(&dup_info, &empty, -EREPLAY) != -1) {
+ printf("Duplicate respond_alloc should return -1.\n");
+ goto fail;
+ }
+
+ reg_dealloc_flow(&info);
+ reg_dealloc_flow_resp(&info);
+ reg_destroy_flow(n_1_info.id);
+
+ reg_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail:
+ REG_TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
struct direct_alloc_info {
struct flow_info info;
buffer_t rsp;
@@ -564,7 +664,7 @@ static int test_reg_direct_flow_success(void)
dai.info.id = info.id;
dai.info.n_1_pid = TEST_N_1_PID;
dai.info.mpl = TEST_MPL;
- dai.info.qs = qos_data;
+ dai.info.qs = qos_msg;
dai.info.state = FLOW_ALLOCATED;
dai.rsp.len = 0;
dai.rsp.data = NULL;
@@ -671,6 +771,167 @@ static int test_reg_direct_flow_success(void)
return TEST_RC_FAIL;
}
+/*
+ * Direct-flow re-key: one shared seed is parked for both local apps. The
+ * per-app initiator role is resolved from the verified caller pid (the
+ * allocator is n_1_pid), and the seed is held until both have pulled it.
+ */
+static int test_reg_direct_flow_rekey(void)
+{
+ pthread_t thr;
+ struct timespec abstime;
+ struct timespec timeo = TIMESPEC_INIT_S(1);
+ buffer_t rbuf = BUF_INIT;
+ buffer_t rsp;
+ buffer_t no_crt = BUF_INIT;
+ struct direct_alloc_info dai;
+ uint8_t seed[SYMMKEYSZ];
+ uint8_t out[SYMMKEYSZ];
+ uint8_t epoch;
+ bool initiator;
+ size_t i;
+
+ struct flow_info info = {
+ .n_pid = TEST_PID,
+ .qs = qos_raw
+ };
+
+ TEST_START();
+
+ for (i = 0; i < SYMMKEYSZ; ++i)
+ seed[i] = (uint8_t) i;
+
+ clock_gettime(PTHREAD_COND_CLOCK, &abstime);
+
+ ts_add(&abstime, &timeo, &abstime);
+
+ if (reg_init() < 0) {
+ printf("Failed to init registry.\n");
+ goto fail;
+ }
+
+ if (reg_create_flow(&info) < 0) {
+ printf("Failed to add flow.\n");
+ goto fail;
+ }
+
+ if (reg_prepare_flow_accept(&info) < 0) {
+ printf("Failed to prepare for accept.\n");
+ goto fail;
+ }
+
+ dai.info.id = info.id;
+ dai.info.n_1_pid = TEST_N_1_PID;
+ dai.info.mpl = TEST_MPL;
+ dai.info.qs = qos_msg;
+ dai.info.state = FLOW_ALLOCATED;
+ dai.rsp.len = 0;
+ dai.rsp.data = NULL;
+ dai.abstime = abstime;
+
+ pthread_create(&thr, NULL, test_flow_alloc_direct, &dai);
+
+ if (reg_wait_flow_accepted(&info, &rbuf, &abstime) < 0) {
+ printf("Flow accept failed.\n");
+ pthread_join(thr, NULL);
+ goto fail;
+ }
+
+ freebuf(rbuf);
+
+ rsp.data = (uint8_t *) strdup(TEST_DATA2);
+ if (rsp.data == NULL) {
+ printf("Failed to strdup rsp data.\n");
+ pthread_join(thr, NULL);
+ goto fail;
+ }
+ rsp.len = strlen(TEST_DATA2) + 1;
+
+ if (reg_respond_flow_direct(info.id, &rsp) < 0) {
+ printf("Failed to respond direct.\n");
+ freebuf(rsp);
+ pthread_join(thr, NULL);
+ goto fail;
+ }
+
+ pthread_join(thr, NULL);
+
+ freebuf(dai.rsp);
+
+ if (!reg_flow_is_direct(info.id)) {
+ printf("Flow not marked direct.\n");
+ goto fail;
+ }
+
+ reg_flow_set_rekey(info.id, false, no_crt);
+
+ if (reg_flow_store_pending_direct(info.id, seed, 5) < 0) {
+ printf("Failed to store pending direct seed.\n");
+ goto fail;
+ }
+
+ if (!reg_flow_rekey_pending(info.id)) {
+ printf("Seed not pending after store.\n");
+ goto fail;
+ }
+
+ /* Allocator (n_1_pid) pulls: initiator role, seed still held. */
+ if (reg_flow_take_pending(info.id, 0, TEST_N_1_PID, out,
+ &epoch, &initiator) != 0) {
+ printf("Allocator failed to take pending seed.\n");
+ goto fail;
+ }
+
+ if (!initiator || epoch != 5 || memcmp(out, seed, SYMMKEYSZ) != 0) {
+ printf("Allocator got wrong seed/role/epoch.\n");
+ goto fail;
+ }
+
+ if (!reg_flow_rekey_pending(info.id)) {
+ printf("Seed cleared before both apps pulled.\n");
+ goto fail;
+ }
+
+ /* Acceptor (n_pid) pulls: responder role, seed now released. */
+ if (reg_flow_take_pending(info.id, 0, TEST_PID, out,
+ &epoch, &initiator) != 0) {
+ printf("Acceptor failed to take pending seed.\n");
+ goto fail;
+ }
+
+ if (initiator || epoch != 5 || memcmp(out, seed, SYMMKEYSZ) != 0) {
+ printf("Acceptor got wrong seed/role/epoch.\n");
+ goto fail;
+ }
+
+ if (reg_flow_rekey_pending(info.id)) {
+ printf("Seed still pending after both pulled.\n");
+ goto fail;
+ }
+
+ if (reg_flow_get_epoch(info.id) != 5) {
+ printf("Flow epoch not advanced.\n");
+ goto fail;
+ }
+
+ info.n_pid = TEST_PID;
+ reg_dealloc_flow(&info);
+
+ info.n_pid = TEST_N_1_PID;
+ reg_dealloc_flow(&info);
+
+ reg_destroy_flow(info.id);
+
+ reg_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail:
+ REG_TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
static int test_reg_flow(void) {
int rc = 0;
@@ -679,7 +940,9 @@ static int test_reg_flow(void) {
rc |= test_reg_accept_flow_success();
rc |= test_reg_accept_flow_success_no_crypt();
rc |= test_reg_allocate_flow_fail();
+ rc |= test_reg_respond_alloc_duplicate();
rc |= test_reg_direct_flow_success();
+ rc |= test_reg_direct_flow_rekey();
return rc;
}
@@ -774,6 +1037,7 @@ static int test_reg_list_ipcps(void)
while (len-- > 0)
ipcp_list_msg__free_unpacked(ipcps[len], NULL);
+
free(ipcps);
for (i = 0; i < 10; i++)
@@ -840,6 +1104,7 @@ static int test_insert_ipcps(void)
while (len-- > 0)
ipcp_list_msg__free_unpacked(ipcps[len], NULL);
+
free(ipcps);
reg_clear();
@@ -1017,6 +1282,7 @@ static int test_reg_list_names(void)
for (i = 0; i < len; i++)
name_info_msg__free_unpacked(names[i], NULL);
+
free(names);
for (i = 0; i < 10; i++) {
diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
index 79263924..3abf39d0 100644
--- a/src/lib/CMakeLists.txt
+++ b/src/lib/CMakeLists.txt
@@ -17,8 +17,12 @@ protobuf_generate_c(IPCP_PROTO_SRCS IPCP_PROTO_HDRS
set(SOURCE_FILES_COMMON
bitmap.c
btree.c
- crc32.c
+ crc/crc8.c
+ crc/crc16.c
+ crc/crc32.c
+ crc/crc64.c
crypt.c
+ crypt/keyrot.c
hash.c
lockfile.c
logs.c
@@ -36,6 +40,7 @@ set(SOURCE_FILES_COMMON
ssm/pool.c
sockets.c
tpm.c
+ tw.c
utils.c
)
@@ -88,6 +93,13 @@ if(HAVE_FUSE)
target_link_libraries(ouroboros-common PRIVATE Fuse::Fuse)
endif()
+if(HAVE_LIBURCU)
+ target_link_libraries(ouroboros-common PRIVATE Urcu::Urcu)
+ # urcu headers require C99; override the global -std=c89 for this TU only.
+ set_source_files_properties(crypt/keyrot.c PROPERTIES
+ COMPILE_OPTIONS "-std=gnu99")
+endif()
+
install(TARGETS ouroboros-common
EXPORT OuroborosTargets
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
@@ -155,5 +167,6 @@ configure_file("${CMAKE_CURRENT_SOURCE_DIR}/ssm/ssm.h.in"
if(BUILD_TESTS)
add_subdirectory(tests)
+ add_subdirectory(crc/tests)
add_subdirectory(ssm/tests)
endif()
diff --git a/src/lib/config.h.in b/src/lib/config.h.in
index 08e9baf6..26ebe56b 100644
--- a/src/lib/config.h.in
+++ b/src/lib/config.h.in
@@ -20,6 +20,14 @@
* Foundation, Inc., http://www.fsf.org/about/contact/.
*/
+#ifndef MILLION
+#define MILLION 1000000LL
+#endif
+
+#ifndef BILLION
+#define BILLION 1000000000LL
+#endif
+
#cmakedefine HAVE_SYS_RANDOM
#cmakedefine HAVE_EXPLICIT_BZERO
#cmakedefine HAVE_LIBGCRYPT
@@ -29,7 +37,7 @@
#cmakedefine HAVE_OPENSSL_ML_DSA
#cmakedefine HAVE_OPENSSL_SLH_DSA
#define HAVE_ENCRYPTION
-#define SECMEM_GUARD @SECMEM_GUARD@
+#define SECMEM_MINSIZE @SECMEM_MINSIZE@
#endif
#define PROC_SECMEM_MAX @PROC_SECMEM_MAX@
@@ -37,6 +45,8 @@
#cmakedefine QOS_DISABLE_CRC
#cmakedefine HAVE_OPENSSL_RNG
+#cmakedefine HAVE_PCLMUL
+#cmakedefine HAVE_PMULL
#define SHM_LOCKFILE_NAME "@SHM_LOCKFILE_NAME@"
#define FLOW_ALLOC_TIMEOUT @FLOW_ALLOC_TIMEOUT@
@@ -60,16 +70,20 @@
#cmakedefine PROC_FLOW_STATS
#endif
+#cmakedefine HAVE_LIBURCU
+
+#cmakedefine FRCT_DEBUG_STDOUT
+
#define PTHREAD_COND_CLOCK @PTHREAD_COND_CLOCK@
-#define PROG_MAX_FLOWS @PROG_MAX_FLOWS@
-#define PROG_RES_FDS @PROG_RES_FDS@
-#define PROG_MAX_FQUEUES @PROG_MAX_FQUEUES@
+#define PROC_MAX_FLOWS @PROC_MAX_FLOWS@
+#define PROC_RES_FDS @PROC_RES_FDS@
+#define PROC_MAX_FQUEUES @PROC_MAX_FQUEUES@
/* Default Delta-t parameters */
#cmakedefine FRCT_LINUX_RTT_ESTIMATOR
-#define DELT_A (@DELTA_T_ACK@) /* ns */
-#define DELT_R (@DELTA_T_RTX@) /* ns */
+#define DELT_A (@DELTA_T_ACK@) /* ms */
+#define DELT_R (@DELTA_T_RTX@) /* ms */
#define RQ_SIZE (@FRCT_REORDER_QUEUE_SIZE@)
#define START_WINDOW (@FRCT_START_WINDOW@)
@@ -80,9 +94,6 @@
#define TICTIME (@FRCT_TICK_TIME@ * 1000) /* ns */
/* Retransmission tuning */
-#cmakedefine RXM_BUFFER_ON_HEAP
-#cmakedefine RXM_BLOCKING
-
#define RXMQ_RES (@RXM_MIN_RESOLUTION@) /* 2^N ns */
#define RXMQ_BUMP (@RXM_WHEEL_MULTIPLIER@)
#define RXMQ_LVLS (@RXM_WHEEL_LEVELS@)
@@ -91,4 +102,9 @@
#define ACKQ_SLOTS (@ACK_WHEEL_SLOTS@)
#define ACKQ_RES (@ACK_WHEEL_RESOLUTION@) /* 2^N ns */
-#define KEY_ROTATION_BIT (@KEY_ROTATION_BIT@) /* Bit for key rotation */
+#define KEY_LEAF_BITS (@KEY_LEAF_BITS@) /* pkts/leaf-key = 2^n */
+#define KEY_NODE_BITS (@KEY_NODE_BITS@) /* leaf-keys/node = 2^n */
+#define KEY_NODE_COUNT (@KEY_NODE_COUNT@) /* node keys/batch N */
+#define KEY_REKEY_WATERMARK (@KEY_REKEY_WATERMARK@) /* node-keys-left trig */
+#define KEY_REPLAY_WINDOW (@KEY_REPLAY_WINDOW@) /* rx replay win pkts */
+#define FLOW_WM_CHECK (1u << @KEY_REKEY_WM_CHECK_BITS@) /* wm chk/n wr */
diff --git a/src/lib/crc/crc16.c b/src/lib/crc/crc16.c
new file mode 100644
index 00000000..9dc59429
--- /dev/null
+++ b/src/lib/crc/crc16.c
@@ -0,0 +1,61 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * 16-bit Cyclic Redundancy Check (CCITT-FALSE variant)
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+/*
+ * CRC-16/CCITT-FALSE (reveng catalog, alias CRC-16/IBM-3740):
+ * poly = 0x1021
+ * init = 0xffff
+ * refin = false
+ * refout = false
+ * xorout = 0x0000
+ * check = crc16_ccitt_false("123456789") == 0x29b1
+ */
+
+#include "config.h"
+
+#include <ouroboros/crc16.h>
+
+/* Bit-by-bit MSB-first CRC. */
+void crc16_ccitt_false(uint16_t * crc,
+ const void * buf,
+ size_t len)
+{
+ const uint8_t * p;
+ uint16_t c;
+ size_t n;
+ int i;
+
+ p = (const uint8_t *) buf;
+ c = *crc ^ 0xffff;
+
+ for (n = 0; n < len; n++) {
+ c ^= ((uint16_t) p[n]) << 8;
+ for (i = 0; i < 8; i++) {
+ if (c & 0x8000)
+ c = (uint16_t) ((c << 1) ^ 0x1021);
+ else
+ c = (uint16_t) (c << 1);
+ }
+ }
+
+ *crc = c;
+}
diff --git a/src/lib/crc32.c b/src/lib/crc/crc32.c
index 0fdb62b1..0fdb62b1 100644
--- a/src/lib/crc32.c
+++ b/src/lib/crc/crc32.c
diff --git a/src/lib/crc/crc64.c b/src/lib/crc/crc64.c
new file mode 100644
index 00000000..1b6fb5f6
--- /dev/null
+++ b/src/lib/crc/crc64.c
@@ -0,0 +1,363 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * 64-bit Cyclic Redundancy Check (NVMe variant)
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+/*
+ * CRC-64/NVMe (reveng catalog):
+ * poly = 0xad93d23594c93659
+ * init = 0xffffffffffffffff
+ * refin = true
+ * refout = true
+ * xorout = 0xffffffffffffffff
+ * check = crc64_nvme("123456789") == 0xae8b14860a799888
+ */
+
+#include "config.h"
+
+#include <ouroboros/crc64.h>
+
+/*
+ * Reflected CRC-64/NVMe table. Polynomial in reflected form:
+ * 0x9a6c9329ac4bc9b5 (bitrev of 0xad93d23594c93659).
+ */
+static const uint64_t crc64_nvme_tab[256] = {
+ 0x0000000000000000ULL, 0x7f6ef0c830358979ULL,
+ 0xfedde190606b12f2ULL, 0x81b31158505e9b8bULL,
+ 0xc962e5739841b68fULL, 0xb60c15bba8743ff6ULL,
+ 0x37bf04e3f82aa47dULL, 0x48d1f42bc81f2d04ULL,
+ 0xa61cecb46814fe75ULL, 0xd9721c7c5821770cULL,
+ 0x58c10d24087fec87ULL, 0x27affdec384a65feULL,
+ 0x6f7e09c7f05548faULL, 0x1010f90fc060c183ULL,
+ 0x91a3e857903e5a08ULL, 0xeecd189fa00bd371ULL,
+ 0x78e0ff3b88be6f81ULL, 0x078e0ff3b88be6f8ULL,
+ 0x863d1eabe8d57d73ULL, 0xf953ee63d8e0f40aULL,
+ 0xb1821a4810ffd90eULL, 0xceecea8020ca5077ULL,
+ 0x4f5ffbd87094cbfcULL, 0x30310b1040a14285ULL,
+ 0xdefc138fe0aa91f4ULL, 0xa192e347d09f188dULL,
+ 0x2021f21f80c18306ULL, 0x5f4f02d7b0f40a7fULL,
+ 0x179ef6fc78eb277bULL, 0x68f0063448deae02ULL,
+ 0xe943176c18803589ULL, 0x962de7a428b5bcf0ULL,
+ 0xf1c1fe77117cdf02ULL, 0x8eaf0ebf2149567bULL,
+ 0x0f1c1fe77117cdf0ULL, 0x7072ef2f41224489ULL,
+ 0x38a31b04893d698dULL, 0x47cdebccb908e0f4ULL,
+ 0xc67efa94e9567b7fULL, 0xb9100a5cd963f206ULL,
+ 0x57dd12c379682177ULL, 0x28b3e20b495da80eULL,
+ 0xa900f35319033385ULL, 0xd66e039b2936bafcULL,
+ 0x9ebff7b0e12997f8ULL, 0xe1d10778d11c1e81ULL,
+ 0x606216208142850aULL, 0x1f0ce6e8b1770c73ULL,
+ 0x8921014c99c2b083ULL, 0xf64ff184a9f739faULL,
+ 0x77fce0dcf9a9a271ULL, 0x08921014c99c2b08ULL,
+ 0x4043e43f0183060cULL, 0x3f2d14f731b68f75ULL,
+ 0xbe9e05af61e814feULL, 0xc1f0f56751dd9d87ULL,
+ 0x2f3dedf8f1d64ef6ULL, 0x50531d30c1e3c78fULL,
+ 0xd1e00c6891bd5c04ULL, 0xae8efca0a188d57dULL,
+ 0xe65f088b6997f879ULL, 0x9931f84359a27100ULL,
+ 0x1882e91b09fcea8bULL, 0x67ec19d339c963f2ULL,
+ 0xd75adabd7a6e2d6fULL, 0xa8342a754a5ba416ULL,
+ 0x29873b2d1a053f9dULL, 0x56e9cbe52a30b6e4ULL,
+ 0x1e383fcee22f9be0ULL, 0x6156cf06d21a1299ULL,
+ 0xe0e5de5e82448912ULL, 0x9f8b2e96b271006bULL,
+ 0x71463609127ad31aULL, 0x0e28c6c1224f5a63ULL,
+ 0x8f9bd7997211c1e8ULL, 0xf0f5275142244891ULL,
+ 0xb824d37a8a3b6595ULL, 0xc74a23b2ba0eececULL,
+ 0x46f932eaea507767ULL, 0x3997c222da65fe1eULL,
+ 0xafba2586f2d042eeULL, 0xd0d4d54ec2e5cb97ULL,
+ 0x5167c41692bb501cULL, 0x2e0934dea28ed965ULL,
+ 0x66d8c0f56a91f461ULL, 0x19b6303d5aa47d18ULL,
+ 0x980521650afae693ULL, 0xe76bd1ad3acf6feaULL,
+ 0x09a6c9329ac4bc9bULL, 0x76c839faaaf135e2ULL,
+ 0xf77b28a2faafae69ULL, 0x8815d86aca9a2710ULL,
+ 0xc0c42c4102850a14ULL, 0xbfaadc8932b0836dULL,
+ 0x3e19cdd162ee18e6ULL, 0x41773d1952db919fULL,
+ 0x269b24ca6b12f26dULL, 0x59f5d4025b277b14ULL,
+ 0xd846c55a0b79e09fULL, 0xa72835923b4c69e6ULL,
+ 0xeff9c1b9f35344e2ULL, 0x90973171c366cd9bULL,
+ 0x1124202993385610ULL, 0x6e4ad0e1a30ddf69ULL,
+ 0x8087c87e03060c18ULL, 0xffe938b633338561ULL,
+ 0x7e5a29ee636d1eeaULL, 0x0134d92653589793ULL,
+ 0x49e52d0d9b47ba97ULL, 0x368bddc5ab7233eeULL,
+ 0xb738cc9dfb2ca865ULL, 0xc8563c55cb19211cULL,
+ 0x5e7bdbf1e3ac9decULL, 0x21152b39d3991495ULL,
+ 0xa0a63a6183c78f1eULL, 0xdfc8caa9b3f20667ULL,
+ 0x97193e827bed2b63ULL, 0xe877ce4a4bd8a21aULL,
+ 0x69c4df121b863991ULL, 0x16aa2fda2bb3b0e8ULL,
+ 0xf86737458bb86399ULL, 0x8709c78dbb8deae0ULL,
+ 0x06bad6d5ebd3716bULL, 0x79d4261ddbe6f812ULL,
+ 0x3105d23613f9d516ULL, 0x4e6b22fe23cc5c6fULL,
+ 0xcfd833a67392c7e4ULL, 0xb0b6c36e43a74e9dULL,
+ 0x9a6c9329ac4bc9b5ULL, 0xe50263e19c7e40ccULL,
+ 0x64b172b9cc20db47ULL, 0x1bdf8271fc15523eULL,
+ 0x530e765a340a7f3aULL, 0x2c608692043ff643ULL,
+ 0xadd397ca54616dc8ULL, 0xd2bd67026454e4b1ULL,
+ 0x3c707f9dc45f37c0ULL, 0x431e8f55f46abeb9ULL,
+ 0xc2ad9e0da4342532ULL, 0xbdc36ec59401ac4bULL,
+ 0xf5129aee5c1e814fULL, 0x8a7c6a266c2b0836ULL,
+ 0x0bcf7b7e3c7593bdULL, 0x74a18bb60c401ac4ULL,
+ 0xe28c6c1224f5a634ULL, 0x9de29cda14c02f4dULL,
+ 0x1c518d82449eb4c6ULL, 0x633f7d4a74ab3dbfULL,
+ 0x2bee8961bcb410bbULL, 0x548079a98c8199c2ULL,
+ 0xd53368f1dcdf0249ULL, 0xaa5d9839ecea8b30ULL,
+ 0x449080a64ce15841ULL, 0x3bfe706e7cd4d138ULL,
+ 0xba4d61362c8a4ab3ULL, 0xc52391fe1cbfc3caULL,
+ 0x8df265d5d4a0eeceULL, 0xf29c951de49567b7ULL,
+ 0x732f8445b4cbfc3cULL, 0x0c41748d84fe7545ULL,
+ 0x6bad6d5ebd3716b7ULL, 0x14c39d968d029fceULL,
+ 0x95708ccedd5c0445ULL, 0xea1e7c06ed698d3cULL,
+ 0xa2cf882d2576a038ULL, 0xdda178e515432941ULL,
+ 0x5c1269bd451db2caULL, 0x237c997575283bb3ULL,
+ 0xcdb181ead523e8c2ULL, 0xb2df7122e51661bbULL,
+ 0x336c607ab548fa30ULL, 0x4c0290b2857d7349ULL,
+ 0x04d364994d625e4dULL, 0x7bbd94517d57d734ULL,
+ 0xfa0e85092d094cbfULL, 0x856075c11d3cc5c6ULL,
+ 0x134d926535897936ULL, 0x6c2362ad05bcf04fULL,
+ 0xed9073f555e26bc4ULL, 0x92fe833d65d7e2bdULL,
+ 0xda2f7716adc8cfb9ULL, 0xa54187de9dfd46c0ULL,
+ 0x24f29686cda3dd4bULL, 0x5b9c664efd965432ULL,
+ 0xb5517ed15d9d8743ULL, 0xca3f8e196da80e3aULL,
+ 0x4b8c9f413df695b1ULL, 0x34e26f890dc31cc8ULL,
+ 0x7c339ba2c5dc31ccULL, 0x035d6b6af5e9b8b5ULL,
+ 0x82ee7a32a5b7233eULL, 0xfd808afa9582aa47ULL,
+ 0x4d364994d625e4daULL, 0x3258b95ce6106da3ULL,
+ 0xb3eba804b64ef628ULL, 0xcc8558cc867b7f51ULL,
+ 0x8454ace74e645255ULL, 0xfb3a5c2f7e51db2cULL,
+ 0x7a894d772e0f40a7ULL, 0x05e7bdbf1e3ac9deULL,
+ 0xeb2aa520be311aafULL, 0x944455e88e0493d6ULL,
+ 0x15f744b0de5a085dULL, 0x6a99b478ee6f8124ULL,
+ 0x224840532670ac20ULL, 0x5d26b09b16452559ULL,
+ 0xdc95a1c3461bbed2ULL, 0xa3fb510b762e37abULL,
+ 0x35d6b6af5e9b8b5bULL, 0x4ab846676eae0222ULL,
+ 0xcb0b573f3ef099a9ULL, 0xb465a7f70ec510d0ULL,
+ 0xfcb453dcc6da3dd4ULL, 0x83daa314f6efb4adULL,
+ 0x0269b24ca6b12f26ULL, 0x7d0742849684a65fULL,
+ 0x93ca5a1b368f752eULL, 0xeca4aad306bafc57ULL,
+ 0x6d17bb8b56e467dcULL, 0x12794b4366d1eea5ULL,
+ 0x5aa8bf68aecec3a1ULL, 0x25c64fa09efb4ad8ULL,
+ 0xa4755ef8cea5d153ULL, 0xdb1bae30fe90582aULL,
+ 0xbcf7b7e3c7593bd8ULL, 0xc399472bf76cb2a1ULL,
+ 0x422a5673a732292aULL, 0x3d44a6bb9707a053ULL,
+ 0x759552905f188d57ULL, 0x0afba2586f2d042eULL,
+ 0x8b48b3003f739fa5ULL, 0xf42643c80f4616dcULL,
+ 0x1aeb5b57af4dc5adULL, 0x6585ab9f9f784cd4ULL,
+ 0xe436bac7cf26d75fULL, 0x9b584a0fff135e26ULL,
+ 0xd389be24370c7322ULL, 0xace74eec0739fa5bULL,
+ 0x2d545fb4576761d0ULL, 0x523aaf7c6752e8a9ULL,
+ 0xc41748d84fe75459ULL, 0xbb79b8107fd2dd20ULL,
+ 0x3acaa9482f8c46abULL, 0x45a459801fb9cfd2ULL,
+ 0x0d75adabd7a6e2d6ULL, 0x721b5d63e7936bafULL,
+ 0xf3a84c3bb7cdf024ULL, 0x8cc6bcf387f8795dULL,
+ 0x620ba46c27f3aa2cULL, 0x1d6554a417c62355ULL,
+ 0x9cd645fc4798b8deULL, 0xe3b8b53477ad31a7ULL,
+ 0xab69411fbfb21ca3ULL, 0xd407b1d78f8795daULL,
+ 0x55b4a08fdfd90e51ULL, 0x2ada5047efec8728ULL
+};
+
+static __inline__ uint64_t crc64_nvme_step(uint64_t c,
+ const uint8_t * p,
+ size_t len)
+{
+ size_t n;
+
+ for (n = 0; n < len; n++)
+ c = crc64_nvme_tab[(c ^ p[n]) & 0xff] ^ (c >> 8);
+
+ return c;
+}
+
+void crc64_nvme_table(uint64_t * crc,
+ const void * buf,
+ size_t len)
+{
+ uint64_t c;
+
+ c = crc64_nvme_step(*crc ^ UINT64_MAX,
+ (const uint8_t *) buf, len);
+
+ *crc = c ^ UINT64_MAX;
+}
+
+#ifdef HAVE_PCLMUL
+
+#include <smmintrin.h>
+#include <wmmintrin.h>
+
+/*
+ * Fold-by-16 constants for reflected CRC-64/NVMe. Properties of the
+ * polynomial; identical between the PCLMUL and PMULL backends.
+ * k3 = bitrev64(x^(128+64) mod P) << 1
+ * k4 = bitrev64(x^(128+0) mod P) << 1
+ */
+static const uint64_t k3_clmul = 0xeadc41fd2ba3d420ULL;
+static const uint64_t k4_clmul = 0x21e9761e252621acULL;
+
+__attribute__((target("pclmul,sse4.1")))
+static __m128i fold16(__m128i x,
+ __m128i k)
+{
+ __m128i lo;
+ __m128i hi;
+
+ lo = _mm_clmulepi64_si128(x, k, 0x00);
+ hi = _mm_clmulepi64_si128(x, k, 0x11);
+ return _mm_xor_si128(lo, hi);
+}
+
+/*
+ * Fold-by-16 over 16-byte chunks; the 128-bit folded state is then
+ * emitted as 16 little-endian bytes and run through the byte-table
+ * loop together with any tail (<=15 bytes). The 16-byte minimum on
+ * the bulk loop is why the short-input path uses the table directly.
+ */
+__attribute__((target("pclmul,sse4.1")))
+static void crc64_nvme_clmul(uint64_t * crc,
+ const void * buf,
+ size_t len)
+{
+ const uint8_t * p;
+ uint64_t seed;
+ uint64_t c;
+ size_t off;
+ __m128i x;
+ __m128i k;
+ uint8_t post[16];
+
+ p = (const uint8_t *) buf;
+ seed = *crc;
+
+ if (len < 16) {
+ c = crc64_nvme_step(seed ^ UINT64_MAX, p, len);
+ *crc = c ^ UINT64_MAX;
+ return;
+ }
+
+ x = _mm_loadu_si128((const __m128i *) p);
+ x = _mm_xor_si128(x, _mm_cvtsi64_si128((int64_t)
+ (seed ^ UINT64_MAX)));
+
+ k = _mm_set_epi64x((int64_t) k4_clmul, (int64_t) k3_clmul);
+
+ off = 16;
+ while (off + 16 <= len) {
+ __m128i d;
+
+ d = _mm_loadu_si128((const __m128i *) (p + off));
+ x = _mm_xor_si128(fold16(x, k), d);
+ off += 16;
+ }
+
+ _mm_storeu_si128((__m128i *) post, x);
+
+ c = crc64_nvme_step(0, post, 16);
+ c = crc64_nvme_step(c, p + off, len - off);
+
+ *crc = c ^ UINT64_MAX;
+}
+
+#endif /* HAVE_PCLMUL */
+
+#ifdef HAVE_PMULL
+
+#include <arm_neon.h>
+
+/* Same fold-by-16 constants as the PCLMUL path (poly properties). */
+static const uint64_t k3_pmull = 0xeadc41fd2ba3d420ULL;
+static const uint64_t k4_pmull = 0x21e9761e252621acULL;
+
+__attribute__((target("+crypto")))
+static uint64x2_t fold16_pmull(uint64x2_t x,
+ uint64x2_t k)
+{
+ poly64x2_t xp;
+ poly64x2_t kp;
+ uint64x2_t lo;
+ uint64x2_t hi;
+
+ xp = vreinterpretq_p64_u64(x);
+ kp = vreinterpretq_p64_u64(k);
+ lo = vreinterpretq_u64_p128(
+ vmull_p64((poly64_t) vgetq_lane_u64(x, 0),
+ (poly64_t) vgetq_lane_u64(k, 0)));
+ hi = vreinterpretq_u64_p128(vmull_high_p64(xp, kp));
+ return veorq_u64(lo, hi);
+}
+
+__attribute__((target("+crypto")))
+static void crc64_nvme_pmull(uint64_t * crc,
+ const void * buf,
+ size_t len)
+{
+ const uint8_t * p;
+ uint64_t seed;
+ uint64_t c;
+ size_t off;
+ uint64x2_t x;
+ uint64x2_t k;
+ uint64_t seed_lane[2];
+ uint64_t k_lanes[2];
+ uint8_t post[16];
+
+ p = (const uint8_t *) buf;
+ seed = *crc;
+
+ if (len < 16) {
+ c = crc64_nvme_step(seed ^ UINT64_MAX, p, len);
+ *crc = c ^ UINT64_MAX;
+ return;
+ }
+
+ x = vld1q_u64((const uint64_t *) p);
+ seed_lane[0] = seed ^ UINT64_MAX;
+ seed_lane[1] = 0;
+ x = veorq_u64(x, vld1q_u64(seed_lane));
+
+ k_lanes[0] = k3_pmull;
+ k_lanes[1] = k4_pmull;
+ k = vld1q_u64(k_lanes);
+
+ off = 16;
+ while (off + 16 <= len) {
+ uint64x2_t d;
+
+ d = vld1q_u64((const uint64_t *) (p + off));
+ x = veorq_u64(fold16_pmull(x, k), d);
+ off += 16;
+ }
+
+ vst1q_u8(post, vreinterpretq_u8_u64(x));
+
+ c = crc64_nvme_step(0, post, 16);
+ c = crc64_nvme_step(c, p + off, len - off);
+
+ *crc = c ^ UINT64_MAX;
+}
+#endif /* HAVE_PMULL */
+
+void crc64_nvme(uint64_t * crc,
+ const void * buf,
+ size_t len)
+{
+#ifdef HAVE_PCLMUL
+ crc64_nvme_clmul(crc, buf, len);
+#elif defined(HAVE_PMULL)
+ crc64_nvme_pmull(crc, buf, len);
+#else
+ crc64_nvme_table(crc, buf, len);
+#endif
+}
diff --git a/src/lib/crc/crc8.c b/src/lib/crc/crc8.c
new file mode 100644
index 00000000..20976b29
--- /dev/null
+++ b/src/lib/crc/crc8.c
@@ -0,0 +1,62 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * 8-bit Cyclic Redundancy Check (AUTOSAR variant)
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+/*
+ * CRC-8/AUTOSAR (reveng catalog):
+ * poly = 0x2f
+ * init = 0xff
+ * refin = false
+ * refout = false
+ * xorout = 0xff
+ * check = crc8_autosar("123456789") == 0xdf
+ */
+
+#include "config.h"
+
+#include <ouroboros/crc8.h>
+
+
+ /* Bit-by-bit MSB-first CRC. */
+void crc8_autosar(uint8_t * crc,
+ const void * buf,
+ size_t len)
+{
+ const uint8_t * p;
+ uint8_t c;
+ size_t n;
+ int i;
+
+ p = (const uint8_t *) buf;
+ c = *crc ^ 0xff;
+
+ for (n = 0; n < len; n++) {
+ c ^= p[n];
+ for (i = 0; i < 8; i++) {
+ if (c & 0x80)
+ c = (uint8_t) ((c << 1) ^ 0x2f);
+ else
+ c = (uint8_t) (c << 1);
+ }
+ }
+
+ *crc = c ^ 0xff;
+}
diff --git a/src/lib/crc/tests/CMakeLists.txt b/src/lib/crc/tests/CMakeLists.txt
new file mode 100644
index 00000000..11daca5a
--- /dev/null
+++ b/src/lib/crc/tests/CMakeLists.txt
@@ -0,0 +1,21 @@
+get_filename_component(PARENT_PATH ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
+get_filename_component(PARENT_DIR ${PARENT_PATH} NAME)
+
+compute_test_prefix()
+
+create_test_sourcelist(${PARENT_DIR}_tests test_suite.c
+ # Add new tests here
+ crc8_test.c
+ crc16_test.c
+ crc32_test.c
+ crc64_test.c
+ )
+
+add_executable(${PARENT_DIR}_test ${${PARENT_DIR}_tests})
+
+disable_test_logging_for_target(${PARENT_DIR}_test)
+target_link_libraries(${PARENT_DIR}_test ouroboros-common)
+
+add_dependencies(build_tests ${PARENT_DIR}_test)
+
+ouroboros_register_tests(TARGET ${PARENT_DIR}_test TESTS ${${PARENT_DIR}_tests})
diff --git a/src/lib/crc/tests/crc16_test.c b/src/lib/crc/tests/crc16_test.c
new file mode 100644
index 00000000..03a5b504
--- /dev/null
+++ b/src/lib/crc/tests/crc16_test.c
@@ -0,0 +1,67 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * Test of the CRC-16/CCITT-FALSE function
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+#include "config.h"
+
+#include <ouroboros/crc16.h>
+
+#include <test/test.h>
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+/* reveng-catalog smoke vectors. */
+static int test_crc16_ccitt_false_basic(void)
+{
+ uint16_t crc;
+
+ TEST_START();
+
+ crc = 0;
+ crc16_ccitt_false(&crc, "", 0);
+ if (crc != 0xffff)
+ goto fail;
+
+ crc = 0;
+ crc16_ccitt_false(&crc, "123456789", 9);
+ if (crc != 0x29b1)
+ goto fail;
+
+ TEST_SUCCESS();
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+int crc16_test(int argc,
+ char ** argv)
+{
+ int ret = 0;
+
+ (void) argc;
+ (void) argv;
+
+ ret |= test_crc16_ccitt_false_basic();
+ return ret;
+}
diff --git a/src/lib/tests/crc32_test.c b/src/lib/crc/tests/crc32_test.c
index 5a1ddd87..5a1ddd87 100644
--- a/src/lib/tests/crc32_test.c
+++ b/src/lib/crc/tests/crc32_test.c
diff --git a/src/lib/crc/tests/crc64_test.c b/src/lib/crc/tests/crc64_test.c
new file mode 100644
index 00000000..cf3f5ca3
--- /dev/null
+++ b/src/lib/crc/tests/crc64_test.c
@@ -0,0 +1,126 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * Test of the CRC-64/NVMe function
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+#include "config.h"
+
+#include <ouroboros/crc64.h>
+#include <ouroboros/random.h>
+
+#include <test/test.h>
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+/* Reference impl, internal to libouroboros-common. */
+extern void crc64_nvme_table(uint64_t * crc,
+ const void * buf,
+ size_t len);
+
+/* reveng-catalog smoke vectors plus a 16-byte fold-boundary check. */
+static int test_crc64_nvme_basic(void)
+{
+ uint64_t crc;
+
+ TEST_START();
+
+ crc = 0;
+ crc64_nvme(&crc, "", 0);
+ if (crc != 0x0000000000000000ULL)
+ goto fail;
+
+ crc = 0;
+ crc64_nvme(&crc, "123456789", 9);
+ if (crc != 0xae8b14860a799888ULL)
+ goto fail;
+
+ crc = 0;
+ crc64_nvme(&crc, "0123456789abcdef", 16);
+ if (crc != 0x091485ca7018730eULL)
+ goto fail;
+
+ TEST_SUCCESS();
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+#if defined(HAVE_PCLMUL) || defined(HAVE_PMULL)
+/* Cross-check the accelerated dispatcher path against the byte-table. */
+static int test_crc64_nvme_random(void)
+{
+ static const size_t lens[] = {
+ 0, 1, 7, 8, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128,
+ 129, 255, 256, 257, 1023, 1024, 1025, 4096
+ };
+ uint8_t buf[4096];
+ size_t i;
+ uint64_t ref;
+ uint64_t got;
+
+ TEST_START();
+
+ if (random_buffer(buf, sizeof(buf)) < 0) {
+ printf("Failed to generate random data.\n");
+ goto fail;
+ }
+
+ for (i = 0; i < sizeof(lens) / sizeof(lens[0]); i++) {
+ ref = 0;
+ crc64_nvme_table(&ref, buf, lens[i]);
+
+ got = 0;
+ crc64_nvme(&got, buf, lens[i]);
+
+ if (ref == got)
+ continue;
+
+ printf("Mismatch at len=%zu: table=0x%016lx disp=0x%016lx\n",
+ lens[i],
+ (unsigned long) ref,
+ (unsigned long) got);
+ goto fail;
+ }
+
+ TEST_SUCCESS();
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+#endif
+}
+
+int crc64_test(int argc,
+ char ** argv)
+{
+ int ret = 0;
+
+ (void) argc;
+ (void) argv;
+
+ ret |= test_crc64_nvme_basic();
+#if defined(HAVE_PCLMUL) || defined(HAVE_PMULL)
+ ret |= test_crc64_nvme_random();
+#endif
+ return ret;
+}
diff --git a/src/lib/crc/tests/crc8_test.c b/src/lib/crc/tests/crc8_test.c
new file mode 100644
index 00000000..f7bb33b8
--- /dev/null
+++ b/src/lib/crc/tests/crc8_test.c
@@ -0,0 +1,67 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * Test of the CRC-8/AUTOSAR function
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+#include "config.h"
+
+#include <ouroboros/crc8.h>
+
+#include <test/test.h>
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+/* reveng-catalog smoke vectors. */
+static int test_crc8_autosar_basic(void)
+{
+ uint8_t crc;
+
+ TEST_START();
+
+ crc = 0;
+ crc8_autosar(&crc, "", 0);
+ if (crc != 0x00)
+ goto fail;
+
+ crc = 0;
+ crc8_autosar(&crc, "123456789", 9);
+ if (crc != 0xdf)
+ goto fail;
+
+ TEST_SUCCESS();
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+int crc8_test(int argc,
+ char ** argv)
+{
+ int ret = 0;
+
+ (void) argc;
+ (void) argv;
+
+ ret |= test_crc8_autosar_basic();
+ return ret;
+}
diff --git a/src/lib/crypt.c b/src/lib/crypt.c
index 71197f6e..5da9d392 100644
--- a/src/lib/crypt.c
+++ b/src/lib/crypt.c
@@ -27,10 +27,14 @@
#include <config.h>
#include <ouroboros/errno.h>
+#include <ouroboros/pthread.h>
#include <ouroboros/random.h>
#include <ouroboros/crypt.h>
+#include "crypt/keyrot.h"
+
#ifdef HAVE_OPENSSL
+#include <openssl/crypto.h>
#include <openssl/evp.h>
#include "crypt/openssl.h"
#endif
@@ -50,18 +54,12 @@ static const struct nid_map cipher_nid_map[] = {
{NID_aes_192_gcm, "aes-192-gcm"},
{NID_aes_256_gcm, "aes-256-gcm"},
{NID_chacha20_poly1305, "chacha20-poly1305"},
- {NID_aes_128_ctr, "aes-128-ctr"},
- {NID_aes_192_ctr, "aes-192-ctr"},
- {NID_aes_256_ctr, "aes-256-ctr"},
{NID_undef, NULL}
};
/* Ordered in strength preference, lowest first */
const uint16_t crypt_supported_nids[] = {
#ifdef HAVE_OPENSSL
- NID_aes_128_ctr,
- NID_aes_192_ctr,
- NID_aes_256_ctr,
NID_aes_128_gcm,
NID_aes_192_gcm,
NID_aes_256_gcm,
@@ -87,23 +85,23 @@ static const struct nid_map kex_nid_map[] = {
{NID_undef, NULL}
};
-/* Ordered in strength preference, lowest first */
+/* Ordered in strength preference, lowest first (NIST SP 800-57 levels) */
const uint16_t kex_supported_nids[] = {
#ifdef HAVE_OPENSSL
- NID_ffdhe2048,
- NID_X9_62_prime256v1,
- NID_X25519,
- NID_ffdhe3072,
- NID_secp384r1,
- NID_ffdhe4096,
- NID_X448,
- NID_secp521r1,
+ NID_ffdhe2048, /* FFDHE-2048, ~112-bit */
+ NID_X9_62_prime256v1, /* ECDH P-256, 128-bit */
+ NID_X25519, /* ECDH X25519, 128-bit */
+ NID_ffdhe3072, /* FFDHE-3072, ~128-bit */
+ NID_ffdhe4096, /* FFDHE-4096, ~152-bit */
+ NID_secp384r1, /* ECDH P-384, 192-bit */
+ NID_X448, /* ECDH X448, 224-bit */
+ NID_secp521r1, /* ECDH P-521, 256-bit */
#ifdef HAVE_OPENSSL_ML_KEM
- NID_MLKEM512,
- NID_MLKEM768,
- NID_MLKEM1024,
- NID_X25519MLKEM768,
- NID_X448MLKEM1024,
+ NID_MLKEM512, /* ML-KEM-512, PQC L1 (~AES-128) */
+ NID_MLKEM768, /* ML-KEM-768, PQC L3 (~AES-192) */
+ NID_MLKEM1024, /* ML-KEM-1024, PQC L5 (~AES-256) */
+ NID_X25519MLKEM768, /* X25519 + ML-KEM-768, PQC L3 */
+ NID_X448MLKEM1024, /* X448 + ML-KEM-1024, PQC L5 */
#endif
#endif
NID_undef
@@ -137,11 +135,13 @@ const uint16_t md_supported_nids[] = {
};
struct crypt_ctx {
- void * ctx; /* Encryption context */
+ struct keyrot * kr; /* backend-independent key rotation */
+ void * cipher; /* backend AEAD cipher context */
};
struct auth_ctx {
- void * store;
+ void * store; /* trusted anchors */
+ void * chain; /* untrusted build-only interm */
};
static int parse_kex_value(const char * value,
@@ -162,6 +162,7 @@ int parse_sec_config(struct sec_config * cfg,
char * equals;
char * key;
char * value;
+ bool no_enc = false;
assert(cfg != NULL);
assert(fp != NULL);
@@ -172,6 +173,7 @@ int parse_sec_config(struct sec_config * cfg,
SET_KEX_KDF_NID(cfg, NID_sha256);
SET_KEX_CIPHER_NID(cfg, NID_aes_256_gcm);
SET_KEX_DIGEST_NID(cfg, NID_sha256);
+ /* a.req is seeded per-role by the caller; only auth= overrides it */
while (fgets(line, sizeof(line), fp) != NULL) {
char * trimmed;
@@ -180,12 +182,10 @@ int parse_sec_config(struct sec_config * cfg,
if (line[0] == '#' || line[0] == '\n')
continue;
- /* Check for 'none' keyword */
+ /* Bare 'none' keyword replaced by encryption=none */
trimmed = trim_whitespace(line);
- if (strcmp(trimmed, "none") == 0) {
- memset(cfg, 0, sizeof(*cfg));
- return 0;
- }
+ if (strcmp(trimmed, "none") == 0)
+ return -EINVAL;
/* Find the = separator */
equals = strchr(line, '=');
@@ -221,9 +221,34 @@ int parse_sec_config(struct sec_config * cfg,
} else {
return -EINVAL;
}
+ } else if (strcmp(key, "auth") == 0) {
+ if (strcmp(value, "required") == 0) {
+ cfg->a.req = true;
+ } else if (strcmp(value, "optional") == 0) {
+ cfg->a.req = false;
+ } else {
+ return -EINVAL;
+ }
+ } else if (strcmp(key, "cacert") == 0) {
+ if (strlen(value) >= sizeof(cfg->a.cacert))
+ return -EINVAL;
+ strcpy(cfg->a.cacert, value);
+ } else if (strcmp(key, "encryption") == 0) {
+ if (strcmp(value, "none") != 0)
+ return -EINVAL;
+ no_enc = true;
+ } else {
+ return -EINVAL;
}
}
+ if (no_enc) {
+ /* Digest stays: it belongs to the auth axis */
+ CLEAR_KEX_ALGO(cfg);
+ CLEAR_KEX_KDF(cfg);
+ CLEAR_KEX_CIPHER(cfg);
+ }
+
return 0;
}
@@ -239,12 +264,17 @@ int load_sec_config_file(struct sec_config * cfg,
fp = fopen(path, "r");
if (fp == NULL) {
- /* File doesn't exist - disable encryption */
- CLEAR_KEX_ALGO(cfg);
- return 0;
+ /* Absent config disables encryption; other errors fail */
+ if (errno == ENOENT) {
+ CLEAR_KEX_ALGO(cfg);
+ return 0;
+ }
+ return -errno;
}
+ pthread_cleanup_push(__cleanup_fclose, fp);
ret = parse_sec_config(cfg, fp);
+ pthread_cleanup_pop(0);
fclose(fp);
@@ -592,19 +622,71 @@ int crypt_kex_rank(int nid)
return -1;
}
-/* Hash length now returned by md_digest() */
+/* AEAD primitive: 1:1 backend wrappers used by the data path below. */
+static int crypt_seal(void * cipher,
+ const uint8_t * key,
+ const uint8_t * nonce,
+ buffer_t aad,
+ buffer_t in,
+ uint8_t * out,
+ uint8_t * tag)
+{
+#ifdef HAVE_OPENSSL
+ return openssl_seal(cipher, key, nonce, aad, in, out, tag);
+#else
+ (void) cipher;
+ (void) key;
+ (void) nonce;
+ (void) aad;
+ (void) in;
+ (void) out;
+ (void) tag;
-int crypt_encrypt(struct crypt_ctx * ctx,
- buffer_t in,
- buffer_t * out)
+ return -ECRYPT;
+#endif
+}
+
+static int crypt_open(void * cipher,
+ const uint8_t * key,
+ const uint8_t * nonce,
+ buffer_t aad,
+ buffer_t in,
+ const uint8_t * tag,
+ buffer_t * out)
{
- assert(ctx != NULL);
- assert(ctx->ctx != NULL);
+#ifdef HAVE_OPENSSL
+ return openssl_open(cipher, key, nonce, aad, in, tag, out);
+#else
+ (void) cipher;
+ (void) key;
+ (void) nonce;
+ (void) aad;
+ (void) in;
+ (void) tag;
+ (void) out;
+
+ return -ECRYPT;
+#endif
+}
+
+int crypt_oneshot_seal(int nid,
+ const uint8_t * key,
+ const uint8_t * nonce,
+ buffer_t aad,
+ buffer_t in,
+ buffer_t * out)
+{
+ assert(key != NULL);
+ assert(nonce != NULL);
+ assert(out != NULL);
#ifdef HAVE_OPENSSL
- return openssl_encrypt(ctx->ctx, in, out);
+ return openssl_oneshot_seal(nid, key, nonce, aad, in, out);
#else
- (void) ctx;
+ (void) nid;
+ (void) key;
+ (void) nonce;
+ (void) aad;
(void) in;
(void) out;
@@ -612,17 +694,24 @@ int crypt_encrypt(struct crypt_ctx * ctx,
#endif
}
-int crypt_decrypt(struct crypt_ctx * ctx,
- buffer_t in,
- buffer_t * out)
+int crypt_oneshot_open(int nid,
+ const uint8_t * key,
+ const uint8_t * nonce,
+ buffer_t aad,
+ buffer_t in,
+ buffer_t * out)
{
- assert(ctx != NULL);
- assert(ctx->ctx != NULL);
+ assert(key != NULL);
+ assert(nonce != NULL);
+ assert(out != NULL);
#ifdef HAVE_OPENSSL
- return openssl_decrypt(ctx->ctx, in, out);
+ return openssl_oneshot_open(nid, key, nonce, aad, in, out);
#else
- (void) ctx;
+ (void) nid;
+ (void) key;
+ (void) nonce;
+ (void) aad;
(void) in;
(void) out;
@@ -630,6 +719,115 @@ int crypt_decrypt(struct crypt_ctx * ctx,
#endif
}
+/*
+ * Data-path encrypt: rotate the key, frame selector ‖ ct ‖ tag, seal.
+ * Backend-agnostic: composed from keyrot_*, crypt_seal and crypt_get_tagsz.
+ */
+int crypt_encrypt(struct crypt_ctx * ctx,
+ buffer_t in,
+ buffer_t * out)
+{
+ uint8_t nonce[KR_NONCE_LEN];
+ const uint8_t * key;
+ uint8_t * ct;
+ buffer_t aad;
+ int tagsz;
+ int out_sz;
+
+ assert(ctx != NULL);
+ assert(ctx->kr != NULL);
+
+ tagsz = crypt_get_tagsz(ctx);
+ if (tagsz < 0)
+ return -ECRYPT;
+
+ out->data = malloc(KR_SELECTOR_LEN + in.len + (size_t) tagsz);
+ if (out->data == NULL)
+ goto fail_malloc;
+
+ ct = out->data + KR_SELECTOR_LEN;
+
+ /* keyrot writes the selector into the wire header (== AAD). */
+ if (keyrot_tx_next(ctx->kr, out->data, &key, nonce) != 0)
+ goto fail_encrypt;
+
+ aad.data = out->data;
+ aad.len = KR_SELECTOR_LEN;
+
+ out_sz = crypt_seal(ctx->cipher, key, nonce, aad, in, ct, ct + in.len);
+ if (out_sz < 0)
+ goto fail_encrypt;
+
+ out->len = KR_SELECTOR_LEN + (size_t) out_sz + (size_t) tagsz;
+
+ return 0;
+ fail_encrypt:
+ free(out->data);
+ fail_malloc:
+ clrbuf(*out);
+ return -ECRYPT;
+}
+
+/*
+ * Data-path decrypt: look up the rotated key from the selector, open, and
+ * commit the replay window only after the tag verifies.
+ */
+int crypt_decrypt(struct crypt_ctx * ctx,
+ buffer_t in,
+ buffer_t * out)
+{
+ uint8_t nonce[KR_NONCE_LEN];
+ const uint8_t * key;
+ const uint8_t * tag;
+ struct kr_rx rx;
+ buffer_t aad;
+ buffer_t ct;
+ int tagsz;
+ int in_sz;
+
+ assert(ctx != NULL);
+ assert(ctx->kr != NULL);
+
+ tagsz = crypt_get_tagsz(ctx);
+ if (tagsz < 0)
+ return -ECRYPT;
+
+ if (in.len < (size_t) (KR_SELECTOR_LEN + tagsz))
+ return -ECRYPT;
+
+ if (keyrot_rx_lookup(ctx->kr, in.data, &key, nonce, &rx) != 0)
+ return -ECRYPT;
+
+ in_sz = (int) in.len - KR_SELECTOR_LEN - tagsz;
+
+ /* +1 keeps malloc(0) defined for an empty (zero-length) frame. */
+ out->data = malloc((size_t) in_sz + 1);
+ if (out->data == NULL)
+ goto fail_malloc;
+
+ aad.data = in.data;
+ aad.len = KR_SELECTOR_LEN;
+
+ ct.data = in.data + KR_SELECTOR_LEN;
+ ct.len = (size_t) in_sz;
+
+ tag = in.data + KR_SELECTOR_LEN + in_sz;
+
+ if (crypt_open(ctx->cipher, key, nonce, aad, ct, tag, out) < 0)
+ goto fail_decrypt;
+
+ /* Commit replay state only after the tag verifies. */
+ if (keyrot_rx_commit(ctx->kr, &rx) != 0)
+ goto fail_decrypt;
+
+ return 0;
+ fail_decrypt:
+ free(out->data);
+ fail_malloc:
+ clrbuf(*out);
+ return -ECRYPT;
+}
+
struct crypt_ctx * crypt_create_ctx(struct crypt_sk * sk)
{
struct crypt_ctx * crypt;
@@ -643,16 +841,23 @@ struct crypt_ctx * crypt_create_ctx(struct crypt_sk * sk)
memset(crypt, 0, sizeof(*crypt));
+ crypt->kr = keyrot_create(sk->key, sk->epoch, sk->role);
+ if (crypt->kr == NULL)
+ goto fail_kr;
+
#ifdef HAVE_OPENSSL
- crypt->ctx = openssl_crypt_create_ctx(sk);
- if (crypt->ctx == NULL)
- goto fail_ctx;
+ crypt->cipher = openssl_crypt_create_ctx(sk);
+ if (crypt->cipher == NULL)
+ goto fail_cipher;
#endif
return crypt;
+
#ifdef HAVE_OPENSSL
- fail_ctx:
- free(crypt);
+ fail_cipher:
+ keyrot_destroy(crypt->kr);
#endif
+ fail_kr:
+ free(crypt);
fail_crypt:
return NULL;
}
@@ -662,43 +867,70 @@ void crypt_destroy_ctx(struct crypt_ctx * crypt)
if (crypt == NULL)
return;
+ keyrot_destroy(crypt->kr);
#ifdef HAVE_OPENSSL
- assert(crypt->ctx != NULL);
- openssl_crypt_destroy_ctx(crypt->ctx);
-#else
- assert(crypt->ctx == NULL);
+ openssl_crypt_destroy_ctx(crypt->cipher);
#endif
free(crypt);
}
-int crypt_get_ivsz(struct crypt_ctx * ctx)
+int crypt_get_headsz(struct crypt_ctx * ctx)
{
- if (ctx == NULL)
- return -EINVAL;
+ assert(ctx != NULL);
+ assert(ctx->kr != NULL);
-#ifdef HAVE_OPENSSL
- assert(ctx->ctx != NULL);
- return openssl_crypt_get_ivsz(ctx->ctx);
-#else
- assert(ctx->ctx == NULL);
- return -ENOTSUP;
-#endif
+ (void) ctx; /* validated only; header size is a constant */
+
+ return KR_SELECTOR_LEN;
+}
+
+int crypt_rekey(struct crypt_ctx * ctx,
+ struct crypt_sk * sk)
+{
+ assert(ctx != NULL);
+ assert(sk != NULL);
+ assert(ctx->kr != NULL);
+
+ return keyrot_rekey(ctx->kr, sk->key, sk->epoch) == 0 ? 0 : -ECRYPT;
}
int crypt_get_tagsz(struct crypt_ctx * ctx)
{
- if (ctx == NULL)
- return -EINVAL;
+ assert(ctx != NULL);
+ assert(ctx->cipher != NULL);
#ifdef HAVE_OPENSSL
- assert(ctx->ctx != NULL);
- return openssl_crypt_get_tagsz(ctx->ctx);
+ return openssl_crypt_get_tagsz(ctx->cipher);
#else
- assert(ctx->ctx == NULL);
+ (void) ctx;
return -ENOTSUP;
#endif
}
+int crypt_nodes_left(struct crypt_ctx * ctx)
+{
+ assert(ctx != NULL);
+ assert(ctx->kr != NULL);
+
+ return (int) keyrot_tx_nodes_left(ctx->kr);
+}
+
+int crypt_peer_synced(struct crypt_ctx * ctx)
+{
+ assert(ctx != NULL);
+ assert(ctx->kr != NULL);
+
+ return keyrot_peer_switched(ctx->kr) ? 1 : 0;
+}
+
+void crypt_tx_promote(struct crypt_ctx * ctx)
+{
+ assert(ctx != NULL);
+ assert(ctx->kr != NULL);
+
+ keyrot_tx_promote(ctx->kr);
+}
+
int crypt_load_privkey_file(const char * path,
void ** key)
{
@@ -801,6 +1033,25 @@ int crypt_load_privkey_raw_file(const char * path,
#endif
}
+int crypt_ct_cmp(const void * a,
+ const void * b,
+ size_t len)
+{
+#ifdef HAVE_OPENSSL
+ return CRYPTO_memcmp(a, b, len);
+#else
+ const volatile uint8_t * pa = a;
+ const volatile uint8_t * pb = b;
+ uint8_t d = 0;
+ size_t i;
+
+ for (i = 0; i < len; i++)
+ d |= pa[i] ^ pb[i];
+
+ return d != 0;
+#endif
+}
+
int crypt_cmp_key(const void * key1,
const void * key2)
{
@@ -967,9 +1218,15 @@ struct auth_ctx * auth_create_ctx(void)
ctx->store = openssl_auth_create_store();
if (ctx->store == NULL)
goto fail_store;
+
+ ctx->chain = openssl_auth_create_chain();
+ if (ctx->chain == NULL)
+ goto fail_chain;
#endif
return ctx;
#ifdef HAVE_OPENSSL
+ fail_chain:
+ openssl_auth_destroy_store(ctx->store);
fail_store:
free(ctx);
#endif
@@ -982,6 +1239,7 @@ void auth_destroy_ctx(struct auth_ctx * ctx)
if (ctx == NULL)
return;
#ifdef HAVE_OPENSSL
+ openssl_auth_destroy_chain(ctx->chain);
openssl_auth_destroy_store(ctx->store);
#endif
free(ctx);
@@ -1003,11 +1261,27 @@ int auth_add_crt_to_store(struct auth_ctx * ctx,
#endif
}
+int auth_add_crt_to_chain(struct auth_ctx * ctx,
+ void * crt)
+{
+ assert(ctx != NULL);
+ assert(crt != NULL);
+
+#ifdef HAVE_OPENSSL
+ return openssl_auth_add_crt_to_chain(ctx->chain, crt);
+#else
+ (void) ctx;
+ (void) crt;
+
+ return 0;
+#endif
+}
+
int auth_verify_crt(struct auth_ctx * ctx,
void * crt)
{
#ifdef HAVE_OPENSSL
- return openssl_verify_crt(ctx->store, crt);
+ return openssl_verify_crt(ctx->store, ctx->chain, crt);
#else
(void) ctx;
(void) crt;
@@ -1016,6 +1290,32 @@ int auth_verify_crt(struct auth_ctx * ctx,
#endif
}
+int auth_verify_crt_pin(struct auth_ctx * ctx,
+ void * crt,
+ void * pin)
+{
+#ifdef HAVE_OPENSSL
+ return openssl_verify_crt_pin(ctx->store, ctx->chain, crt, pin);
+#else
+ (void) ctx;
+ (void) crt;
+ (void) pin;
+
+ return 0;
+#endif
+}
+
+bool crypt_pk_requires_md(const void * pk)
+{
+#ifdef HAVE_OPENSSL
+ return openssl_pk_requires_md((const EVP_PKEY *) pk);
+#else
+ (void) pk;
+
+ return false;
+#endif
+}
+
int auth_sign(void * pkp,
int md_nid,
buffer_t msg,
@@ -1077,10 +1377,25 @@ ssize_t md_len(int md_nid)
#endif
}
+int crypt_hkdf_expand(buffer_t key,
+ buffer_t info,
+ buffer_t out)
+{
+#ifdef HAVE_OPENSSL
+ return openssl_hkdf_expand(key, info, out) == 0 ? 0 : -ECRYPT;
+#else
+ (void) key;
+ (void) info;
+ (void) out;
+
+ return -ECRYPT;
+#endif
+}
+
int crypt_secure_malloc_init(size_t max)
{
#ifdef HAVE_OPENSSL
- return openssl_secure_malloc_init(max, SECMEM_GUARD);
+ return openssl_secure_malloc_init(max, SECMEM_MINSIZE);
#else
(void) max;
return 0;
diff --git a/src/lib/crypt/keyrot.c b/src/lib/crypt/keyrot.c
new file mode 100644
index 00000000..8b0d9429
--- /dev/null
+++ b/src/lib/crypt/keyrot.c
@@ -0,0 +1,741 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * Data-plane key-rotation schedule (node/leaf keys, selector)
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+#define _POSIX_C_SOURCE 200809L
+
+#include <config.h>
+
+#include <ouroboros/atomics.h>
+#include <ouroboros/crypt.h>
+#include <ouroboros/pthread.h>
+#include <ouroboros/rcu.h>
+
+#include "crypt/keyrot.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+/*
+ * Per-flow keys are addressed by (epoch, node, leaf) and derived as:
+ * root = per-batch HKDF PRK from the OAP exchange, wiped once expanded
+ * nodes = HKDF-Expand(root, "o7s-keyrot-node") -> KEY_NODE_COUNT keys
+ * leaf = HKDF-Expand(node, "o7s-keyrot-leaf"|dir|leaf) -> AEAD key
+ * The epoch is a small wrapping counter, carried in the selector, that picks
+ * the live batch; a Tier-2 OAP re-key advances it. The "dir" byte forks the
+ * leaf keys per direction.
+ *
+ * Concurrency: cur/prev batch pointers are published by a re-key and read on
+ * the data path under an rcu_guard (lock-free RCU with liburcu, else a per-
+ * keyrot rwlock). The per-batch TX counter is atomic, so the (epoch, counter)
+ * nonce is unique without serialising TX. Leaf caches are THREAD-LOCAL (an app
+ * writer and the FRCT retransmit timer never share cache state), keyed on a
+ * global batch id and direct-mapped.
+ */
+
+#define KR_WITHIN_BITS (KEY_LEAF_BITS + KEY_NODE_BITS)
+#define KR_WITHIN_MASK (((uint64_t) 1 << KR_WITHIN_BITS) - 1)
+#define KR_N (KEY_NODE_COUNT)
+#define KR_LEAVES (1u << KEY_NODE_BITS)
+#define KR_BATCH_MAX ((uint64_t) KR_N << KR_WITHIN_BITS)
+#define KR_NODES_SZ ((size_t) KR_N * SYMMKEYSZ)
+#define KR_TCACHE_WAYS 16 /* per-thread cache slots per direction (pow2) */
+#define KR_EPOCHS 16 /* 4-bit wire epoch: gens before wrap */
+
+#define KR_RP_WORDS (KEY_REPLAY_WINDOW / 64) /* pow2; RFC 6479 bitmap */
+#define KR_RP_SHIFT 6
+#define KR_RP_MASK 63
+#define KR_RP_WINDOW (KEY_REPLAY_WINDOW - 64) /* reserve 1 slack word */
+
+static const char kr_node_label[] = "o7s-keyrot-node";
+static const char kr_leaf_label[] = "o7s-keyrot-leaf";
+
+struct kr_batch {
+ uint64_t id; /* process-global, unique; cache key (no ABA) */
+ uint8_t epoch; /* 4-bit wire selector */
+ uint8_t * nodes; /* KR_NODES_SZ in secure heap; NULL if empty */
+ uint64_t tx_ctr; /* atomic; per-batch so nonces never collide */
+
+ struct { /* RFC 6479-like anti-replay window */
+ uint64_t last; /* highest accepted ctr + 1 */
+ uint64_t bits[KR_RP_WORDS];
+ pthread_mutex_t mtx;
+ } rp;
+};
+
+struct kr_keycache {
+ uint8_t * key; /* SYMMKEYSZ, points into the per-thread slab */
+ uint64_t id; /* batch the cached key belongs to */
+ uint16_t node;
+ uint8_t leaf;
+ uint8_t dir;
+ bool valid;
+};
+
+struct keyrot {
+ struct kr_batch * cur; /* published; read on data path */
+ struct kr_batch * prev; /* NULL = none */
+ struct rcu_guard guard; /* re-key vs readers */
+ uint8_t role;
+ uint8_t tx_epoch; /* epoch TX currently stamps */
+ bool peer_switched; /* peer is on the cur epoch */
+};
+
+/* Per-thread leaf-key caches, freed by the thread-exit destructor. */
+struct kr_tcache {
+ struct kr_keycache tx[KR_TCACHE_WAYS];
+ struct kr_keycache rx[KR_TCACHE_WAYS];
+ uint8_t * slab; /* 2*KR_TCACHE_WAYS*SYMMKEYSZ secure heap */
+};
+
+static struct {
+ uint64_t next_id; /* batch-id allocator (atomic) */
+ pthread_key_t tcache_key; /* per-thread leaf-key caches */
+ pthread_once_t tcache_once;
+} kr_g = { 0, 0, PTHREAD_ONCE_INIT };
+
+static void kr_tcache_free(void * p)
+{
+ struct kr_tcache * t = p;
+
+ if (t == NULL)
+ return;
+
+ crypt_secure_free(t->slab, 2 * KR_TCACHE_WAYS * SYMMKEYSZ);
+ free(t);
+}
+
+static void kr_tcache_init(void)
+{
+ pthread_key_create(&kr_g.tcache_key, kr_tcache_free);
+}
+
+static struct kr_tcache * kr_tcache_get(void)
+{
+ struct kr_tcache * t;
+ size_t i;
+
+ pthread_once(&kr_g.tcache_once, kr_tcache_init);
+
+ t = pthread_getspecific(kr_g.tcache_key);
+ if (t != NULL)
+ return t;
+
+ t = malloc(sizeof(*t));
+ if (t == NULL)
+ goto fail_alloc;
+
+ memset(t, 0, sizeof(*t));
+
+ t->slab = crypt_secure_malloc(2 * KR_TCACHE_WAYS * SYMMKEYSZ);
+ if (t->slab == NULL)
+ goto fail_slab;
+
+ for (i = 0; i < KR_TCACHE_WAYS; i++) {
+ t->tx[i].key = t->slab + i * SYMMKEYSZ;
+ t->rx[i].key = t->slab + (KR_TCACHE_WAYS + i) * SYMMKEYSZ;
+ }
+
+ if (pthread_setspecific(kr_g.tcache_key, t) != 0)
+ goto fail_set;
+
+ return t;
+
+ fail_set:
+ crypt_secure_free(t->slab, 2 * KR_TCACHE_WAYS * SYMMKEYSZ);
+ fail_slab:
+ free(t);
+ fail_alloc:
+ return NULL;
+}
+
+static uint8_t * kr_expand_nodes(const uint8_t * root)
+{
+ uint8_t * nodes;
+ buffer_t prk;
+ buffer_t info;
+ buffer_t okm;
+
+ nodes = crypt_secure_malloc(KR_NODES_SZ);
+ if (nodes == NULL)
+ return NULL;
+
+ prk.len = SYMMKEYSZ;
+ prk.data = (uint8_t *) root;
+ info.len = sizeof(kr_node_label) - 1;
+ info.data = (uint8_t *) kr_node_label;
+ okm.len = KR_NODES_SZ;
+ okm.data = nodes;
+
+ if (crypt_hkdf_expand(prk, info, okm) != 0)
+ goto fail_expand;
+
+ return nodes;
+
+ fail_expand:
+ crypt_secure_free(nodes, KR_NODES_SZ);
+ return NULL;
+}
+
+static int kr_leaf_key(const uint8_t * node,
+ uint8_t leaf,
+ uint8_t dir,
+ uint8_t * out)
+{
+ uint8_t info_buf[sizeof(kr_leaf_label) - 1 + 2];
+ buffer_t prk;
+ buffer_t info;
+ buffer_t okm;
+ size_t n = sizeof(kr_leaf_label) - 1;
+
+ memcpy(info_buf, kr_leaf_label, n);
+ info_buf[n] = dir;
+ info_buf[n + 1] = leaf;
+
+ prk.len = SYMMKEYSZ;
+ prk.data = (uint8_t *) node;
+ info.len = n + 2;
+ info.data = info_buf;
+ okm.len = SYMMKEYSZ;
+ okm.data = out;
+
+ return crypt_hkdf_expand(prk, info, okm);
+}
+
+static __inline__ bool kr_kc_hit(const struct kr_keycache * kc,
+ const struct kr_batch * b,
+ uint16_t node,
+ uint8_t leaf,
+ uint8_t dir)
+{
+ if (!kc->valid)
+ return false;
+
+ if (kc->id != b->id)
+ return false;
+
+ if (kc->node != node)
+ return false;
+
+ if (kc->leaf != leaf)
+ return false;
+
+ return kc->dir == dir;
+}
+
+/* Fetch the leaf key; derive into the (direct-mapped) slot on a miss. */
+static const uint8_t * kr_kc_get(struct kr_keycache * cache,
+ const struct kr_batch * b,
+ uint16_t node,
+ uint8_t leaf,
+ uint8_t dir)
+{
+ struct kr_keycache * kc;
+ uint8_t * nkey;
+
+ kc = &cache[b->id & (KR_TCACHE_WAYS - 1)];
+
+ if (kr_kc_hit(kc, b, node, leaf, dir))
+ return kc->key;
+
+ nkey = b->nodes + (size_t) node * SYMMKEYSZ;
+ if (kr_leaf_key(nkey, leaf, dir, kc->key) != 0)
+ return NULL;
+
+ kc->valid = true;
+ kc->id = b->id;
+ kc->node = node;
+ kc->leaf = leaf;
+ kc->dir = dir;
+
+ return kc->key;
+}
+
+static void kr_sel_enc(uint8_t epoch,
+ uint16_t node,
+ uint32_t seq,
+ uint8_t sel[KR_SELECTOR_LEN])
+{
+ sel[0] = (uint8_t) ((epoch << 4) | ((node >> 8) & 0x0F));
+ sel[1] = (uint8_t) (node & 0xFF);
+ sel[2] = (uint8_t) (seq >> 24);
+ sel[3] = (uint8_t) (seq >> 16);
+ sel[4] = (uint8_t) (seq >> 8);
+ sel[5] = (uint8_t) (seq);
+}
+
+static void kr_sel_dec(const uint8_t sel[KR_SELECTOR_LEN],
+ uint8_t * epoch,
+ uint16_t * node,
+ uint32_t * seq)
+{
+ *epoch = (uint8_t) (sel[0] >> 4);
+ *node = (uint16_t) (((sel[0] & 0x0F) << 8) | sel[1]);
+ *seq = ((uint32_t) sel[2] << 24) | ((uint32_t) sel[3] << 16) |
+ ((uint32_t) sel[4] << 8) | (uint32_t) sel[5];
+}
+
+static uint64_t kr_ctr(uint16_t node,
+ uint32_t seq)
+{
+ return ((uint64_t) node << KR_WITHIN_BITS) |
+ ((uint64_t) seq & KR_WITHIN_MASK);
+}
+
+static void kr_nonce(uint64_t ctr,
+ uint8_t * nonce)
+{
+ size_t i;
+
+ memset(nonce, 0, KR_NONCE_LEN);
+
+ /* ctr big-endian in the low 8 bytes; high bytes stay zero */
+ for (i = 0; i < 8; i++)
+ nonce[i] = (uint8_t) (ctr >> (56 - 8 * i));
+}
+
+static struct kr_batch * kr_batch_create(uint8_t epoch,
+ const uint8_t * root)
+{
+ struct kr_batch * b;
+
+ b = malloc(sizeof(*b));
+ if (b == NULL)
+ goto fail_alloc;
+
+ b->nodes = kr_expand_nodes(root);
+ if (b->nodes == NULL)
+ goto fail_nodes;
+
+ b->id = FETCH_ADD_RELAXED(&kr_g.next_id, 1);
+ b->epoch = epoch;
+ b->tx_ctr = 0;
+ if (pthread_mutex_init(&b->rp.mtx, NULL) != 0)
+ goto fail_lock;
+
+ b->rp.last = 0;
+ memset(b->rp.bits, 0, sizeof(b->rp.bits));
+
+ return b;
+
+ fail_lock:
+ crypt_secure_free(b->nodes, KR_NODES_SZ);
+ free(b);
+ return NULL;
+ fail_nodes:
+ free(b);
+ fail_alloc:
+ return NULL;
+}
+
+static void kr_batch_free(struct kr_batch * b)
+{
+ if (b == NULL)
+ return;
+
+ pthread_mutex_destroy(&b->rp.mtx);
+ crypt_secure_free(b->nodes, KR_NODES_SZ);
+ free(b);
+}
+
+/*
+ * RFC 6479 anti-replay window keyed on the per-batch counter, with
+ * seq = ctr + 1 so 0 means "nothing accepted yet". Returns 0 if the
+ * packet is fresh (and records it), -1 on a replay or a too-old ctr.
+ */
+static int kr_rp_commit(struct kr_batch * b,
+ uint64_t ctr)
+{
+ uint64_t seq;
+ uint64_t idx;
+ uint64_t cur;
+ uint64_t diff;
+
+ seq = ctr + 1;
+
+ pthread_mutex_lock(&b->rp.mtx);
+
+ if (seq > b->rp.last) {
+ idx = seq >> KR_RP_SHIFT;
+ cur = b->rp.last >> KR_RP_SHIFT;
+ diff = idx - cur;
+ if (diff > KR_RP_WORDS)
+ diff = KR_RP_WORDS;
+
+ while (diff-- > 0) {
+ cur++;
+ b->rp.bits[cur & (KR_RP_WORDS - 1)] = 0;
+ }
+
+ b->rp.bits[idx & (KR_RP_WORDS - 1)] |=
+ (uint64_t) 1 << (seq & KR_RP_MASK);
+ b->rp.last = seq;
+ goto finish;
+ }
+
+ if (b->rp.last - seq >= KR_RP_WINDOW)
+ goto fail;
+
+ idx = seq >> KR_RP_SHIFT;
+ if (b->rp.bits[idx & (KR_RP_WORDS - 1)]
+ & ((uint64_t) 1 << (seq & KR_RP_MASK)))
+ goto fail;
+
+ b->rp.bits[idx & (KR_RP_WORDS - 1)] |=
+ (uint64_t) 1 << (seq & KR_RP_MASK);
+ finish:
+ pthread_mutex_unlock(&b->rp.mtx);
+
+ return 0;
+ fail:
+ pthread_mutex_unlock(&b->rp.mtx);
+
+ return -1;
+}
+
+struct keyrot * keyrot_create(const uint8_t * root,
+ uint8_t epoch,
+ uint8_t role)
+{
+ struct keyrot * kr;
+
+ assert(root != NULL);
+ assert(role <= 1);
+
+ if (epoch >= KR_EPOCHS)
+ goto fail_kr;
+
+ kr = malloc(sizeof(*kr));
+ if (kr == NULL)
+ goto fail_kr;
+
+ memset(kr, 0, sizeof(*kr));
+
+ kr->role = role;
+ kr->tx_epoch = epoch;
+ kr->peer_switched = true;
+ kr->prev = NULL;
+
+ kr->cur = kr_batch_create(epoch, root);
+ if (kr->cur == NULL)
+ goto fail_cur;
+
+ if (rcu_guard_init(&kr->guard))
+ goto fail_guard;
+
+ return kr;
+
+ fail_guard:
+ kr_batch_free(kr->cur);
+ fail_cur:
+ free(kr);
+ fail_kr:
+ return NULL;
+}
+
+void keyrot_destroy(struct keyrot * kr)
+{
+ if (kr == NULL)
+ return;
+
+ /* Wait out any in-flight reader before freeing batches. */
+ rcu_drain(&kr->guard);
+
+ kr_batch_free(kr->cur);
+ kr_batch_free(kr->prev);
+
+ rcu_guard_fini(&kr->guard);
+
+ free(kr);
+}
+
+int keyrot_rekey(struct keyrot * kr,
+ const uint8_t * root,
+ uint8_t epoch)
+{
+ struct kr_batch * nb;
+ struct kr_batch * old_prev;
+
+ assert(kr != NULL);
+ assert(root != NULL);
+
+ if (epoch >= KR_EPOCHS)
+ return -1;
+
+ nb = kr_batch_create(epoch, root);
+ if (nb == NULL)
+ return -1;
+
+ rcu_wrlock(&kr->guard);
+
+ old_prev = kr->prev;
+ rcu_assign(kr->prev, kr->cur);
+ rcu_publish(nb);
+ rcu_assign(kr->cur, nb);
+
+ /* TX keeps the old epoch until the peer is seen on the new one. */
+ STORE_RELEASE(&kr->peer_switched, false);
+
+ rcu_wrunlock(&kr->guard);
+
+ /* old_prev is unreachable now; reclaim past any live reader. */
+ rcu_reclaim(&kr->guard);
+ kr_batch_free(old_prev);
+
+ return 0;
+}
+
+void keyrot_tx_promote(struct keyrot * kr)
+{
+ assert(kr != NULL);
+
+ /* Serialise with keyrot_rekey so tx_epoch tracks a consistent cur. */
+ rcu_wrlock(&kr->guard);
+ STORE_RELAXED(&kr->tx_epoch, rcu_deref(kr->cur)->epoch);
+ rcu_wrunlock(&kr->guard);
+}
+
+int keyrot_tx_next(struct keyrot * kr,
+ uint8_t sel[KR_SELECTOR_LEN],
+ const uint8_t ** key,
+ uint8_t nonce[KR_NONCE_LEN])
+{
+ struct kr_tcache * tc;
+ struct kr_batch * cur;
+ struct kr_batch * prev;
+ struct kr_batch * b;
+ uint64_t ctr;
+ uint16_t node;
+ uint8_t leaf;
+ uint8_t txe;
+ uint8_t epoch;
+ uint32_t seq;
+ const uint8_t * k;
+
+ assert(kr != NULL);
+ assert(key != NULL);
+
+ tc = kr_tcache_get();
+ if (tc == NULL)
+ return -1;
+
+ rcu_rdlock(&kr->guard);
+
+ cur = rcu_deref(kr->cur);
+ prev = rcu_deref(kr->prev);
+ rcu_consume(cur);
+ rcu_consume(prev);
+ txe = LOAD_RELAXED(&kr->tx_epoch);
+
+ if (cur->epoch == txe)
+ b = cur;
+ else if (prev != NULL && prev->epoch == txe)
+ b = prev;
+ else
+ b = NULL;
+
+ if (b == NULL) {
+ rcu_rdunlock(&kr->guard);
+ return -1; /* tx_epoch batch gone; next promote resyncs */
+ }
+
+ /* Slot reserved even if exhausted; tx_nodes_left clamps the count. */
+ ctr = FETCH_ADD_RELAXED(&b->tx_ctr, 1);
+ if (ctr >= KR_BATCH_MAX) {
+ rcu_rdunlock(&kr->guard);
+ return -1; /* batch exhausted */
+ }
+
+ node = (uint16_t) (ctr >> KR_WITHIN_BITS);
+ leaf = (uint8_t) ((ctr >> KEY_LEAF_BITS) & (KR_LEAVES - 1));
+ seq = (uint32_t) (ctr & KR_WITHIN_MASK);
+ epoch = b->epoch;
+
+ k = kr_kc_get(tc->tx, b, node, leaf, kr->role);
+
+ rcu_rdunlock(&kr->guard);
+
+ if (k == NULL)
+ return -1;
+
+ kr_sel_enc(epoch, node, seq, sel);
+ kr_nonce(ctr, nonce);
+
+ *key = k;
+
+ return 0;
+}
+
+int keyrot_rx_lookup(struct keyrot * kr,
+ const uint8_t sel[KR_SELECTOR_LEN],
+ const uint8_t ** key,
+ uint8_t nonce[KR_NONCE_LEN],
+ struct kr_rx * rx)
+{
+ struct kr_tcache * tc;
+ struct kr_batch * cur;
+ struct kr_batch * prev;
+ struct kr_batch * b;
+ uint8_t epoch;
+ uint16_t node;
+ uint32_t seq;
+ uint64_t ctr;
+ uint8_t leaf;
+ const uint8_t * k;
+
+ assert(kr != NULL);
+ assert(key != NULL);
+
+ kr_sel_dec(sel, &epoch, &node, &seq);
+
+ if (node >= KR_N)
+ return -1;
+
+ tc = kr_tcache_get();
+ if (tc == NULL)
+ return -1;
+
+ rcu_rdlock(&kr->guard);
+
+ cur = rcu_deref(kr->cur);
+ prev = rcu_deref(kr->prev);
+ rcu_consume(cur);
+ rcu_consume(prev);
+
+ if (epoch == cur->epoch) {
+ b = cur;
+ } else if (prev != NULL && epoch == prev->epoch) {
+ b = prev;
+ } else {
+ rcu_rdunlock(&kr->guard);
+ return -1; /* unknown epoch */
+ }
+
+ ctr = kr_ctr(node, seq);
+ leaf = (uint8_t) ((ctr >> KEY_LEAF_BITS) & (KR_LEAVES - 1));
+
+ /* peer's tx direction */
+ k = kr_kc_get(tc->rx, b, node, leaf, (uint8_t) (kr->role ^ 1));
+
+ rx->id = b->id;
+ rx->ctr = ctr;
+
+ rcu_rdunlock(&kr->guard);
+
+ if (k == NULL)
+ return -1;
+
+ kr_nonce(ctr, nonce);
+
+ *key = k;
+
+ return 0;
+}
+
+/*
+ * Commit a packet that authenticated under the batch keyrot_rx_lookup
+ * selected. Re-finds that batch by id (epoch may have advanced) and,
+ * if still resident, advances the replay window and records that the
+ * peer is on the current batch. Runs only post-AEAD so a forged or
+ * replayed packet can mutate no receiver state. Returns -1 on replay.
+ */
+int keyrot_rx_commit(struct keyrot * kr,
+ const struct kr_rx * rx)
+{
+ struct kr_batch * cur;
+ struct kr_batch * prev;
+ struct kr_batch * b;
+ int rc;
+
+ assert(kr != NULL);
+ assert(rx != NULL);
+
+ rcu_rdlock(&kr->guard);
+
+ cur = rcu_deref(kr->cur);
+ prev = rcu_deref(kr->prev);
+ rcu_consume(cur);
+ rcu_consume(prev);
+
+ if (cur->id == rx->id)
+ b = cur;
+ else if (prev != NULL && prev->id == rx->id)
+ b = prev;
+ else
+ b = NULL;
+
+ if (b == NULL) {
+ rcu_rdunlock(&kr->guard);
+ return 0; /* batch evicted post-auth; nothing to protect */
+ }
+
+ rc = kr_rp_commit(b, rx->ctr);
+ if (rc == 0 && b == cur)
+ STORE_RELEASE(&kr->peer_switched, true);
+
+ rcu_rdunlock(&kr->guard);
+
+ return rc;
+}
+
+bool keyrot_peer_switched(const struct keyrot * kr)
+{
+ assert(kr != NULL);
+
+ return LOAD_ACQUIRE(&kr->peer_switched);
+}
+
+unsigned keyrot_tx_nodes_left(struct keyrot * kr)
+{
+ struct kr_batch * cur;
+ struct kr_batch * prev;
+ struct kr_batch * b;
+ uint64_t ctr;
+ unsigned used;
+ uint8_t txe;
+
+ assert(kr != NULL);
+
+ rcu_rdlock(&kr->guard);
+ cur = rcu_deref(kr->cur);
+ prev = rcu_deref(kr->prev);
+ rcu_consume(cur);
+ rcu_consume(prev);
+ txe = LOAD_RELAXED(&kr->tx_epoch);
+
+ if (cur->epoch == txe)
+ b = cur;
+ else if (prev != NULL && prev->epoch == txe)
+ b = prev;
+ else
+ b = NULL;
+
+ ctr = b != NULL ? LOAD_RELAXED(&b->tx_ctr) : KR_BATCH_MAX;
+ rcu_rdunlock(&kr->guard);
+
+ used = (unsigned) (ctr >> KR_WITHIN_BITS);
+ if (used >= KR_N)
+ return 0;
+
+ return KR_N - used;
+}
diff --git a/src/lib/crypt/keyrot.h b/src/lib/crypt/keyrot.h
new file mode 100644
index 00000000..6a598f76
--- /dev/null
+++ b/src/lib/crypt/keyrot.h
@@ -0,0 +1,74 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * Data-plane key-rotation schedule (node/leaf keys, selector)
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+#ifndef OUROBOROS_LIB_CRYPT_KEYROT_H
+#define OUROBOROS_LIB_CRYPT_KEYROT_H
+
+#include <ouroboros/crypt.h> /* SYMMKEYSZ, NONCESZ */
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#define KR_SELECTOR_LEN 6
+#define KR_NONCE_LEN NONCESZ
+
+struct keyrot;
+
+struct kr_rx {
+ uint64_t id; /* batch id of the matched epoch */
+ uint64_t ctr; /* packet counter for replay check */
+};
+
+struct keyrot * keyrot_create(const uint8_t * root,
+ uint8_t epoch,
+ uint8_t role);
+
+void keyrot_destroy(struct keyrot * kr);
+
+int keyrot_rekey(struct keyrot * kr,
+ const uint8_t * root,
+ uint8_t epoch);
+
+/* Promote TX to the installed (new) batch once the peer is on it. */
+void keyrot_tx_promote(struct keyrot * kr);
+
+int keyrot_tx_next(struct keyrot * kr,
+ uint8_t sel[KR_SELECTOR_LEN],
+ const uint8_t ** key,
+ uint8_t nonce[KR_NONCE_LEN]);
+
+int keyrot_rx_lookup(struct keyrot * kr,
+ const uint8_t sel[KR_SELECTOR_LEN],
+ const uint8_t ** key,
+ uint8_t nonce[KR_NONCE_LEN],
+ struct kr_rx * rx);
+
+/* Commit an authenticated packet: replay window + peer-switched. */
+int keyrot_rx_commit(struct keyrot * kr,
+ const struct kr_rx * rx);
+
+/* True once an RX packet under the current batch has been observed. */
+bool keyrot_peer_switched(const struct keyrot * kr);
+
+unsigned keyrot_tx_nodes_left(struct keyrot * kr);
+
+#endif /* OUROBOROS_LIB_CRYPT_KEYROT_H */
diff --git a/src/lib/crypt/openssl.c b/src/lib/crypt/openssl.c
index 5916e3cb..d5d9adf5 100644
--- a/src/lib/crypt/openssl.c
+++ b/src/lib/crypt/openssl.c
@@ -30,6 +30,8 @@
#include <ouroboros/errno.h>
#include <ouroboros/crypt.h>
#include <ouroboros/hash.h>
+#include <ouroboros/name.h>
+#include <ouroboros/pthread.h>
#include <ouroboros/random.h>
#include <ouroboros/utils.h>
@@ -52,27 +54,14 @@
#define HKDF_INFO_DHE "o7s-ossl-dhe"
#define HKDF_INFO_ENCAP "o7s-ossl-encap"
-#define HKDF_INFO_ROTATION "o7s-key-rotation"
#define HKDF_SALT_LEN 32 /* SHA-256 output size */
+#define AEAD_NONCE_LEN 12 /* 96-bit deterministic IV (SP 800-38D) */
+#define AEAD_TAG_LEN 16 /* 128-bit AEAD authentication tag */
struct ossl_crypt_ctx {
EVP_CIPHER_CTX * evp_ctx;
const EVP_CIPHER * cipher;
- int ivsz;
int tagsz;
-
- struct {
- uint8_t * cur; /* current key */
- uint8_t * prv; /* rotated key */
- } keys;
-
- struct {
- uint32_t cntr; /* counter */
- uint32_t mask; /* phase mask */
- uint32_t age; /* counter within epoch */
- uint8_t phase; /* current key phase */
- uint8_t salt[HKDF_SALT_LEN];
- } rot; /* rotation logic */
};
struct kdf_info {
@@ -83,17 +72,6 @@ struct kdf_info {
buffer_t key;
};
-/* Key rotation macros */
-#define HAS_PHASE_BIT_TOGGLED(ctx) \
- (((ctx)->rot.cntr & (ctx)->rot.mask) != \
- (((ctx)->rot.cntr - 1) & (ctx)->rot.mask))
-
-#define HAS_GRACE_EXPIRED(ctx) \
- ((ctx)->rot.age >= ((ctx)->rot.mask >> 1))
-
-#define ROTATION_TOO_RECENT(ctx) \
- ((ctx)->rot.age < ((ctx)->rot.mask - ((ctx)->rot.mask >> 2)))
-
/* Convert hash NID to OpenSSL digest name string for HKDF */
static const char * hash_nid_to_digest_name(int nid)
{
@@ -102,11 +80,11 @@ static const char * hash_nid_to_digest_name(int nid)
md = EVP_get_digestbynid(nid);
if (md == NULL)
- return "SHA256"; /* fallback to SHA-256 */
+ return NULL;
name = EVP_MD_get0_name(md);
if (name == NULL)
- return "SHA256"; /* fallback to SHA-256 */
+ return NULL;
return name;
}
@@ -144,21 +122,20 @@ static int get_pk_bytes_from_key(EVP_PKEY * key,
}
/* Derive salt from public key bytes by hashing them */
-static int derive_salt_from_pk_bytes(buffer_t pk,
- uint8_t * salt,
- size_t salt_len)
+static int derive_salt_from_pk_bytes(buffer_t pk,
+ buffer_t salt)
{
uint8_t hash[EVP_MAX_MD_SIZE];
unsigned hash_len;
assert(pk.data != NULL);
- assert(salt != NULL);
+ assert(salt.data != NULL);
if (EVP_Digest(pk.data, pk.len, hash, &hash_len,
EVP_sha256(), NULL) != 1)
goto fail_digest;
- memcpy(salt, hash, salt_len < hash_len ? salt_len : hash_len);
+ memcpy(salt.data, hash, salt.len < hash_len ? salt.len : hash_len);
return 0;
fail_digest:
@@ -166,10 +143,9 @@ static int derive_salt_from_pk_bytes(buffer_t pk,
}
/* Derive salt from two public key byte buffers (DHE) in canonical order */
-static int derive_salt_from_pk_bytes_dhe(buffer_t local,
- buffer_t remote,
- uint8_t * salt,
- size_t salt_len)
+static int derive_salt_from_pk_bytes_dhe(buffer_t local,
+ buffer_t remote,
+ buffer_t salt)
{
uint8_t * concat;
size_t concat_len;
@@ -180,7 +156,7 @@ static int derive_salt_from_pk_bytes_dhe(buffer_t local,
assert(local.data != NULL);
assert(remote.data != NULL);
- assert(salt != NULL);
+ assert(salt.data != NULL);
concat_len = local.len + remote.len;
concat = OPENSSL_malloc(concat_len);
@@ -204,7 +180,7 @@ static int derive_salt_from_pk_bytes_dhe(buffer_t local,
OPENSSL_free(concat);
- memcpy(salt, hash, salt_len < hash_len ? salt_len : hash_len);
+ memcpy(salt.data, hash, salt.len < hash_len ? salt.len : hash_len);
return 0;
fail_digest:
@@ -225,6 +201,8 @@ static int derive_key_hkdf(struct kdf_info * ki)
int idx;
digest = hash_nid_to_digest_name(ki->nid);
+ if (digest == NULL)
+ goto fail_fetch;
kdf = EVP_KDF_fetch(NULL, "HKDF", NULL);
if (kdf == NULL)
@@ -258,117 +236,144 @@ static int derive_key_hkdf(struct kdf_info * ki)
return -ECRYPT;
}
-/* Key rotation helper functions implementation */
-static int should_rotate_key_rx(struct ossl_crypt_ctx * ctx,
- uint8_t rx_phase)
+int openssl_hkdf_expand(buffer_t key,
+ buffer_t info,
+ buffer_t out)
{
- assert(ctx != NULL);
+ EVP_KDF * kdf;
+ EVP_KDF_CTX * kctx;
+ OSSL_PARAM params[5];
+ int mode = EVP_KDF_HKDF_MODE_EXPAND_ONLY;
+ int idx = 0;
+ int ret = -1;
- /* Phase must have changed */
- if (rx_phase == ctx->rot.phase)
- return 0;
+ kdf = EVP_KDF_fetch(NULL, "HKDF", NULL);
+ if (kdf == NULL)
+ goto fail_fetch;
- if (ROTATION_TOO_RECENT(ctx))
- return 0;
+ kctx = EVP_KDF_CTX_new(kdf);
+ if (kctx == NULL)
+ goto fail_ctx;
+
+ params[idx++] = OSSL_PARAM_construct_utf8_string(
+ "digest", (char *) "SHA256", 0);
+ params[idx++] = OSSL_PARAM_construct_int("mode", &mode);
+ params[idx++] = OSSL_PARAM_construct_octet_string(
+ "key", key.data, key.len);
+ params[idx++] = OSSL_PARAM_construct_octet_string(
+ "info", info.data, info.len);
+ params[idx] = OSSL_PARAM_construct_end();
- return 1;
+ if (EVP_KDF_derive(kctx, out.data, out.len, params) == 1)
+ ret = 0;
+
+ EVP_KDF_CTX_free(kctx);
+ fail_ctx:
+ EVP_KDF_free(kdf);
+ fail_fetch:
+ return ret;
}
-static int rotate_key(struct ossl_crypt_ctx * ctx)
+/* AEAD seal: encrypt in with key/nonce, bind aad, append tag */
+int openssl_seal(struct ossl_crypt_ctx * ctx,
+ const uint8_t * key,
+ const uint8_t * nonce,
+ buffer_t aad,
+ buffer_t in,
+ uint8_t * out,
+ uint8_t * tag)
{
- struct kdf_info ki;
- uint8_t * tmp;
+ int out_sz;
+ int tmp_sz;
assert(ctx != NULL);
+ assert(ctx->tagsz > 0); /* AEAD mandated at ctx creation */
- /* Swap keys - move current to prev */
- tmp = ctx->keys.prv;
- ctx->keys.prv = ctx->keys.cur;
+ EVP_CIPHER_CTX_reset(ctx->evp_ctx);
- if (tmp != NULL) {
- /* Reuse old prev_key memory for new key */
- ctx->keys.cur = tmp;
- } else {
- /* First rotation - allocate new memory */
- ctx->keys.cur = OPENSSL_secure_malloc(SYMMKEYSZ);
- if (ctx->keys.cur == NULL)
- return -ECRYPT;
- }
+ if (EVP_EncryptInit_ex(ctx->evp_ctx, ctx->cipher, NULL,
+ NULL, NULL) != 1)
+ return -1;
- /* Derive new key from previous key using HKDF */
- ki.secret.data = ctx->keys.prv;
- ki.secret.len = SYMMKEYSZ;
- ki.nid = NID_sha256;
- ki.salt.data = ctx->rot.salt;
- ki.salt.len = HKDF_SALT_LEN;
- ki.info.data = (uint8_t *) HKDF_INFO_ROTATION;
- ki.info.len = strlen(HKDF_INFO_ROTATION);
- ki.key.data = ctx->keys.cur;
- ki.key.len = SYMMKEYSZ;
+ /* Pin the AEAD nonce to 96 bits (SP 800-38D deterministic IV). */
+ if (EVP_CIPHER_CTX_ctrl(ctx->evp_ctx, EVP_CTRL_AEAD_SET_IVLEN,
+ AEAD_NONCE_LEN, NULL) != 1)
+ return -1;
- if (derive_key_hkdf(&ki) != 0)
- return -ECRYPT;
+ if (EVP_EncryptInit_ex(ctx->evp_ctx, NULL, NULL,
+ key, nonce) != 1)
+ return -1;
- ctx->rot.age = 0;
- ctx->rot.phase = !ctx->rot.phase;
+ if (EVP_EncryptUpdate(ctx->evp_ctx, NULL, &tmp_sz,
+ aad.data, (int) aad.len) != 1)
+ return -1;
- return 0;
-}
+ if (EVP_EncryptUpdate(ctx->evp_ctx, out, &out_sz,
+ in.data, (int) in.len) != 1)
+ return -1;
-static void cleanup_old_key(struct ossl_crypt_ctx * ctx)
-{
- assert(ctx != NULL);
+ if (EVP_EncryptFinal_ex(ctx->evp_ctx, out + out_sz, &tmp_sz) != 1)
+ return -1;
- if (ctx->keys.prv == NULL)
- return;
+ out_sz += tmp_sz;
- if (!HAS_GRACE_EXPIRED(ctx))
- return;
+ if (EVP_CIPHER_CTX_ctrl(ctx->evp_ctx, EVP_CTRL_AEAD_GET_TAG,
+ ctx->tagsz, tag) != 1)
+ return -1;
- OPENSSL_secure_clear_free(ctx->keys.prv, SYMMKEYSZ);
- ctx->keys.prv = NULL;
+ return out_sz;
}
-static int try_decrypt(struct ossl_crypt_ctx * ctx,
- uint8_t * key,
- uint8_t * iv,
- uint8_t * input,
- int in_sz,
- uint8_t * out,
- int * out_sz)
+/* AEAD open: decrypt in with key/nonce, verify aad and tag */
+int openssl_open(struct ossl_crypt_ctx * ctx,
+ const uint8_t * key,
+ const uint8_t * nonce,
+ buffer_t aad,
+ buffer_t in,
+ const uint8_t * tag,
+ buffer_t * out)
{
- uint8_t * tag;
- int tmp_sz;
- int ret;
+ int out_sz;
+ int tmp_sz;
- tag = input + in_sz;
+ assert(ctx != NULL);
+ assert(ctx->tagsz > 0); /* AEAD mandated at ctx creation */
EVP_CIPHER_CTX_reset(ctx->evp_ctx);
- ret = EVP_DecryptInit_ex(ctx->evp_ctx, ctx->cipher, NULL, key, iv);
- if (ret != 1)
+ if (EVP_DecryptInit_ex(ctx->evp_ctx, ctx->cipher, NULL,
+ NULL, NULL) != 1)
return -1;
- if (ctx->tagsz > 0) {
- ret = EVP_CIPHER_CTX_ctrl(ctx->evp_ctx, EVP_CTRL_AEAD_SET_TAG,
- ctx->tagsz, tag);
- if (ret != 1)
- return -1;
- }
+ /* Pin the AEAD nonce to 96 bits (SP 800-38D deterministic IV). */
+ if (EVP_CIPHER_CTX_ctrl(ctx->evp_ctx, EVP_CTRL_AEAD_SET_IVLEN,
+ AEAD_NONCE_LEN, NULL) != 1)
+ return -1;
- ret = EVP_DecryptUpdate(ctx->evp_ctx, out, &tmp_sz, input, in_sz);
- if (ret != 1)
+ if (EVP_DecryptInit_ex(ctx->evp_ctx, NULL, NULL, key, nonce) != 1)
return -1;
- *out_sz = tmp_sz;
+ if (EVP_CIPHER_CTX_ctrl(ctx->evp_ctx, EVP_CTRL_AEAD_SET_TAG,
+ ctx->tagsz, (void *) tag) != 1)
+ return -1;
- ret = EVP_DecryptFinal_ex(ctx->evp_ctx, out + tmp_sz, &tmp_sz);
- if (ret != 1)
+ if (EVP_DecryptUpdate(ctx->evp_ctx, NULL, &tmp_sz,
+ aad.data, (int) aad.len) != 1)
return -1;
- *out_sz += tmp_sz;
+ if (EVP_DecryptUpdate(ctx->evp_ctx, out->data, &out_sz,
+ in.data, (int) in.len) != 1)
+ return -1;
- return 0;
+ if (EVP_DecryptFinal_ex(ctx->evp_ctx, out->data + out_sz,
+ &tmp_sz) != 1)
+ return -1;
+
+ out_sz += tmp_sz;
+
+ out->len = (size_t) out_sz;
+
+ return out_sz;
}
/*
@@ -396,11 +401,14 @@ static int __openssl_dhe_derive(EVP_PKEY * pkp,
ret = i2d_PUBKEY(pkp, &local_pk.data);
if (ret <= 0)
goto fail_local;
+
local_pk.len = (size_t) ret;
+ ki.salt.len = HKDF_SALT_LEN;
+ ki.salt.data = salt_buf;
+
/* Derive salt from both public keys */
- if (derive_salt_from_pk_bytes_dhe(local_pk, remote_pk, salt_buf,
- HKDF_SALT_LEN) < 0)
+ if (derive_salt_from_pk_bytes_dhe(local_pk, remote_pk, ki.salt) < 0)
goto fail_salt;
ctx = EVP_PKEY_CTX_new(pkp, NULL);
@@ -437,13 +445,11 @@ static int __openssl_dhe_derive(EVP_PKEY * pkp,
ki.info.data = (uint8_t *) HKDF_INFO_DHE;
ki.key.len = SYMMKEYSZ;
ki.key.data = s;
- ki.salt.len = HKDF_SALT_LEN;
- ki.salt.data = salt_buf;
/* Derive symmetric key from shared secret using HKDF */
ret = derive_key_hkdf(&ki);
- OPENSSL_free(secret);
+ OPENSSL_clear_free(secret, secret_len);
EVP_PKEY_CTX_free(ctx);
OPENSSL_free(local_pk.data);
@@ -452,7 +458,7 @@ static int __openssl_dhe_derive(EVP_PKEY * pkp,
return 0;
fail_derive:
- OPENSSL_free(secret);
+ OPENSSL_clear_free(secret, secret_len);
fail_ctx:
EVP_PKEY_CTX_free(ctx);
fail_salt:
@@ -624,14 +630,22 @@ ssize_t openssl_pkp_create(const char * algo,
if (raw.len == 0)
goto fail_pubkey;
+ if (raw.len > CRYPT_KEY_BUFSZ) {
+ OPENSSL_free(raw.data);
+ goto fail_pubkey;
+ }
+
memcpy(pk, raw.data, raw.len);
OPENSSL_free(raw.data);
return (ssize_t) raw.len;
} else { /* DER encode standard algorithms */
+ len = i2d_PUBKEY(*pkp, NULL); /* pre-flight length */
+ if (len < 0 || len > CRYPT_KEY_BUFSZ)
+ goto fail_pubkey;
+
pos = pk; /* i2d_PUBKEY increments the ptr, don't use pk! */
- len = i2d_PUBKEY(*pkp, &pos);
- if (len < 0)
+ if (i2d_PUBKEY(*pkp, &pos) < 0)
goto fail_pubkey;
return len;
@@ -692,7 +706,7 @@ static ssize_t __openssl_kem_encap(EVP_PKEY * pub,
/* Derive symmetric key from shared secret using HKDF */
ret = derive_key_hkdf(&ki);
- OPENSSL_free(secret);
+ OPENSSL_clear_free(secret, secret_len);
EVP_PKEY_CTX_free(ctx);
if (ret != 0)
@@ -701,7 +715,7 @@ static ssize_t __openssl_kem_encap(EVP_PKEY * pub,
return (ssize_t) ct_len;
fail_secret:
- OPENSSL_free(secret);
+ OPENSSL_clear_free(secret, secret_len);
fail_encap:
EVP_PKEY_CTX_free(ctx);
fail_ctx:
@@ -717,13 +731,17 @@ ssize_t openssl_kem_encap(buffer_t pk,
EVP_PKEY * pub;
uint8_t * pos;
uint8_t salt[HKDF_SALT_LEN];
+ buffer_t salt_b;
ssize_t ret;
assert(pk.data != NULL);
assert(ct != NULL);
assert(s != NULL);
- if (derive_salt_from_pk_bytes(pk, salt, HKDF_SALT_LEN) < 0)
+ salt_b.len = HKDF_SALT_LEN;
+ salt_b.data = salt;
+
+ if (derive_salt_from_pk_bytes(pk, salt_b) < 0)
goto fail_salt;
pos = pk.data;
@@ -749,13 +767,17 @@ ssize_t openssl_kem_encap_raw(buffer_t pk,
EVP_PKEY * pub;
const char * algo;
uint8_t salt[HKDF_SALT_LEN];
+ buffer_t salt_b;
ssize_t ret;
assert(pk.data != NULL);
assert(ct != NULL);
assert(s != NULL);
- if (derive_salt_from_pk_bytes(pk, salt, HKDF_SALT_LEN) < 0)
+ salt_b.len = HKDF_SALT_LEN;
+ salt_b.data = salt;
+
+ if (derive_salt_from_pk_bytes(pk, salt_b) < 0)
goto fail_salt;
algo = __openssl_hybrid_algo_from_len(pk.len);
@@ -789,12 +811,16 @@ int openssl_kem_decap(EVP_PKEY * priv,
size_t secret_len;
int ret;
uint8_t salt[HKDF_SALT_LEN];
+ buffer_t salt_b;
/* Extract public key bytes from private key */
if (get_pk_bytes_from_key(priv, &pk) < 0)
goto fail_pk;
- if (derive_salt_from_pk_bytes(pk, salt, HKDF_SALT_LEN) < 0)
+ salt_b.len = HKDF_SALT_LEN;
+ salt_b.data = salt;
+
+ if (derive_salt_from_pk_bytes(pk, salt_b) < 0)
goto fail_salt;
ctx = EVP_PKEY_CTX_new(priv, NULL);
@@ -833,7 +859,7 @@ int openssl_kem_decap(EVP_PKEY * priv,
/* Derive symmetric key from shared secret using HKDF */
ret = derive_key_hkdf(&ki);
- OPENSSL_free(secret);
+ OPENSSL_clear_free(secret, secret_len);
EVP_PKEY_CTX_free(ctx);
OPENSSL_free(pk.data);
@@ -843,7 +869,7 @@ int openssl_kem_decap(EVP_PKEY * priv,
return 0;
fail_secret:
- OPENSSL_free(secret);
+ OPENSSL_clear_free(secret, secret_len);
fail_ctx:
EVP_PKEY_CTX_free(ctx);
fail_salt:
@@ -857,13 +883,14 @@ void openssl_pkp_destroy(EVP_PKEY * pkp)
EVP_PKEY_free(pkp);
}
-int __openssl_get_curve(EVP_PKEY * pub,
- char * algo)
+static int openssl_get_curve(EVP_PKEY * pub,
+ char * algo)
{
int ret;
size_t len = KEX_ALGO_BUFSZ;
ret = EVP_PKEY_get_utf8_string_param(pub, "group", algo, len, &len);
+
return ret == 1 ? 0 : -ECRYPT;
}
@@ -888,9 +915,10 @@ int openssl_get_algo_from_pk_der(buffer_t pk,
strcpy(algo, type_str);
- if ((IS_EC_GROUP(algo) || IS_DH_GROUP(algo)) &&
- __openssl_get_curve(pub, algo) < 0)
- goto fail_pub;
+ if (IS_EC_GROUP(algo) || IS_DH_GROUP(algo)) {
+ if (openssl_get_curve(pub, algo) < 0)
+ goto fail_pub;
+ }
EVP_PKEY_free(pub);
return 0;
@@ -948,141 +976,122 @@ int openssl_dhe_derive(EVP_PKEY * pkp,
return -ECRYPT;
}
-int openssl_encrypt(struct ossl_crypt_ctx * ctx,
- buffer_t in,
- buffer_t * out)
+/* Set up a fresh AEAD cipher ctx for nid: reject non-AEAD / oversized IV. */
+static int ossl_cipher_ctx_init(struct ossl_crypt_ctx * ctx,
+ int nid)
{
- uint8_t * ptr;
- uint8_t * iv;
- int in_sz;
- int out_sz;
- int tmp_sz;
- int ret;
-
- assert(ctx != NULL);
-
- in_sz = (int) in.len;
-
- out->data = malloc(in.len + EVP_MAX_BLOCK_LENGTH + \
- ctx->ivsz + ctx->tagsz);
- if (out->data == NULL)
- goto fail_malloc;
-
- iv = out->data;
- ptr = out->data + ctx->ivsz;
+ ctx->cipher = EVP_get_cipherbynid(nid);
+ if (ctx->cipher == NULL)
+ return -1;
- if (random_buffer(iv, ctx->ivsz) < 0)
- goto fail_encrypt;
+ /* IV must fit the NONCESZ nonce buffer. */
+ if (EVP_CIPHER_get_iv_length(ctx->cipher) > NONCESZ)
+ return -1;
- /* Set IV bit 7 to current key phase (KEY_ROTATION_BIT of counter) */
- if (ctx->rot.cntr & ctx->rot.mask)
- iv[0] |= 0x80;
- else
- iv[0] &= 0x7F;
+ /* Authenticated encryption is mandatory; reject non-AEAD ciphers. */
+ if ((EVP_CIPHER_flags(ctx->cipher) & EVP_CIPH_FLAG_AEAD_CIPHER) == 0)
+ return -1;
- EVP_CIPHER_CTX_reset(ctx->evp_ctx);
+ ctx->tagsz = AEAD_TAG_LEN;
- ret = EVP_EncryptInit_ex(ctx->evp_ctx, ctx->cipher, NULL,
- ctx->keys.cur, iv);
- if (ret != 1)
- goto fail_encrypt;
+ ctx->evp_ctx = EVP_CIPHER_CTX_new();
+ if (ctx->evp_ctx == NULL)
+ return -1;
- ret = EVP_EncryptUpdate(ctx->evp_ctx, ptr, &tmp_sz, in.data, in_sz);
- if (ret != 1)
- goto fail_encrypt;
+ return 0;
+}
- out_sz = tmp_sz;
- ret = EVP_EncryptFinal_ex(ctx->evp_ctx, ptr + tmp_sz, &tmp_sz);
- if (ret != 1)
- goto fail_encrypt;
+/* One-shot AEAD seal over an explicit key/nonce (no keyrot). out = ct ‖ tag. */
+int openssl_oneshot_seal(int nid,
+ const uint8_t * key,
+ const uint8_t * nonce,
+ buffer_t aad,
+ buffer_t in,
+ buffer_t * out)
+{
+ struct ossl_crypt_ctx ctx;
+ int out_sz;
- out_sz += tmp_sz;
+ assert(key != NULL);
+ assert(nonce != NULL);
+ assert(out != NULL);
- /* For AEAD ciphers, get and append the authentication tag */
- if (ctx->tagsz > 0) {
- ret = EVP_CIPHER_CTX_ctrl(ctx->evp_ctx, EVP_CTRL_AEAD_GET_TAG,
- ctx->tagsz, ptr + out_sz);
- if (ret != 1)
- goto fail_encrypt;
- out_sz += ctx->tagsz;
- }
+ memset(&ctx, 0, sizeof(ctx));
- assert(out_sz >= in_sz);
+ if (ossl_cipher_ctx_init(&ctx, nid) < 0)
+ goto fail_cipher;
- out->len = (size_t) out_sz + ctx->ivsz;
+ out->data = malloc(in.len + EVP_MAX_BLOCK_LENGTH + ctx.tagsz);
+ if (out->data == NULL)
+ goto fail_ctx;
- /* Increment packet counter and check for key rotation */
- ctx->rot.cntr++;
- ctx->rot.age++;
+ out_sz = openssl_seal(&ctx, key, nonce, aad, in,
+ out->data, out->data + in.len);
+ if (out_sz < 0)
+ goto fail_seal;
- if (HAS_PHASE_BIT_TOGGLED(ctx)) {
- if (rotate_key(ctx) != 0)
- goto fail_encrypt;
- }
+ out->len = (size_t) out_sz + ctx.tagsz;
- cleanup_old_key(ctx);
+ EVP_CIPHER_CTX_free(ctx.evp_ctx);
return 0;
- fail_encrypt:
+
+ fail_seal:
free(out->data);
- fail_malloc:
+ fail_ctx:
+ EVP_CIPHER_CTX_free(ctx.evp_ctx);
+ fail_cipher:
clrbuf(*out);
return -ECRYPT;
}
-int openssl_decrypt(struct ossl_crypt_ctx * ctx,
- buffer_t in,
- buffer_t * out)
+/* One-shot AEAD open; in = ct ‖ tag, verifies aad and tag. */
+int openssl_oneshot_open(int nid,
+ const uint8_t * key,
+ const uint8_t * nonce,
+ buffer_t aad,
+ buffer_t in,
+ buffer_t * out)
{
- uint8_t * iv;
- uint8_t * input;
- uint8_t rx_phase;
- int out_sz;
- int in_sz;
-
- assert(ctx != NULL);
+ struct ossl_crypt_ctx ctx;
+ buffer_t ct;
+ const uint8_t * tag;
+ int in_sz;
- in_sz = (int) in.len - ctx->ivsz;
- if (in_sz < ctx->tagsz)
- return -ECRYPT;
-
- in_sz -= ctx->tagsz;
+ assert(key != NULL);
+ assert(nonce != NULL);
+ assert(out != NULL);
- out->data = malloc(in_sz + EVP_MAX_BLOCK_LENGTH);
- if (out->data == NULL)
- goto fail_malloc;
+ memset(&ctx, 0, sizeof(ctx));
- iv = in.data;
- input = in.data + ctx->ivsz;
+ if (ossl_cipher_ctx_init(&ctx, nid) < 0)
+ goto fail_cipher;
- /* Extract phase from IV bit 7 and check for key rotation */
- rx_phase = (iv[0] & 0x80) ? 1 : 0;
+ if (in.len < (size_t) ctx.tagsz)
+ goto fail_ctx;
- if (should_rotate_key_rx(ctx, rx_phase)) {
- if (rotate_key(ctx) != 0)
- goto fail_decrypt;
- }
+ in_sz = (int) in.len - ctx.tagsz;
- ctx->rot.cntr++;
- ctx->rot.age++;
+ out->data = malloc((size_t) in_sz + EVP_MAX_BLOCK_LENGTH);
+ if (out->data == NULL)
+ goto fail_ctx;
- if (try_decrypt(ctx, ctx->keys.cur, iv, input, in_sz, out->data,
- &out_sz) != 0) {
- if (ctx->keys.prv == NULL)
- goto fail_decrypt;
- if (try_decrypt(ctx, ctx->keys.prv, iv, input, in_sz,
- out->data, &out_sz) != 0)
- goto fail_decrypt;
- }
+ ct.data = in.data;
+ ct.len = (size_t) in_sz;
+ tag = in.data + in_sz;
- assert(out_sz <= in_sz);
+ if (openssl_open(&ctx, key, nonce, aad, ct, tag, out) < 0)
+ goto fail_open;
- out->len = (size_t) out_sz;
+ EVP_CIPHER_CTX_free(ctx.evp_ctx);
return 0;
- fail_decrypt:
+
+ fail_open:
free(out->data);
- fail_malloc:
+ fail_ctx:
+ EVP_CIPHER_CTX_free(ctx.evp_ctx);
+ fail_cipher:
clrbuf(*out);
return -ECRYPT;
}
@@ -1093,51 +1102,19 @@ struct ossl_crypt_ctx * openssl_crypt_create_ctx(struct crypt_sk * sk)
assert(sk != NULL);
assert(sk->key != NULL);
- assert(sk->rot_bit > 0 && sk->rot_bit < 32);
ctx = malloc(sizeof(*ctx));
if (ctx == NULL)
- goto fail_malloc;
+ goto fail_malloc;
memset(ctx, 0, sizeof(*ctx));
- ctx->keys.cur = OPENSSL_secure_malloc(SYMMKEYSZ);
- if (ctx->keys.cur == NULL)
- goto fail_key;
-
- memcpy(ctx->keys.cur, sk->key, SYMMKEYSZ);
-
- ctx->keys.prv = NULL;
-
- /* Derive rotation salt from initial shared secret */
- if (EVP_Digest(sk->key, SYMMKEYSZ, ctx->rot.salt, NULL,
- EVP_sha256(), NULL) != 1)
- goto fail_cipher;
-
- ctx->cipher = EVP_get_cipherbynid(sk->nid);
- if (ctx->cipher == NULL)
- goto fail_cipher;
-
- ctx->ivsz = EVP_CIPHER_iv_length(ctx->cipher);
-
- /* Set tag size for AEAD ciphers (GCM, CCM, OCB, ChaCha20-Poly1305) */
- if (EVP_CIPHER_flags(ctx->cipher) & EVP_CIPH_FLAG_AEAD_CIPHER)
- ctx->tagsz = 16; /* Standard AEAD tag length (128 bits) */
-
- ctx->rot.cntr = 0;
- ctx->rot.mask = (1U << sk->rot_bit);
- ctx->rot.age = 0;
- ctx->rot.phase = 0;
-
- ctx->evp_ctx = EVP_CIPHER_CTX_new();
- if (ctx->evp_ctx == NULL)
+ if (ossl_cipher_ctx_init(ctx, sk->nid) < 0)
goto fail_cipher;
return ctx;
fail_cipher:
- OPENSSL_secure_clear_free(ctx->keys.cur, SYMMKEYSZ);
- fail_key:
free(ctx);
fail_malloc:
return NULL;
@@ -1148,23 +1125,10 @@ void openssl_crypt_destroy_ctx(struct ossl_crypt_ctx * ctx)
if (ctx == NULL)
return;
- if (ctx->keys.cur != NULL)
- OPENSSL_secure_clear_free(ctx->keys.cur, SYMMKEYSZ);
-
- if (ctx->keys.prv != NULL)
- OPENSSL_secure_clear_free(ctx->keys.prv, SYMMKEYSZ);
-
EVP_CIPHER_CTX_free(ctx->evp_ctx);
free(ctx);
}
-int openssl_crypt_get_ivsz(struct ossl_crypt_ctx * ctx)
-{
- assert(ctx != NULL);
-
- return ctx->ivsz;
-}
-
int openssl_crypt_get_tagsz(struct ossl_crypt_ctx * ctx)
{
assert(ctx != NULL);
@@ -1184,7 +1148,12 @@ int openssl_load_crt_file(const char * path,
if (fp == NULL)
goto fail_file;
+ pthread_cleanup_push(__cleanup_fclose, fp);
+
xcrt = PEM_read_X509(fp, NULL, NULL, NULL);
+
+ pthread_cleanup_pop(false);
+
if (xcrt == NULL)
goto fail_crt;
@@ -1200,35 +1169,58 @@ int openssl_load_crt_file(const char * path,
return -1;
}
-int openssl_load_crt_str(const char * str,
- void ** crt)
+static void * rd_crt_bio(BIO * bio)
+{
+ return PEM_read_bio_X509(bio, NULL, NULL, NULL);
+}
+
+static void * rd_privkey_bio(BIO * bio)
+{
+ return PEM_read_bio_PrivateKey(bio, NULL, NULL, "");
+}
+
+static void * rd_pubkey_bio(BIO * bio)
+{
+ return PEM_read_bio_PUBKEY(bio, NULL, NULL, NULL);
+}
+
+/* Decode a PEM object from an in-memory string via rd. */
+static int load_pem_str(const char * str,
+ void * (* rd)(BIO *),
+ void ** out)
{
BIO * bio;
- X509 * xcrt;
+ void * obj;
bio = BIO_new(BIO_s_mem());
if (bio == NULL)
goto fail_bio;
if (BIO_write(bio, str, strlen(str)) < 0)
- goto fail_crt;
+ goto fail_obj;
- xcrt = PEM_read_bio_X509(bio, NULL, NULL, NULL);
- if (xcrt == NULL)
- goto fail_crt;
+ obj = rd(bio);
+ if (obj == NULL)
+ goto fail_obj;
BIO_free(bio);
- *crt = (void *) xcrt;
+ *out = obj;
return 0;
- fail_crt:
+ fail_obj:
BIO_free(bio);
fail_bio:
- *crt = NULL;
+ *out = NULL;
return -1;
}
+int openssl_load_crt_str(const char * str,
+ void ** crt)
+{
+ return load_pem_str(str, rd_crt_bio, crt);
+}
+
int openssl_load_crt_der(buffer_t buf,
void ** crt)
{
@@ -1288,7 +1280,12 @@ int openssl_load_privkey_file(const char * path,
if (fp == NULL)
goto fail_file;
+ pthread_cleanup_push(__cleanup_fclose, fp);
+
pkey = PEM_read_PrivateKey(fp, NULL, NULL, "");
+
+ pthread_cleanup_pop(false);
+
if (pkey == NULL)
goto fail_key;
@@ -1307,30 +1304,7 @@ int openssl_load_privkey_file(const char * path,
int openssl_load_privkey_str(const char * str,
void ** key)
{
- BIO * bio;
- EVP_PKEY * pkey;
-
- bio = BIO_new(BIO_s_mem());
- if (bio == NULL)
- goto fail_bio;
-
- if (BIO_write(bio, str, strlen(str)) < 0)
- goto fail_key;
-
- pkey = PEM_read_bio_PrivateKey(bio, NULL, NULL, NULL);
- if (pkey == NULL)
- goto fail_key;
-
- BIO_free(bio);
-
- *key = (void *) pkey;
-
- return 0;
- fail_key:
- BIO_free(bio);
- fail_bio:
- *key = NULL;
- return -1;
+ return load_pem_str(str, rd_privkey_bio, key);
}
int openssl_load_pubkey_file(const char * path,
@@ -1343,7 +1317,12 @@ int openssl_load_pubkey_file(const char * path,
if (fp == NULL)
goto fail_file;
+ pthread_cleanup_push(__cleanup_fclose, fp);
+
pkey = PEM_read_PUBKEY(fp, NULL, NULL, NULL);
+
+ pthread_cleanup_pop(false);
+
if (pkey == NULL)
goto fail_key;
@@ -1375,7 +1354,12 @@ int openssl_load_pubkey_file_to_der(const char * path,
if (fp == NULL)
goto fail_file;
+ pthread_cleanup_push(__cleanup_fclose, fp);
+
pkey = PEM_read_PUBKEY(fp, NULL, NULL, NULL);
+
+ pthread_cleanup_pop(false);
+
if (pkey == NULL)
goto fail_key;
@@ -1402,30 +1386,7 @@ int openssl_load_pubkey_file_to_der(const char * path,
int openssl_load_pubkey_str(const char * str,
void ** key)
{
- BIO * bio;
- EVP_PKEY * pkey;
-
- bio = BIO_new(BIO_s_mem());
- if (bio == NULL)
- goto fail_bio;
-
- if (BIO_write(bio, str, strlen(str)) < 0)
- goto fail_key;
-
- pkey = PEM_read_bio_PUBKEY(bio, NULL, NULL, NULL);
- if (pkey == NULL)
- goto fail_key;
-
- BIO_free(bio);
-
- *key = (void *) pkey;
-
- return 0;
- fail_key:
- BIO_free(bio);
- fail_bio:
- *key = NULL;
- return -1;
+ return load_pem_str(str, rd_pubkey_bio, key);
}
int openssl_load_pubkey_raw_file(const char * path,
@@ -1443,7 +1404,12 @@ int openssl_load_pubkey_raw_file(const char * path,
if (fp == NULL)
goto fail_file;
+ pthread_cleanup_push(__cleanup_fclose, fp);
+
bytes_read = fread(tmp_buf, 1, CRYPT_KEY_BUFSZ, fp);
+
+ pthread_cleanup_pop(false);
+
if (bytes_read == 0)
goto fail_read;
@@ -1485,11 +1451,17 @@ static const char * __openssl_hybrid_algo_from_sk_len(size_t len)
return NULL;
}
+/* Wipe the raw-key staging buffer if a cancel aborts the read. */
+static void __cleanse_key_buf(void * o)
+{
+ OPENSSL_cleanse(o, CRYPT_KEY_BUFSZ);
+}
+
int openssl_load_privkey_raw_file(const char * path,
void ** key)
{
FILE * fp;
- uint8_t tmp_buf[4096];
+ uint8_t tmp_buf[CRYPT_KEY_BUFSZ];
size_t bytes_read;
const char * algo;
EVP_PKEY * pkey;
@@ -1501,7 +1473,14 @@ int openssl_load_privkey_raw_file(const char * path,
if (fp == NULL)
goto fail_file;
+ pthread_cleanup_push(__cleanup_fclose, fp);
+ pthread_cleanup_push(__cleanse_key_buf, tmp_buf);
+
bytes_read = fread(tmp_buf, 1, sizeof(tmp_buf), fp);
+
+ pthread_cleanup_pop(false);
+ pthread_cleanup_pop(false);
+
if (bytes_read == 0)
goto fail_read;
@@ -1552,65 +1531,71 @@ void openssl_free_key(EVP_PKEY * key)
int openssl_check_crt_name(void * crt,
const char * name)
{
- char * subj;
- char * cn;
- X509 * xcrt;
+ const unsigned char * cn;
+ ASN1_STRING * val;
+ X509_NAME * nm;
+ int idx;
+ int len;
- xcrt = (X509 *) crt;
+ nm = X509_get_subject_name((X509 *) crt);
+ if (nm == NULL)
+ return -1;
+
+ idx = X509_NAME_get_index_by_NID(nm, NID_commonName, -1);
+ if (idx < 0)
+ return -1;
- subj = X509_NAME_oneline(X509_get_subject_name(xcrt), NULL, 0);
- if (subj == NULL)
- goto fail_subj;
+ val = X509_NAME_ENTRY_get_data(X509_NAME_get_entry(nm, idx));
+ cn = ASN1_STRING_get0_data(val);
+ len = ASN1_STRING_length(val);
- cn = strstr(subj, "CN=");
- if (cn == NULL)
- goto fail_cn;
+ if (len < 0 || (size_t) len != strlen(name))
+ return -1;
- if (strcmp(cn + 3, name) != 0)
- goto fail_cn;
+ if (memchr(cn, '\0', (size_t) len) != NULL)
+ return -1;
- free(subj);
+ if (memcmp(cn, name, (size_t) len) != 0)
+ return -1;
return 0;
- fail_cn:
- free(subj);
- fail_subj:
- return -1;
}
int openssl_get_crt_name(void * crt,
char * name)
{
- char * subj;
- char * cn;
- char * end;
- X509 * xcrt;
+ const unsigned char * cn;
+ ASN1_STRING * val;
+ X509_NAME * nm;
+ int idx;
+ int len;
- xcrt = (X509 *) crt;
+ nm = X509_get_subject_name((X509 *) crt);
+ if (nm == NULL)
+ return -1;
- subj = X509_NAME_oneline(X509_get_subject_name(xcrt), NULL, 0);
- if (subj == NULL)
- goto fail_subj;
+ idx = X509_NAME_get_index_by_NID(nm, NID_commonName, -1);
+ if (idx < 0)
+ return -1;
- cn = strstr(subj, "CN=");
- if (cn == NULL)
- goto fail_cn;
+ val = X509_NAME_ENTRY_get_data(X509_NAME_get_entry(nm, idx));
+ cn = ASN1_STRING_get0_data(val);
+ len = ASN1_STRING_length(val);
- cn += 3; /* Skip "CN=" */
+ if (len < 0)
+ return -1;
- /* Find end of CN (comma or slash for next field) */
- end = strpbrk(cn, ",/");
- if (end != NULL)
- *end = '\0';
+ if ((size_t) len > NAME_SIZE)
+ return -ENAME;
- strcpy(name, cn);
- free(subj);
+ /* Reject an embedded NUL that would truncate the parsed name. */
+ if (memchr(cn, '\0', (size_t) len) != NULL)
+ return -1;
+
+ memcpy(name, cn, (size_t) len);
+ name[len] = '\0';
return 0;
- fail_cn:
- free(subj);
- fail_subj:
- return -1;
}
int openssl_crt_str(const void * crt,
@@ -1695,12 +1680,43 @@ int openssl_auth_add_crt_to_store(void * store,
return ret == 1 ? 0 : -1;
}
-int openssl_verify_crt(void * store,
- void * crt)
+void * openssl_auth_create_chain(void)
+{
+ return sk_X509_new_null();
+}
+
+void openssl_auth_destroy_chain(void * chain)
+{
+ sk_X509_pop_free((STACK_OF(X509) *) chain, X509_free);
+}
+
+int openssl_auth_add_crt_to_chain(void * chain,
+ void * crt)
+{
+ if (X509_up_ref((X509 *) crt) != 1)
+ goto fail_ref;
+
+ if (sk_X509_push((STACK_OF(X509) *) chain, (X509 *) crt) == 0)
+ goto fail_push;
+
+ return 0;
+ fail_push:
+ X509_free((X509 *) crt);
+ fail_ref:
+ return -1;
+}
+
+int openssl_verify_crt_pin(void * store,
+ void * untrusted,
+ void * crt,
+ void * pin)
{
X509_STORE_CTX * ctx;
X509_STORE * _store;
X509* _crt;
+ STACK_OF(X509) * chain;
+ int i;
+ int n;
int ret;
_store = (X509_STORE *) store;
@@ -1710,7 +1726,8 @@ int openssl_verify_crt(void * store,
if (ctx == NULL)
goto fail_store_ctx;
- ret = X509_STORE_CTX_init(ctx, _store, _crt, NULL);
+ ret = X509_STORE_CTX_init(ctx, _store, _crt,
+ (STACK_OF(X509) *) untrusted);
if (ret != 1)
goto fail_ca;
@@ -1718,13 +1735,39 @@ int openssl_verify_crt(void * store,
if (ret != 1)
goto fail_ca;
+ /* Peer cert only verifies a signature; gate on sig KU, not role. */
+ if ((X509_get_key_usage(_crt) & KU_DIGITAL_SIGNATURE) == 0)
+ goto fail_ca;
+
+ if (pin != NULL) {
+ chain = X509_STORE_CTX_get0_chain(ctx);
+ if (chain == NULL)
+ goto fail_ca;
+ n = sk_X509_num(chain);
+ for (i = 1; i < n; i++) /* Skip the leaf */
+ if (X509_cmp(sk_X509_value(chain, i), pin) == 0)
+ break;
+ if (i == n)
+ goto fail_pin;
+ }
+
X509_STORE_CTX_free(ctx);
return 0;
+ fail_pin:
+ X509_STORE_CTX_free(ctx);
+ return -ENOENT;
fail_ca:
X509_STORE_CTX_free(ctx);
fail_store_ctx:
- return -1;
+ return -EAUTH;
+}
+
+int openssl_verify_crt(void * store,
+ void * untrusted,
+ void * crt)
+{
+ return openssl_verify_crt_pin(store, untrusted, crt, NULL);
}
static const EVP_MD * select_md(EVP_PKEY * pkey,
@@ -1739,6 +1782,12 @@ static const EVP_MD * select_md(EVP_PKEY * pkey,
return EVP_get_digestbynid(nid);
}
+bool openssl_pk_requires_md(const EVP_PKEY * pk)
+{
+ /* Provider-based (PQC) signatures have an intrinsic digest */
+ return EVP_PKEY_get_id(pk) >= 0;
+}
+
int openssl_sign(EVP_PKEY * pkp,
int nid,
buffer_t msg,
@@ -1866,9 +1915,10 @@ void * openssl_secure_malloc(size_t size)
return OPENSSL_secure_malloc(size);
}
-void openssl_secure_free(void * ptr)
+void openssl_secure_free(void * ptr,
+ size_t size)
{
- OPENSSL_secure_free(ptr);
+ OPENSSL_secure_clear_free(ptr, size);
}
void openssl_secure_clear(void * ptr,
@@ -1876,6 +1926,7 @@ void openssl_secure_clear(void * ptr,
{
OPENSSL_cleanse(ptr, size);
}
+
void openssl_cleanup(void)
{
OPENSSL_cleanup();
diff --git a/src/lib/crypt/openssl.h b/src/lib/crypt/openssl.h
index af285232..e5cc35f7 100644
--- a/src/lib/crypt/openssl.h
+++ b/src/lib/crypt/openssl.h
@@ -61,20 +61,44 @@ int openssl_get_algo_from_pk_der(buffer_t pk,
int openssl_get_algo_from_pk_raw(buffer_t pk,
char * algo);
-int openssl_encrypt(struct ossl_crypt_ctx * ctx,
- buffer_t in,
- buffer_t * out);
-
-int openssl_decrypt(struct ossl_crypt_ctx * ctx,
- buffer_t in,
- buffer_t * out);
+int openssl_seal(struct ossl_crypt_ctx * ctx,
+ const uint8_t * key,
+ const uint8_t * nonce,
+ buffer_t aad,
+ buffer_t in,
+ uint8_t * out,
+ uint8_t * tag);
+
+int openssl_open(struct ossl_crypt_ctx * ctx,
+ const uint8_t * key,
+ const uint8_t * nonce,
+ buffer_t aad,
+ buffer_t in,
+ const uint8_t * tag,
+ buffer_t * out);
+
+int openssl_oneshot_seal(int nid,
+ const uint8_t * key,
+ const uint8_t * nonce,
+ buffer_t aad,
+ buffer_t in,
+ buffer_t * out);
+
+int openssl_oneshot_open(int nid,
+ const uint8_t * key,
+ const uint8_t * nonce,
+ buffer_t aad,
+ buffer_t in,
+ buffer_t * out);
+
+int openssl_hkdf_expand(buffer_t key,
+ buffer_t info,
+ buffer_t out);
struct ossl_crypt_ctx * openssl_crypt_create_ctx(struct crypt_sk * sk);
void openssl_crypt_destroy_ctx(struct ossl_crypt_ctx * ctx);
-int openssl_crypt_get_ivsz(struct ossl_crypt_ctx * ctx);
-
int openssl_crypt_get_tagsz(struct ossl_crypt_ctx * ctx);
/* AUTHENTICATION */
@@ -136,9 +160,24 @@ void openssl_auth_destroy_store(void * store);
int openssl_auth_add_crt_to_store(void * store,
void * crt);
+void * openssl_auth_create_chain(void);
+
+void openssl_auth_destroy_chain(void * chain);
+
+int openssl_auth_add_crt_to_chain(void * chain,
+ void * crt);
+
int openssl_verify_crt(void * store,
+ void * untrusted,
void * crt);
+int openssl_verify_crt_pin(void * store,
+ void * untrusted,
+ void * crt,
+ void * pin);
+
+bool openssl_pk_requires_md(const EVP_PKEY * pk);
+
int openssl_sign(EVP_PKEY * pkp,
int md_nid,
buffer_t msg,
diff --git a/src/lib/dev.c b/src/lib/dev.c
index 9cfc24ee..d0997273 100644
--- a/src/lib/dev.c
+++ b/src/lib/dev.c
@@ -29,10 +29,13 @@
#include "config.h"
#include "ssm.h"
+#include <ouroboros/atomics.h>
#include <ouroboros/bitmap.h>
#include <ouroboros/cep.h>
+#include <ouroboros/crc16.h>
#include <ouroboros/crypt.h>
#include <ouroboros/dev.h>
+#include <ouroboros/endian.h>
#include <ouroboros/errno.h>
#include <ouroboros/fccntl.h>
#include <ouroboros/flow.h>
@@ -45,32 +48,33 @@
#include <ouroboros/np1_flow.h>
#include <ouroboros/pthread.h>
#include <ouroboros/random.h>
+#ifdef PROC_FLOW_STATS
+#include <ouroboros/rib.h>
+#endif
#include <ouroboros/serdes-irm.h>
+#include <ouroboros/sockets.h>
#include <ouroboros/ssm_flow_set.h>
#include <ouroboros/ssm_pool.h>
#include <ouroboros/ssm_rbuff.h>
-#include <ouroboros/sockets.h>
+#include <ouroboros/tw.h>
#include <ouroboros/utils.h>
-#ifdef PROC_FLOW_STATS
-#include <ouroboros/rib.h>
-#endif
+#include <assert.h>
#ifdef HAVE_LIBGCRYPT
#include <gcrypt.h>
#endif
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
#include <stdarg.h>
#include <stdbool.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
#include <sys/types.h>
#ifndef CLOCK_REALTIME_COARSE
#define CLOCK_REALTIME_COARSE CLOCK_REALTIME
#endif
-/* Partial read information. */
#define NO_PART -1
#define DONE_PART -2
@@ -78,19 +82,12 @@
#define SECMEMSZ 16384
#define MSGBUFSZ 2048
-/* map flow_ids to flow descriptors; track state of the flow */
struct fmap {
int fd;
- /* TODO: use actual flow state */
enum flow_state state;
};
-#define frcti_to_flow(frcti) \
- ((struct flow *)((uint8_t *) frcti - offsetof(struct flow, frcti)))
-
struct flow {
- struct list_head next;
-
struct flow_info info;
struct ssm_rbuff * rx_rb;
@@ -101,8 +98,14 @@ struct flow {
ssize_t part_idx;
struct crypt_ctx * crypt;
- int headsz; /* IV */
- int tailsz; /* Tag + CRC */
+ int headsz; /* Selector */
+ int tailsz; /* Tag + CRC */
+
+ struct timespec rk_grace; /* TX-promote deadline (0 = none) */
+ struct timespec rk_attempt; /* Last re-key attempt (backoff) */
+ bool rk_wm_inflight; /* Re-key trigger in flight */
+ uint32_t rk_wm_ctr; /* Throttles the consult */
+ bool rk_initiator; /* OAP initiator this re-key */
struct timespec snd_act;
struct timespec rcv_act;
@@ -135,16 +138,10 @@ struct {
struct flow * flows;
struct fmap * id_to_fd;
- struct list_head flow_list;
pthread_mutex_t mtx;
pthread_cond_t cond;
- pthread_t tx;
- pthread_t rx;
- size_t n_frcti;
- fset_t * frct_set;
-
pthread_rwlock_t lock;
} proc;
@@ -243,7 +240,7 @@ static int proc_announce(const struct proc_info * proc)
return irm__irm_result_des(&msg);
}
-/* IRMd will clean up the mess if this fails */
+/* IRMd cleans up on failure. */
static void proc_exit(void)
{
uint8_t buf[SOCK_BUF_SIZE];
@@ -264,7 +261,7 @@ static int spb_encrypt(struct flow * flow,
uint8_t * tail;
if (flow->crypt == NULL)
- return 0; /* No encryption */
+ return 0;
in.data = ssm_pk_buff_head(spb);
in.len = ssm_pk_buff_len(spb);
@@ -272,11 +269,11 @@ static int spb_encrypt(struct flow * flow,
if (crypt_encrypt(flow->crypt, in, &out) < 0)
goto fail_encrypt;
- head = ssm_pk_buff_head_alloc(spb, flow->headsz);
+ head = ssm_pk_buff_push(spb, flow->headsz);
if (head == NULL)
goto fail_alloc;
- tail = ssm_pk_buff_tail_alloc(spb, flow->tailsz);
+ tail = ssm_pk_buff_push_tail(spb, flow->tailsz);
if (tail == NULL)
goto fail_alloc;
@@ -299,17 +296,16 @@ static int spb_decrypt(struct flow * flow,
uint8_t * head;
if (flow->crypt == NULL)
- return 0; /* No decryption */
+ return 0;
in.data = ssm_pk_buff_head(spb);
in.len = ssm_pk_buff_len(spb);
if (crypt_decrypt(flow->crypt, in, &out) < 0)
- return -ENOMEM;
-
+ return -ECRYPT;
- head = ssm_pk_buff_head_release(spb, flow->headsz) + flow->headsz;
- ssm_pk_buff_tail_release(spb, flow->tailsz);
+ head = ssm_pk_buff_pop(spb, flow->headsz) + flow->headsz;
+ ssm_pk_buff_pop_tail(spb, flow->tailsz);
memcpy(head, out.data, out.len);
@@ -318,130 +314,357 @@ static int spb_decrypt(struct flow * flow,
return 0;
}
-#include "frct.c"
+/* tw_move under proc.lock rdlock; gates teardown vs in-flight fires. */
+static void tw_move_safe(void)
+{
+ pthread_rwlock_rdlock(&proc.lock);
+
+ pthread_cleanup_push(__cleanup_rwlock_unlock, &proc.lock);
+
+ tw_move();
+
+ pthread_cleanup_pop(1);
+}
-void * flow_tx(void * o)
+static int crc_add(struct ssm_pk_buff * spb,
+ size_t head_skip)
{
- struct timespec tic = TIMESPEC_INIT_NS(TICTIME);
+ uint8_t * head;
+ uint8_t * tail;
- (void) o;
+ tail = ssm_pk_buff_push_tail(spb, CRCLEN);
+ if (tail == NULL)
+ return -ENOMEM;
- while (true) {
- timerwheel_move();
+ head = ssm_pk_buff_head(spb) + head_skip;
- nanosleep(&tic, NULL);
- }
+ mem_hash(HASH_CRC32, tail, head, tail - head);
- return (void *) 0;
+ return 0;
}
-static void flow_send_keepalive(struct flow * flow,
- struct timespec now)
+static int crc_check(struct ssm_pk_buff * spb,
+ size_t head_skip)
{
- struct ssm_pk_buff * spb;
- ssize_t idx;
- uint8_t * ptr;
+ uint32_t crc;
+ uint8_t * head;
+ uint8_t * tail;
- idx = ssm_pool_alloc(proc.pool, 0, &ptr, &spb);
- if (idx < 0)
- return;
+ if (ssm_pk_buff_len(spb) < head_skip + CRCLEN)
+ return 1;
- pthread_rwlock_wrlock(&proc.lock);
+ head = ssm_pk_buff_head(spb) + head_skip;
+ tail = ssm_pk_buff_pop_tail(spb, CRCLEN);
- flow->snd_act = now;
+ mem_hash(HASH_CRC32, &crc, head, tail - head);
- if (ssm_rbuff_write(flow->tx_rb, idx))
- ssm_pool_remove(proc.pool, idx);
+ return !(crc == *((uint32_t *) tail));
+}
+
+/* FRCT included here so it can use proc and dev.c statics directly. */
+#include "frct.c"
+
+/* Decrypt before any check so the plaintext is authoritative. */
+static bool invalid_pkt(struct flow * flow,
+ struct ssm_pk_buff * spb)
+{
+ const struct frct_pci * pci;
+ uint16_t flags;
+ size_t pci_total;
+
+ if (spb == NULL || ssm_pk_buff_len(spb) == 0)
+ return true;
+
+ if (spb_decrypt(flow, spb) < 0)
+ return true;
+
+ if (flow->frcti == NULL) {
+ if (flow->info.qs.ber == 0 && crc_check(spb, 0) != 0)
+ return true;
+ return false;
+ }
+
+ if (ssm_pk_buff_len(spb) < FRCT_PCILEN)
+ return true;
+
+ pci = (const struct frct_pci *) ssm_pk_buff_head(spb);
+ flags = ntoh16(pci->flags);
+
+ /* Untrusted flag read; mismatch on HCS will drop on corrupt. */
+ if (flags & FRCT_DATA)
+ pci_total = frcti_data_hdr_len(flow->frcti);
else
- ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT);
+ pci_total = frcti_ctrl_hdr_len(flow->frcti);
- pthread_rwlock_unlock(&proc.lock);
+ if (ssm_pk_buff_len(spb) < pci_total)
+ return true;
+
+ if (frct_hcs_check(pci, flow->frcti) != 0)
+ return true;
+
+ /* HCS valid: CRC32 on SACK; or on DATA if ber = 0. */
+ if (flags & FRCT_SACK) {
+ if (crc_check(spb, pci_total) != 0)
+ return true;
+
+ } else if ((flags & FRCT_DATA) && flow->info.qs.ber == 0) {
+ if (crc_check(spb, pci_total) != 0)
+ return true;
+ }
+
+ return false;
}
-/* Needs rdlock on proc. */
-static void _flow_keepalive(struct flow * flow)
+static bool deadline_passed(const struct timespec * abs)
{
- struct timespec now;
- struct timespec s_act;
- struct timespec r_act;
- int flow_id;
- time_t timeo;
- uint32_t acl;
+ struct timespec now;
- s_act = flow->snd_act;
- r_act = flow->rcv_act;
+ if (abs == NULL)
+ return false;
- flow_id = flow->info.id;
- timeo = flow->info.qs.timeout;
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
- acl = ssm_rbuff_get_acl(flow->rx_rb);
- if (timeo == 0 || acl & (ACL_FLOWPEER | ACL_FLOWDOWN))
- return;
+ return ts_diff_ns(&now, abs) >= 0;
+}
+
+/* Clamp the wait by min(dl, next tw expiry, now + TICTIME). */
+static void compute_wait_deadline(const struct timespec * dl,
+ struct timespec * out)
+{
+ struct timespec now;
+ struct timespec cap;
+ struct timespec expiry;
+ struct timespec tic = TIMESPEC_INIT_NS(TICTIME);
clock_gettime(PTHREAD_COND_CLOCK, &now);
+ ts_add(&now, &tic, &cap);
- if (ts_diff_ns(&now, &r_act) > (int64_t) timeo * MILLION) {
- ssm_rbuff_set_acl(flow->rx_rb, ACL_FLOWPEER);
- ssm_flow_set_notify(proc.fqset, flow_id, FLOW_PEER);
- return;
- }
+ tw_next_expiry(&expiry);
- if (ts_diff_ns(&now, &s_act) > (int64_t) timeo * (MILLION >> 2)) {
- pthread_rwlock_unlock(&proc.lock);
+ *out = (ts_diff_ns(&cap, &expiry) < 0) ? expiry : cap;
+ if (dl != NULL && ts_diff_ns(out, dl) > 0)
+ *out = *dl;
+}
- flow_send_keepalive(flow, now);
+static void flow_drain_rx_nb(struct flow * flow)
+{
+ ssize_t idx;
+ struct ssm_pk_buff * spb;
+ struct ssm_rbuff * rx_rb;
+ struct frcti * frcti;
+#ifdef PROC_FLOW_STATS
+ struct timespec t_a;
+ struct timespec t_b;
+#endif
+
+ if (flow->frcti != NULL)
+ STAT_BUMP(flow->frcti, drain_calls);
+ while (true) {
pthread_rwlock_rdlock(&proc.lock);
+
+ rx_rb = flow->rx_rb;
+ if (rx_rb == NULL) {
+ pthread_rwlock_unlock(&proc.lock);
+ return;
+ }
+
+ idx = ssm_rbuff_read(rx_rb);
+ if (idx < 0) {
+ pthread_rwlock_unlock(&proc.lock);
+ return;
+ }
+
+ spb = ssm_pool_get(proc.pool, idx);
+ if (invalid_pkt(flow, spb)) {
+ ssm_pool_remove(proc.pool, idx);
+ pthread_rwlock_unlock(&proc.lock);
+ continue;
+ }
+
+ frcti = flow->frcti;
+ if (frcti != NULL) {
+#ifdef PROC_FLOW_STATS
+ clock_gettime(CLOCK_MONOTONIC, &t_a);
+ FRCTI_RCV(frcti, spb);
+ clock_gettime(CLOCK_MONOTONIC, &t_b);
+ STAT_ADD(frcti, rcv_proc_ns,
+ (size_t) ts_diff_ns(&t_b, &t_a));
+#else
+ FRCTI_RCV(frcti, spb);
+#endif
+ } else {
+ ssm_pool_remove(proc.pool, idx);
+ }
+
+ pthread_rwlock_unlock(&proc.lock);
+
+ /* Per-packet so the delayed-ACK fires on time in a burst. */
+#ifdef PROC_FLOW_STATS
+ clock_gettime(CLOCK_MONOTONIC, &t_a);
+ tw_move_safe();
+ clock_gettime(CLOCK_MONOTONIC, &t_b);
+ if (frcti != NULL)
+ STAT_ADD(frcti, tw_move_ns,
+ (size_t) ts_diff_ns(&t_b, &t_a));
+#else
+ tw_move_safe();
+#endif
}
}
-static void handle_keepalives(void)
+/* TX-promotion grace when the peer's install latency is unknown (raw). */
+#define REKEY_GRACE_MS 1000
+
+/* Last-resort promote within N node-keys of exhaustion (< watermark). */
+#define REKEY_PROMOTE_FLOOR 1
+
+/* Throttle re-key retries so a failed attempt can't storm the IRMd. */
+#define REKEY_BACKOFF_NS (250 * MILLION)
+
+/* proc.lock (rd) only guards teardown; crypt_rekey self-synchronises. */
+static void flow_rekey(struct flow * flow)
{
- struct list_head * p;
- struct list_head * h;
+ struct flow_info info;
+ struct crypt_sk sk;
+ struct timespec now;
+ struct timespec intv;
+ time_t ms;
+ uint8_t key[SYMMKEYSZ];
+ uint8_t buf[SOCK_BUF_SIZE];
+ buffer_t msg = {SOCK_BUF_SIZE, buf};
+ bool has_key;
+ bool initiator = false;
pthread_rwlock_rdlock(&proc.lock);
+ if (flow->info.id < 0 || flow->crypt == NULL) {
+ pthread_rwlock_unlock(&proc.lock);
+ return;
+ }
- list_for_each_safe(p, h, &proc.flow_list) {
- struct flow * flow;
- flow = list_entry(p, struct flow, next);
- _flow_keepalive(flow);
+ /* Back off so a failed attempt can't storm the IRMd per syscall. */
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ if (ts_diff_ns(&now, &flow->rk_attempt) < REKEY_BACKOFF_NS) {
+ pthread_rwlock_unlock(&proc.lock);
+ return;
}
+ flow->rk_attempt = now;
+ info = flow->info;
pthread_rwlock_unlock(&proc.lock);
-}
-static void __cleanup_fqueue_destroy(void * fq)
-{
- fqueue_destroy((fqueue_t *) fq);
+ if (flow_update__irm_req_ser(&msg, &info, false) < 0)
+ return;
+
+ if (send_recv_msg(&msg) < 0)
+ return;
+
+ sk.key = key;
+ if (flow_rekey__irm_result_des(&msg, &sk, &has_key, &initiator) < 0)
+ return;
+
+ if (!has_key)
+ return;
+
+ pthread_rwlock_rdlock(&proc.lock);
+ if (flow->info.id == info.id && flow->crypt != NULL) {
+ if (crypt_rekey(flow->crypt, &sk) == 0) {
+ flow->rk_initiator = initiator;
+ /* Hold TX on the old epoch until the peer installs. */
+ ms = flow->info.mpl > 0 ? flow->info.mpl * 3
+ : REKEY_GRACE_MS;
+ intv.tv_sec = ms / 1000;
+ intv.tv_nsec = (ms % 1000) * MILLION;
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ ts_add(&now, &intv, &flow->rk_grace);
+ }
+ /* Re-arm the watermark even if the install was a no-op. */
+ STORE_RELAXED(&flow->rk_wm_inflight, false);
+ }
+ pthread_rwlock_unlock(&proc.lock);
+
+ crypt_secure_clear(key, SYMMKEYSZ);
}
-void * flow_rx(void * o)
+/* A clamp-timeout means tw work is due, not the caller deadline. */
+static int flow_rx_one(struct flow * flow,
+ struct timespec * abs)
{
- struct timespec tic = TIMESPEC_INIT_NS(TICTIME);
- int ret;
- struct fqueue * fq;
+ struct timespec wait_abs;
+ struct ssm_pk_buff * spb;
+ struct ssm_rbuff * rx_rb;
+ ssize_t idx;
- (void) o;
+ while (true) {
+ compute_wait_deadline(abs, &wait_abs);
- fq = fqueue_create();
+ /* rdlock gates flow_fini; FLOWDOWN preempts the block. */
+ pthread_rwlock_rdlock(&proc.lock);
- pthread_cleanup_push(__cleanup_fqueue_destroy, fq);
+ rx_rb = flow->rx_rb;
+ if (rx_rb == NULL) {
+ pthread_rwlock_unlock(&proc.lock);
+ return -EFLOWDOWN;
+ }
- /* fevent will filter all FRCT packets for us */
- while ((ret = fevent(proc.frct_set, fq, &tic)) != 0) {
- if (ret == -ETIMEDOUT) {
- handle_keepalives();
+ /* Pull a parked re-key before re-blocking (idle reader). */
+ if (flow->crypt != NULL
+ && (ssm_rbuff_get_flags(rx_rb) & RB_REKEY)) {
+ pthread_rwlock_unlock(&proc.lock);
+ flow_rekey(flow);
continue;
}
- while (fqueue_next(fq) >= 0)
- ; /* no need to act */
+ idx = ssm_rbuff_read_b(rx_rb, &wait_abs);
+ if (idx == -ETIMEDOUT) {
+ pthread_rwlock_unlock(&proc.lock);
+ if (deadline_passed(abs))
+ return -ETIMEDOUT;
+ tw_move_safe();
+ continue;
+ }
+ if (idx < 0) {
+ pthread_rwlock_unlock(&proc.lock);
+ return idx;
+ }
+
+ spb = ssm_pool_get(proc.pool, idx);
+ if (invalid_pkt(flow, spb)) {
+ ssm_pool_remove(proc.pool, idx);
+ pthread_rwlock_unlock(&proc.lock);
+ continue;
+ }
+
+ if (flow->frcti != NULL)
+ FRCTI_RCV(flow->frcti, spb);
+ else
+ ssm_pool_remove(proc.pool, idx);
+
+ pthread_rwlock_unlock(&proc.lock);
+
+ tw_move_safe();
+ return 0;
}
+}
- pthread_cleanup_pop(true);
+/* 0 = window open; -EAGAIN = !block and would block; else flow_rx_one rc. */
+static __inline__ int flow_wait_window(struct flow * flow,
+ size_t n,
+ bool block,
+ struct timespec * dl)
+{
+ int rc;
- return (void *) 0;
+ while (true) {
+ flow_drain_rx_nb(flow);
+ if (FRCTI_IS_WINDOW_OPEN_N(flow->frcti, n))
+ return 0;
+ if (!block)
+ return -EAGAIN;
+ rc = flow_rx_one(flow, dl);
+ if (rc < 0)
+ return rc;
+ }
}
static void flow_clear(int fd)
@@ -451,36 +674,36 @@ static void flow_clear(int fd)
proc.flows[fd].info.id = -1;
}
-static void __flow_fini(int fd)
+/* Order before flow_fini's wrlock, which blocks on rdlock holders. */
+static void flow_quiesce(int fd)
{
- assert(fd >= 0 && fd < SYS_MAX_FLOWS);
+ struct ssm_rbuff * rx_rb = proc.flows[fd].rx_rb;
+ struct ssm_rbuff * tx_rb = proc.flows[fd].tx_rb;
- if (proc.flows[fd].frcti != NULL) {
- proc.n_frcti--;
- if (proc.n_frcti == 0) {
- pthread_cancel(proc.tx);
- pthread_join(proc.tx, NULL);
- }
+ if (rx_rb != NULL)
+ ssm_rbuff_set_bits(rx_rb, RB_FLOWDOWN);
+
+ if (tx_rb != NULL)
+ ssm_rbuff_set_bits(tx_rb, RB_FLOWDOWN);
+}
- ssm_flow_set_del(proc.fqset, 0, proc.flows[fd].info.id);
+static void do_flow_fini(int fd)
+{
+ assert(fd >= 0 && fd < PROC_MAX_FLOWS);
+ if (proc.flows[fd].frcti != NULL)
frcti_destroy(proc.flows[fd].frcti);
- }
if (proc.flows[fd].info.id != -1) {
flow_destroy(&proc.id_to_fd[proc.flows[fd].info.id]);
bmp_release(proc.fds, fd);
}
- if (proc.flows[fd].rx_rb != NULL) {
- ssm_rbuff_set_acl(proc.flows[fd].rx_rb, ACL_FLOWDOWN);
+ if (proc.flows[fd].rx_rb != NULL)
ssm_rbuff_close(proc.flows[fd].rx_rb);
- }
- if (proc.flows[fd].tx_rb != NULL) {
- ssm_rbuff_set_acl(proc.flows[fd].tx_rb, ACL_FLOWDOWN);
+ if (proc.flows[fd].tx_rb != NULL)
ssm_rbuff_close(proc.flows[fd].tx_rb);
- }
if (proc.flows[fd].set != NULL) {
ssm_flow_set_notify(proc.flows[fd].set,
@@ -491,24 +714,40 @@ static void __flow_fini(int fd)
crypt_destroy_ctx(proc.flows[fd].crypt);
- list_del(&proc.flows[fd].next);
-
flow_clear(fd);
}
static void flow_fini(int fd)
{
+ flow_quiesce(fd);
+
pthread_rwlock_wrlock(&proc.lock);
- __flow_fini(fd);
+ do_flow_fini(fd);
pthread_rwlock_unlock(&proc.lock);
}
#define IS_ENCRYPTED(crypt) ((crypt)->nid != NID_undef)
-#define IS_ORDERED(flow) (flow.qs.in_order != 0)
+#define IS_ORDERED(info) ((info)->qs.service != SVC_RAW)
+#define IS_STREAM(info) ((info)->qs.service == SVC_STREAM)
+
+/* Raw MTU minus the wrapping (IV/Tag + optional CRC) dev.c adds. */
+static __inline__ size_t flow_user_mtu(const struct flow * flow,
+ size_t raw)
+{
+ size_t hdr;
+
+ hdr = flow->headsz + flow->tailsz;
+ if (flow->info.qs.ber == 0 && flow->crypt == NULL)
+ hdr += CRCLEN;
+
+ return raw > hdr ? raw - hdr : 0;
+}
+
static int flow_init(struct flow_info * info,
- struct crypt_sk * sk)
+ struct crypt_sk * sk,
+ time_t rtt_hint)
{
struct timespec now;
struct flow * flow;
@@ -550,33 +789,25 @@ static int flow_init(struct flow_info * info,
flow->tailsz = 0;
if (IS_ENCRYPTED(sk)) {
- /* Set to lower value in tests, should we make configurable? */
- sk->rot_bit = KEY_ROTATION_BIT;
flow->crypt = crypt_create_ctx(sk);
if (flow->crypt == NULL)
goto fail_crypt;
- flow->headsz = crypt_get_ivsz(flow->crypt);
+ flow->headsz = crypt_get_headsz(flow->crypt);
flow->tailsz = crypt_get_tagsz(flow->crypt);
}
assert(flow->frcti == NULL);
- if (IS_ORDERED(flow->info)) {
- flow->frcti = frcti_create(fd, DELT_A, DELT_R, info->mpl);
+ if (IS_ORDERED(&flow->info)) {
+ uint32_t frct_mtu = flow_user_mtu(flow, info->mtu);
+
+ flow->frcti = frcti_create(fd, DELT_A, DELT_R,
+ info->mpl, rtt_hint,
+ info->qs, frct_mtu);
if (flow->frcti == NULL)
goto fail_frcti;
-
- if (ssm_flow_set_add(proc.fqset, 0, info->id))
- goto fail_flow_set_add;
-
- ++proc.n_frcti;
- if (proc.n_frcti == 1 &&
- pthread_create(&proc.tx, NULL, flow_tx, NULL) < 0)
- goto fail_tx_thread;
}
- list_add_tail(&flow->next, &proc.flow_list);
-
proc.id_to_fd[info->id].fd = fd;
flow_set_state(&proc.id_to_fd[info->id], FLOW_ALLOCATED);
@@ -585,10 +816,6 @@ static int flow_init(struct flow_info * info,
return fd;
- fail_tx_thread:
- ssm_flow_set_del(proc.fqset, 0, info->id);
- fail_flow_set_add:
- frcti_destroy(flow->frcti);
fail_frcti:
crypt_destroy_ctx(flow->crypt);
fail_crypt:
@@ -655,13 +882,13 @@ static void init(int argc,
gcry_control(GCRYCTL_INITIALIZATION_FINISHED, 0);
}
#endif
- proc.fds = bmp_create(PROG_MAX_FLOWS - PROG_RES_FDS, PROG_RES_FDS);
+ proc.fds = bmp_create(PROC_MAX_FLOWS - PROC_RES_FDS, PROC_RES_FDS);
if (proc.fds == NULL) {
fprintf(stderr, "FATAL: Could not create fd bitmap.\n");
goto fail_fds;
}
- proc.fqueues = bmp_create(PROG_MAX_FQUEUES, 0);
+ proc.fqueues = bmp_create(PROC_MAX_FQUEUES, 0);
if (proc.fqueues == NULL) {
fprintf(stderr, "FATAL: Could not create fqueue bitmap.\n");
goto fail_fqueues;
@@ -677,13 +904,13 @@ static void init(int argc,
goto fail_rdrb;
}
- proc.flows = malloc(sizeof(*proc.flows) * PROG_MAX_FLOWS);
+ proc.flows = malloc(sizeof(*proc.flows) * PROC_MAX_FLOWS);
if (proc.flows == NULL) {
fprintf(stderr, "FATAL: Could not malloc flows.\n");
goto fail_flows;
}
- for (i = 0; i < PROG_MAX_FLOWS; ++i)
+ for (i = 0; i < PROC_MAX_FLOWS; ++i)
flow_clear(i);
proc.id_to_fd = malloc(sizeof(*proc.id_to_fd) * SYS_MAX_FLOWS);
@@ -716,20 +943,14 @@ static void init(int argc,
goto fail_fqset;
}
- proc.frct_set = fset_create();
- if (proc.frct_set == NULL || proc.frct_set->idx != 0) {
- fprintf(stderr, "FATAL: Could not create FRCT set.\n");
- goto fail_frct_set;
- }
-
- if (timerwheel_init() < 0) {
+ if (tw_init() < 0) {
fprintf(stderr, "FATAL: Could not initialize timerwheel.\n");
goto fail_timerwheel;
}
if (crypt_secure_malloc_init(PROC_SECMEM_MAX) < 0) {
fprintf(stderr, "FATAL: Could not init secure malloc.\n");
- goto fail_timerwheel;
+ goto fail_secmem;
}
#if defined PROC_FLOW_STATS
@@ -741,24 +962,15 @@ static void init(int argc,
}
}
#endif
- if (pthread_create(&proc.rx, NULL, flow_rx, NULL) < 0) {
- fprintf(stderr, "FATAL: Could not start monitor thread.\n");
- goto fail_monitor;
- }
-
- list_head_init(&proc.flow_list);
-
return;
- fail_monitor:
#if defined PROC_FLOW_STATS
- rib_fini();
fail_rib_init:
+ crypt_secure_malloc_fini();
#endif
- timerwheel_fini();
+ fail_secmem:
+ tw_fini();
fail_timerwheel:
- fset_destroy(proc.frct_set);
- fail_frct_set:
ssm_flow_set_close(proc.fqset);
fail_fqset:
pthread_rwlock_destroy(&proc.lock);
@@ -789,19 +1001,20 @@ static void fini(void)
if (proc.fds == NULL)
return;
- pthread_cancel(proc.rx);
- pthread_join(proc.rx, NULL);
+ /* Wake all in-flight readers/writers BEFORE wrlock acquire. */
+ for (i = 0; i < PROC_MAX_FLOWS; ++i)
+ if (proc.flows[i].info.id != -1)
+ flow_quiesce(i);
pthread_rwlock_wrlock(&proc.lock);
- for (i = 0; i < PROG_MAX_FLOWS; ++i) {
+ for (i = 0; i < PROC_MAX_FLOWS; ++i) {
struct flow * flow = &proc.flows[i];
if (flow->info.id != -1) {
ssize_t idx;
- ssm_rbuff_set_acl(flow->rx_rb, ACL_FLOWDOWN);
while ((idx = ssm_rbuff_read(flow->rx_rb)) >= 0)
ssm_pool_remove(proc.pool, idx);
- __flow_fini(i);
+ do_flow_fini(i);
}
}
@@ -813,9 +1026,9 @@ static void fini(void)
#ifdef PROC_FLOW_STATS
rib_fini();
#endif
- timerwheel_fini();
+ crypt_secure_malloc_fini();
- fset_destroy(proc.frct_set);
+ tw_fini();
ssm_flow_set_close(proc.fqset);
@@ -860,6 +1073,10 @@ int flow_accept(qosspec_t * qs,
if (qs != NULL)
qs->ber = 1;
#endif
+ /* STREAM cannot tolerate loss: drops create silent gaps. */
+ if (qs != NULL && qs->service == SVC_STREAM && qs->loss != 0)
+ return -EINVAL;
+
memset(&flow, 0, sizeof(flow));
flow.n_pid = getpid();
@@ -872,13 +1089,16 @@ int flow_accept(qosspec_t * qs,
if (err < 0)
return err;
- crypt.key = key;
+ crypt.key = key;
+ crypt.epoch = 0;
+ crypt.role = CRYPT_ROLE_RESP;
err = flow__irm_result_des(&msg, &flow, &crypt);
if (err < 0)
return err;
- fd = flow_init(&flow, &crypt);
+ /* No RTT in accept; rtt_hint=0 bootstraps from first ACK. */
+ fd = flow_init(&flow, &crypt, 0);
crypt_secure_clear(key, SYMMKEYSZ);
@@ -899,11 +1119,16 @@ int flow_alloc(const char * dst,
uint8_t key[SYMMKEYSZ];
int fd;
int err;
+ struct timespec t0;
+ struct timespec t1;
#ifdef QOS_DISABLE_CRC
if (qs != NULL)
qs->ber = 1;
#endif
+ /* STREAM cannot tolerate loss: drops create silent gaps. */
+ if (qs != NULL && qs->service == SVC_STREAM && qs->loss != 0)
+ return -EINVAL;
memset(&flow, 0, sizeof(flow));
@@ -913,19 +1138,23 @@ int flow_alloc(const char * dst,
if (flow_alloc__irm_req_ser(&msg, &flow, dst, timeo))
return -ENOMEM;
+ clock_gettime(PTHREAD_COND_CLOCK, &t0);
+
err = send_recv_msg(&msg);
- if (err < 0) {
- printf("send_recv_msg error %d\n", err);
+ if (err < 0)
return err;
- }
- crypt.key = key;
+ clock_gettime(PTHREAD_COND_CLOCK, &t1);
+
+ crypt.key = key;
+ crypt.epoch = 0;
+ crypt.role = CRYPT_ROLE_INIT;
err = flow__irm_result_des(&msg, &flow, &crypt);
if (err < 0)
return err;
- fd = flow_init(&flow, &crypt);
+ fd = flow_init(&flow, &crypt, ts_diff_ns(&t1, &t0));
crypt_secure_clear(key, SYMMKEYSZ);
@@ -958,13 +1187,15 @@ int flow_join(const char * dst,
if (err < 0)
return err;
- crypt.key = key;
+ crypt.key = key;
+ crypt.epoch = 0;
+ crypt.role = CRYPT_ROLE_INIT;
err = flow__irm_result_des(&msg, &flow, &crypt);
if (err < 0)
return err;
- fd = flow_init(&flow, &crypt);
+ fd = flow_init(&flow, &crypt, 0);
crypt_secure_clear(key, SYMMKEYSZ);
@@ -983,10 +1214,10 @@ int flow_dealloc(int fd)
struct flow * flow;
int err;
- if (fd < 0 || fd >= SYS_MAX_FLOWS )
+ if (fd < 0 || fd >= PROC_MAX_FLOWS )
return -EINVAL;
- memset(&info, 0, sizeof(flow));
+ memset(&info, 0, sizeof(info));
flow = &proc.flows[fd];
@@ -1008,9 +1239,8 @@ int flow_dealloc(int fd)
pthread_rwlock_rdlock(&proc.lock);
- timeo.tv_sec = frcti_dealloc(flow->frcti);
- while (timeo.tv_sec < 0) { /* keep the flow active for rtx */
- ssize_t ret;
+ while (FRCTI_LINGERING(flow->frcti)) {
+ ssize_t ret;
pthread_rwlock_unlock(&proc.lock);
@@ -1018,12 +1248,12 @@ int flow_dealloc(int fd)
pthread_rwlock_rdlock(&proc.lock);
- timeo.tv_sec = frcti_dealloc(flow->frcti);
-
- if (ret == -EFLOWDOWN && timeo.tv_sec < 0)
- timeo.tv_sec = -timeo.tv_sec;
+ if (ret == -EFLOWDOWN)
+ break;
}
+ timeo.tv_sec = FRCTI_DEALLOC(flow->frcti);
+
pthread_cleanup_push(__cleanup_rwlock_unlock, &proc.lock);
ssm_rbuff_fini(flow->tx_rb);
@@ -1033,15 +1263,18 @@ int flow_dealloc(int fd)
info.id = flow->info.id;
info.n_pid = getpid();
- if (flow_dealloc__irm_req_ser(&msg, &info, &timeo) < 0)
- return -ENOMEM;
+ if (flow_dealloc__irm_req_ser(&msg, &info, &timeo) < 0) {
+ err = -ENOMEM;
+ goto out;
+ }
err = send_recv_msg(&msg);
if (err < 0)
- return err;
+ goto out;
err = irm__irm_result_des(&msg);
+ out:
flow_fini(fd);
return err;
@@ -1055,12 +1288,12 @@ int ipcp_flow_dealloc(int fd)
struct flow * flow;
int err;
- if (fd < 0 || fd >= SYS_MAX_FLOWS )
+ if (fd < 0 || fd >= PROC_MAX_FLOWS )
return -EINVAL;
flow = &proc.flows[fd];
- memset(&info, 0, sizeof(flow));
+ memset(&info, 0, sizeof(info));
pthread_rwlock_rdlock(&proc.lock);
@@ -1074,15 +1307,18 @@ int ipcp_flow_dealloc(int fd)
pthread_rwlock_unlock(&proc.lock);
- if (ipcp_flow_dealloc__irm_req_ser(&msg, &info) < 0)
- return -ENOMEM;
+ if (ipcp_flow_dealloc__irm_req_ser(&msg, &info) < 0) {
+ err = -ENOMEM;
+ goto out;
+ }
err = send_recv_msg(&msg);
if (err < 0)
- return err;
+ goto out;
err = irm__irm_result_des(&msg);
+ out:
flow_fini(fd);
return err;
@@ -1098,12 +1334,20 @@ int fccntl(int fd,
va_list l;
struct timespec * timeo;
qosspec_t * qs;
- uint32_t rx_acl;
- uint32_t tx_acl;
size_t * qlen;
struct flow * flow;
-
- if (fd < 0 || fd >= SYS_MAX_FLOWS)
+ uint16_t old_acc;
+ uint16_t new_acc;
+ size_t max;
+ size_t * maxp;
+ size_t rsz;
+ size_t * rszp;
+ time_t rto;
+ time_t * rtop;
+ int rc;
+ bool emit_eos = false;
+
+ if (fd < 0 || fd >= PROC_MAX_FLOWS)
return -EBADF;
flow = &proc.flows[fd];
@@ -1167,36 +1411,44 @@ int fccntl(int fd,
qlen = va_arg(l, size_t *);
*qlen = ssm_rbuff_queued(flow->tx_rb);
break;
+ case FLOWGMTU:
+ maxp = va_arg(l, size_t *);
+ if (maxp == NULL)
+ goto einval;
+ *maxp = flow_user_mtu(flow, flow->info.mtu);
+ break;
case FLOWSFLAGS:
+ old_acc = flow->oflags & FLOWFACCMODE;
flow->oflags = va_arg(l, uint32_t);
- rx_acl = ssm_rbuff_get_acl(flow->rx_rb);
- tx_acl = ssm_rbuff_get_acl(flow->rx_rb);
- /*
- * Making our own flow write only means making the
- * the other side of the flow read only.
- */
+ new_acc = flow->oflags & FLOWFACCMODE;
+
+ /* Defer EOS emit until after proc.lock is dropped: */
+ /* frcti_fin_snd may block on shm-pool/tx-rb. */
+ if (new_acc == FLOWFRDONLY
+ && old_acc != FLOWFRDONLY
+ && flow->frcti != NULL)
+ emit_eos = true;
+
+ /* Our flow write-only -> peer's read-only; restore on RDWR. */
if (flow->oflags & FLOWFWRONLY)
- rx_acl |= ACL_RDONLY;
- if (flow->oflags & FLOWFRDWR)
- rx_acl |= ACL_RDWR;
+ ssm_rbuff_clr_bits(flow->rx_rb, RB_WR);
+ else
+ ssm_rbuff_set_bits(flow->rx_rb, RB_WR);
if (flow->oflags & FLOWFDOWN) {
- rx_acl |= ACL_FLOWDOWN;
- tx_acl |= ACL_FLOWDOWN;
+ ssm_rbuff_set_bits(flow->rx_rb, RB_FLOWDOWN);
+ ssm_rbuff_set_bits(flow->tx_rb, RB_FLOWDOWN);
ssm_flow_set_notify(flow->set,
flow->info.id,
FLOW_DOWN);
} else {
- rx_acl &= ~ACL_FLOWDOWN;
- tx_acl &= ~ACL_FLOWDOWN;
+ ssm_rbuff_clr_bits(flow->rx_rb, RB_FLOWDOWN);
+ ssm_rbuff_clr_bits(flow->tx_rb, RB_FLOWDOWN);
ssm_flow_set_notify(flow->set,
flow->info.id,
FLOW_UP);
}
- ssm_rbuff_set_acl(flow->rx_rb, rx_acl);
- ssm_rbuff_set_acl(flow->tx_rb, tx_acl);
-
break;
case FLOWGFLAGS:
fflags = va_arg(l, uint32_t *);
@@ -1218,6 +1470,59 @@ int fccntl(int fd,
goto eperm;
*cflags = frcti_getflags(flow->frcti);
break;
+ case FRCTSMAXSDU:
+ max = va_arg(l, size_t);
+ if (flow->frcti == NULL)
+ goto eperm;
+ if (frcti_set_max_rcv_sdu(flow->frcti, max) < 0)
+ goto einval;
+ break;
+ case FRCTGMAXSDU:
+ maxp = va_arg(l, size_t *);
+ if (maxp == NULL)
+ goto einval;
+ if (flow->frcti == NULL)
+ goto eperm;
+ *maxp = frcti_get_max_rcv_sdu(flow->frcti);
+ break;
+ case FRCTSRRINGSZ:
+ rsz = va_arg(l, size_t);
+ if (flow->frcti == NULL)
+ goto eperm;
+ rc = frcti_set_rcv_ring_sz(flow->frcti, rsz);
+ if (rc < 0) {
+ pthread_rwlock_unlock(&proc.lock);
+ va_end(l);
+ return rc;
+ }
+ break;
+ case FRCTGRRINGSZ:
+ rszp = va_arg(l, size_t *);
+ if (rszp == NULL)
+ goto einval;
+ if (flow->frcti == NULL)
+ goto eperm;
+ *rszp = frcti_get_rcv_ring_sz(flow->frcti);
+ break;
+ case FRCTSRTOMIN:
+ if (flow->frcti == NULL)
+ goto eperm;
+ rto = va_arg(l, time_t);
+ rc = frcti_set_rto_min(flow->frcti, rto);
+ if (rc < 0) {
+ pthread_rwlock_unlock(&proc.lock);
+ va_end(l);
+ return rc;
+ }
+ break;
+ case FRCTGRTOMIN:
+ if (flow->frcti == NULL)
+ goto eperm;
+ rtop = va_arg(l, time_t *);
+ if (rtop == NULL)
+ goto einval;
+ *rtop = frcti_get_rto_min(flow->frcti);
+ break;
default:
pthread_rwlock_unlock(&proc.lock);
va_end(l);
@@ -1227,6 +1532,9 @@ int fccntl(int fd,
pthread_rwlock_unlock(&proc.lock);
+ if (emit_eos)
+ frcti_fin_snd(flow->frcti);
+
va_end(l);
return 0;
@@ -1241,86 +1549,275 @@ int fccntl(int fd,
return -EPERM;
}
-static int chk_crc(struct ssm_pk_buff * spb)
-{
- uint32_t crc;
- uint8_t * head = ssm_pk_buff_head(spb);
- uint8_t * tail = ssm_pk_buff_tail_release(spb, CRCLEN);
-
- mem_hash(HASH_CRC32, &crc, head, tail - head);
-
- return !(crc == *((uint32_t *) tail));
-}
-
-static int add_crc(struct ssm_pk_buff * spb)
-{
- uint8_t * head;
- uint8_t * tail;
-
- tail = ssm_pk_buff_tail_alloc(spb, CRCLEN);
- if (tail == NULL)
- return -ENOMEM;
-
- head = ssm_pk_buff_head(spb);
- mem_hash(HASH_CRC32, tail, head, tail - head);
-
- return 0;
-}
-
static int flow_tx_spb(struct flow * flow,
struct ssm_pk_buff * spb,
+ uint16_t flags,
bool block,
struct timespec * abstime)
{
struct timespec now;
ssize_t idx;
+ size_t pci_total;
int ret;
clock_gettime(PTHREAD_COND_CLOCK, &now);
-
- pthread_rwlock_wrlock(&proc.lock);
-
flow->snd_act = now;
- pthread_rwlock_unlock(&proc.lock);
-
- idx = ssm_pk_buff_get_idx(spb);
-
- pthread_rwlock_rdlock(&proc.lock);
+ idx = ssm_pk_buff_get_off(spb);
if (ssm_pk_buff_len(spb) > 0) {
- if (frcti_snd(flow->frcti, spb) < 0)
+ if (FRCTI_SND(flow->frcti, spb, flags) < 0)
goto enomem;
- if (spb_encrypt(flow, spb) < 0)
- goto enomem;
+ if (flow->info.qs.ber == 0) {
+ pci_total = flow->frcti != NULL
+ ? frcti_data_hdr_len(flow->frcti) : 0;
+ if (crc_add(spb, pci_total) != 0)
+ goto enomem;
+ }
- if (flow->info.qs.ber == 0 && add_crc(spb) != 0)
+ if (spb_encrypt(flow, spb) < 0)
goto enomem;
}
- pthread_cleanup_push(__cleanup_rwlock_unlock, &proc.lock);
-
if (!block)
ret = ssm_rbuff_write(flow->tx_rb, idx);
else
ret = ssm_rbuff_write_b(flow->tx_rb, idx, abstime);
if (ret < 0)
- ssm_pool_remove(proc.pool, idx);
- else
- ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT);
-
- pthread_cleanup_pop(true);
+ return ret;
+ ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT);
return 0;
-enomem:
- pthread_rwlock_unlock(&proc.lock);
- ssm_pool_remove(proc.pool, idx);
+ enomem:
return -ENOMEM;
}
+/* Per-fragment role for fragment i out of n; n == 1 yields SOLE. */
+static __inline__ uint16_t flow_frag_role(size_t i, size_t n)
+{
+ if (n == 1)
+ return FRCT_FR_SOLE;
+
+ if (i == 0)
+ return FRCT_FR_FIRST;
+
+ if (i + 1 == n)
+ return FRCT_FR_LAST;
+
+ return FRCT_FR_MID;
+}
+
+static ssize_t flow_write_stream(struct flow * flow,
+ const void * buf,
+ size_t count,
+ int oflags,
+ struct timespec * dl)
+{
+ const uint8_t * src = buf;
+ size_t payload;
+ size_t off = 0;
+ bool block = !(oflags & FLOWFWNOBLOCK);
+
+ if (!FRCTI_IS_FRTX(flow->frcti))
+ return -EMSGSIZE;
+
+ payload = FRCTI_PAYLOAD_CAP(flow->frcti);
+
+ while (off < count) {
+ struct ssm_pk_buff * spb;
+ uint8_t * ptr;
+ ssize_t idx;
+ size_t clen;
+ int ret;
+
+ ret = flow_wait_window(flow, 1, block, dl);
+ if (ret < 0)
+ return off > 0 ? (ssize_t) off : (ssize_t) ret;
+
+ clen = MIN(count - off, payload);
+
+ if (block)
+ idx = ssm_pool_alloc_b(proc.pool, clen, &ptr,
+ &spb, dl);
+ else
+ idx = ssm_pool_alloc(proc.pool, clen, &ptr, &spb);
+ if (idx < 0)
+ return off > 0 ? (ssize_t) off : idx;
+
+ memcpy(ptr, src + off, clen);
+
+ ret = flow_tx_spb(flow, spb, 0, block, dl);
+ if (ret < 0) {
+ ssm_pool_remove(proc.pool, idx);
+ return off > 0 ? (ssize_t) off : (ssize_t) ret;
+ }
+
+ off += clen;
+ }
+
+ return (ssize_t) count;
+}
+
+/* Per-fragment flow_tx_spb loop. Raw flows refuse; FRCT splits the SDU. */
+static ssize_t flow_write_frag(struct flow * flow,
+ const void * buf,
+ size_t count,
+ int oflags,
+ struct timespec * dl)
+{
+ const uint8_t * src = buf;
+ size_t frag_payload;
+ size_t n;
+ size_t off = 0;
+ size_t i;
+ int ret;
+ bool block = !(oflags & FLOWFWNOBLOCK);
+
+ /* Raw flows carry no PCI; cannot fragment. */
+ if (flow->frcti == NULL)
+ return -EMSGSIZE;
+
+ frag_payload = FRCTI_PAYLOAD_CAP(flow->frcti);
+
+ /* Guard the ceil-divide against size_t overflow. */
+ if (count > SIZE_MAX - frag_payload + 1)
+ return -EMSGSIZE;
+
+ n = (count + frag_payload - 1) / frag_payload;
+
+ /* SDU larger than the FC window can ever offer would deadlock. */
+ if (n > RQ_SIZE)
+ return -EMSGSIZE;
+
+ /* SDU-atomic FC: wait for n seqnos to avoid overshoot mid-SDU. */
+ ret = flow_wait_window(flow, n, block, dl);
+ if (ret < 0)
+ return (ssize_t) ret;
+
+ STAT_BUMP(flow->frcti, sdu_snd_frag);
+
+ for (i = 0; i < n; ++i) {
+ struct ssm_pk_buff * spb;
+ uint8_t * ptr;
+ ssize_t idx;
+ size_t clen;
+
+ clen = (i + 1 == n) ? (count - off) : frag_payload;
+
+ if (block)
+ idx = ssm_pool_alloc_b(proc.pool, clen, &ptr,
+ &spb, dl);
+ else
+ idx = ssm_pool_alloc(proc.pool, clen, &ptr, &spb);
+ if (idx < 0) {
+ if (off > 0)
+ STAT_BUMP(flow->frcti, sdu_snd_alloc);
+ return off > 0 ? (ssize_t) off : idx;
+ }
+
+ memcpy(ptr, src + off, clen);
+
+ ret = flow_tx_spb(flow, spb, flow_frag_role(i, n),
+ block, dl);
+ if (ret < 0) {
+ ssm_pool_remove(proc.pool, idx);
+ if (off > 0)
+ STAT_BUMP(flow->frcti, sdu_snd_tx);
+ return off > 0 ? (ssize_t) off : (ssize_t) ret;
+ }
+
+ off += clen;
+ }
+
+ return (ssize_t) count;
+}
+
+/*
+ * Initiator promotes on the install grace (it holds the key-confirm
+ * tag); responder waits for peer_synced, with a near-exhaustion floor.
+ */
+static void flow_tx_promote(struct flow * flow)
+{
+ struct timespec now;
+ int nodes_left;
+ bool promote;
+
+ if (flow->crypt == NULL)
+ return;
+
+ if (flow->rk_grace.tv_sec == 0 && flow->rk_grace.tv_nsec == 0)
+ return;
+
+ promote = crypt_peer_synced(flow->crypt);
+
+ if (!promote && flow->rk_initiator) {
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ promote = ts_diff_ns(&now, &flow->rk_grace) >= 0;
+ }
+
+ if (!promote && !flow->rk_initiator) {
+ nodes_left = crypt_nodes_left(flow->crypt);
+ promote = nodes_left >= 0 && nodes_left <= REKEY_PROMOTE_FLOOR;
+ }
+
+ if (!promote)
+ return;
+
+ crypt_tx_promote(flow->crypt);
+ flow->rk_grace.tv_sec = 0;
+ flow->rk_grace.tv_nsec = 0;
+}
+
+/* The reply carries no key; the seed arrives later over RB_REKEY. */
+static int flow_rekey_trigger(struct flow * flow)
+{
+ struct flow_info info;
+ uint8_t buf[SOCK_BUF_SIZE];
+ buffer_t msg = {SOCK_BUF_SIZE, buf};
+
+ pthread_rwlock_rdlock(&proc.lock);
+ if (flow->info.id < 0 || flow->crypt == NULL) {
+ pthread_rwlock_unlock(&proc.lock);
+ return -1;
+ }
+ info = flow->info;
+ pthread_rwlock_unlock(&proc.lock);
+
+ if (flow_update__irm_req_ser(&msg, &info, true) < 0)
+ return -1;
+
+ if (send_recv_msg(&msg) < 0)
+ return -1;
+
+ return 0;
+}
+
+static bool flow_wm_due(struct flow * flow)
+{
+ uint32_t tick;
+
+ if (KEY_REKEY_WATERMARK == 0)
+ return false;
+
+ if (flow->crypt == NULL)
+ return false;
+
+ if (LOAD_RELAXED(&flow->rk_wm_inflight))
+ return false;
+
+ tick = FETCH_ADD_RELAXED(&flow->rk_wm_ctr, 1);
+ if ((tick & (FLOW_WM_CHECK - 1)) != 0)
+ return false;
+
+ if (ssm_rbuff_get_flags(flow->rx_rb) & RB_REKEY)
+ return false;
+
+ return crypt_nodes_left(flow->crypt) <= KEY_REKEY_WATERMARK;
+}
+
ssize_t flow_write(int fd,
const void * buf,
size_t count)
@@ -1330,74 +1827,90 @@ ssize_t flow_write(int fd,
int ret;
int flags;
struct timespec abs;
- struct timespec * abstime = NULL;
+ struct timespec now;
+ struct timespec * dl = NULL;
struct ssm_pk_buff * spb;
uint8_t * ptr;
if (buf == NULL && count != 0)
return -EINVAL;
- if (fd < 0 || fd >= PROG_MAX_FLOWS)
+ if (fd < 0 || fd >= PROC_MAX_FLOWS)
return -EBADF;
flow = &proc.flows[fd];
- clock_gettime(PTHREAD_COND_CLOCK, &abs);
-
- pthread_rwlock_wrlock(&proc.lock);
+ pthread_rwlock_rdlock(&proc.lock);
if (flow->info.id < 0) {
pthread_rwlock_unlock(&proc.lock);
return -ENOTALLOC;
}
+ flags = flow->oflags;
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+
if (flow->snd_timesout) {
- ts_add(&abs, &flow->snd_timeo, &abs);
- abstime = &abs;
+ ts_add(&now, &flow->snd_timeo, &abs);
+ dl = &abs;
}
- flags = flow->oflags;
-
pthread_rwlock_unlock(&proc.lock);
if ((flags & FLOWFACCMODE) == FLOWFRDONLY)
return -EPERM;
- if (flags & FLOWFWNOBLOCK) {
- if (!frcti_is_window_open(flow->frcti))
- return -EAGAIN;
- idx = ssm_pool_alloc(proc.pool, count, &ptr, &spb);
- } else {
- ret = frcti_window_wait(flow->frcti, abstime);
+ if (flow->crypt != NULL
+ && (ssm_rbuff_get_flags(flow->rx_rb) & RB_REKEY))
+ flow_rekey(flow);
+
+ flow_tx_promote(flow);
+
+ /* Pre-empt TX key exhaustion; the timer is the backstop. */
+ if (flow_wm_due(flow)) {
+ STORE_RELAXED(&flow->rk_wm_inflight, true);
+ if (flow_rekey_trigger(flow) < 0)
+ STORE_RELAXED(&flow->rk_wm_inflight, false);
+ }
+
+ tw_move_safe();
+
+ if (flow->frcti != NULL) {
+ /* Pump rx_rb so a pure-writer processes ACKs. */
+ ret = flow_wait_window(flow, 1, !(flags & FLOWFWNOBLOCK), dl);
if (ret < 0)
return ret;
- idx = ssm_pool_alloc_b(proc.pool, count, &ptr, &spb, abstime);
+
+ if (count > 0 && FRCTI_IS_STREAM(flow->frcti))
+ return flow_write_stream(flow, buf, count, flags, dl);
+
+ if (FRCTI_NEEDS_FRAG(flow->frcti, count))
+ return flow_write_frag(flow, buf, count, flags, dl);
+ } else if (flow->info.mtu > 0
+ && count > flow_user_mtu(flow, flow->info.mtu)) {
+ /* Raw flows carry no PCI; refuse anything > one n-1 frame. */
+ return -EMSGSIZE;
}
+ if (flags & FLOWFWNOBLOCK)
+ idx = ssm_pool_alloc(proc.pool, count, &ptr, &spb);
+ else
+ idx = ssm_pool_alloc_b(proc.pool, count, &ptr, &spb, dl);
if (idx < 0)
return idx;
if (count > 0)
memcpy(ptr, buf, count);
- ret = flow_tx_spb(flow, spb, !(flags & FLOWFWNOBLOCK), abstime);
-
- return ret < 0 ? (ssize_t) ret : (ssize_t) count;
-}
-
-static bool invalid_pkt(struct flow * flow,
- struct ssm_pk_buff * spb)
-{
- if (spb == NULL || ssm_pk_buff_len(spb) == 0)
- return true;
-
- if (flow->info.qs.ber == 0 && chk_crc(spb) != 0)
- return true;
-
- if (spb_decrypt(flow, spb) < 0)
- return true;
+ ret = flow_tx_spb(flow, spb, FRCT_FR_SOLE,
+ !(flags & FLOWFWNOBLOCK), dl);
+ if (ret < 0) {
+ ssm_pool_remove(proc.pool, idx);
+ return (ssize_t) ret;
+ }
- return false;
+ return (ssize_t) count;
}
static ssize_t flow_rx_spb(struct flow * flow,
@@ -1408,19 +1921,14 @@ static ssize_t flow_rx_spb(struct flow * flow,
ssize_t idx;
struct timespec now;
- idx = block ? ssm_rbuff_read_b(flow->rx_rb, abstime) :
- ssm_rbuff_read(flow->rx_rb);
+ idx = block ? ssm_rbuff_read_b(flow->rx_rb, abstime)
+ : ssm_rbuff_read(flow->rx_rb);
if (idx < 0)
return idx;
clock_gettime(PTHREAD_COND_CLOCK, &now);
-
- pthread_rwlock_wrlock(&proc.lock);
-
flow->rcv_act = now;
- pthread_rwlock_unlock(&proc.lock);
-
*spb = ssm_pool_get(proc.pool, idx);
if (invalid_pkt(flow, *spb)) {
@@ -1431,28 +1939,128 @@ static ssize_t flow_rx_spb(struct flow * flow,
return idx;
}
+static ssize_t raw_flow_read_pkt(struct flow * flow,
+ bool block,
+ struct timespec * dl)
+{
+ struct ssm_pk_buff * spb;
+ struct timespec wait_abs;
+ ssize_t idx;
+
+ while (true) {
+ if (flow->crypt != NULL
+ && (ssm_rbuff_get_flags(flow->rx_rb) & RB_REKEY))
+ flow_rekey(flow);
+
+ if (!block) {
+ idx = ssm_rbuff_read(flow->rx_rb);
+ if (idx < 0)
+ return -EAGAIN;
+ } else {
+ compute_wait_deadline(dl, &wait_abs);
+ idx = ssm_rbuff_read_b(flow->rx_rb, &wait_abs);
+ if (idx == -ETIMEDOUT) {
+ if (deadline_passed(dl))
+ return -ETIMEDOUT;
+ continue;
+ }
+ if (idx < 0)
+ return idx;
+ }
+
+ spb = ssm_pool_get(proc.pool, idx);
+ if (!invalid_pkt(flow, spb))
+ return idx;
+
+ ssm_pool_remove(proc.pool, idx);
+ if (!block)
+ return -EAGAIN;
+ }
+}
+
+static ssize_t deliver_pkt(struct flow * flow,
+ struct ssm_pk_buff * spb,
+ ssize_t idx,
+ void * buf,
+ size_t count,
+ bool partrd)
+{
+ uint8_t * packet = ssm_pk_buff_head(spb);
+ ssize_t n = ssm_pk_buff_len(spb);
+
+ assert(n >= 0);
+
+ if (n <= (ssize_t) count) {
+ memcpy(buf, packet, n);
+ ipcp_spb_release(spb);
+ if (partrd && n == (ssize_t) count)
+ flow->part_idx = DONE_PART;
+ else
+ flow->part_idx = NO_PART;
+
+ return n;
+ }
+
+ if (partrd) {
+ memcpy(buf, packet, count);
+ ssm_pk_buff_pop(spb, n);
+ flow->part_idx = idx;
+ return count;
+ }
+
+ ipcp_spb_release(spb);
+ return -EMSGSIZE;
+}
+
+/* Drive frcti_consume until it delivers or errors. */
+static ssize_t flow_read_frcti(struct flow * flow,
+ void * buf,
+ size_t count,
+ bool block,
+ struct timespec * dl)
+{
+ struct timespec now;
+ ssize_t bytes;
+ int rc;
+
+ while (true) {
+ flow_drain_rx_nb(flow);
+ bytes = FRCTI_CONSUME(flow->frcti, buf, count);
+ if (bytes >= 0)
+ break;
+ if (bytes != -EAGAIN)
+ return bytes;
+ if (!block)
+ return -EAGAIN;
+ rc = flow_rx_one(flow, dl);
+ if (rc < 0)
+ return rc;
+ }
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ flow->rcv_act = now;
+
+ return bytes;
+}
+
ssize_t flow_read(int fd,
void * buf,
size_t count)
{
- ssize_t idx;
- ssize_t n;
- uint8_t * packet;
+ struct flow * flow;
struct ssm_pk_buff * spb;
struct timespec abs;
struct timespec now;
- struct timespec * abstime = NULL;
- struct flow * flow;
+ struct timespec * dl = NULL;
+ ssize_t idx;
bool block;
bool partrd;
- if (fd < 0 || fd >= PROG_MAX_FLOWS)
+ if (fd < 0 || fd >= PROC_MAX_FLOWS)
return -EBADF;
flow = &proc.flows[fd];
- clock_gettime(PTHREAD_COND_CLOCK, &now);
-
pthread_rwlock_rdlock(&proc.lock);
if (flow->info.id < 0) {
@@ -1461,8 +2069,8 @@ ssize_t flow_read(int fd,
}
if (flow->part_idx == DONE_PART) {
- pthread_rwlock_unlock(&proc.lock);
flow->part_idx = NO_PART;
+ pthread_rwlock_unlock(&proc.lock);
return 0;
}
@@ -1470,75 +2078,40 @@ ssize_t flow_read(int fd,
partrd = !(flow->oflags & FLOWFRNOPART);
if (flow->rcv_timesout) {
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
ts_add(&now, &flow->rcv_timeo, &abs);
- abstime = &abs;
- }
-
- idx = flow->part_idx;
- if (idx < 0) {
- while ((idx = frcti_queued_pdu(flow->frcti)) < 0) {
- pthread_rwlock_unlock(&proc.lock);
-
- idx = flow_rx_spb(flow, &spb, block, abstime);
- if (idx < 0) {
- if (block && idx != -EAGAIN)
- return idx;
- if (!block)
- return idx;
-
- pthread_rwlock_rdlock(&proc.lock);
- continue;
- }
-
- pthread_rwlock_rdlock(&proc.lock);
-
- frcti_rcv(flow->frcti, spb);
- }
+ dl = &abs;
}
- spb = ssm_pool_get(proc.pool, idx);
-
pthread_rwlock_unlock(&proc.lock);
- packet = ssm_pk_buff_head(spb);
+ if (flow->crypt != NULL
+ && (ssm_rbuff_get_flags(flow->rx_rb) & RB_REKEY))
+ flow_rekey(flow);
- n = ssm_pk_buff_len(spb);
+ /* Advance TX off a stale epoch even on recv-mostly (ACK-only) flows. */
+ flow_tx_promote(flow);
- assert(n >= 0);
+ tw_move_safe();
- if (n <= (ssize_t) count) {
- memcpy(buf, packet, n);
- ipcp_spb_release(spb);
-
- pthread_rwlock_wrlock(&proc.lock);
-
- flow->part_idx = (partrd && n == (ssize_t) count) ?
- DONE_PART : NO_PART;
+ idx = flow->part_idx;
+ if (idx < 0 && flow->frcti != NULL)
+ return flow_read_frcti(flow, buf, count, block, dl);
- flow->rcv_act = now;
+ if (idx < 0) {
+ idx = raw_flow_read_pkt(flow, block, dl);
+ if (idx < 0)
+ return idx;
+ }
- pthread_rwlock_unlock(&proc.lock);
- return n;
- } else {
- if (partrd) {
- memcpy(buf, packet, count);
- ssm_pk_buff_head_release(spb, n);
- pthread_rwlock_wrlock(&proc.lock);
- flow->part_idx = idx;
+ spb = ssm_pool_get(proc.pool, idx);
- flow->rcv_act = now;
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ flow->rcv_act = now;
- pthread_rwlock_unlock(&proc.lock);
- return count;
- } else {
- ipcp_spb_release(spb);
- return -EMSGSIZE;
- }
- }
+ return deliver_pkt(flow, spb, idx, buf, count, partrd);
}
-/* fqueue functions. */
-
struct flow_set * fset_create(void)
{
struct flow_set * set;
@@ -1614,7 +2187,7 @@ int fset_add(struct flow_set * set,
struct flow * flow;
int ret;
- if (set == NULL || fd < 0 || fd >= SYS_MAX_FLOWS)
+ if (set == NULL || fd < 0 || fd >= PROC_MAX_FLOWS)
return -EINVAL;
flow = &proc.flows[fd];
@@ -1650,7 +2223,7 @@ void fset_del(struct flow_set * set,
{
struct flow * flow;
- if (set == NULL || fd < 0 || fd >= SYS_MAX_FLOWS)
+ if (set == NULL || fd < 0 || fd >= PROC_MAX_FLOWS)
return;
flow = &proc.flows[fd];
@@ -1661,7 +2234,7 @@ void fset_del(struct flow_set * set,
ssm_flow_set_del(proc.fqset, set->idx, flow->info.id);
if (flow->frcti != NULL)
- ssm_flow_set_add(proc.fqset, 0, proc.flows[fd].info.id);
+ ssm_flow_set_add(proc.fqset, 0, flow->info.id);
pthread_rwlock_unlock(&proc.lock);
}
@@ -1672,7 +2245,7 @@ bool fset_has(const struct flow_set * set,
struct flow * flow;
bool ret;
- if (set == NULL || fd < 0 || fd >= SYS_MAX_FLOWS)
+ if (set == NULL || fd < 0 || fd >= PROC_MAX_FLOWS)
return false;
flow = &proc.flows[fd];
@@ -1691,61 +2264,71 @@ bool fset_has(const struct flow_set * set,
return ret;
}
-/* Filter fqueue events for non-data packets */
static int fqueue_filter(struct fqueue * fq)
{
struct ssm_pk_buff * spb;
int fd;
ssize_t idx;
struct frcti * frcti;
+ int ret = 0;
+
+ /* proc.lock rdlock gates frcti_destroy via flow_fini wrlock. */
+ pthread_rwlock_rdlock(&proc.lock);
while (fq->next < fq->fqsize) {
- if (fq->fqueue[fq->next].event != FLOW_PKT)
- return 1;
+ if (fq->fqueue[fq->next].event == FLOW_UPD) {
+ /* Re-key doorbell: pull internally, never surface. */
+ fd = proc.id_to_fd[fq->fqueue[fq->next].flow_id].fd;
+ ++fq->next;
+ if (fd >= 0) {
+ pthread_rwlock_unlock(&proc.lock);
+ flow_rekey(&proc.flows[fd]);
+ pthread_rwlock_rdlock(&proc.lock);
+ }
+ continue;
+ }
- pthread_rwlock_rdlock(&proc.lock);
+ if (fq->fqueue[fq->next].event != FLOW_PKT) {
+ ret = 1;
+ goto out;
+ }
fd = proc.id_to_fd[fq->fqueue[fq->next].flow_id].fd;
if (fd < 0) {
++fq->next;
- pthread_rwlock_unlock(&proc.lock);
continue;
}
frcti = proc.flows[fd].frcti;
if (frcti == NULL) {
- pthread_rwlock_unlock(&proc.lock);
- return 1;
+ ret = 1;
+ goto out;
}
- if (__frcti_pdu_ready(frcti) >= 0) {
- pthread_rwlock_unlock(&proc.lock);
- return 1;
+ if (FRCTI_PDU_READY(frcti)) {
+ ret = 1;
+ goto out;
}
- pthread_rwlock_unlock(&proc.lock);
-
idx = flow_rx_spb(&proc.flows[fd], &spb, false, NULL);
if (idx < 0)
- return 0;
-
- pthread_rwlock_rdlock(&proc.lock);
+ goto out;
spb = ssm_pool_get(proc.pool, idx);
- __frcti_rcv(frcti, spb);
+ FRCTI_RCV(frcti, spb);
- if (__frcti_pdu_ready(frcti) >= 0) {
- pthread_rwlock_unlock(&proc.lock);
- return 1;
+ if (FRCTI_PDU_READY(frcti)) {
+ ret = 1;
+ goto out;
}
- pthread_rwlock_unlock(&proc.lock);
-
++fq->next;
}
- return 0;
+ out:
+ pthread_rwlock_unlock(&proc.lock);
+ return ret;
}
int fqueue_next(struct fqueue * fq)
@@ -1792,7 +2375,8 @@ ssize_t fevent(struct flow_set * set,
{
ssize_t ret = 0;
struct timespec abs;
- struct timespec * t = NULL;
+ struct timespec * dl = NULL;
+ struct timespec wait_abs;
if (set == NULL || fq == NULL)
return -EINVAL;
@@ -1800,17 +2384,26 @@ ssize_t fevent(struct flow_set * set,
if (fq->fqsize > 0 && fq->next != fq->fqsize)
return 1;
- clock_gettime(PTHREAD_COND_CLOCK, &abs);
-
if (timeo != NULL) {
- ts_add(&abs, timeo, &abs);
- t = &abs;
+ struct timespec now;
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ ts_add(&now, timeo, &abs);
+ dl = &abs;
}
while (ret == 0) {
- ret = ssm_flow_set_wait(proc.fqset, set->idx, fq->fqueue, t);
- if (ret == -ETIMEDOUT)
- return -ETIMEDOUT;
+ tw_move_safe();
+
+ compute_wait_deadline(dl, &wait_abs);
+
+ ret = ssm_flow_set_wait(proc.fqset, set->idx,
+ fq->fqueue, &wait_abs);
+ if (ret == -ETIMEDOUT) {
+ if (deadline_passed(dl))
+ return -ETIMEDOUT;
+ ret = 0;
+ continue;
+ }
fq->fqsize = ret;
fq->next = 0;
@@ -1823,13 +2416,12 @@ ssize_t fevent(struct flow_set * set,
return 1;
}
-/* ipcp-dev functions. */
-
int np1_flow_alloc(pid_t n_pid,
int flow_id)
{
struct flow_info flow;
- struct crypt_sk crypt = { .nid = NID_undef, .key = NULL };
+ struct crypt_sk crypt = { .nid = NID_undef, .key = NULL,
+ .epoch = 0, .role = CRYPT_ROLE_INIT };
memset(&flow, 0, sizeof(flow));
@@ -1837,9 +2429,10 @@ int np1_flow_alloc(pid_t n_pid,
flow.n_pid = getpid();
flow.qs = qos_np1;
flow.mpl = 0;
- flow.n_1_pid = n_pid; /* This "flow" is upside-down! */
+ /* np1 flow: n_1_pid is the upper. */
+ flow.n_1_pid = n_pid;
- return flow_init(&flow, &crypt);
+ return flow_init(&flow, &crypt, 0);
}
int np1_flow_dealloc(int flow_id,
@@ -1847,12 +2440,7 @@ int np1_flow_dealloc(int flow_id,
{
int fd;
- /*
- * TODO: Don't pass timeo to the IPCP but wait in IRMd.
- * This will need async ops, waiting until we bootstrap
- * the IRMd over ouroboros.
- */
-
+ /* TODO: wait in IRMd, not here; needs async ops. */
sleep(timeo);
pthread_rwlock_rdlock(&proc.lock);
@@ -1881,6 +2469,38 @@ int np1_flow_resp(int flow_id,
return fd;
}
+int np1_flow_fd(int flow_id)
+{
+ int fd;
+
+ if (flow_id < 0 || flow_id >= SYS_MAX_FLOWS)
+ return -1;
+
+ pthread_rwlock_rdlock(&proc.lock);
+
+ fd = proc.id_to_fd[flow_id].fd;
+
+ pthread_rwlock_unlock(&proc.lock);
+
+ return fd;
+}
+
+int np1_flow_id(int fd)
+{
+ int flow_id;
+
+ if (fd < 0 || fd >= PROC_MAX_FLOWS)
+ return -1;
+
+ pthread_rwlock_rdlock(&proc.lock);
+
+ flow_id = proc.flows[fd].info.id;
+
+ pthread_rwlock_unlock(&proc.lock);
+
+ return flow_id;
+}
+
int ipcp_create_r(const struct ipcp_info * info)
{
uint8_t buf[SOCK_BUF_SIZE];
@@ -1900,6 +2520,7 @@ int ipcp_create_r(const struct ipcp_info * info)
int ipcp_flow_req_arr(const buffer_t * dst,
qosspec_t qs,
time_t mpl,
+ uint32_t mtu,
const buffer_t * data)
{
struct flow_info flow;
@@ -1916,6 +2537,7 @@ int ipcp_flow_req_arr(const buffer_t * dst,
flow.n_1_pid = getpid();
flow.qs = qs;
flow.mpl = mpl;
+ flow.mtu = mtu;
if (ipcp_flow_req_arr__irm_req_ser(&msg, dst, &flow, data) < 0)
return -ENOMEM;
@@ -1924,28 +2546,56 @@ int ipcp_flow_req_arr(const buffer_t * dst,
if (err < 0)
return err;
- crypt.key = key;
+ crypt.key = key;
+ crypt.epoch = 0;
+ crypt.role = CRYPT_ROLE_INIT;
err = flow__irm_result_des(&msg, &flow, &crypt);
if (err < 0)
return err;
- assert(crypt.nid == NID_undef); /* np1 flows are not encrypted */
+ /* np1 flows are not encrypted. */
+ assert(crypt.nid == NID_undef);
- /* inverted for np1_flow */
+ /* Inverted for np1_flow. */
flow.n_1_pid = flow.n_pid;
flow.n_pid = getpid();
flow.mpl = 0;
+ flow.mtu = 0;
flow.qs = qos_np1;
crypt.nid = NID_undef;
- return flow_init(&flow, &crypt);
+ return flow_init(&flow, &crypt, 0);
+}
+
+int ipcp_flow_update_arr(int flow_id,
+ const buffer_t * data)
+{
+ struct flow_info flow;
+ uint8_t buf[SOCK_BUF_SIZE];
+ buffer_t msg = {SOCK_BUF_SIZE, buf};
+ int err;
+
+ memset(&flow, 0, sizeof(flow));
+
+ flow.id = flow_id;
+ flow.n_1_pid = getpid();
+
+ if (ipcp_flow_update_arr__irm_req_ser(&msg, &flow, data) < 0)
+ return -ENOMEM;
+
+ err = send_recv_msg(&msg);
+ if (err < 0)
+ return err;
+
+ return irm__irm_result_des(&msg);
}
int ipcp_flow_alloc_reply(int fd,
int response,
time_t mpl,
+ uint32_t mtu,
const buffer_t * data)
{
struct flow_info flow;
@@ -1953,7 +2603,7 @@ int ipcp_flow_alloc_reply(int fd,
buffer_t msg = {SOCK_BUF_SIZE, buf};
int err;
- assert(fd >= 0 && fd < SYS_MAX_FLOWS);
+ assert(fd >= 0 && fd < PROC_MAX_FLOWS);
pthread_rwlock_rdlock(&proc.lock);
@@ -1962,6 +2612,7 @@ int ipcp_flow_alloc_reply(int fd,
pthread_rwlock_unlock(&proc.lock);
flow.mpl = mpl;
+ flow.mtu = mtu;
if (ipcp_flow_alloc_reply__irm_msg_ser(&msg, &flow, response, data) < 0)
return -ENOMEM;
@@ -1979,7 +2630,7 @@ int ipcp_flow_read(int fd,
struct flow * flow;
ssize_t idx = -1;
- assert(fd >= 0 && fd < SYS_MAX_FLOWS);
+ assert(fd >= 0 && fd < PROC_MAX_FLOWS);
assert(spb);
flow = &proc.flows[fd];
@@ -1988,7 +2639,14 @@ int ipcp_flow_read(int fd,
assert(flow->info.id >= 0);
- while (frcti_queued_pdu(flow->frcti) < 0) {
+ /* Raw flow: deliver the popped pkt directly (no FRCT rq). */
+ if (flow->frcti == NULL) {
+ pthread_rwlock_unlock(&proc.lock);
+ idx = flow_rx_spb(flow, spb, false, NULL);
+ return idx < 0 ? (int) idx : 0;
+ }
+
+ while (!FRCTI_PDU_READY(flow->frcti)) {
pthread_rwlock_unlock(&proc.lock);
idx = flow_rx_spb(flow, spb, false, NULL);
@@ -1997,7 +2655,7 @@ int ipcp_flow_read(int fd,
pthread_rwlock_rdlock(&proc.lock);
- frcti_rcv(flow->frcti, *spb);
+ FRCTI_RCV(flow->frcti, *spb);
}
pthread_rwlock_unlock(&proc.lock);
@@ -2011,12 +2669,12 @@ int ipcp_flow_write(int fd,
struct flow * flow;
int ret;
- assert(fd >= 0 && fd < SYS_MAX_FLOWS);
+ assert(fd >= 0 && fd < PROC_MAX_FLOWS);
assert(spb);
flow = &proc.flows[fd];
- pthread_rwlock_wrlock(&proc.lock);
+ pthread_rwlock_rdlock(&proc.lock);
if (flow->info.id < 0) {
pthread_rwlock_unlock(&proc.lock);
@@ -2030,30 +2688,28 @@ int ipcp_flow_write(int fd,
pthread_rwlock_unlock(&proc.lock);
- ret = flow_tx_spb(flow, spb, true, NULL);
+ ret = flow_tx_spb(flow, spb, FRCT_FR_SOLE, true, NULL);
return ret;
}
-static int pool_copy_spb(struct ssm_pool * src_pool,
- ssize_t src_idx,
- struct ssm_pool * dst_pool,
- struct ssm_pk_buff ** dst_spb)
+/* Copy src into dst_pool without consuming src. Caller owns both halves. */
+static int pool_dup_spb(struct ssm_pool * src_pool,
+ size_t src_off,
+ struct ssm_pool * dst_pool,
+ struct ssm_pk_buff ** dst_spb)
{
struct ssm_pk_buff * src;
uint8_t * ptr;
size_t len;
- src = ssm_pool_get(src_pool, src_idx);
+ src = ssm_pool_get(src_pool, src_off);
len = ssm_pk_buff_len(src);
- if (ssm_pool_alloc(dst_pool, len, &ptr, dst_spb) < 0) {
- ssm_pool_remove(src_pool, src_idx);
+ if (ssm_pool_alloc(dst_pool, len, &ptr, dst_spb) < 0)
return -ENOMEM;
- }
memcpy(ptr, ssm_pk_buff_head(src), len);
- ssm_pool_remove(src_pool, src_idx);
return 0;
}
@@ -2063,9 +2719,9 @@ int np1_flow_read(int fd,
struct ssm_pool * pool)
{
struct flow * flow;
- ssize_t idx = -1;
+ ssize_t off;
- assert(fd >= 0 && fd < SYS_MAX_FLOWS);
+ assert(fd >= 0 && fd < PROC_MAX_FLOWS);
assert(spb);
flow = &proc.flows[fd];
@@ -2074,20 +2730,23 @@ int np1_flow_read(int fd,
pthread_rwlock_rdlock(&proc.lock);
- idx = ssm_rbuff_read(flow->rx_rb);
- if (idx < 0) {
+ off = ssm_rbuff_read(flow->rx_rb);
+ if (off < 0) {
pthread_rwlock_unlock(&proc.lock);
- return idx;
+ return off;
}
pthread_rwlock_unlock(&proc.lock);
if (pool == NULL) {
- *spb = ssm_pool_get(proc.pool, idx);
+ *spb = ssm_pool_get(proc.pool, off);
} else {
/* Cross-pool copy: PUP -> GSPP */
- if (pool_copy_spb(pool, idx, proc.pool, spb) < 0)
+ if (pool_dup_spb(pool, off, proc.pool, spb) < 0) {
+ ssm_pool_remove(pool, off);
return -ENOMEM;
+ }
+ ssm_pool_remove(pool, off);
}
return 0;
@@ -2100,9 +2759,10 @@ int np1_flow_write(int fd,
struct flow * flow;
struct ssm_pk_buff * dst;
int ret;
- ssize_t idx;
+ size_t off;
+ size_t dst_off;
- assert(fd >= 0 && fd < SYS_MAX_FLOWS);
+ assert(fd >= 0 && fd < PROC_MAX_FLOWS);
assert(spb);
flow = &proc.flows[fd];
@@ -2121,45 +2781,47 @@ int np1_flow_write(int fd,
pthread_rwlock_unlock(&proc.lock);
- idx = ssm_pk_buff_get_idx(spb);
+ off = ssm_pk_buff_get_off(spb);
if (pool == NULL) {
- ret = ssm_rbuff_write_b(flow->tx_rb, idx, NULL);
+ ret = ssm_rbuff_write_b(flow->tx_rb, off, NULL);
if (ret < 0)
- ssm_pool_remove(proc.pool, idx);
- else
- ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT);
+ return ret;
+ ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT);
} else {
- /* Cross-pool copy: GSPP -> PUP */
- if (pool_copy_spb(proc.pool, idx, pool, &dst) < 0)
+ /* Cross-pool copy: GSPP -> PUP. Src kept on error. */
+ if (pool_dup_spb(proc.pool, off, pool, &dst) < 0)
return -ENOMEM;
- idx = ssm_pk_buff_get_idx(dst);
- ret = ssm_rbuff_write_b(flow->tx_rb, idx, NULL);
- if (ret < 0)
- ssm_pool_remove(pool, idx);
- else
- ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT);
+ dst_off = ssm_pk_buff_get_off(dst);
+ ret = ssm_rbuff_write_b(flow->tx_rb, dst_off, NULL);
+ if (ret < 0) {
+ ssm_pool_remove(pool, dst_off);
+ return ret;
+ }
+ ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT);
+ ssm_pool_remove(proc.pool, off);
}
- return ret;
+ return 0;
}
int ipcp_spb_reserve(struct ssm_pk_buff ** spb,
size_t len)
{
- return ssm_pool_alloc_b(proc.pool, len, NULL, spb, NULL) < 0 ? -1 : 0;
+ return ssm_pool_alloc_b(proc.pool, len, NULL, spb, NULL) < 0
+ ? -1 : 0;
}
void ipcp_spb_release(struct ssm_pk_buff * spb)
{
- ssm_pool_remove(proc.pool, ssm_pk_buff_get_idx(spb));
+ ssm_pool_remove(proc.pool, ssm_pk_buff_get_off(spb));
}
int ipcp_flow_fini(int fd)
{
struct ssm_rbuff * rx_rb;
- assert(fd >= 0 && fd < SYS_MAX_FLOWS);
+ assert(fd >= 0 && fd < PROC_MAX_FLOWS);
pthread_rwlock_rdlock(&proc.lock);
@@ -2168,8 +2830,8 @@ int ipcp_flow_fini(int fd)
return -1;
}
- ssm_rbuff_set_acl(proc.flows[fd].rx_rb, ACL_FLOWDOWN);
- ssm_rbuff_set_acl(proc.flows[fd].tx_rb, ACL_FLOWDOWN);
+ ssm_rbuff_set_bits(proc.flows[fd].rx_rb, RB_FLOWDOWN);
+ ssm_rbuff_set_bits(proc.flows[fd].tx_rb, RB_FLOWDOWN);
ssm_flow_set_notify(proc.flows[fd].set,
proc.flows[fd].info.id,
@@ -2188,7 +2850,7 @@ int ipcp_flow_fini(int fd)
int ipcp_flow_get_qoscube(int fd,
qoscube_t * cube)
{
- assert(fd >= 0 && fd < SYS_MAX_FLOWS);
+ assert(fd >= 0 && fd < PROC_MAX_FLOWS);
assert(cube);
pthread_rwlock_rdlock(&proc.lock);
@@ -2227,7 +2889,7 @@ int local_flow_transfer(int src_fd,
struct ssm_pk_buff * dst_spb;
struct ssm_pool * sp;
struct ssm_pool * dp;
- ssize_t idx;
+ ssize_t off;
int ret;
assert(src_fd >= 0);
@@ -2241,15 +2903,15 @@ int local_flow_transfer(int src_fd,
pthread_rwlock_rdlock(&proc.lock);
- idx = ssm_rbuff_read(src_flow->rx_rb);
- if (idx < 0) {
+ off = ssm_rbuff_read(src_flow->rx_rb);
+ if (off < 0) {
pthread_rwlock_unlock(&proc.lock);
- return idx;
+ return off;
}
if (dst_flow->info.id < 0) {
pthread_rwlock_unlock(&proc.lock);
- ssm_pool_remove(sp, idx);
+ ssm_pool_remove(sp, off);
return -ENOTALLOC;
}
@@ -2257,21 +2919,24 @@ int local_flow_transfer(int src_fd,
if (sp == dp) {
/* Same pool: zero-copy */
- ret = ssm_rbuff_write_b(dst_flow->tx_rb, idx, NULL);
+ ret = ssm_rbuff_write_b(dst_flow->tx_rb, off, NULL);
if (ret < 0)
- ssm_pool_remove(sp, idx);
+ ssm_pool_remove(sp, off);
else
ssm_flow_set_notify(dst_flow->set,
dst_flow->info.id, FLOW_PKT);
} else {
/* Different pools: single copy */
- if (pool_copy_spb(sp, idx, dp, &dst_spb) < 0)
+ if (pool_dup_spb(sp, off, dp, &dst_spb) < 0) {
+ ssm_pool_remove(sp, off);
return -ENOMEM;
+ }
- idx = ssm_pk_buff_get_idx(dst_spb);
- ret = ssm_rbuff_write_b(dst_flow->tx_rb, idx, NULL);
+ ssm_pool_remove(sp, off);
+ off = ssm_pk_buff_get_off(dst_spb);
+ ret = ssm_rbuff_write_b(dst_flow->tx_rb, off, NULL);
if (ret < 0)
- ssm_pool_remove(dp, idx);
+ ssm_pool_remove(dp, off);
else
ssm_flow_set_notify(dst_flow->set,
dst_flow->info.id, FLOW_PKT);
diff --git a/src/lib/frct.c b/src/lib/frct.c
index fad2cf69..c055433d 100644
--- a/src/lib/frct.c
+++ b/src/lib/frct.c
@@ -1,7 +1,7 @@
/*
* Ouroboros - Copyright (C) 2016 - 2026
*
- * Flow and Retransmission Control
+ * Flow and Retransmission Control Task (FRCT)
*
* Dimitri Staessens <dimitri@ouroboros.rocks>
* Sander Vrijders <sander@ouroboros.rocks>
@@ -20,97 +20,416 @@
* Foundation, Inc., http://www.fsf.org/about/contact/.
*/
-#include <ouroboros/endian.h>
+/* Included by dev.c; uses dev.c statics (proc, spb_encrypt, ...). */
#define DELT_RDV (100 * MILLION) /* ns */
-#define MAX_RDV (1 * BILLION) /* ns */
+#define MAX_RDV (1 * BILLION) /* ns */
+
+#define MAX_RTO_MUL 8 /* caps the RTO backoff shift */
+#define MAX_TLP_PER_EP 2 /* RFC 8985 §7.3: up to 2 TLPs */
+#define INITIAL_RTO (1 * BILLION) /* RFC 6298 §2.1: 1 s default */
+#define RTT_BOOT_NS (10 * MILLION) /* rtt_hint floor + initial mdev */
+#define SRTT_FLOOR_NS 1000L /* 1 us; smoothed RTT floor */
+#define MDEV_FLOOR_NS 100L /* 100 ns; mdev sanity floor */
+#define RTT_CLAMP_MUL 16 /* probe sample cap = N * srtt */
+#define MIN_RTT_WIN_NS (300ULL * BILLION) /* 5 min, Linux tcp default */
+#define NACK_COOLDOWN_NS (100 * MILLION) /* pre-DRF NACK cooldown */
+#define FRCT_TX_TIMEO_NS (250 * 1000) /* tx ring write deadline */
+#define ACK_DELAY_NS (2ULL * TICTIME) /* delayed-ACK fire delay */
#define FRCT "frct"
#define FRCT_PCILEN (sizeof(struct frct_pci))
#define FRCT_NAME_STRLEN 32
-struct frct_cr {
- uint32_t lwe; /* Left window edge */
- uint32_t rwe; /* Right window edge */
+/* Wire-protocol cap on SACK blocks per packet; binds both peers. */
+#define SACK_MAX_BLOCKS 2048
+#define SACK_BLOCK_SIZE (2 * sizeof(uint32_t))
+/* 2B count + 2B pad to 4-byte align the block list. */
+#define SACK_HDR_SIZE (sizeof(uint32_t))
+#define SACK_MIN_GAP_NS (250u * 1000u) /* 250 us SACK gap */
+#define MIN_REORDER_NS (250u * 1000u) /* 250 us RACK floor */
+#define SACK_RXM_MAX 32 /* Cap on retransmits staged from single SACK.*/
+#define DUP_THRESH 3 /* RFC 8985 §6.2 step 2.2 SACK count gate. */
+
+/* RFC 8985 §7.2 RACK reorder-window scaling cap. */
+#define REO_WND_MULT_MAX 20
+/* RFC 8985 §7.2 step 5: round trips of no DSACK before halving. */
+#define REO_DECAY_PKTS 16
+/* DSACK seqno sanity: reject reports older/farther than one rcv window. */
+#define MAX_DSACK_LAG RQ_SIZE
+
+/* Signed ns elapsed; negative under concurrent update (no underflow). */
+static __inline__ int64_t ts_age_ns(uint64_t now_ns,
+ uint64_t then_ns)
+{
+ return (int64_t)(now_ns - then_ns);
+}
- uint8_t cflags;
- uint32_t seqno; /* SEQ to send, or last SEQ Ack'd */
+/* True iff strictly more than thr_ns elapsed since then_ns. */
+static __inline__ bool ts_aged_ns(uint64_t now_ns,
+ uint64_t then_ns,
+ uint64_t thr_ns)
+{
+ return ts_age_ns(now_ns, then_ns) > (int64_t) thr_ns;
+}
- struct timespec act; /* Last seen activity */
- time_t inact; /* Inactivity (s) */
-};
+/* FRCT r-timer: do not retransmit packet older than t_r (from first send). */
+#define RXM_AGED_OUT(t0, now_ns, t_r) \
+ ts_aged_ns((now_ns), (t0), (uint64_t)(t_r))
-struct frcti {
- int fd;
+/* FRCT a-timer: do not (re)transmit ACK after t_a from last data receive. */
+#define ACK_AGED_OUT(act, now_ns, t_a) \
+ ts_aged_ns((now_ns), (act), (uint64_t)(t_a))
- time_t mpl;
- time_t a;
- time_t r;
- time_t rdv;
-
- time_t srtt; /* Smoothed rtt */
- time_t mdev; /* Deviation */
- time_t rto; /* Retransmission timeout */
- uint32_t rttseq;
- struct timespec t_probe; /* Probe time */
- bool probe; /* Probe active */
-#ifdef PROC_FLOW_STATS
- size_t n_rtx; /* Number of rxm packets */
- size_t n_prb; /* Number of rtt probes */
- size_t n_rtt; /* Number of estimates */
- size_t n_dup; /* Duplicates received */
- size_t n_dak; /* Delayed ACKs received */
- size_t n_rdv; /* Number of rdv packets */
- size_t n_out; /* Packets out of window */
- size_t n_rqo; /* Packets out of rqueue */
-#endif
- struct frct_cr snd_cr;
- struct frct_cr rcv_cr;
+struct sack_args {
+ uint16_t n;
+ bool dsack; /* RFC 2883: block[0] is a DSACK report */
+ uint32_t ack;
+ uint32_t rwe;
+ uint32_t blocks[][2]; /* flexible — sized at alloc time */
+};
+/* NewReno-careful (RFC 6582) exit pad; gates RTT samples post-signal. */
+#define RTT_QUARANTINE 32
+#define RTTP_NONCE_LEN 16
- ssize_t rq[RQ_SIZE];
- pthread_rwlock_t lock;
+/* RTT-probe wire payload (after the FRCT PCI). */
+struct frct_rttp {
+ uint32_t probe_id; /* sender counter; 0 on reply */
+ uint32_t echo_id; /* peer's probe_id; 0 outbound */
+ uint8_t nonce[RTTP_NONCE_LEN]; /* random; echoed verbatim */
+} __attribute__((packed));
- bool open; /* Window open/closed */
- struct timespec t_wnd; /* Window closed time */
- struct timespec t_rdvs; /* Last rendez-vous sent */
- pthread_cond_t cond;
- pthread_mutex_t mtx;
-};
+#define RTTP_PAYLOAD sizeof(struct frct_rttp)
+#define RTTP_POS(id) ((id) & (RTTP_RING - 1))
+/*
+ * Flag values are assigned MSB-first on the wire (RFC convention):
+ * bit 0 = 0x8000 occupies wire-position 0 of the 16-bit flags
+ * field, bit 12 = 0x0008 is the last assigned bit, and the three
+ * LSBs (0x0007) are reserved.
+ */
enum frct_flags {
- FRCT_DATA = 0x01, /* PDU carries data */
- FRCT_DRF = 0x02, /* Data run flag */
- FRCT_ACK = 0x04, /* ACK field valid */
- FRCT_FC = 0x08, /* FC window valid */
- FRCT_RDVS = 0x10, /* Rendez-vous */
- FRCT_FFGM = 0x20, /* First Fragment */
- FRCT_MFGM = 0x40, /* More fragments */
+ FRCT_DATA = 0x8000, /* PDU carries data */
+ FRCT_DRF = 0x4000, /* Data run flag */
+ FRCT_ACK = 0x2000, /* ACK field valid */
+ FRCT_NACK = 0x1000, /* Neg-ACK: pci->seqno is arrival_seqno - 1 */
+ FRCT_FC = 0x0800, /* FC window valid */
+ FRCT_RDVS = 0x0400, /* Rendez-vous */
+ FRCT_FFGM = 0x0200, /* First fragment (begin) */
+ FRCT_LFGM = 0x0100, /* Last fragment (end) */
+ FRCT_RXM = 0x0080, /* Retransmission */
+ FRCT_SACK = 0x0040, /* SACK block list follows */
+ FRCT_RTTP = 0x0020, /* RTT probe / echo */
+ FRCT_KA = 0x0010, /* Keepalive */
+ FRCT_FIN = 0x0008, /* End of stream */
};
-struct frct_pci {
- uint8_t flags;
+/*
+ * DATA-packet fragment role (FFGM = begin, LFGM = end), SCTP-style:
+ * 1 1 = sole / un-fragmented SDU (begin AND end)
+ * 1 0 = first fragment of a multi-fragment SDU
+ * 0 0 = middle fragment
+ * 0 1 = last fragment
+ */
+#define FRCT_FR_MASK (FRCT_FFGM | FRCT_LFGM)
+#define FRCT_FR_SOLE (FRCT_FFGM | FRCT_LFGM)
+#define FRCT_FR_FIRST (FRCT_FFGM)
+#define FRCT_FR_MID (0)
+#define FRCT_FR_LAST (FRCT_LFGM)
+
+/* Default cap on a single reassembled SDU. App can raise via FRCTSMAXSDU */
+#define FRCT_MAX_SDU (1U << 20)
+
+/* Stream-mode PCI extension: [start, end) byte range on every DATA pkt. */
+struct frct_pci_stream {
+ uint32_t start;
+ uint32_t end;
+} __attribute__((packed));
+
+#define FRCT_PCI_STREAM_LEN (sizeof(struct frct_pci_stream))
- uint8_t pad; /* 24 bit window! */
- uint16_t window;
+/* Bytes following PCI: SACK list / RTTP nonce / control payload. */
+#define FRCT_BODY(pci) ((uint8_t *) (pci) + FRCT_PCILEN)
+/* Typed access to the stream PCI extension on stream DATA packets. */
+#define FRCT_SPCI(pci) \
+ ((struct frct_pci_stream *) ((uint8_t *) (pci) + FRCT_PCILEN))
+/* Push the FRCT header onto spb's head. */
+#define FRCT_HDR_PUSH(spb, frcti) \
+ ((struct frct_pci *) ssm_pk_buff_push((spb), \
+ frcti_data_hdr_len(frcti)))
+
+/* Pop a fixed-size header off spb's head; cast to type *. */
+#define FRCT_HDR_POP(spb, type) \
+ ((struct type *) ssm_pk_buff_pop((spb), sizeof(struct type)))
+
+/* Default / max per-flow stream rx ring (pow2); min N * per_pkt. */
+#define FRCT_STREAM_RING_MIN_PKTS 4
+#define FRCT_STREAM_RING_SZ (1U << 20) /* 1 MiB default */
+#define FRCT_STREAM_RING_SZ_MAX (1U << 27) /* 128 MiB */
+
+struct frct_pci {
+ uint16_t flags;
+ uint16_t hcs;
+
+ uint32_t window;
uint32_t seqno;
uint32_t ackno;
} __attribute__((packed));
+/* Stat counters; fold to no-ops without PROC_FLOW_STATS. */
+#ifdef PROC_FLOW_STATS
+struct frcti_stat {
+ size_t rxm_rto; /* RTO-timer driven retransmits */
+ size_t rxm_rcv; /* RXM packets received (all) */
+ size_t rxm_dup_rcv; /* RXM dups (peer already had it) */
+ size_t rxm_sack; /* SACK-mechanism retransmits */
+ size_t rxm_rack; /* RACK-driven retransmits */
+ size_t rxm_dupthresh; /* DupThresh-driven retransmits */
+ size_t rxm_nack; /* NACK-pulled retransmits */
+ size_t rxm_due_count; /* rxm_due entries (pre-bail) */
+ size_t rxm_due_acked; /* bail: seqno < snd_lwe */
+ size_t rxm_due_unowned; /* bail: slot.rxm replaced */
+ size_t rxm_due_aged; /* bail: r->t0 + t_r < now */
+ size_t rxm_due_defer; /* bail: non-HoL, deferred to HoL */
+ size_t rxm_arm_fail; /* rxm_arm: malloc failed */
+ size_t rxm_cancel; /* entries cancelled at teardown */
+ size_t rxm_tx_dead; /* RXM tx into terminal flow */
+ size_t tx_drop; /* frct_tx fail (any cause) */
+ size_t tx_drop_ack; /* bare ACK dropped */
+ size_t tx_drop_sack; /* SACK dropped */
+ size_t tx_drop_ka; /* keepalive dropped */
+ size_t tx_drop_rttp; /* RTT probe/echo dropped */
+ size_t tx_drop_nack; /* pre-DRF NACK dropped */
+ size_t tx_drop_rdv; /* rendez-vous dropped */
+ size_t tx_drop_other; /* anything not matched above */
+ size_t ack_snd; /* ACK packets sent (bare + SACK) */
+ size_t ack_fire; /* delayed-ACK timer fires */
+ size_t ack_supp_seqno; /* fire suppressed: seqno */
+ size_t ack_supp_inact; /* fire suppressed: inact */
+ size_t ack_supp_rate; /* fire suppressed: rate */
+ size_t ack_rcv; /* ACK packets received */
+ size_t ack_rtt; /* ACKs that fed RTT estimator */
+ size_t ack_dup_rcv; /* ACK packet wire dups dropped */
+ size_t dup_rcv; /* duplicates received */
+ size_t out_rcv; /* pkts out of window */
+ size_t rqo_rcv; /* pkts out of rqueue */
+ size_t ooo_rcv; /* OOO arrivals */
+ size_t sack_snd; /* SACK packets sent */
+ size_t sack_rcv; /* SACK packets received */
+ size_t dsack_snd; /* SACK pkts carrying a DSACK */
+ size_t dsack_rcv; /* DSACK blocks parsed */
+ size_t dsack_drop; /* DSACK blocks past MAX_DSACK_LAG */
+ size_t nack_snd; /* pre-DRF NACKs sent */
+ size_t nack_rcv; /* pre-DRF NACKs received */
+ size_t tlp_snd; /* tail loss probes sent */
+ size_t inact_drop; /* inactivity drop (NACK on cd) */
+ size_t drf_rebase; /* DRF-triggered window rebase */
+ size_t rq_released; /* slots cleared by release_rq */
+ size_t rttp_snd; /* RTT probes sent */
+ size_t rttp_rcv; /* RTT probe replies rcvd */
+ size_t rtt_smpl; /* RTT estimator samples */
+ size_t rdv_snd; /* rendez-vous packets sent */
+ size_t rdv_rcv; /* rendez-vous packets rcvd */
+ size_t ka_snd; /* keepalives sent */
+ size_t ka_rcv; /* keepalives received */
+ size_t sdu_snd_frag; /* writes that fragmented */
+ size_t sdu_snd_alloc; /* alloc fail truncated SDU send */
+ size_t sdu_snd_tx; /* tx fail truncated SDU send */
+ size_t frag_snd; /* fragments sent: FIRST/MID/LAST */
+ size_t frag_rcv; /* fragments stashed in rq[] */
+ size_t sdu_reasm; /* SDUs delivered reassembled */
+ size_t sdu_sole; /* SOLE SDUs delivered (n==1) */
+ size_t frag_drop; /* dropped at malformed run */
+ size_t strm_snd_byte; /* bytes sent on stream */
+ size_t strm_rcv_byte; /* bytes copied to ring */
+ size_t strm_dlv_byte; /* bytes delivered to reader */
+ size_t strm_drop; /* stream rcvs dropped */
+ size_t strm_fin_drop; /* stream FIN packets rejected */
+ /* Profiling instrumentation. */
+ size_t rcv_proc_ns; /* time inside FRCTI_RCV (ns) */
+ size_t tw_move_ns; /* time inside tw_move (ns) */
+ size_t drain_calls; /* flow_drain_rx_nb invocations */
+};
+
+#define STAT_BUMP(frcti, field) FETCH_ADD_RELAXED(&(frcti)->stat.field, 1)
+#define STAT_ADD(frcti, field, v) FETCH_ADD_RELAXED(&(frcti)->stat.field, (v))
+#define STAT_LOAD(frcti, field) LOAD_RELAXED(&(frcti)->stat.field)
+#else
+#define STAT_BUMP(frcti, field) ((void) (frcti))
+#define STAT_ADD(frcti, field, v) ((void) (frcti))
+#define STAT_LOAD(frcti, field) ((void) (frcti), (size_t) 0)
+#endif
+
+#define frcti_to_flow(f) (&proc.flows[(f)->fd])
+
+#define RTTP_RING 8
+#define RTTP_COLD_NS (100 * MILLION) /* cold-probe cadence */
+#define RQ_SLOT(seqno) ((seqno) & (RQ_SIZE - 1))
+
+struct rxm_entry;
+
+enum snd_slot_flags {
+ SND_RTX = 0x01, /* Any retransmit; Karn skips next RTT sample. */
+ SND_FAST_RXM = 0x02, /* Fast-retx one-shot gate per loss event. */
+ SND_TLP = 0x04, /* Tail loss probe; ACK resets rto_mul. */
+};
+
+struct snd_slot {
+ struct rxm_entry * rxm; /* RXM entry, NULL if none. */
+ uint64_t time; /* ts_to_ns of last send (any kind). */
+ uint8_t flags; /* SND_* bits above. */
+};
+
+/* Per-seqno reorder slot (FRTX) and stream-mode byte/FIN metadata. */
+struct rcv_slot {
+ ssize_t idx; /* spb idx; -1 = empty */
+ uint32_t start; /* stream byte start */
+ uint32_t end; /* stream byte end */
+ uint8_t fin; /* stream FIN bit */
+};
+
+struct frct_cr {
+ uint32_t lwe; /* Left window edge */
+ uint32_t rwe; /* Right window edge */
+
+ uint8_t cflags;
+ uint32_t seqno; /* SEQ to send, or last SEQ Ack'd */
+ uint32_t ackno; /* snd: ACK-pkt seqno; rcv: dedup */
+
+ uint64_t act; /* ts_to_ns of last activity */
+ uint64_t inact; /* Inactivity threshold (ns) */
+};
+
+struct frcti {
+ /* IMM: set once in frcti_create; read-only thereafter. */
+ int fd;
+ uint64_t t_mpl; /* MPL (ns) */
+ uint64_t t_a; /* a-timer (ns) */
+ uint64_t t_r; /* r-timer (ns) */
+ uint64_t t_rdv; /* RDV cooldown (ns) */
+ time_t ber; /* cached qs.ber */
+ bool lossy; /* qs.loss != 0 */
+ time_t qs_timeout; /* cached qs.timeout (ms) */
+ size_t frag_mtu; /* max FRCT pkt: PCI + payload */
+ uint16_t sack_n_max; /* SACK blocks that fit MTU */
+ bool stream;
+
+ /* All fields below are protected by lock (rwlock/LOAD_ACQUIRE). */
+ struct {
+ struct frct_cr snd_cr;
+ struct frct_cr rcv_cr;
+
+ /* RTT/RACK estimator */
+ time_t srtt; /* smoothed RTT */
+ time_t mdev; /* mean deviation */
+ time_t min_rtt; /* RACK base, ns */
+ uint64_t t_min_rtt; /* min_rtt last set */
+ time_t rto; /* retransmit TO */
+ time_t rto_min; /* RTO floor (ns) */
+ uint8_t rto_mul; /* RTO backoff bits */
+ uint32_t rtt_lwe; /* RTT-sample fence */
+ uint64_t t_rcv_rtt; /* last RTT feed */
+ uint64_t t_snd_probe; /* last probe sent */
+ uint64_t t_latest_ack; /* RACK.fack snd-ts */
+ uint32_t probe_id_next;
+ struct {
+ uint32_t id;
+ uint64_t ts; /* ts_to_ns send */
+ uint8_t nonce[RTTP_NONCE_LEN]; /* echoed back */
+ } probes[RTTP_RING];
+
+ /* rcv reassembly */
+ size_t max_rcv_sdu; /* max reasm bytes */
+ uint8_t * rcv_ring; /* lazy alloc */
+ size_t rcv_ring_sz; /* power of 2 */
+ uint32_t ring_seq_cap; /* ring/per_pkt */
+
+ uint32_t snd_byte_next;
+ bool snd_fin_sent;
+ uint32_t snd_fin_seqno;
+ uint32_t rcv_byte_next;
+ uint32_t rcv_byte_high; /* contiguous high */
+ uint32_t rcv_byte_fin; /* set when FIN */
+ bool rcv_fin_seen;
+
+ struct rcv_slot rcv_slots[RQ_SIZE];
+ struct snd_slot snd_slots[RQ_SIZE]; /* .rxm is ATOM */
+
+ /* rcv SACK dedup */
+ uint64_t t_snd_sack;
+ uint32_t sack_lwe; /* rcv lwe at SACK */
+ uint16_t sack_n; /* SACK block count */
+
+ /* RFC 2883 D-SACK: pending report (single-slot, latest). */
+ uint32_t dsack_seqno;
+ bool dsack_valid;
+
+ /* RFC 8985 §7.2 RACK reorder-window scaling. */
+ uint8_t reo_wnd_mult; /* REO_WND_MULT_MAX */
+ uint32_t dsack_lwe_snap; /* lwe @ last DSACK */
+ uint64_t t_last_reo_widen; /* once-per-RTT */
+
+ uint32_t dup_thresh; /* RFC 8985 */
+ uint32_t tlp_high_seq; /* §7.3: 0 = none */
+ uint8_t tlp_count; /* §7.3 per-episode */
+ uint64_t t_nack;
+ bool open; /* FC window state */
+ bool in_recovery;
+ uint32_t recovery_high; /* seqno @ entry */
+ uint32_t rack_fired_lwe; /* lwe @ last RACK */
+ struct timespec t_wnd; /* window-closed ts */
+ struct timespec t_last_rdv; /* last RDV sent */
+ struct list_head rxm_list; /* live rxm entries */
+
+ pthread_rwlock_t lock;
+ };
+
+ /* Read/written via __atomic without holding lock. */
+ uint64_t t_ka_rcv; /* ts_to_ns of last KA rx */
+ uint8_t ack_pending; /* delayed-ACK dedup */
+ uint8_t tlp_pending; /* TLP arm dedup (lazy) */
+
+ /* Timer entries; ownership belongs to the tw module. */
+ struct tw_entry ack_tw; /* delayed-ACK timer */
+ struct tw_entry ka_tw; /* keepalive timer */
+ struct tw_entry tlp_tw; /* tail-loss probe timer */
+
+#ifdef PROC_FLOW_STATS
+ /* STAT: lock-free relaxed atomic counters. */
+ struct frcti_stat stat;
+#endif
+};
+
#ifdef PROC_FLOW_STATS
+__attribute__((cold))
static int frct_rib_read(const char * path,
char * buf,
size_t len)
{
+ struct frcti * frcti;
struct timespec now;
+ uint64_t now_ns;
char * entry;
- struct flow * flow;
- struct frcti * frcti;
int fd;
-
- (void) len;
+ int written;
+ /* Snapshot under the locks; format outside (pure userspace). */
+ struct {
+ uint64_t t_mpl;
+ uint64_t t_a;
+ uint64_t t_r;
+ time_t srtt;
+ time_t mdev;
+ time_t rto;
+ time_t min_rtt;
+ struct frct_cr snd_cr;
+ struct frct_cr rcv_cr;
+ size_t rx_q_now;
+ size_t tx_q_now;
+ struct frcti_stat stat;
+ } s;
entry = strstr(path, RIB_SEPARATOR);
assert(entry);
@@ -118,23 +437,50 @@ static int frct_rib_read(const char * path,
fd = atoi(path);
- flow = &proc.flows[fd];
-
clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ if (fd < 0 || fd >= PROC_MAX_FLOWS)
+ return 0;
pthread_rwlock_rdlock(&proc.lock);
- frcti = flow->frcti;
+ frcti = proc.flows[fd].frcti;
+ if (frcti == NULL) {
+ pthread_rwlock_unlock(&proc.lock);
+ return 0;
+ }
+
+ s.t_mpl = frcti->t_mpl;
+ s.t_a = frcti->t_a;
+ s.t_r = frcti->t_r;
+
+ s.rx_q_now = proc.flows[fd].rx_rb != NULL
+ ? ssm_rbuff_queued(proc.flows[fd].rx_rb) : 0;
+ s.tx_q_now = proc.flows[fd].tx_rb != NULL
+ ? ssm_rbuff_queued(proc.flows[fd].tx_rb) : 0;
pthread_rwlock_rdlock(&frcti->lock);
- sprintf(buf,
- "Maximum packet lifetime (ns): %20ld\n"
- "Max time to Ack (ns): %20ld\n"
- "Max time to Retransmit (ns): %20ld\n"
+ s.srtt = frcti->srtt;
+ s.mdev = frcti->mdev;
+ s.rto = frcti->rto;
+ s.min_rtt = frcti->min_rtt;
+ s.snd_cr = frcti->snd_cr;
+ s.rcv_cr = frcti->rcv_cr;
+ s.stat = frcti->stat;
+
+ pthread_rwlock_unlock(&frcti->lock);
+ pthread_rwlock_unlock(&proc.lock);
+
+ written = snprintf(buf, len,
+ "Maximum packet lifetime (ns): %20" PRIu64 "\n"
+ "Max time to Ack (ns): %20" PRIu64 "\n"
+ "Max time to Retransmit (ns): %20" PRIu64 "\n"
"Smoothed rtt (ns): %20ld\n"
"RTT standard deviation (ns): %20ld\n"
"Retransmit timeout RTO (ns): %20ld\n"
+ "Minimum RTT (RACK base, ns): %20ld\n"
"Sender left window edge: %20u\n"
"Sender right window edge: %20u\n"
"Sender inactive (ns): %20lld\n"
@@ -143,44 +489,132 @@ static int frct_rib_read(const char * path,
"Receiver right window edge: %20u\n"
"Receiver inactive (ns): %20lld\n"
"Receiver last ack: %20u\n"
- "Number of pkt retransmissions: %20zu\n"
- "Number of rtt probes: %20zu\n"
- "Number of rtt estimates: %20zu\n"
- "Number of duplicates received: %20zu\n"
- "Number of delayed acks received: %20zu\n"
- "Number of rendez-vous sent: %20zu\n"
- "Number of packets out of window: %20zu\n"
- "Number of packets out of rqueue: %20zu\n",
- frcti->mpl,
- frcti->a,
- frcti->r,
- frcti->srtt,
- frcti->mdev,
- frcti->rto,
- frcti->snd_cr.lwe,
- frcti->snd_cr.rwe,
- ts_diff_ns(&now, &frcti->snd_cr.act),
- frcti->snd_cr.seqno,
- frcti->rcv_cr.lwe,
- frcti->rcv_cr.rwe,
- ts_diff_ns(&now, &frcti->rcv_cr.act),
- frcti->rcv_cr.seqno,
- frcti->n_rtx,
- frcti->n_prb,
- frcti->n_rtt,
- frcti->n_dup,
- frcti->n_dak,
- frcti->n_rdv,
- frcti->n_out,
- frcti->n_rqo);
-
- pthread_rwlock_unlock(&flow->frcti->lock);
+ "RXM (RTO-driven) sent: %20zu\n"
+ "RXM packets received: %20zu\n"
+ " duplicates received: %20zu\n"
+ "RXM (SACK mechanism) sent: %20zu\n"
+ "RXM (RACK-driven) sent: %20zu\n"
+ "RXM (DupThresh-driven) sent: %20zu\n"
+ "RXM (NACK-driven) sent: %20zu\n"
+ "ACK packets sent: %20zu\n"
+ "Delayed-ACK timer fires: %20zu\n"
+ " suppressed (seqno): %20zu\n"
+ " suppressed (inact): %20zu\n"
+ " suppressed (rate): %20zu\n"
+ "ACK packets received: %20zu\n"
+ " fed RTT estimator: %20zu\n"
+ " wire dups dropped: %20zu\n"
+ "Duplicates received: %20zu\n"
+ "Out-of-window pkts received: %20zu\n"
+ "Out-of-rqueue pkts received: %20zu\n"
+ "OOO arrivals: %20zu\n"
+ "SACKs sent: %20zu\n"
+ "SACKs received: %20zu\n"
+ "D-SACKs sent: %20zu\n"
+ "D-SACKs received: %20zu\n"
+ "D-SACK out-of-range dropped: %20zu\n"
+ "Pre-DRF NACKs sent: %20zu\n"
+ "Pre-DRF NACKs received: %20zu\n"
+ "Tail loss probes sent: %20zu\n"
+ "Inactivity drops (silent): %20zu\n"
+ "DRF window rebases: %20zu\n"
+ "rq slots cleared by release_rq: %20zu\n"
+ "RTT probes sent: %20zu\n"
+ "RTT probe replies received: %20zu\n"
+ "RTT estimator samples: %20zu\n"
+ "Rendez-vous packets sent: %20zu\n"
+ "Rendez-vous packets received: %20zu\n"
+ "Keepalives sent: %20zu\n"
+ "Keepalives received: %20zu\n"
+ "SDU writes fragmented: %20zu\n"
+ " alloc fail mid-SDU: %20zu\n"
+ " tx fail mid-SDU: %20zu\n"
+ "Fragments sent: %20zu\n"
+ "Fragments received: %20zu\n"
+ "SDUs delivered reassembled: %20zu\n"
+ "SDUs delivered (SOLE): %20zu\n"
+ "Fragments dropped (malformed): %20zu\n"
+ "Stream bytes sent: %20zu\n"
+ "Stream bytes received: %20zu\n"
+ "Stream bytes delivered: %20zu\n"
+ "Stream packets dropped: %20zu\n"
+ "Stream FINs dropped: %20zu\n"
+ "FRCTI_RCV time (ns): %20zu\n"
+ "tw_move time (ns): %20zu\n"
+ "drain_rx_nb calls: %20zu\n"
+ "RX rbuff queued: %20zu\n"
+ "TX rbuff queued: %20zu\n"
+ "RXM-due entries: %20zu\n"
+ " bail (acked): %20zu\n"
+ " bail (unowned): %20zu\n"
+ " bail (aged): %20zu\n"
+ " bail (defer): %20zu\n"
+ "RXM-arm malloc failures: %20zu\n"
+ "RXM cancels (teardown): %20zu\n"
+ "RXM tx into dead flow: %20zu\n"
+ "Tx ring drops (any cause): %20zu\n"
+ " ack: %20zu\n"
+ " sack: %20zu\n"
+ " ka: %20zu\n"
+ " rttp: %20zu\n"
+ " nack: %20zu\n"
+ " rdv: %20zu\n"
+ " other: %20zu\n",
+ /* Check getattr size below when adding stats. */
+ s.t_mpl, s.t_a, s.t_r,
+ s.srtt, s.mdev, s.rto, s.min_rtt,
+ s.snd_cr.lwe, s.snd_cr.rwe,
+ (long long)(now_ns - s.snd_cr.act),
+ s.snd_cr.seqno,
+ s.rcv_cr.lwe, s.rcv_cr.rwe,
+ (long long)(now_ns - s.rcv_cr.act),
+ s.rcv_cr.seqno,
+ s.stat.rxm_rto, s.stat.rxm_rcv, s.stat.rxm_dup_rcv,
+ s.stat.rxm_sack, s.stat.rxm_rack, s.stat.rxm_dupthresh,
+ s.stat.rxm_nack,
+ s.stat.ack_snd, s.stat.ack_fire,
+ s.stat.ack_supp_seqno, s.stat.ack_supp_inact,
+ s.stat.ack_supp_rate,
+ s.stat.ack_rcv, s.stat.ack_rtt, s.stat.ack_dup_rcv,
+ s.stat.dup_rcv, s.stat.out_rcv, s.stat.rqo_rcv,
+ s.stat.ooo_rcv,
+ s.stat.sack_snd, s.stat.sack_rcv,
+ s.stat.dsack_snd, s.stat.dsack_rcv, s.stat.dsack_drop,
+ s.stat.nack_snd, s.stat.nack_rcv, s.stat.tlp_snd,
+ s.stat.inact_drop, s.stat.drf_rebase, s.stat.rq_released,
+ s.stat.rttp_snd, s.stat.rttp_rcv, s.stat.rtt_smpl,
+ s.stat.rdv_snd, s.stat.rdv_rcv,
+ s.stat.ka_snd, s.stat.ka_rcv,
+ s.stat.sdu_snd_frag, s.stat.sdu_snd_alloc, s.stat.sdu_snd_tx,
+ s.stat.frag_snd, s.stat.frag_rcv,
+ s.stat.sdu_reasm, s.stat.sdu_sole, s.stat.frag_drop,
+ s.stat.strm_snd_byte, s.stat.strm_rcv_byte,
+ s.stat.strm_dlv_byte,
+ s.stat.strm_drop, s.stat.strm_fin_drop,
+ s.stat.rcv_proc_ns, s.stat.tw_move_ns,
+ s.stat.drain_calls,
+ s.rx_q_now, s.tx_q_now,
+ s.stat.rxm_due_count,
+ s.stat.rxm_due_acked, s.stat.rxm_due_unowned,
+ s.stat.rxm_due_aged, s.stat.rxm_due_defer,
+ s.stat.rxm_arm_fail,
+ s.stat.rxm_cancel,
+ s.stat.rxm_tx_dead, s.stat.tx_drop,
+ s.stat.tx_drop_ack, s.stat.tx_drop_sack,
+ s.stat.tx_drop_ka, s.stat.tx_drop_rttp,
+ s.stat.tx_drop_nack, s.stat.tx_drop_rdv,
+ s.stat.tx_drop_other);
+
+ if (written < 0)
+ return 0;
- pthread_rwlock_unlock(&proc.lock);
+ if ((size_t) written >= len)
+ return (int) (len - 1);
- return strlen(buf);
+ return written;
}
+__attribute__((cold))
static int frct_rib_readdir(char *** buf)
{
*buf = malloc(sizeof(**buf));
@@ -199,13 +633,14 @@ static int frct_rib_readdir(char *** buf)
return -ENOMEM;
}
+__attribute__((cold))
static int frct_rib_getattr(const char * path,
struct rib_attr * attr)
{
(void) path;
- (void) attr;
- attr->size = 1189;
+ /* Must be >= the sprintf output in frct_rib_read. */
+ attr->size = 8192;
attr->mtime = 0;
return 0;
@@ -220,128 +655,1168 @@ static struct rib_ops r_ops = {
#endif /* PROC_FLOW_STATS */
-static bool before(uint32_t seq1,
- uint32_t seq2)
+static __inline__ bool before(uint32_t s1, uint32_t s2)
{
- return (int32_t)(seq1 - seq2) < 0;
+ return (int32_t)(s1 - s2) < 0;
}
-static bool after(uint32_t seq1,
- uint32_t seq2)
+static __inline__ bool after(uint32_t s1, uint32_t s2)
{
- return (int32_t)(seq2 - seq1) < 0;
+ return (int32_t)(s2 - s1) < 0;
}
-static void __send_frct_pkt(int fd,
- uint8_t flags,
- uint32_t ackno,
- uint32_t rwe)
+static __inline__ bool within(uint32_t seq, uint32_t lo, uint32_t hi)
{
- struct ssm_pk_buff * spb;
- struct frct_pci * pci;
- ssize_t idx;
- struct flow * f;
+ return after(seq, lo) && !after(seq, hi);
+}
- /* Raw calls needed to bypass frcti. */
-#ifdef RXM_BLOCKING
- idx = ssm_pool_alloc_b(proc.pool, sizeof(*pci), NULL, &spb, NULL);
-#else
- idx = ssm_pool_alloc(proc.pool, sizeof(*pci), NULL, &spb);
-#endif
- if (idx < 0)
+static __inline__ bool in_window(uint32_t seq, const struct frct_cr * cr)
+{
+ return !before(seq, cr->lwe) && before(seq, cr->rwe);
+}
+
+/* DRF arrival that stays within the current receive epoch. */
+static __inline__ bool same_epoch_drf(uint32_t seq,
+ uint16_t flags,
+ const struct frct_cr * cr)
+{
+ if (cr->lwe == cr->rwe)
+ return false;
+
+ return (flags & FRCT_RXM) || in_window(seq, cr);
+}
+
+/*
+ * RACK reorder window R (RFC 8985 §6.2):
+ * R = MIN(reo_wnd_mult * RACK.min_RTT / 4, SRTT)
+ * reo_wnd_mult scales on D-SACK evidence of under-tolerance (§7.2).
+ * Fall back to srtt when no min_rtt sample exists yet; MIN_REORDER_NS
+ * floor guards collapse below the timer-tick resolution.
+ */
+static __inline__ uint64_t rack_reorder_window(struct frcti * frcti)
+{
+ uint64_t mult = frcti->reo_wnd_mult > 0 ? frcti->reo_wnd_mult : 1;
+ uint64_t base = frcti->min_rtt > 0 ? (uint64_t) frcti->min_rtt
+ : (uint64_t) frcti->srtt;
+ uint64_t R = mult * (base / 4);
+
+ R = MAX(R, (uint64_t) MIN_REORDER_NS);
+ R = MIN(R, (uint64_t) frcti->srtt);
+
+ return R;
+}
+
+static __inline__ int frct_spb_reserve(size_t len,
+ struct ssm_pk_buff ** spb)
+{
+ ssize_t idx = ssm_pool_alloc_b(proc.pool, len, NULL, spb, NULL);
+
+ return idx < 0 ? (int) idx : 0;
+}
+
+static __inline__ void frct_spb_release(struct ssm_pk_buff * spb)
+{
+ ssm_pool_remove(proc.pool, ssm_pk_buff_get_off(spb));
+}
+
+static __inline__ void frct_spb_release_idx(size_t idx)
+{
+ ssm_pool_remove(proc.pool, idx);
+}
+
+/* Fetch the spb stashed at the rq slot for seqno. */
+static __inline__ struct ssm_pk_buff * rq_frag(const struct frcti * frcti,
+ uint32_t seqno)
+{
+ return ssm_pool_get(proc.pool, frcti->rcv_slots[RQ_SLOT(seqno)].idx);
+}
+
+static __inline__ size_t frcti_data_hdr_len(const struct frcti * frcti)
+{
+ return FRCT_PCILEN + (frcti->stream ? FRCT_PCI_STREAM_LEN : 0);
+}
+
+static __inline__ size_t frcti_ctrl_hdr_len(const struct frcti * frcti)
+{
+ (void) frcti;
+
+ return FRCT_PCILEN;
+}
+
+/*
+ * HCS at offset 2 inside PCI. Covers flags (bytes 0..1) and
+ * window/seqno/ackno (bytes 4..15), plus SPCI for stream DATA.
+ */
+static void frct_hcs_set(struct frct_pci * pci,
+ bool stream)
+{
+ uint16_t hcs = 0;
+ size_t tail;
+
+ tail = sizeof(*pci) - sizeof(pci->flags) - sizeof(pci->hcs);
+ if (stream)
+ tail += FRCT_PCI_STREAM_LEN;
+
+ crc16_ccitt_false(&hcs, pci, sizeof(pci->flags));
+ crc16_ccitt_false(&hcs, &pci->window, tail);
+
+ pci->hcs = hton16(hcs);
+}
+
+static int frct_hcs_check(const struct frct_pci * pci,
+ const struct frcti * frcti)
+{
+ uint16_t hcs = 0;
+ uint16_t flags;
+ size_t tail;
+
+ /* Untrusted flag read; mismatch on HCS will drop on corrupt. */
+ flags = ntoh16(pci->flags);
+
+ tail = sizeof(*pci) - sizeof(pci->flags) - sizeof(pci->hcs);
+ if (frcti->stream && (flags & FRCT_DATA))
+ tail += FRCT_PCI_STREAM_LEN;
+
+ crc16_ccitt_false(&hcs, pci, sizeof(pci->flags));
+ crc16_ccitt_false(&hcs, &pci->window, tail);
+
+ return hcs != ntoh16(pci->hcs);
+}
+
+/* Bump tx_drop plus the per-frame-type counter matching `flags`. */
+static void frct_tx_drop_bump(struct frcti * frcti,
+ uint16_t flags)
+{
+ STAT_BUMP(frcti, tx_drop);
+
+ if (flags & FRCT_SACK) {
+ STAT_BUMP(frcti, tx_drop_sack);
return;
+ }
- pci = (struct frct_pci *) ssm_pk_buff_head(spb);
- memset(pci, 0, sizeof(*pci));
+ if (flags & FRCT_KA) {
+ STAT_BUMP(frcti, tx_drop_ka);
+ return;
+ }
- *((uint32_t *) pci) = hton32(rwe);
+ if (flags & FRCT_RTTP) {
+ STAT_BUMP(frcti, tx_drop_rttp);
+ return;
+ }
- pci->flags = flags;
- pci->ackno = hton32(ackno);
+ if (flags & FRCT_NACK) {
+ STAT_BUMP(frcti, tx_drop_nack);
+ return;
+ }
+
+ if (flags & FRCT_RDVS) {
+ STAT_BUMP(frcti, tx_drop_rdv);
+ return;
+ }
+
+ if (flags & FRCT_ACK) {
+ STAT_BUMP(frcti, tx_drop_ack);
+ return;
+ }
+
+ STAT_BUMP(frcti, tx_drop_other);
+}
- f = &proc.flows[fd];
+static int frct_tx(struct frcti * frcti, struct ssm_pk_buff * spb)
+{
+ struct flow * f = frcti_to_flow(frcti);
+ const struct frct_pci * pci;
+ const struct timespec * dl = NULL;
+ struct timespec now;
+ struct timespec intv = TIMESPEC_INIT_NS(FRCT_TX_TIMEO_NS);
+ struct timespec deadline;
+ uint16_t flags;
+ ssize_t idx;
+ int ret = -ENOMEM;
+
+ pci = (const struct frct_pci *) ssm_pk_buff_head(spb);
+ flags = ntoh16(pci->flags);
+
+ /* CRC32 covers plaintext body; PCI is in HCS. Pre-encrypt. */
+ if (flags & FRCT_SACK) {
+ if (crc_add(spb, frcti_ctrl_hdr_len(frcti)) != 0)
+ goto fail;
+ } else if ((flags & FRCT_DATA) && f->info.qs.ber == 0) {
+ if (crc_add(spb, frcti_data_hdr_len(frcti)) != 0)
+ goto fail;
+ }
if (spb_encrypt(f, spb) < 0)
goto fail;
-#ifdef RXM_BLOCKING
- if (ssm_rbuff_write_b(f->tx_rb, idx, NULL))
-#else
- if (ssm_rbuff_write(f->tx_rb, idx))
-#endif
+ idx = ssm_pk_buff_get_off(spb);
+
+ /* DATA blocks; control times out so a full ring can't stall wheel. */
+ if (!(flags & FRCT_DATA)) {
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ ts_add(&now, &intv, &deadline);
+ dl = &deadline;
+ }
+
+ ret = ssm_rbuff_write_b(f->tx_rb, idx, dl);
+ if (ret < 0)
goto fail;
ssm_flow_set_notify(f->set, f->info.id, FLOW_PKT);
- return;
+ return 0;
fail:
- ipcp_spb_release(spb);
- return;
+ frct_tx_drop_bump(frcti, flags);
+ ssm_pool_remove(proc.pool, ssm_pk_buff_get_off(spb));
+ return ret;
+}
+
+__attribute__((cold))
+static void frct_mark_flow_down(struct frcti * frcti)
+{
+ struct flow * f = frcti_to_flow(frcti);
+
+ if (f->rx_rb != NULL)
+ ssm_rbuff_set_bits(f->rx_rb, RB_FLOWDOWN);
+
+ if (f->tx_rb != NULL)
+ ssm_rbuff_set_bits(f->tx_rb, RB_FLOWDOWN);
+}
+
+__attribute__((cold))
+static void frct_mark_peer_dead(struct frcti * frcti)
+{
+ struct flow * f = frcti_to_flow(frcti);
+
+ if (f->rx_rb != NULL)
+ ssm_rbuff_set_bits(f->rx_rb, RB_FLOWPEER);
+
+ if (proc.fqset != NULL)
+ ssm_flow_set_notify(proc.fqset, f->info.id, FLOW_PEER);
+}
+
+static __inline__ int frct_ctrl_alloc(struct ssm_pk_buff ** spb,
+ struct frct_pci ** pci,
+ size_t payload_len)
+{
+ if (frct_spb_reserve(FRCT_PCILEN + payload_len, spb) < 0)
+ return -1;
+
+ *pci = (struct frct_pci *) ssm_pk_buff_head(*spb);
+ memset(*pci, 0, FRCT_PCILEN);
+
+ return 0;
+}
+
+/*
+ * Advertised rwe. Stream mode clamps to lwe + ring_seq_cap so the
+ * byte-equivalent fits the rx ring. Caller holds at least the rdlock.
+ */
+static __inline__ uint32_t frcti_advert_rwe(struct frcti * frcti)
+{
+ uint32_t rwe;
+ uint32_t cap;
+
+ rwe = frcti->rcv_cr.rwe;
+
+ if (!frcti->stream)
+ return rwe;
+
+ cap = frcti->rcv_cr.lwe + frcti->ring_seq_cap;
+
+ return before(cap, rwe) ? cap : rwe;
+}
+
+static void frcti_pkt_snd(struct frcti * frcti,
+ uint16_t flags,
+ uint32_t ackno,
+ uint32_t rwe)
+{
+ struct ssm_pk_buff * spb;
+ struct frct_pci * pci;
+
+ if (frct_ctrl_alloc(&spb, &pci, 0) < 0)
+ return;
+
+ pci->flags = hton16(flags);
+ pci->window = hton32(rwe);
+ pci->ackno = hton32(ackno);
+ if (flags & FRCT_ACK) {
+ /* reuse ackno for the sequence number of delayed ACK */
+ ackno = FETCH_ADD_RELAXED(&frcti->snd_cr.ackno, 1);
+ pci->seqno = hton32(ackno + 1);
+ }
+
+ frct_hcs_set(pci, false);
+
+ frct_tx(frcti, spb);
+}
+
+/* RTO floor scales with srtt; hard floor rto_min guards sub-ms RTT. */
+static void rtt_init(struct frcti * frcti,
+ time_t rtt_hint)
+{
+ time_t floor;
+
+ if (rtt_hint > 0) {
+ rtt_hint = MAX(rtt_hint, (time_t) RTT_BOOT_NS);
+ frcti->srtt = rtt_hint;
+ frcti->mdev = rtt_hint >> 3;
+ floor = MAX(frcti->rto_min, 2 * frcti->srtt);
+ frcti->rto = MAX(floor, rtt_hint + (frcti->mdev << MDEV_MUL));
+ frcti->min_rtt = rtt_hint;
+ } else {
+ /* Boot from first ACK. */
+ frcti->srtt = 0;
+ frcti->mdev = RTT_BOOT_NS;
+ frcti->rto = MAX((time_t) INITIAL_RTO, frcti->rto_min);
+ frcti->min_rtt = 0;
+ }
+
+ frcti->rto_mul = 0;
+}
+
+/* RFC 8985 §6.2: replace min_RTT on unset, smaller sample, or expiry. */
+static __inline__ bool min_rtt_stale(struct frcti * frcti,
+ time_t mrtt,
+ uint64_t now_ns)
+{
+ if (frcti->min_rtt == 0)
+ return true;
+
+ if (mrtt < frcti->min_rtt)
+ return true;
+
+ return ts_aged_ns(now_ns, frcti->t_min_rtt, MIN_RTT_WIN_NS);
+}
+
+/* Linux-style windowed-min refresh of RACK.min_RTT. */
+static __inline__ void min_rtt_update(struct frcti * frcti,
+ time_t mrtt,
+ uint64_t now_ns)
+{
+ if (!min_rtt_stale(frcti, mrtt, now_ns))
+ return;
+
+ frcti->min_rtt = mrtt;
+ frcti->t_min_rtt = now_ns;
+}
+
+static void rtt_update(struct frcti * frcti,
+ time_t mrtt,
+ uint64_t now_ns)
+{
+ time_t srtt = frcti->srtt;
+ time_t rttvar = frcti->mdev;
+ time_t floor;
+ time_t rto;
+
+ if (srtt == 0) {
+ srtt = mrtt;
+ rttvar = mrtt >> 1;
+ } else {
+ /* RFC 6298 symmetric EWMA. */
+ time_t delta = mrtt - srtt;
+ srtt += (delta >> 3);
+ delta = (ABS(delta) - rttvar) >> 2;
+#ifdef FRCT_LINUX_RTT_ESTIMATOR
+ if (delta < 0)
+ delta >>= 3;
+#endif
+ rttvar += delta;
+ }
+ STAT_BUMP(frcti, rtt_smpl);
+ frcti->srtt = MAX(SRTT_FLOOR_NS, srtt);
+ frcti->mdev = MAX(MDEV_FLOOR_NS, rttvar);
+
+ min_rtt_update(frcti, mrtt, now_ns);
+
+ floor = MAX(frcti->rto_min, 2 * frcti->srtt);
+ rto = MAX(floor, frcti->srtt + (frcti->mdev << MDEV_MUL));
+
+ STORE_RELEASE(&frcti->rto, rto);
+ STORE_RELEASE(&frcti->rto_mul, 0);
+}
+
+/* Fill probes[pos], return new probe_id; 0 on entropy failure. Wrlock. */
+static uint32_t rttp_alloc_probe(struct frcti * frcti,
+ uint64_t now_ns,
+ uint8_t nonce[RTTP_NONCE_LEN])
+{
+ uint32_t probe_id;
+ size_t pos;
+
+ if (random_buffer(nonce, RTTP_NONCE_LEN) < 0)
+ return 0;
+
+ probe_id = frcti->probe_id_next++;
+ if (probe_id == 0)
+ probe_id = frcti->probe_id_next++;
+
+ pos = RTTP_POS(probe_id);
+ frcti->probes[pos].id = probe_id;
+ frcti->probes[pos].ts = now_ns;
+ memcpy(frcti->probes[pos].nonce, nonce, RTTP_NONCE_LEN);
+ frcti->t_snd_probe = now_ns;
+
+ STAT_BUMP(frcti, rttp_snd);
+
+ return probe_id;
+}
+
+/* Caller wrlock; out args valid on true (caller emits post-unlock). */
+static bool rtt_probe_arm(struct frcti * frcti,
+ uint64_t now_ns,
+ uint32_t * probe_id,
+ uint8_t nonce[RTTP_NONCE_LEN])
+{
+ if (frcti->srtt == 0)
+ return false;
+
+ if (!after(frcti->snd_cr.seqno, frcti->snd_cr.lwe))
+ return false;
+
+ if (!ts_aged_ns(now_ns, frcti->t_rcv_rtt,
+ 2u * (uint64_t) frcti->srtt))
+ return false;
+
+ if (!ts_aged_ns(now_ns, frcti->t_snd_probe,
+ (uint64_t) frcti->srtt))
+ return false;
+
+ *probe_id = rttp_alloc_probe(frcti, now_ns, nonce);
+
+ return *probe_id != 0;
+}
+
+static void frcti_rttp_snd(struct frcti * frcti,
+ uint32_t probe_id,
+ uint32_t echo_id,
+ const uint8_t * nonce)
+{
+ struct ssm_pk_buff * spb;
+ struct frct_pci * pci;
+ struct frct_rttp * rttp;
+
+ if (frct_ctrl_alloc(&spb, &pci, RTTP_PAYLOAD) < 0)
+ return;
+
+ pci->flags = hton16(FRCT_RTTP);
+
+ frct_hcs_set(pci, false);
+
+ rttp = (struct frct_rttp *) FRCT_BODY(pci);
+ rttp->probe_id = hton32(probe_id);
+ rttp->echo_id = hton32(echo_id);
+ memcpy(rttp->nonce, nonce, sizeof(rttp->nonce));
+
+ frct_tx(frcti, spb);
+}
+
+struct rxm_entry {
+ struct tw_entry tw;
+ struct list_head next; /* in frcti->rxm_list */
+ struct frcti * frcti;
+ uint32_t seqno;
+ uint64_t t0;
+ size_t len;
+ uint8_t pkt[]; /* flexible — sized at alloc time */
+};
+
+static void rxm_entry_destroy(struct rxm_entry * r)
+{
+ free(r);
+}
+
+static bool rxm_still_owned(struct frcti * frcti,
+ size_t pos,
+ struct rxm_entry * r)
+{
+ return LOAD_ACQUIRE(&frcti->snd_slots[pos].rxm) == r;
+}
+
+/*
+ * All in-flight slots share the HoL backoff; otherwise non-HoL timers
+ * cycle at base RTO and storm the wire while HoL is still backing off.
+ */
+static uint64_t rxm_next_deadline(struct frcti * frcti,
+ uint64_t now_ns)
+{
+ time_t rto = LOAD_RELAXED(&frcti->rto);
+ uint8_t rto_mul = LOAD_RELAXED(&frcti->rto_mul);
+
+ return now_ns + ((uint64_t) rto << rto_mul);
+}
+
+/* Copy pkt, set FRCT_RXM, refresh ackno, re-seal HCS. */
+static struct ssm_pk_buff * rxm_pkt_prepare(const void * pkt,
+ size_t len,
+ uint32_t rcv_lwe,
+ bool stream)
+{
+ struct ssm_pk_buff * spb;
+ struct frct_pci * pci;
+ uint16_t flags;
+
+ if (frct_spb_reserve(len, &spb) < 0)
+ return NULL;
+
+ pci = (struct frct_pci *) ssm_pk_buff_head(spb);
+ memcpy(pci, pkt, len);
+
+ flags = ntoh16(pci->flags) | FRCT_RXM;
+ pci->flags = hton16(flags);
+ pci->ackno = hton32(rcv_lwe);
+
+ frct_hcs_set(pci, stream);
+
+ return spb;
+}
+
+/* Caller must NOT hold frcti->lock. */
+static void rxm_snd(struct frcti * frcti,
+ uint32_t seqno,
+ const void * pkt,
+ size_t len)
+{
+ struct ssm_pk_buff * spb;
+ struct timespec now;
+ struct snd_slot * slot;
+ uint32_t snd_lwe;
+ uint32_t rcv_lwe;
+ size_t pos;
+ int ret;
+
+ snd_lwe = LOAD_RELAXED(&frcti->snd_cr.lwe);
+ rcv_lwe = LOAD_RELAXED(&frcti->rcv_cr.lwe);
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ pos = RQ_SLOT(seqno);
+ slot = &frcti->snd_slots[pos];
+
+ slot->time = TS_TO_UINT64(now);
+ /* RTO supersedes any pending TLP/fast-rxm on this slot. */
+ slot->flags = (slot->flags & ~(SND_FAST_RXM | SND_TLP)) | SND_RTX;
+ /* §7.3: RTO supersedes TLP probes and ends the probe episode. */
+ frcti->tlp_high_seq = 0;
+ frcti->tlp_count = 0;
+
+ frcti->rtt_lwe = seqno + 1;
+
+ /* Only the HoL retransmit bumps the global RTO backoff. */
+ if (seqno == snd_lwe && frcti->rto_mul < MAX_RTO_MUL)
+ STORE_RELEASE(&frcti->rto_mul, frcti->rto_mul + 1);
+
+ /* RFC 8985 §7.2 step 4: RTO on HoL resets RACK reo scaling. */
+ if (seqno == snd_lwe)
+ frcti->reo_wnd_mult = 1;
+
+ pthread_rwlock_unlock(&frcti->lock);
+
+ STAT_BUMP(frcti, rxm_rto);
+
+ spb = rxm_pkt_prepare(pkt, len, rcv_lwe, frcti->stream);
+ if (spb == NULL)
+ return;
+
+ /* ETIMEDOUT/ENOMEM: let r-timer drive teardown. */
+ ret = frct_tx(frcti, spb);
+ if (ret == -EFLOWDOWN || ret == -ENOTALLOC)
+ STAT_BUMP(frcti, rxm_tx_dead);
+}
+
+static void rxm_due(void * arg)
+{
+ struct rxm_entry * r = arg;
+ struct frcti * frcti = r->frcti;
+ struct timespec now;
+ uint64_t now_ns;
+ uint32_t snd_lwe;
+ size_t pos = RQ_SLOT(r->seqno);
+
+ STAT_BUMP(frcti, rxm_due_count);
+
+ snd_lwe = LOAD_RELAXED(&frcti->snd_cr.lwe);
+
+ /* Already ACK'd: expected for the steady-state majority. */
+ if (before(r->seqno, snd_lwe)) {
+ STAT_BUMP(frcti, rxm_due_acked);
+ goto cleanup;
+ }
+
+ /* SACK/RACK-cleared the slot (caller NULL'd snd_slots[pos].rxm). */
+ if (!rxm_still_owned(frcti, pos, r)) {
+ STAT_BUMP(frcti, rxm_due_unowned);
+ goto cleanup;
+ }
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ /* R-timer expired: peer unreachable. */
+ if (RXM_AGED_OUT(r->t0, now_ns, frcti->t_r)) {
+ STAT_BUMP(frcti, rxm_due_aged);
+ frct_mark_flow_down(frcti);
+ goto cleanup;
+ }
+
+ /* HoL-only retx; defer at base rto so HoL transitions react. */
+ if (r->seqno != snd_lwe) {
+ STAT_BUMP(frcti, rxm_due_defer);
+ tw_post(&r->tw, now_ns + LOAD_RELAXED(&frcti->rto),
+ rxm_due, r);
+ return;
+ }
+
+ rxm_snd(frcti, r->seqno, r->pkt, r->len);
+
+ /* Re-check ownership: fire path may have replaced our entry. */
+ if (rxm_still_owned(frcti, pos, r)) {
+ uint64_t anchor;
+
+ /* Per-slot anchor breaks co-fire re-bin. */
+ anchor = frcti->snd_slots[pos].time;
+ tw_post(&r->tw, rxm_next_deadline(frcti, anchor), rxm_due, r);
+ return;
+ }
+
+ cleanup:
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ if (rxm_still_owned(frcti, pos, r))
+ STORE_RELEASE(&frcti->snd_slots[pos].rxm, NULL);
+
+ list_del(&r->next);
+
+ pthread_rwlock_unlock(&frcti->lock);
+
+ rxm_entry_destroy(r);
+}
+
+/* Pre-allocate rxm entry so frcti_snd can fail before committing seqno. */
+static struct rxm_entry * rxm_alloc(struct frcti * frcti,
+ size_t pkt_len)
+{
+ struct rxm_entry * r;
+
+ r = malloc(sizeof(*r) + pkt_len);
+ if (r == NULL) {
+ STAT_BUMP(frcti, rxm_arm_fail);
+ return NULL;
+ }
+
+ r->frcti = frcti;
+ tw_init_entry(&r->tw);
+
+ return r;
+}
+
+static void rxm_arm(struct frcti * frcti,
+ uint32_t seqno,
+ struct rxm_entry * r,
+ const struct ssm_pk_buff * spb)
+{
+ struct timespec now;
+ time_t rto;
+ uint8_t rto_mul;
+ uint64_t deadline;
+ size_t len = ssm_pk_buff_len(spb);
+
+ memcpy(r->pkt, ssm_pk_buff_head(spb), len);
+ r->len = len;
+ r->seqno = seqno;
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ r->t0 = TS_TO_UINT64(now);
+
+ rto = LOAD_RELAXED(&frcti->rto);
+ rto_mul = LOAD_RELAXED(&frcti->rto_mul);
+ deadline = r->t0 + ((uint64_t) rto << rto_mul);
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ assert(before(seqno, frcti->snd_cr.lwe + RQ_SIZE));
+
+ list_add_tail(&r->next, &frcti->rxm_list);
+ STORE_RELEASE(&frcti->snd_slots[RQ_SLOT(seqno)].rxm, r);
+
+ pthread_rwlock_unlock(&frcti->lock);
+
+ tw_post(&r->tw, deadline, rxm_due, r);
+}
+
+static void rxm_cancel_all(struct frcti * frcti)
+{
+ struct list_head * p;
+ struct list_head * t;
+
+ list_for_each_safe(p, t, &frcti->rxm_list) {
+ struct rxm_entry * r = list_entry(p, struct rxm_entry, next);
+ list_del(&r->next);
+ tw_cancel(&r->tw);
+ rxm_entry_destroy(r);
+ STAT_BUMP(frcti, rxm_cancel);
+ }
+}
+
+static __inline__ void sack_block_put(uint8_t * payload,
+ uint16_t i,
+ uint32_t s,
+ uint32_t e)
+{
+ uint32_t * blk = (uint32_t *)
+ (payload + SACK_HDR_SIZE + i * SACK_BLOCK_SIZE);
+
+ blk[0] = hton32(s);
+ blk[1] = hton32(e);
+}
+
+static __inline__ void sack_block_get(const uint8_t * payload,
+ uint16_t i,
+ uint32_t * s,
+ uint32_t * e)
+{
+ const uint32_t * blk = (const uint32_t *)
+ (payload + SACK_HDR_SIZE + i * SACK_BLOCK_SIZE);
+
+ *s = ntoh32(blk[0]);
+ *e = ntoh32(blk[1]);
}
-static void send_frct_pkt(struct frcti * frcti)
+/*
+ * Build SACK blocks for ranges *above* rcv_cr.lwe. Wire invariant
+ * (see doc/frct.txt §1.3): every block produced here satisfies
+ * blocks[i].start > rcv_cr.lwe = ackno, which makes the "first block
+ * below ackno" convention used to mark a D-SACK (RFC 2883 §4 case 1)
+ * unambiguous. Caller holds frcti->lock.
+ */
+static uint16_t sack_blocks_build(struct frcti * frcti,
+ uint32_t blocks[][2],
+ uint16_t max_n)
+{
+ const struct rcv_slot * slots = frcti->rcv_slots;
+ uint32_t s;
+ uint32_t end;
+ uint16_t n = 0;
+
+ s = frcti->rcv_cr.lwe + 1;
+ end = frcti->rcv_cr.lwe + RQ_SIZE;
+ if (after(end, frcti->rcv_cr.rwe))
+ end = frcti->rcv_cr.rwe;
+
+ while (before(s, end) && n < max_n) {
+ while (before(s, end) && slots[RQ_SLOT(s)].idx == -1)
+ ++s;
+
+ if (!before(s, end))
+ break;
+
+ blocks[n][0] = s;
+ while (before(s, end) && slots[RQ_SLOT(s)].idx != -1)
+ ++s;
+ blocks[n][1] = s;
+ ++n;
+ }
+
+ return n;
+}
+
+/*
+ * Prepend the pending D-SACK report (if any) as block[0]; clear flag.
+ * Returns the number of slots consumed at the head (0 or 1). Caller
+ * holds wrlock.
+ */
+static __inline__ uint16_t dsack_consume(struct frcti * frcti,
+ uint32_t blocks[][2])
+{
+ if (!frcti->dsack_valid || frcti->sack_n_max == 0)
+ return 0;
+
+ blocks[0][0] = frcti->dsack_seqno;
+ blocks[0][1] = frcti->dsack_seqno + 1;
+ frcti->dsack_valid = false;
+ return 1;
+}
+
+/* Caller must NOT hold frcti->lock. */
+static void frcti_sack_snd(struct frcti * frcti,
+ const struct sack_args * sa)
+{
+ struct ssm_pk_buff * spb;
+ struct frct_pci * pci;
+ buffer_t buf;
+ uint16_t i;
+
+ assert(sa->n <= SACK_MAX_BLOCKS);
+
+ buf.len = SACK_HDR_SIZE + sa->n * SACK_BLOCK_SIZE;
+
+ if (frct_ctrl_alloc(&spb, &pci, buf.len) < 0)
+ return;
+
+ pci->flags = hton16(FRCT_ACK | FRCT_FC | FRCT_SACK);
+ pci->window = hton32(sa->rwe);
+ pci->ackno = hton32(sa->ack);
+ pci->seqno = hton32(FETCH_ADD_RELAXED(&frcti->snd_cr.ackno, 1) + 1);
+
+ frct_hcs_set(pci, false);
+
+ buf.data = FRCT_BODY(pci);
+ memset(buf.data, 0, SACK_HDR_SIZE);
+ *(uint16_t *) buf.data = hton16(sa->n);
+ for (i = 0; i < sa->n; ++i)
+ sack_block_put(buf.data, i, sa->blocks[i][0], sa->blocks[i][1]);
+
+ frct_tx(frcti, spb);
+}
+
+static void ack_snd(struct frcti * frcti,
+ bool with_sack)
{
struct timespec now;
+ uint64_t now_ns;
time_t diff;
uint32_t ackno;
uint32_t rwe;
- int fd;
+ struct sack_args * sa = NULL;
+ size_t sa_sz;
+ bool sacking = false;
assert(frcti);
+ STAT_BUMP(frcti, ack_fire);
+
clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ if (with_sack && frcti->sack_n_max > 0) {
+ sa_sz = sizeof(*sa) + frcti->sack_n_max * sizeof(sa->blocks[0]);
+ sa = malloc(sa_sz);
+ /* If alloc fails, fall through and send a bare cum-ACK. */
+ }
pthread_rwlock_wrlock(&frcti->lock);
- if (!after(frcti->rcv_cr.lwe, frcti->rcv_cr.seqno)) {
+ /* D-SACK rides through cum-ACK freshness; signal is the duplicate. */
+ if (!after(frcti->rcv_cr.lwe, frcti->rcv_cr.seqno)
+ && !frcti->dsack_valid) {
pthread_rwlock_unlock(&frcti->lock);
- return;
+ STAT_BUMP(frcti, ack_supp_seqno);
+ goto out;
}
- fd = frcti->fd;
ackno = frcti->rcv_cr.lwe;
- rwe = frcti->rcv_cr.rwe;
+ rwe = frcti_advert_rwe(frcti);
- diff = ts_diff_ns(&now, &frcti->rcv_cr.act);
- if (diff > frcti->a) {
+ if (ACK_AGED_OUT(frcti->rcv_cr.act, now_ns, frcti->t_a)) {
pthread_rwlock_unlock(&frcti->lock);
- return;
+ STAT_BUMP(frcti, ack_supp_inact);
+ goto out;
}
- diff = ts_diff_ns(&now, &frcti->snd_cr.act);
- if (diff < TICTIME) {
+ diff = (time_t) ts_age_ns(now_ns, frcti->snd_cr.act);
+ if (diff < TICTIME && !frcti->dsack_valid) {
pthread_rwlock_unlock(&frcti->lock);
- return;
+ STAT_BUMP(frcti, ack_supp_rate);
+ goto out;
}
+ /* RFC 2018: piggyback SACK on timer ACK; dedup unchanged board. */
+ if (sa == NULL || (frcti->sack_n == 0 && !frcti->dsack_valid))
+ goto no_sack;
+
+ sa->dsack = false;
+ sa->n = dsack_consume(frcti, sa->blocks);
+ if (sa->n == 1)
+ sa->dsack = true;
+
+ sa->n += sack_blocks_build(frcti, sa->blocks + sa->n,
+ frcti->sack_n_max - sa->n);
+ if (sa->n == 0)
+ goto no_sack;
+
+ if (!sa->dsack && ackno == frcti->sack_lwe && sa->n == frcti->sack_n)
+ goto no_sack;
+
+ sa->ack = ackno;
+ sa->rwe = rwe;
+ frcti->sack_lwe = ackno;
+ frcti->sack_n = sa->n;
+ frcti->t_snd_sack = now_ns;
+ sacking = true;
+
+ no_sack:
frcti->rcv_cr.seqno = frcti->rcv_cr.lwe;
pthread_rwlock_unlock(&frcti->lock);
- __send_frct_pkt(fd, FRCT_ACK | FRCT_FC, ackno, rwe);
+ STAT_BUMP(frcti, ack_snd);
+
+ if (sacking) {
+ STAT_BUMP(frcti, sack_snd);
+ if (sa->dsack)
+ STAT_BUMP(frcti, dsack_snd);
+ frcti_sack_snd(frcti, sa);
+ } else {
+ frcti_pkt_snd(frcti, FRCT_ACK | FRCT_FC, ackno, rwe);
+ }
+
+ out:
+ free(sa);
}
-static void __send_rdv(int fd)
+/* Delayed-ACK timer: per-flow, dedup'd via atomic test-and-set. */
+static void ack_due(void * arg)
{
- __send_frct_pkt(fd, FRCT_RDVS, 0, 0);
+ struct frcti * frcti = arg;
+
+ __atomic_clear(&frcti->ack_pending, __ATOMIC_RELAXED);
+
+ ack_snd(frcti, true);
}
-static struct frcti * frcti_create(int fd,
- time_t a,
- time_t r,
- time_t mpl)
+static int ack_arm(struct frcti * frcti)
{
- struct frcti * frcti;
- ssize_t idx;
- struct timespec now;
- pthread_condattr_t cattr;
+ struct timespec now;
+ uint64_t deadline;
+
+ if (__atomic_test_and_set(&frcti->ack_pending, __ATOMIC_RELAXED))
+ return 0;
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ deadline = TS_TO_UINT64(now) + ACK_DELAY_NS;
+
+ tw_post(&frcti->ack_tw, deadline, ack_due, frcti);
+
+ return 0;
+}
+
+/* Forward decl breaks the keepalive cycle: ka_arm <-> ka_due. */
+static void ka_due(void * arg);
+
+static int ka_arm(struct frcti * frcti)
+{
+ struct timespec now;
+ uint64_t now_ns;
+ uint64_t timeo_ns;
+ uint64_t snd_ns;
+ uint64_t rcv_ns;
+ uint64_t deadline;
+
+ timeo_ns = (uint64_t) frcti->qs_timeout * MILLION; /* IMM */
+ snd_ns = LOAD_RELAXED(&frcti->snd_cr.act) + timeo_ns / 4;
+ rcv_ns = LOAD_RELAXED(&frcti->rcv_cr.act) + timeo_ns;
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+ deadline = MIN(snd_ns, rcv_ns);
+ if (deadline <= now_ns)
+ deadline = now_ns + timeo_ns / 4;
+
+ tw_post(&frcti->ka_tw, deadline, ka_due, frcti);
+
+ return 0;
+}
+
+__attribute__((cold))
+static void ka_snd(struct frcti * frcti)
+{
+ struct ssm_pk_buff * spb;
+ struct frct_pci * pci;
+ struct timespec now;
+ uint64_t now_ns;
+ time_t timeo_ns;
+ uint64_t rcv_act;
+ uint64_t ka_rcv;
+ int64_t rcv_idle;
+ int64_t snd_idle;
+ uint32_t ackno;
+
+ assert(frcti);
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ timeo_ns = (time_t)(frcti->qs_timeout) * MILLION; /* IMM */
+ rcv_act = LOAD_RELAXED(&frcti->rcv_cr.act);
+ ka_rcv = LOAD_RELAXED(&frcti->t_ka_rcv);
+ rcv_idle = ts_age_ns(now_ns, rcv_act > ka_rcv ? rcv_act : ka_rcv);
+ snd_idle = ts_age_ns(now_ns, LOAD_RELAXED(&frcti->snd_cr.act));
+
+ if (rcv_idle > timeo_ns) {
+ frct_mark_peer_dead(frcti);
+ return;
+ }
+
+ if (snd_idle <= timeo_ns / 4) {
+ ka_arm(frcti);
+ return;
+ }
+
+ if (frct_ctrl_alloc(&spb, &pci, 0) < 0) {
+ ka_arm(frcti);
+ return;
+ }
+
+ ackno = LOAD_RELAXED(&frcti->rcv_cr.lwe);
+
+ pci->flags = hton16(FRCT_KA | FRCT_ACK);
+ pci->ackno = hton32(ackno);
+
+ frct_hcs_set(pci, false);
+
+ STAT_BUMP(frcti, ka_snd);
+ frct_tx(frcti, spb);
+
+ ka_arm(frcti);
+}
+
+/* Keepalive timer: re-posted by the fire callback itself. */
+static void ka_due(void * arg)
+{
+ ka_snd((struct frcti *) arg);
+}
+
+static void frcti_rdv_snd(struct frcti * frcti)
+{
+ frcti_pkt_snd(frcti, FRCT_RDVS, 0, 0);
+}
+
+#define HAS_RESCNTL(cr) ((cr)->cflags & FRCTFRESCNTL)
+static bool frcti_is_window_open(struct frcti * frcti)
+{
+ struct frct_cr * snd_cr = &frcti->snd_cr;
+ struct timespec now;
+ time_t diff;
+ bool ret = false;
+
+ if (!HAS_RESCNTL(snd_cr))
+ return true;
+
+ if (before(snd_cr->seqno, LOAD_RELAXED(&snd_cr->rwe)))
+ return true;
+
+ /* Window may be closed; wrlock for RDV state mutations. */
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ if (before(snd_cr->seqno, snd_cr->rwe)) {
+ ret = true;
+ goto unlock;
+ }
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+
+ if (frcti->open) {
+ frcti->open = false;
+ frcti->t_wnd = now;
+ frcti->t_last_rdv = now;
+ goto unlock;
+ }
+
+ diff = ts_diff_ns(&now, &frcti->t_wnd);
+ if (diff > MAX_RDV)
+ goto unlock;
+
+ diff = ts_diff_ns(&now, &frcti->t_last_rdv);
+ if (diff > (time_t) frcti->t_rdv) {
+ frcti->t_last_rdv = now;
+ frcti_rdv_snd(frcti);
+ STAT_BUMP(frcti, rdv_snd);
+ }
+ unlock:
+ pthread_rwlock_unlock(&frcti->lock);
+
+ return ret;
+}
+
+/* n contiguous seqnos free? No RDV: the n=1 path drives it. */
+static bool frcti_is_window_open_n(struct frcti * frcti,
+ size_t n)
+{
+ struct frct_cr * snd_cr = &frcti->snd_cr;
+
+ if (!HAS_RESCNTL(snd_cr))
+ return true;
+
+ if (n <= 1)
+ return frcti_is_window_open(frcti);
+
+ return before(snd_cr->seqno + (uint32_t)(n - 1),
+ LOAD_RELAXED(&snd_cr->rwe));
+}
+
+static void release_rq(struct frcti * frcti)
+{
+ size_t i;
+
+ for (i = 0; i < RQ_SIZE; ++i) {
+ if (frcti->rcv_slots[i].idx == -1)
+ continue;
+
+ /* Stream rq entries are sentinels (no spb owned). */
+ if (!frcti->stream)
+ frct_spb_release_idx(frcti->rcv_slots[i].idx);
+
+ frcti->rcv_slots[i].idx = -1;
+ STAT_BUMP(frcti, rq_released);
+ }
+}
+
+static __inline__ bool stream_ring_sz_ok(struct frcti * frcti,
+ size_t n)
+{
+ size_t per_pkt;
+
+ if (n > FRCT_STREAM_RING_SZ_MAX)
+ return false;
+
+ if ((n & (n - 1)) != 0)
+ return false;
+
+ per_pkt = frcti->frag_mtu - frcti_data_hdr_len(frcti);
+
+ return n >= FRCT_STREAM_RING_MIN_PKTS * per_pkt;
+}
+
+/* Default ring sized for full RQ_SIZE seqno window; pow2, capped. */
+static size_t default_stream_ring_sz(size_t per_pkt)
+{
+ size_t need;
+ size_t sz;
+
+ need = (size_t) RQ_SIZE * per_pkt;
+ sz = FRCT_STREAM_RING_SZ;
+
+ while (sz < need && sz < FRCT_STREAM_RING_SZ_MAX)
+ sz <<= 1;
+
+ return sz;
+}
+
+struct frcti * frcti_create(int fd,
+ uint64_t a,
+ uint64_t r,
+ uint64_t mpl,
+ time_t rtt_hint,
+ qosspec_t qs,
+ uint32_t mtu)
+{
+ struct frcti * frcti;
+ ssize_t idx;
+ struct timespec now;
+ uint64_t now_ns;
+ size_t bb;
+ size_t per_pkt;
#ifdef PROC_FLOW_STATS
- char frctstr[FRCT_NAME_STRLEN + 1];
+ char frctstr[FRCT_NAME_STRLEN + 1];
#endif
- mpl *= MILLION;
- a *= BILLION;
- r *= BILLION;
+ mpl *= MILLION; /* ms -> ns */
+ a *= MILLION; /* ms -> ns */
+ r *= MILLION; /* ms -> ns */
frcti = malloc(sizeof(*frcti));
if (frcti == NULL)
@@ -349,56 +1824,76 @@ static struct frcti * frcti_create(int fd,
memset(frcti, 0, sizeof(*frcti));
+ list_head_init(&frcti->rxm_list);
+
if (pthread_rwlock_init(&frcti->lock, NULL))
goto fail_lock;
- if (pthread_mutex_init(&frcti->mtx, NULL))
- goto fail_mutex;
-
- if (pthread_condattr_init(&cattr))
- goto fail_cattr;
-#ifndef __APPLE__
- pthread_condattr_setclock(&cattr, PTHREAD_COND_CLOCK);
-#endif
- if (pthread_cond_init(&frcti->cond, &cattr))
- goto fail_cond;
-
#ifdef PROC_FLOW_STATS
sprintf(frctstr, "%d", fd);
if (rib_reg(frctstr, &r_ops))
goto fail_rib_reg;
#endif
- pthread_condattr_destroy(&cattr);
for (idx = 0; idx < RQ_SIZE; ++idx)
- frcti->rq[idx] = -1;
+ frcti->rcv_slots[idx].idx = -1;
clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ frcti->t_mpl = mpl;
+ frcti->t_a = a;
+ frcti->t_r = r;
+ frcti->t_rdv = DELT_RDV;
+ frcti->fd = fd;
+ frcti->ber = (time_t) qs.ber;
+ frcti->lossy = (qs.loss != 0);
+ frcti->qs_timeout = (time_t) qs.timeout;
+
+ frcti->frag_mtu = (size_t) mtu;
+
+ /* Cap blocks per SACK at what fits in the per-flow frag_mtu. */
+ bb = (frcti->frag_mtu - FRCT_PCILEN - SACK_HDR_SIZE)
+ / SACK_BLOCK_SIZE;
+ if (bb > SACK_MAX_BLOCKS)
+ bb = SACK_MAX_BLOCKS;
+ frcti->sack_n_max = (uint16_t) bb;
+
+ frcti->max_rcv_sdu = FRCT_MAX_SDU;
+
+ frcti->stream = (qs.service == SVC_STREAM);
+ if (frcti->stream) {
+ per_pkt = frcti->frag_mtu - frcti_data_hdr_len(frcti);
+ frcti->rcv_ring_sz = default_stream_ring_sz(per_pkt);
+ frcti->ring_seq_cap =
+ (uint32_t) (frcti->rcv_ring_sz / per_pkt);
+ }
- frcti->mpl = mpl;
- frcti->a = a;
- frcti->r = r;
- frcti->rdv = DELT_RDV;
- frcti->fd = fd;
-
-
- frcti->rttseq = 0;
- frcti->probe = false;
-
- frcti->srtt = 0; /* Updated on first ACK */
- frcti->mdev = 10 * MILLION; /* Updated on first ACK */
- frcti->rto = BILLION; /* Initial rxm will be after 1 s */
-#ifdef PROC_FLOW_STATS
- frcti->n_rtx = 0;
- frcti->n_prb = 0;
- frcti->n_rtt = 0;
- frcti->n_dup = 0;
- frcti->n_dak = 0;
- frcti->n_rdv = 0;
- frcti->n_out = 0;
- frcti->n_rqo = 0;
-#endif
- if (proc.flows[fd].info.qs.loss == 0) {
+ frcti->rto_min = (time_t) MAX(RTO_MIN, 1ULL << RXMQ_RES);
+ rtt_init(frcti, rtt_hint);
+ frcti->t_min_rtt = now_ns;
+ frcti->probe_id_next = 1;
+ frcti->t_rcv_rtt = now_ns;
+ frcti->t_snd_probe = now_ns;
+ frcti->t_snd_sack = 0;
+ frcti->sack_lwe = 0;
+ frcti->sack_n = 0;
+ frcti->dsack_seqno = 0;
+ frcti->dsack_valid = false;
+ frcti->reo_wnd_mult = 1;
+ frcti->dsack_lwe_snap = 0;
+ frcti->t_last_reo_widen = 0;
+ /* So the first pre-DRF NACK fires without waiting cooldown. */
+ frcti->t_nack = now_ns - BILLION;
+ frcti->in_recovery = false;
+ frcti->recovery_high = 0;
+ frcti->rack_fired_lwe = 0;
+
+ tw_init_entry(&frcti->ack_tw);
+ tw_init_entry(&frcti->ka_tw);
+ tw_init_entry(&frcti->tlp_tw);
+
+ if (!frcti->lossy) {
frcti->snd_cr.cflags |= FRCTFRTX | FRCTFLINGER;
frcti->rcv_cr.cflags |= FRCTFRTX;
}
@@ -406,24 +1901,31 @@ static struct frcti * frcti_create(int fd,
frcti->snd_cr.cflags |= FRCTFRESCNTL;
frcti->snd_cr.rwe = START_WINDOW;
+ if (frcti->lossy)
+ frcti->snd_cr.rwe = RQ_SIZE;
+
+ frcti->snd_cr.inact = 3 * mpl + a + r + BILLION; /* ns */
+ frcti->snd_cr.act = now_ns - frcti->snd_cr.inact - BILLION;
- frcti->snd_cr.inact = (3 * mpl + a + r) / BILLION + 1; /* s */
- frcti->snd_cr.act.tv_sec = now.tv_sec - (frcti->snd_cr.inact + 1);
+ frcti->rcv_cr.inact = 2 * mpl + a + r + BILLION; /* ns */
+ frcti->rcv_cr.act = now_ns - frcti->rcv_cr.inact - BILLION;
- frcti->rcv_cr.inact = (2 * mpl + a + r) / BILLION + 1; /* s */
- frcti->rcv_cr.act.tv_sec = now.tv_sec - (frcti->rcv_cr.inact + 1);
+ frcti->t_ka_rcv = now_ns;
+
+ /* qs_timeout == 0: no KA, silent peer crash goes undetected. */
+ if (frcti->qs_timeout > 0) {
+ if (ka_arm(frcti) < 0)
+ goto fail_ka_arm;
+ }
return frcti;
+ fail_ka_arm:
#ifdef PROC_FLOW_STATS
+ sprintf(frctstr, "%d", fd);
+ rib_unreg(frctstr);
fail_rib_reg:
- pthread_cond_destroy(&frcti->cond);
#endif
- fail_cond:
- pthread_condattr_destroy(&cattr);
- fail_cattr:
- pthread_mutex_destroy(&frcti->mtx);
- fail_mutex:
pthread_rwlock_destroy(&frcti->lock);
fail_lock:
free(frcti);
@@ -431,21 +1933,55 @@ static struct frcti * frcti_create(int fd,
return NULL;
}
-static void frcti_destroy(struct frcti * frcti)
+void frcti_destroy(struct frcti * frcti)
{
#ifdef PROC_FLOW_STATS
char frctstr[FRCT_NAME_STRLEN + 1];
+#endif
+ /* Drop every wheel entry referencing frcti before freeing it. */
+ rxm_cancel_all(frcti);
+ tw_cancel(&frcti->ack_tw);
+ tw_cancel(&frcti->ka_tw);
+ tw_cancel(&frcti->tlp_tw);
+
+#if defined(PROC_FLOW_STATS) && defined(FRCT_DEBUG_STDOUT)
+ printf("[FRCT teardown] pid=%d fd=%d "
+ "sdu_snd=%zu sdu_reasm=%zu sdu_sole=%zu "
+ "frag_snd=%zu frag_rcv=%zu frag_drop=%zu "
+ "rxm_rto=%zu rxm_sack=%zu rxm_dup=%zu "
+ "rxm_due=%zu acked=%zu unowned=%zu aged=%zu defer=%zu "
+ "cancel=%zu arm_fail=%zu inflight=%u "
+ "nack_snd=%zu nack_rcv=%zu inact_drop=%zu "
+ "drf_rebase=%zu rq_released=%zu\n",
+ (int) getpid(), frcti->fd,
+ frcti->stat.sdu_snd_frag, frcti->stat.sdu_reasm,
+ frcti->stat.sdu_sole,
+ frcti->stat.frag_snd, frcti->stat.frag_rcv,
+ frcti->stat.frag_drop,
+ frcti->stat.rxm_rto, frcti->stat.rxm_sack,
+ frcti->stat.rxm_dupthresh,
+ frcti->stat.rxm_due_count, frcti->stat.rxm_due_acked,
+ frcti->stat.rxm_due_unowned, frcti->stat.rxm_due_aged,
+ frcti->stat.rxm_due_defer,
+ frcti->stat.rxm_cancel, frcti->stat.rxm_arm_fail,
+ frcti->snd_cr.seqno - frcti->snd_cr.lwe,
+ frcti->stat.nack_snd, frcti->stat.nack_rcv,
+ frcti->stat.inact_drop,
+ frcti->stat.drf_rebase, frcti->stat.rq_released);
+#endif
+
+ release_rq(frcti);
+ free(frcti->rcv_ring);
+#ifdef PROC_FLOW_STATS
sprintf(frctstr, "%d", frcti->fd);
rib_unreg(frctstr);
#endif
- pthread_cond_destroy(&frcti->cond);
- pthread_mutex_destroy(&frcti->mtx);
pthread_rwlock_destroy(&frcti->lock);
free(frcti);
}
-static uint16_t frcti_getflags(struct frcti * frcti)
+uint16_t frcti_getflags(struct frcti * frcti)
{
uint16_t ret;
@@ -453,89 +1989,91 @@ static uint16_t frcti_getflags(struct frcti * frcti)
pthread_rwlock_rdlock(&frcti->lock);
- ret = frcti->snd_cr.cflags;
+ ret = frcti->snd_cr.cflags & FRCTFMASK;
pthread_rwlock_unlock(&frcti->lock);
return ret;
}
-static void frcti_setflags(struct frcti * frcti,
- uint16_t flags)
+void frcti_setflags(struct frcti * frcti,
+ uint16_t flags)
{
- flags |= FRCTFRTX; /* Should not be set by command */
-
assert(frcti);
- pthread_rwlock_wrlock(&frcti->lock);
+ flags &= FRCTFSETMASK;
- frcti->snd_cr.cflags &= FRCTFRTX; /* Zero other flags */
+ pthread_rwlock_wrlock(&frcti->lock);
- frcti->snd_cr.cflags &= flags;
+ frcti->snd_cr.cflags = (frcti->snd_cr.cflags & ~FRCTFSETMASK) | flags;
pthread_rwlock_unlock(&frcti->lock);
}
-#define frcti_queued_pdu(frcti) \
- (frcti == NULL ? idx : __frcti_queued_pdu(frcti))
+size_t frcti_get_max_rcv_sdu(struct frcti * frcti)
+{
+ size_t ret;
-#define frcti_snd(frcti, spb) \
- (frcti == NULL ? 0 : __frcti_snd(frcti, spb))
+ assert(frcti);
+
+ pthread_rwlock_rdlock(&frcti->lock);
+ ret = frcti->max_rcv_sdu;
+ pthread_rwlock_unlock(&frcti->lock);
-#define frcti_rcv(frcti, spb) \
- (frcti == NULL ? 0 : __frcti_rcv(frcti, spb))
+ return ret;
+}
-#define frcti_dealloc(frcti) \
- (frcti == NULL ? 0 : __frcti_dealloc(frcti))
+int frcti_set_max_rcv_sdu(struct frcti * frcti,
+ size_t max)
+{
+ assert(frcti);
-#define frcti_is_window_open(frcti) \
- (frcti == NULL ? true : __frcti_is_window_open(frcti))
+ if (max == 0)
+ return -EINVAL;
-#define frcti_window_wait(frcti, abstime) \
- (frcti == NULL ? 0 : __frcti_window_wait(frcti, abstime))
+ pthread_rwlock_wrlock(&frcti->lock);
+ frcti->max_rcv_sdu = max;
+ pthread_rwlock_unlock(&frcti->lock);
+ return 0;
+}
-static bool __frcti_is_window_open(struct frcti * frcti)
+size_t frcti_get_rcv_ring_sz(struct frcti * frcti)
{
- struct frct_cr * snd_cr = &frcti->snd_cr;
- bool ret = true;
+ size_t ret;
+
+ assert(frcti);
pthread_rwlock_rdlock(&frcti->lock);
+ ret = frcti->rcv_ring_sz;
+ pthread_rwlock_unlock(&frcti->lock);
- if (snd_cr->cflags & FRCTFRESCNTL)
- ret = before(snd_cr->seqno, snd_cr->rwe);
+ return ret;
+}
- if (!ret) {
- struct timespec now;
+/* Set before any stream byte has been delivered; -EBUSY otherwise. */
+int frcti_set_rcv_ring_sz(struct frcti * frcti,
+ size_t n)
+{
+ int ret = 0;
+ size_t per_pkt;
- clock_gettime(PTHREAD_COND_CLOCK, &now);
+ assert(frcti);
- pthread_mutex_lock(&frcti->mtx);
- if (frcti->open) {
- frcti->open = false;
- frcti->t_wnd = now;
- frcti->t_rdvs = now;
- } else {
- time_t diff;
- diff = ts_diff_ns(&now, &frcti->t_wnd);
- if (diff > MAX_RDV) {
- pthread_mutex_unlock(&frcti->mtx);
- pthread_rwlock_unlock(&frcti->lock);
- return false;
- }
-
- diff = ts_diff_ns(&now, &frcti->t_rdvs);
- if (diff > frcti->rdv) {
- frcti->t_rdvs = now;
- __send_rdv(frcti->fd);
-#ifdef PROC_FLOW_STATS
- frcti->n_rdv++;
-#endif
+ if (!frcti->stream)
+ return -ENOTSUP;
+ if (!stream_ring_sz_ok(frcti, n))
+ return -EINVAL;
- }
- }
+ per_pkt = frcti->frag_mtu - frcti_data_hdr_len(frcti);
+
+ pthread_rwlock_wrlock(&frcti->lock);
- pthread_mutex_unlock(&frcti->mtx);
+ if (frcti->rcv_ring != NULL) {
+ ret = -EBUSY;
+ } else {
+ frcti->rcv_ring_sz = n;
+ frcti->ring_seq_cap = (uint32_t) (n / per_pkt);
}
pthread_rwlock_unlock(&frcti->lock);
@@ -543,392 +2081,2130 @@ static bool __frcti_is_window_open(struct frcti * frcti)
return ret;
}
-static int __frcti_window_wait(struct frcti * frcti,
- struct timespec * abstime)
+time_t frcti_get_rto_min(struct frcti * frcti)
{
- struct frct_cr * snd_cr = &frcti->snd_cr;
- int ret = 0;
+ time_t v;
+
+ assert(frcti);
pthread_rwlock_rdlock(&frcti->lock);
+ v = frcti->rto_min;
+ pthread_rwlock_unlock(&frcti->lock);
- if (!(snd_cr->cflags & FRCTFRESCNTL)) {
- pthread_rwlock_unlock(&frcti->lock);
+ return v;
+}
+
+/* Floor at the timer-wheel resolution; finer granularity is unrepresentable. */
+int frcti_set_rto_min(struct frcti * frcti,
+ time_t rto_min)
+{
+ time_t floor = (time_t) (1ULL << RXMQ_RES);
+ time_t rto_floor;
+ time_t rto;
+
+ assert(frcti);
+
+ if (rto_min < floor)
+ return -EINVAL;
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ frcti->rto_min = rto_min;
+ if (frcti->srtt > 0) {
+ rto_floor = MAX(rto_min, 2 * frcti->srtt);
+ rto = MAX(rto_floor,
+ frcti->srtt + (frcti->mdev << MDEV_MUL));
+ STORE_RELEASE(&frcti->rto, rto);
+ } else if (frcti->rto < rto_min) {
+ STORE_RELEASE(&frcti->rto, rto_min);
+ }
+
+ pthread_rwlock_unlock(&frcti->lock);
+
+ return 0;
+}
+
+/* Re-arm a fresh rxm so a lost fast-retx still recovers via RTO. */
+static void sack_rxm_snd(struct frcti * frcti,
+ void * pkt,
+ size_t len)
+{
+ struct ssm_pk_buff * spb;
+ const struct frct_pci * pci;
+ struct rxm_entry * rxm;
+ uint32_t rcv_lwe;
+ uint32_t seqno;
+ int ret;
+
+ rcv_lwe = LOAD_RELAXED(&frcti->rcv_cr.lwe);
+
+ spb = rxm_pkt_prepare(pkt, len, rcv_lwe, frcti->stream);
+ if (spb == NULL)
+ return;
+
+ pci = (const struct frct_pci *) ssm_pk_buff_head(spb);
+ seqno = ntoh32(pci->seqno);
+
+ rxm = rxm_alloc(frcti, ssm_pk_buff_len(spb));
+ if (rxm == NULL) {
+ frct_spb_release(spb);
+ return;
+ }
+ rxm_arm(frcti, seqno, rxm, spb);
+
+ STAT_BUMP(frcti, rxm_sack);
+ ret = frct_tx(frcti, spb);
+ if (ret == -EFLOWDOWN || ret == -ENOTALLOC)
+ STAT_BUMP(frcti, rxm_tx_dead);
+}
+
+/* Additive HoL emit; original snd_slots[hp].rxm stays armed (NewReno). */
+static int fast_rxm_send(struct frcti * frcti,
+ void * pkt,
+ size_t len)
+{
+ struct ssm_pk_buff * spb;
+ uint32_t rcv_lwe;
+
+ rcv_lwe = LOAD_RELAXED(&frcti->rcv_cr.lwe);
+
+ spb = rxm_pkt_prepare(pkt, len, rcv_lwe, frcti->stream);
+ if (spb == NULL)
return 0;
+
+ return frct_tx(frcti, spb);
+}
+
+/* PCI bytes survive head_release at receive; just rewind the pointer. */
+static __inline__ uint16_t frag_role_peek(struct ssm_pk_buff * spb)
+{
+ const struct frct_pci * pci;
+
+ assert(ssm_pk_buff_head(spb) != NULL);
+
+ pci = (const struct frct_pci *) (ssm_pk_buff_head(spb) - FRCT_PCILEN);
+
+ return ntoh16(pci->flags) & FRCT_FR_MASK;
+}
+
+enum frag_state {
+ FRAG_NOT_READY, /* head missing / FIRST..LAST run incomplete */
+ FRAG_DELIVER, /* *count fragments form a deliverable SDU */
+ FRAG_DROP, /* *count fragments at lwe are malformed */
+};
+
+/*
+ * On a gap in the run: FRTX waits (NOT_READY); best-effort scans forward
+ * for the next FIRST/SOLE and returns DROP for the broken prefix. *count
+ * gets the offset from the trailing edge. NOT_READY if no later run is
+ * in window. Caller rdlock.
+ */
+static enum frag_state frag_inspect_gap(struct frcti * frcti,
+ size_t start,
+ size_t * count)
+{
+ const struct rcv_slot * slots = frcti->rcv_slots;
+ struct ssm_pk_buff * spb;
+ uint32_t k;
+ uint16_t role;
+ size_t m;
+
+ if (frcti->rcv_cr.cflags & FRCTFRTX)
+ return FRAG_NOT_READY;
+
+ k = frcti->rcv_cr.rwe - RQ_SIZE;
+
+ for (m = start; m < RQ_SIZE; ++m) {
+ if (slots[RQ_SLOT(k + m)].idx == -1)
+ continue;
+
+ spb = rq_frag(frcti, k + m);
+ role = frag_role_peek(spb);
+
+ if (role == FRCT_FR_SOLE || role == FRCT_FR_FIRST) {
+ if (m == 0)
+ return FRAG_NOT_READY;
+
+ *count = m;
+ return FRAG_DROP;
+ }
}
- while (snd_cr->seqno == snd_cr->rwe && ret != -ETIMEDOUT) {
- struct timespec now;
- pthread_rwlock_unlock(&frcti->lock);
- pthread_mutex_lock(&frcti->mtx);
+ return FRAG_NOT_READY;
+}
+
+/*
+ * Inspect rq[lwe..]; set *count and return DELIVER/DROP/NOT_READY. DROP
+ * covers broken prefixes (mid/last at HoL, FIRST..[non-LAST]..new-FIRST).
+ * Non-FRTX flows skip past gaps to the next FIRST/SOLE. Caller rdlock.
+ */
+static enum frag_state frag_run_inspect(struct frcti * frcti,
+ size_t * count)
+{
+ const struct rcv_slot * slots = frcti->rcv_slots;
+ struct ssm_pk_buff * spb;
+ uint32_t k = frcti->rcv_cr.rwe - RQ_SIZE;
+ uint16_t role;
+ size_t n = 0;
+
+ if (slots[RQ_SLOT(k)].idx == -1)
+ return frag_inspect_gap(frcti, 0, count);
+
+ spb = rq_frag(frcti, k);
+ role = frag_role_peek(spb);
+
+ if (role == FRCT_FR_SOLE) {
+ *count = 1;
+ return FRAG_DELIVER;
+ }
- if (frcti->open) {
- clock_gettime(PTHREAD_COND_CLOCK, &now);
+ if (role != FRCT_FR_FIRST) {
+ *count = 1;
+ return FRAG_DROP;
+ }
+
+ while (true) {
+ if (n == RQ_SIZE || slots[RQ_SLOT(k + n)].idx == -1)
+ return frag_inspect_gap(frcti, n, count);
+
+ spb = rq_frag(frcti, k + n);
+ role = frag_role_peek(spb);
+ ++n;
+
+ if (role == FRCT_FR_LAST) {
+ *count = n;
+ return FRAG_DELIVER;
+ }
- frcti->t_wnd = now;
- frcti->t_rdvs = now;
- frcti->open = false;
+ if (n > 1 && role != FRCT_FR_MID) {
+ /* SOLE or new FIRST mid-run: drop the prefix. */
+ *count = n - 1;
+ return FRAG_DROP;
}
+ }
+}
+
+/* Caller wrlock. Delivery edge is implicit: rwe - RQ_SIZE. */
+static void frag_drop(struct frcti * frcti,
+ size_t count)
+{
+ uint32_t k = frcti->rcv_cr.rwe - RQ_SIZE;
+ uint32_t edge;
+ size_t i;
+
+ for (i = 0; i < count; ++i) {
+ size_t pos = RQ_SLOT(k + i);
+
+ if (frcti->rcv_slots[pos].idx == -1)
+ continue;
+
+ frct_spb_release_idx(frcti->rcv_slots[pos].idx);
+ frcti->rcv_slots[pos].idx = -1;
+ }
+
+ frcti->rcv_cr.rwe += count;
+
+ /* Drop may span a gap; pull lwe up to preserve rwe - RQ_SIZE <= lwe. */
+ edge = frcti->rcv_cr.rwe - RQ_SIZE;
+ if (before(frcti->rcv_cr.lwe, edge))
+ STORE_RELEASE(&frcti->rcv_cr.lwe, edge);
+}
+
+/* Copy `count` fragments at rq[lwe..] into buf; release + advance lwe. */
+static size_t frag_gather(struct frcti * frcti,
+ size_t count,
+ uint8_t * buf)
+{
+ struct ssm_pk_buff * frag;
+ size_t off = 0;
+ size_t i;
+ uint32_t k = frcti->rcv_cr.rwe - RQ_SIZE;
+
+ for (i = 0; i < count; ++i) {
+ size_t pos = RQ_SLOT(k + i);
+ size_t flen;
+
+ frag = rq_frag(frcti, k + i);
+ flen = ssm_pk_buff_len(frag);
+ memcpy(buf + off, ssm_pk_buff_head(frag), flen);
+ off += flen;
+ frct_spb_release_idx(frcti->rcv_slots[pos].idx);
+ frcti->rcv_slots[pos].idx = -1;
+ }
+
+ frcti->rcv_cr.rwe += count;
+
+ return off;
+}
+
+/* Caller holds lock. */
+static size_t frag_total_len(struct frcti * frcti,
+ size_t count,
+ bool * overflow)
+{
+ struct ssm_pk_buff * frag;
+ size_t total = 0;
+ size_t i;
+ uint32_t k = frcti->rcv_cr.rwe - RQ_SIZE;
- pthread_cleanup_push(__cleanup_mutex_unlock, &frcti->mtx);
+ *overflow = false;
- ret = -__timedwait(&frcti->cond, &frcti->mtx, abstime);
+ for (i = 0; i < count; ++i) {
+ size_t flen;
- pthread_cleanup_pop(false);
+ frag = rq_frag(frcti, k + i);
+ flen = ssm_pk_buff_len(frag);
+ if (total + flen < total) {
+ *overflow = true;
+ return 0;
+ }
+ total += flen;
+ }
- if (ret == -ETIMEDOUT) {
- time_t diff;
+ return total;
+}
- clock_gettime(PTHREAD_COND_CLOCK, &now);
+/*
+ * Process a delivered slot at lwe: latch FIN if acceptable,
+ * advance byte_high (clamped to byte_fin once latched).
+ */
+static __inline__ void stream_deliver_slot(struct frcti * frcti,
+ size_t lp)
+{
+ uint32_t end;
- diff = ts_diff_ns(&now, &frcti->t_wnd);
- if (diff > MAX_RDV) {
- pthread_mutex_unlock(&frcti->mtx);
- return -ECONNRESET; /* write fails! */
- }
+ end = frcti->rcv_slots[lp].end;
- diff = ts_diff_ns(&now, &frcti->t_rdvs);
- if (diff > frcti->rdv) {
- frcti->t_rdvs = now;
- __send_rdv(frcti->fd);
- }
+ if (frcti->rcv_slots[lp].fin) {
+ if (end == frcti->rcv_byte_high && !frcti->rcv_fin_seen) {
+ frcti->rcv_fin_seen = true;
+ frcti->rcv_byte_fin = end;
+ } else {
+ STAT_BUMP(frcti, strm_fin_drop);
}
+ }
+
+ if (frcti->rcv_fin_seen && after(end, frcti->rcv_byte_fin))
+ end = frcti->rcv_byte_fin;
+
+ frcti->rcv_byte_high = end;
+}
+
+/* Two-segment memcpy from buf into the rx ring at byte offset start. */
+static void stream_ring_write(struct frcti * frcti,
+ uint32_t start,
+ buffer_t buf)
+{
+ size_t mask = frcti->rcv_ring_sz - 1;
+ size_t off = start & mask;
- pthread_mutex_unlock(&frcti->mtx);
- pthread_rwlock_rdlock(&frcti->lock);
+ if (off + buf.len <= frcti->rcv_ring_sz) {
+ memcpy(frcti->rcv_ring + off, buf.data, buf.len);
+ } else {
+ size_t first = frcti->rcv_ring_sz - off;
+ memcpy(frcti->rcv_ring + off, buf.data, first);
+ memcpy(frcti->rcv_ring, buf.data + first, buf.len - first);
}
+}
+/* Two-segment memcpy from the rx ring at byte offset start into buf. */
+static void stream_ring_read(struct frcti * frcti,
+ uint32_t start,
+ buffer_t buf)
+{
+ size_t mask = frcti->rcv_ring_sz - 1;
+ size_t off = start & mask;
+
+ if (off + buf.len <= frcti->rcv_ring_sz) {
+ memcpy(buf.data, frcti->rcv_ring + off, buf.len);
+ } else {
+ size_t first = frcti->rcv_ring_sz - off;
+ memcpy(buf.data, frcti->rcv_ring + off, first);
+ memcpy(buf.data + first, frcti->rcv_ring, buf.len - first);
+ }
+}
+
+/* Deliver-or-drop one stashed slot at lwe; advance lwe/rwe. Caller wrlock. */
+static void stream_advance_lwe(struct frcti * frcti)
+{
+ size_t lp;
+
+ lp = RQ_SLOT(frcti->rcv_cr.lwe);
+
+ if (frcti->rcv_slots[lp].start != frcti->rcv_byte_high)
+ STAT_BUMP(frcti, strm_drop);
+ else
+ stream_deliver_slot(frcti, lp);
+
+ frcti->rcv_slots[lp].fin = 0;
+ frcti->rcv_slots[lp].idx = -1;
+ STORE_RELEASE(&frcti->rcv_cr.lwe, frcti->rcv_cr.lwe + 1);
+ frcti->rcv_cr.rwe++;
+}
+
+/*
+ * Validate a stream DATA packet before stashing. Returns 0 if the
+ * packet may be written into rcv_ring + rq[], -1 otherwise.
+ */
+static __inline__ int stream_stash_check(struct frcti * frcti,
+ uint32_t start,
+ uint32_t end,
+ size_t plen,
+ uint16_t flags)
+{
+ if (end - start != (uint32_t) plen)
+ return -1;
+
+ /* FIN MUST be 0-byte. */
+ if ((flags & FRCT_FIN) && plen != 0)
+ return -1;
+
+ /* Post-EOS: no further FIN once latched. */
+ if (frcti->rcv_fin_seen && (flags & FRCT_FIN))
+ return -1;
+
+ /* Post-EOS: reject data at or past byte_fin. */
+ if (frcti->rcv_fin_seen && !before(start, frcti->rcv_byte_fin))
+ return -1;
+
+ /* Stale: peer is behind the delivered edge. */
+ if (before(end, frcti->rcv_byte_next))
+ return -1;
+
+ /* Exact-edge: only an empty-stream FIN is meaningful. */
+ if (end == frcti->rcv_byte_next && !(flags & FRCT_FIN))
+ return -1;
+
+ if (end - frcti->rcv_byte_next > frcti->rcv_ring_sz)
+ return -1;
+
+ return 0;
+}
+
+/*
+ * Stream-mode DATA receive: validate, stash payload in rcv_ring, mark
+ * rq[pos], advance lwe through any newly-contiguous run. Returns 0
+ * (spb released) or -1 (caller releases). Caller wrlock.
+ */
+static int frcti_stream_data_rcv(struct frcti * frcti,
+ struct ssm_pk_buff * spb,
+ size_t pos,
+ uint16_t flags)
+{
+ struct frct_pci_stream * spci;
+ uint32_t start;
+ uint32_t end;
+ buffer_t buf;
+ size_t skip;
+
+ if (ssm_pk_buff_len(spb) < FRCT_PCI_STREAM_LEN)
+ return -1;
+
+ if (frcti->rcv_ring == NULL) {
+ frcti->rcv_ring = calloc(1, frcti->rcv_ring_sz);
+ if (frcti->rcv_ring == NULL)
+ return -ENOMEM;
+ }
+
+ spci = FRCT_HDR_POP(spb, frct_pci_stream);
+ start = ntoh32(spci->start);
+ end = ntoh32(spci->end);
+
+ buf.data = ssm_pk_buff_head(spb);
+ buf.len = ssm_pk_buff_len(spb);
+
+ if (stream_stash_check(frcti, start, end, buf.len, flags) < 0)
+ return -1;
+
+ /* Trim front-overlap with already-delivered region. */
+ if (before(start, frcti->rcv_byte_next)) {
+ skip = frcti->rcv_byte_next - start;
+ buf.data += skip;
+ buf.len -= skip;
+ start = frcti->rcv_byte_next;
+ }
+
+ stream_ring_write(frcti, start, buf);
+ STAT_ADD(frcti, strm_rcv_byte, buf.len);
+
+ frcti->rcv_slots[pos].idx = 1;
+ frcti->rcv_slots[pos].start = start;
+ frcti->rcv_slots[pos].end = end;
+ frcti->rcv_slots[pos].fin = (flags & FRCT_FIN) ? 1 : 0;
+
+ while (frcti->rcv_slots[RQ_SLOT(frcti->rcv_cr.lwe)].idx != -1)
+ stream_advance_lwe(frcti);
+
+ frct_spb_release(spb);
+
+ return 0;
+}
+
+/*
+ * DATA receive: stash idx at rq[pos], advance lwe through any
+ * contiguous run. Caller wrlock.
+ */
+static void frcti_data_stash(struct frcti * frcti,
+ ssize_t idx,
+ size_t pos,
+ uint16_t flags)
+{
+ frcti->rcv_slots[pos].idx = idx;
+
+ if ((flags & FRCT_FR_MASK) != FRCT_FR_SOLE)
+ STAT_BUMP(frcti, frag_rcv);
+
+ /* lwe = cum-ACK edge; advance per fragment through contiguous run. */
+ while (before(frcti->rcv_cr.lwe, frcti->rcv_cr.rwe)
+ && frcti->rcv_slots[RQ_SLOT(frcti->rcv_cr.lwe)].idx != -1)
+ STORE_RELEASE(&frcti->rcv_cr.lwe, frcti->rcv_cr.lwe + 1);
+}
+
+/* Stream consume: copy up to `count` contiguous bytes from ring into buf. */
+static ssize_t frcti_consume_stream(struct frcti * frcti,
+ uint8_t * buf,
+ size_t count)
+{
+ size_t avail;
+ size_t copy;
+ ssize_t ret;
+ buffer_t dst;
+
+ assert(frcti);
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ avail = (size_t) (frcti->rcv_byte_high - frcti->rcv_byte_next);
+ if (avail == 0) {
+ /* EOS drained: signal EOF to the reader. */
+ if (frcti->rcv_fin_seen
+ && frcti->rcv_byte_next == frcti->rcv_byte_fin)
+ ret = 0;
+ else
+ ret = -EAGAIN;
+ goto unlock;
+ }
+
+ copy = MIN(avail, count);
+
+ dst.data = buf;
+ dst.len = copy;
+ stream_ring_read(frcti, frcti->rcv_byte_next, dst);
+
+ frcti->rcv_byte_next += (uint32_t) copy;
+ STAT_ADD(frcti, strm_dlv_byte, copy);
+
+ ret = (ssize_t) copy;
+
+ unlock:
pthread_rwlock_unlock(&frcti->lock);
return ret;
}
-static ssize_t __frcti_queued_pdu(struct frcti * frcti)
+/*
+ * FRTX consume: copy next ready PDU (full SDU or nothing). Returns bytes,
+ * -EAGAIN (no PDU), or -EMSGSIZE (oversize: run dropped to unblock flow).
+ */
+static ssize_t frcti_consume(struct frcti * frcti,
+ uint8_t * buf,
+ size_t count)
{
- ssize_t idx;
- size_t pos;
+ size_t n;
+ size_t total;
+ bool overflow;
+ enum frag_state st;
+ ssize_t ret;
assert(frcti);
- /* See if we already have the next PDU. */
pthread_rwlock_wrlock(&frcti->lock);
- pos = frcti->rcv_cr.lwe & (RQ_SIZE - 1);
-
- idx = frcti->rq[pos];
- if (idx != -1) {
- ++frcti->rcv_cr.lwe;
- ++frcti->rcv_cr.rwe;
- frcti->rq[pos] = -1;
+ while (true) {
+ st = frag_run_inspect(frcti, &n);
+ if (st == FRAG_NOT_READY) {
+ ret = -EAGAIN;
+ goto unlock;
+ }
+ if (st == FRAG_DROP) {
+ STAT_ADD(frcti, frag_drop, n);
+ frag_drop(frcti, n);
+ continue;
+ }
+ /* FRAG_DELIVER */
+ total = frag_total_len(frcti, n, &overflow);
+ if (overflow || total > frcti->max_rcv_sdu || total > count) {
+ STAT_ADD(frcti, frag_drop, n);
+ frag_drop(frcti, n);
+ ret = -EMSGSIZE;
+ goto unlock;
+ }
+ ret = (ssize_t) frag_gather(frcti, n, buf);
+ if (n > 1)
+ STAT_BUMP(frcti, sdu_reasm);
+ else
+ STAT_BUMP(frcti, sdu_sole);
+ goto unlock;
}
+ unlock:
pthread_rwlock_unlock(&frcti->lock);
- return idx;
+ return ret;
}
-static ssize_t __frcti_pdu_ready(struct frcti * frcti)
+static bool frcti_pdu_ready(struct frcti * frcti)
{
- ssize_t idx;
- size_t pos;
+ size_t pos;
+ size_t count;
+ bool ready;
assert(frcti);
- /* See if we already have the next PDU. */
pthread_rwlock_rdlock(&frcti->lock);
- pos = frcti->rcv_cr.lwe & (RQ_SIZE - 1);
- idx = frcti->rq[pos];
+ if (frcti->stream) {
+ ready = frcti->rcv_byte_high != frcti->rcv_byte_next;
+ pthread_rwlock_unlock(&frcti->lock);
+ return ready;
+ }
+
+ if (frag_run_inspect(frcti, &count) != FRAG_DELIVER) {
+ /* Drop case: frcti_consume will handle it; not ready. */
+ pthread_rwlock_unlock(&frcti->lock);
+ return false;
+ }
+
+ pos = RQ_SLOT(frcti->rcv_cr.rwe - RQ_SIZE);
+ ready = frcti->rcv_slots[pos].idx != -1;
+
+ pthread_rwlock_unlock(&frcti->lock);
+
+ return ready;
+}
+
+/* No srtt yet: probe at the cold-probe cadence to seed it. */
+#define PROBE_DUE_COLD(frcti, now_ns) \
+ ((now_ns) - (frcti)->t_snd_probe > (uint64_t) RTTP_COLD_NS)
+
+/* Have srtt: probe when peer quiet for > 2*srtt and last probe > srtt. */
+#define PROBE_DUE_WARM(frcti, now_ns) \
+ ((now_ns) - (frcti)->t_rcv_rtt > 2u * (uint64_t)(frcti)->srtt \
+ && (now_ns) - (frcti)->t_snd_probe > (uint64_t)(frcti)->srtt)
+
+/* Seeds srtt for receive-only sides so they don't fall back to 1 s RTO. */
+__attribute__((cold))
+static void frcti_rcv_probe(struct frcti * frcti,
+ uint64_t now_ns)
+{
+ uint32_t probe_id;
+ uint8_t nonce[RTTP_NONCE_LEN] = { 0 };
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ if (frcti->srtt == 0 && !PROBE_DUE_COLD(frcti, now_ns)) {
+ pthread_rwlock_unlock(&frcti->lock);
+ return;
+ }
+
+ if (frcti->srtt != 0 && !PROBE_DUE_WARM(frcti, now_ns)) {
+ pthread_rwlock_unlock(&frcti->lock);
+ return;
+ }
+
+ probe_id = rttp_alloc_probe(frcti, now_ns, nonce);
pthread_rwlock_unlock(&frcti->lock);
- return idx;
+ if (probe_id != 0)
+ frcti_rttp_snd(frcti, probe_id, 0, nonce);
}
-#include <timerwheel.c>
+/* Echo at slot `pos` matches our probe: id, slot, nonce all intact. */
+static __inline__ bool probe_echo_matches(struct frcti * frcti,
+ size_t pos,
+ uint32_t echo_id,
+ const uint8_t nonce[RTTP_NONCE_LEN])
+{
+ if (frcti->probes[pos].id != echo_id)
+ return false;
+
+ if (frcti->probes[pos].ts == 0)
+ return false;
+
+ return memcmp(frcti->probes[pos].nonce, nonce, RTTP_NONCE_LEN) == 0;
+}
/*
- * Send a final ACK for everything that has not been ACK'd.
- * If the flow should be kept active for retransmission,
- * the returned time will be negative.
+ * RTT probe (echo_id == 0): bounce the nonce back to peer.
+ * RTT echo (echo_id != 0): verify nonce + feed sample.
*/
-static time_t __frcti_dealloc(struct frcti * frcti)
+static void frcti_rttp_rcv(struct frcti * frcti,
+ buffer_t pkt,
+ uint64_t now_ns)
{
- struct timespec now;
- time_t wait;
- int ackno;
- int fd = -1;
+ const struct frct_rttp * rttp;
+ uint32_t probe_id;
+ uint32_t echo_id;
+ uint8_t nonce[RTTP_NONCE_LEN];
+ size_t ring_pos;
+ int64_t elapsed;
+ uint64_t sample;
+
+ if (pkt.len < RTTP_PAYLOAD)
+ return;
+
+ rttp = (const struct frct_rttp *) pkt.data;
+ probe_id = ntoh32(rttp->probe_id);
+ echo_id = ntoh32(rttp->echo_id);
+
+ /* Forged/malformed: bouncing this would loop on echo_id == 0. */
+ if (probe_id == 0 && echo_id == 0)
+ return;
+
+ memcpy(nonce, rttp->nonce, sizeof(nonce));
+
+ if (echo_id == 0) {
+ /* Probe: echo back with same nonce so peer can verify. */
+ STAT_BUMP(frcti, rttp_rcv);
+ frcti_rttp_snd(frcti, 0, probe_id, nonce);
+ return;
+ }
+
+ ring_pos = RTTP_POS(echo_id);
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ if (!probe_echo_matches(frcti, ring_pos, echo_id, nonce)) {
+ pthread_rwlock_unlock(&frcti->lock);
+ return;
+ }
+
+ elapsed = ts_age_ns(now_ns, frcti->probes[ring_pos].ts);
+ frcti->probes[ring_pos].ts = 0;
+ frcti->t_rcv_rtt = now_ns;
+
+ if (elapsed <= 0) {
+ pthread_rwlock_unlock(&frcti->lock);
+ return;
+ }
+ sample = (uint64_t) elapsed;
+
+ /* Clamp probe sample to RTT_CLAMP_MUL * srtt to avoid poisoning. */
+ if (frcti->srtt > 0)
+ sample = MIN(sample, (uint64_t) frcti->srtt * RTT_CLAMP_MUL);
+
+ rtt_update(frcti, sample, now_ns);
+
+ pthread_rwlock_unlock(&frcti->lock);
+}
+
+/* Honours piggybacked ACK on the KA. */
+static void frcti_ka_rcv(struct frcti * frcti,
+ const struct frct_pci * pci,
+ uint64_t now_ns,
+ uint16_t flags)
+{
+ uint32_t ka_ackno;
+
+ STORE_RELEASE(&frcti->t_ka_rcv, now_ns);
+ STAT_BUMP(frcti, ka_rcv);
+
+ if (!(flags & FRCT_ACK))
+ return;
+
+ ka_ackno = ntoh32(pci->ackno);
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ if (within(ka_ackno, frcti->snd_cr.lwe, frcti->snd_cr.seqno))
+ STORE_RELEASE(&frcti->snd_cr.lwe, ka_ackno);
+
+ pthread_rwlock_unlock(&frcti->lock);
+}
+
+/*
+ * Additive HoL re-emit (carries DRF); runs before rcv_cr->act
+ * refresh so it doesn't pre-empt peer's first DRF.
+ */
+__attribute__((cold))
+static void frcti_nack_rcv(struct frcti * frcti)
+{
+ struct timespec now;
+ uint64_t now_ns;
+ size_t hp;
+ struct rxm_entry * rxm;
+ void * pkt_copy = NULL;
+ size_t pkt_len = 0;
clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ STAT_BUMP(frcti, nack_rcv);
+
+ if (frcti->snd_cr.seqno == frcti->snd_cr.lwe) {
+ pthread_rwlock_unlock(&frcti->lock);
+ return;
+ }
+
+ hp = RQ_SLOT(frcti->snd_cr.lwe);
+ rxm = LOAD_ACQUIRE(&frcti->snd_slots[hp].rxm);
+ if (rxm == NULL || RXM_AGED_OUT(rxm->t0, now_ns, frcti->t_r)) {
+ pthread_rwlock_unlock(&frcti->lock);
+ return;
+ }
+
+ pkt_copy = malloc(rxm->len);
+ if (pkt_copy != NULL) {
+ memcpy(pkt_copy, rxm->pkt, rxm->len);
+ pkt_len = rxm->len;
+ /* Karn: suppress RTT sample. NACK supersedes pending TLP. */
+ frcti->snd_slots[hp].flags =
+ (frcti->snd_slots[hp].flags & ~SND_TLP)
+ | SND_RTX | SND_FAST_RXM;
+ frcti->rtt_lwe = frcti->snd_cr.lwe + 1;
+ STAT_BUMP(frcti, rxm_nack);
+ }
+
+ pthread_rwlock_unlock(&frcti->lock);
+
+ if (pkt_copy != NULL) {
+ int ret = fast_rxm_send(frcti, pkt_copy, pkt_len);
+ if (ret == -EFLOWDOWN || ret == -ENOTALLOC)
+ STAT_BUMP(frcti, rxm_tx_dead);
+ free(pkt_copy);
+ }
+}
+
+__attribute__((cold))
+static void frcti_rdv_rcv(struct frcti * frcti)
+{
+ uint32_t rwe;
pthread_rwlock_rdlock(&frcti->lock);
- ackno = frcti->rcv_cr.lwe;
- if (frcti->rcv_cr.lwe != frcti->rcv_cr.seqno)
- fd = frcti->fd;
+ rwe = frcti_advert_rwe(frcti);
+
+ pthread_rwlock_unlock(&frcti->lock);
+
+ STAT_BUMP(frcti, rdv_rcv);
+
+ frcti_pkt_snd(frcti, FRCT_FC, 0, rwe);
+}
+
+/* §7.2: PTO = 2*SRTT + max delayed-ACK delay; fallback when unseeded. */
+static __inline__ uint64_t tlp_pto(const struct frcti * frcti)
+{
+ if (frcti->srtt > 0)
+ return 2ULL * (uint64_t) frcti->srtt + ACK_DELAY_NS;
+
+ return NACK_COOLDOWN_NS;
+}
+
+/*
+ * RFC 8985 §7: lazy probe. Re-evaluate on fire — if sender was active
+ * within PTO, re-post; else probe HoL once and hand off to RTO.
+ */
+__attribute__((cold))
+static void tlp_due(void * arg)
+{
+ struct frcti * frcti = arg;
+ struct timespec now;
+ uint64_t now_ns;
+ uint64_t pto;
+ uint64_t rto_at;
+ size_t hp;
+ struct rxm_entry * rxm;
+ void * pkt_copy = NULL;
+ size_t pkt_len = 0;
+ bool re_post = false;
+ uint64_t deadline = 0;
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ if (frcti->snd_cr.seqno == frcti->snd_cr.lwe)
+ goto unlock;
- wait = MAX(frcti->rcv_cr.inact - now.tv_sec + frcti->rcv_cr.act.tv_sec,
- frcti->snd_cr.inact - now.tv_sec + frcti->snd_cr.act.tv_sec);
- wait = MAX(wait, 0);
+ if (!before(frcti->snd_cr.seqno, frcti->snd_cr.rwe))
+ goto unlock; /* FC-blocked: RDV handles it. */
- if (frcti->snd_cr.cflags & FRCTFLINGER
- && before(frcti->snd_cr.lwe, frcti->snd_cr.seqno))
- wait = -wait;
+ /* RFC 8985 §7.3: one outstanding probe, MAX_TLP_PER_EP per ep. */
+ if (frcti->tlp_high_seq != 0)
+ goto unlock;
+ if (frcti->tlp_count >= MAX_TLP_PER_EP)
+ goto unlock;
+
+ pto = tlp_pto(frcti);
+
+ /* §7.2: anchor PTO on most recent send; defer if still active. */
+ if (now_ns < frcti->snd_cr.act + pto) {
+ deadline = frcti->snd_cr.act + pto;
+ re_post = true;
+ goto unlock;
+ }
+
+ hp = RQ_SLOT(frcti->snd_cr.lwe);
+ rxm = LOAD_ACQUIRE(&frcti->snd_slots[hp].rxm);
+ if (rxm == NULL || RXM_AGED_OUT(rxm->t0, now_ns, frcti->t_r))
+ goto unlock;
+
+ /* Cap: if HoL RTO is due, let rxm_due fire instead. */
+ rto_at = rxm->t0 + ((uint64_t) frcti->rto
+ << LOAD_RELAXED(&frcti->rto_mul));
+ if (rto_at <= now_ns)
+ goto unlock;
+
+ pkt_copy = malloc(rxm->len);
+ if (pkt_copy != NULL) {
+ memcpy(pkt_copy, rxm->pkt, rxm->len);
+ pkt_len = rxm->len;
+ frcti->snd_slots[hp].time = now_ns;
+ frcti->snd_slots[hp].flags |= SND_TLP | SND_FAST_RXM;
+ frcti->rtt_lwe = frcti->snd_cr.lwe + 1;
+ /* §7.3 outstanding-probe marker; ack_rcv/rxm_snd clear. */
+ frcti->tlp_high_seq = frcti->snd_cr.seqno;
+ frcti->tlp_count++;
+ STAT_BUMP(frcti, tlp_snd);
+ }
+
+ unlock:
pthread_rwlock_unlock(&frcti->lock);
- if (fd != -1)
- __send_frct_pkt(fd, FRCT_ACK, ackno, 0);
+ if (pkt_copy != NULL) {
+ fast_rxm_send(frcti, pkt_copy, pkt_len);
+ free(pkt_copy);
+ }
- return wait;
+ if (re_post)
+ tw_post(&frcti->tlp_tw, deadline, tlp_due, frcti);
+ else
+ __atomic_clear(&frcti->tlp_pending, __ATOMIC_RELAXED);
}
-static int __frcti_snd(struct frcti * frcti,
- struct ssm_pk_buff * spb)
+/* §7.2 lazy: post once per quiet period. tlp_due re-evaluates on fire. */
+static int tlp_arm(struct frcti * frcti)
{
- struct frct_pci * pci;
- struct timespec now;
- struct frct_cr * snd_cr;
- struct frct_cr * rcv_cr;
- uint32_t seqno;
- bool rtx;
+ struct timespec now;
+ uint64_t now_ns;
+ uint64_t pto;
+ uint64_t deadline;
+
+ /* §7.3: one outstanding probe, MAX_TLP_PER_EP per recovery ep. */
+ if (LOAD_RELAXED(&frcti->tlp_high_seq) != 0)
+ return 0;
+ if (LOAD_RELAXED(&frcti->tlp_count) >= MAX_TLP_PER_EP)
+ return 0;
+ if (__atomic_test_and_set(&frcti->tlp_pending, __ATOMIC_RELAXED))
+ return 0;
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ pto = tlp_pto(frcti);
+
+ deadline = LOAD_RELAXED(&frcti->snd_cr.act) + pto;
+ if (deadline <= now_ns)
+ deadline = now_ns + pto;
+
+ tw_post(&frcti->tlp_tw, deadline, tlp_due, frcti);
+
+ return 0;
+}
+
+/*
+ * FC window advert from any flag-bearing packet. Caps at lwe + RQ_SIZE,
+ * rejects backward shrink (forged/stale FC), marks window open.
+ * Caller wrlock.
+ */
+static __inline__ void frcti_fc_rcv(struct frcti * frcti,
+ const struct frct_pci * pci)
+{
+ struct frct_cr * snd_cr;
+ uint32_t rwe;
+ uint32_t rwe_max;
+
+ snd_cr = &frcti->snd_cr;
+ rwe = ntoh32(pci->window);
+ rwe_max = snd_cr->lwe + RQ_SIZE;
+
+ if (after(rwe, rwe_max))
+ rwe = rwe_max;
+
+ /* Reject backward shrink (forged/stale FC). */
+ if (before(rwe, snd_cr->rwe))
+ rwe = snd_cr->rwe;
+
+ STORE_RELAXED(&snd_cr->rwe, rwe);
+ frcti->open = true;
+}
+
+/* Packet copies captured under frcti->lock; emitted after release. */
+struct pending {
+ buffer_t fast_rxm;
+ buffer_t sack_rxm[SACK_RXM_MAX];
+ size_t sack_rxm_cnt;
+};
+
+/* RFC 6582 §3.2: seal recovery_high on entry; do not extend on new gaps. */
+static void recovery_enter(struct frcti * frcti)
+{
+ if (frcti->in_recovery)
+ return;
+
+ frcti->in_recovery = true;
+ frcti->recovery_high = frcti->snd_cr.seqno + RTT_QUARANTINE;
+}
+
+/* True when cum-ACK clears recovery_high or all in-flight ACKed. */
+static bool recovery_exit_reached(struct frcti * frcti,
+ uint32_t ackno)
+{
+ if (!frcti->in_recovery)
+ return false;
+
+ if (!before(ackno, frcti->recovery_high))
+ return true;
+
+ return ackno == frcti->snd_cr.seqno;
+}
+
+/* RTT sample gate: Karn + SACK-consume + don't-seed. */
+static bool rtt_sample_eligible(struct frcti * frcti,
+ size_t p,
+ uint16_t flags,
+ uint32_t lwe)
+{
+ if (flags & FRCT_RXM)
+ return false;
+ if (frcti->snd_slots[p].flags & (SND_RTX | SND_TLP))
+ return false;
+ if (LOAD_ACQUIRE(&frcti->snd_slots[p].rxm) == NULL)
+ return false;
+ if (before(lwe, frcti->rtt_lwe))
+ return false;
+ /* Don't seed srtt from a cum-ACK; let probes seed. */
+ if (frcti->srtt == 0)
+ return false;
+ return true;
+}
+
+#define RXM_SLOT_EMPTY(rxm) ((rxm) == NULL)
+#define FAST_RXM_STAGED(pending) ((pending)->fast_rxm.data != NULL)
+#define RXM_FAST_DONE(flags) (((flags) & SND_FAST_RXM) != 0)
+
+/* RACK fast retransmit on cum-ACK: HoL aged past R, not yet retransmitted. */
+static void fast_rxm_consider(struct frcti * frcti,
+ uint64_t now_ns,
+ struct pending * pending)
+{
+ struct rxm_entry * rxm;
+ struct snd_slot * slot;
+ size_t hp;
+ uint64_t R;
+ bool rack_ok;
+
+ hp = RQ_SLOT(frcti->snd_cr.lwe);
+ slot = &frcti->snd_slots[hp];
+ rxm = LOAD_ACQUIRE(&slot->rxm);
+ R = rack_reorder_window(frcti);
+
+ if (RXM_SLOT_EMPTY(rxm))
+ return;
+
+ /* RFC 8985 §6.2: time-based RACK OR DupThresh count. */
+ rack_ok = (int64_t)(frcti->t_latest_ack - slot->time) > (int64_t) R;
+ if (!rack_ok && frcti->dup_thresh < DUP_THRESH)
+ return;
+
+ /* HoL aged past t_r; let rxm_due tear the flow down. */
+ if (RXM_AGED_OUT(rxm->t0, now_ns, frcti->t_r))
+ return;
+
+ /* Already on it. */
+ if (FAST_RXM_STAGED(pending) || RXM_FAST_DONE(slot->flags))
+ return;
+
+ recovery_enter(frcti);
+
+ pending->fast_rxm.data = malloc(rxm->len);
+ if (pending->fast_rxm.data == NULL)
+ return;
+
+ pending->fast_rxm.len = rxm->len;
+ memcpy(pending->fast_rxm.data, rxm->pkt, rxm->len);
+ slot->flags |= SND_RTX | SND_FAST_RXM;
+ frcti->rtt_lwe = frcti->snd_cr.lwe + 1;
+ if (rack_ok)
+ STAT_BUMP(frcti, rxm_rack);
+ else
+ STAT_BUMP(frcti, rxm_dupthresh);
+}
+
+/* Caller holds wrlock; RACK fast retransmit queued in pending. */
+__attribute__((hot))
+static void frcti_ack_rcv(struct frcti * frcti,
+ const struct frct_pci * pci,
+ uint16_t flags,
+ uint64_t now_ns,
+ struct pending * pending)
+{
+ uint32_t ackno;
+ uint32_t lwe;
+ size_t p;
+ size_t fresh;
+
+ if (!(flags & FRCT_DATA))
+ STAT_BUMP(frcti, ack_rcv);
+
+ ackno = ntoh32(pci->ackno);
+ if (ackno == frcti->snd_cr.lwe) {
+ /* RFC 8985 §6.2: only on scoreboard change. */
+ if (frcti->snd_cr.lwe != frcti->rack_fired_lwe) {
+ fast_rxm_consider(frcti, now_ns, pending);
+ frcti->rack_fired_lwe = frcti->snd_cr.lwe;
+ }
+ return;
+ }
+
+ if (!within(ackno, frcti->snd_cr.lwe, frcti->snd_cr.seqno))
+ return;
+
+ lwe = frcti->snd_cr.lwe;
+ p = RQ_SLOT(lwe);
+
+ STORE_RELEASE(&frcti->snd_cr.lwe, ackno);
+
+ /* §7.3: cum-ACK past the probed seqno resolves the TLP. */
+ if (frcti->tlp_high_seq != 0
+ && !before(ackno, frcti->tlp_high_seq))
+ frcti->tlp_high_seq = 0;
+
+ /* §7.3: end the probe episode once inflight drains. */
+ if (ackno == frcti->snd_cr.seqno)
+ frcti->tlp_count = 0;
+
+ /* RFC 8985 §7.2: halve mult per REO_DECAY_PKTS fresh-ACK'd seqnos. */
+ fresh = ackno - frcti->dsack_lwe_snap;
+ if (frcti->reo_wnd_mult > 1 && fresh >= REO_DECAY_PKTS) {
+ uint8_t half = frcti->reo_wnd_mult >> 1;
+ frcti->reo_wnd_mult = half < 1 ? 1 : half;
+ frcti->dsack_lwe_snap = ackno;
+ }
+
+ /* RFC 8985: latest cum-ACKed send-time (slot of ackno-1). */
+ frcti->t_latest_ack = frcti->snd_slots[RQ_SLOT(ackno - 1)].time;
+
+ /* RFC 8985: SACK-above-lwe count is per-recovery-episode. */
+ frcti->dup_thresh = 0;
+
+ /* Karn-skip on retx; TLP ACK clears rto_mul (no CC backoff). */
+ if ((frcti->snd_slots[p].flags & SND_RTX) == 0
+ || (frcti->snd_slots[p].flags & SND_TLP) != 0)
+ STORE_RELEASE(&frcti->rto_mul, 0);
+
+ if (recovery_exit_reached(frcti, ackno))
+ frcti->in_recovery = false;
+
+ if (rtt_sample_eligible(frcti, p, flags, lwe)) {
+ int64_t mrtt = ts_age_ns(now_ns, frcti->snd_slots[p].time);
+ if (mrtt > 0) {
+ if (!(flags & FRCT_DATA))
+ STAT_BUMP(frcti, ack_rtt);
+ rtt_update(frcti, (time_t) mrtt, now_ns);
+ frcti->t_rcv_rtt = now_ns;
+ }
+ }
+}
+
+/* Skip k == lwe under clamp: NULLing HoL from a stale SACK wedges it. */
+static uint32_t sack_mark_blocks(struct frcti * frcti,
+ const uint8_t * payload,
+ uint16_t n,
+ uint32_t * newly_marked)
+{
+ uint32_t hi_sacked = frcti->snd_cr.lwe;
+ uint32_t marked = 0;
+ uint16_t i;
+
+ for (i = 0; i < n; ++i) {
+ uint32_t s;
+ uint32_t e;
+ uint32_t k;
+ bool clamped;
+
+ sack_block_get(payload, i, &s, &e);
+
+ if (!before(s, e))
+ continue;
+
+ clamped = before(s, frcti->snd_cr.lwe);
+ if (clamped)
+ s = frcti->snd_cr.lwe;
+ if (after(e, frcti->snd_cr.seqno))
+ e = frcti->snd_cr.seqno;
+
+ for (k = s; before(k, e); ++k) {
+ size_t kp = RQ_SLOT(k);
+ uint64_t t_k;
+ if (clamped && k == frcti->snd_cr.lwe)
+ continue;
+ if (LOAD_ACQUIRE(&frcti->snd_slots[kp].rxm) == NULL)
+ continue;
+ STORE_RELEASE(&frcti->snd_slots[kp].rxm, NULL);
+ frcti->snd_slots[kp].flags = 0;
+ marked++;
+ /* RACK.fack: latest SACK-confirmed send-time. */
+ t_k = frcti->snd_slots[kp].time;
+ if (t_k > frcti->t_latest_ack)
+ frcti->t_latest_ack = t_k;
+ }
+
+ if (after(e, hi_sacked))
+ hi_sacked = e;
+ }
+
+ *newly_marked = marked;
+ return hi_sacked;
+}
+
+/* Queue once per loss event (SND_FAST_RXM gates). Emit after unlock. */
+static void sack_queue_rxm(struct frcti * frcti,
+ uint32_t hi_sacked,
+ uint64_t now_ns,
+ struct pending * pending)
+{
+ uint64_t R = rack_reorder_window(frcti);
+ uint32_t k;
+ bool rack_ok;
+
+ for (k = frcti->snd_cr.lwe; before(k, hi_sacked); ++k) {
+ struct rxm_entry * rxm;
+ size_t kp = RQ_SLOT(k);
+ size_t cnt = pending->sack_rxm_cnt;
+ size_t rack_age;
+
+ rxm = LOAD_ACQUIRE(&frcti->snd_slots[kp].rxm);
+
+ if (cnt >= SACK_RXM_MAX)
+ break;
+
+ if (rxm == NULL)
+ continue;
+
+ if (frcti->snd_slots[kp].flags & SND_FAST_RXM)
+ continue;
+
+ if (RXM_AGED_OUT(rxm->t0, now_ns, frcti->t_r))
+ continue;
+
+ rack_age = frcti->t_latest_ack - frcti->snd_slots[kp].time;
+ /* RFC 8985 §6.2: time-based RACK OR DupThresh count. */
+ rack_ok = (int64_t) rack_age > (int64_t) R;
+ if (!rack_ok && frcti->dup_thresh < DUP_THRESH)
+ continue;
+
+ if (rack_ok)
+ STAT_BUMP(frcti, rxm_rack);
+ else
+ STAT_BUMP(frcti, rxm_dupthresh);
+
+ pending->sack_rxm[cnt].data = malloc(rxm->len);
+ if (pending->sack_rxm[cnt].data == NULL)
+ break;
+
+ pending->sack_rxm[cnt].len = rxm->len;
+ memcpy(pending->sack_rxm[cnt].data, rxm->pkt, rxm->len);
+ pending->sack_rxm_cnt++;
+ /* NULL slot so the original timer self-cleans. */
+ STORE_RELEASE(&frcti->snd_slots[kp].rxm, NULL);
+ frcti->snd_slots[kp].time = now_ns;
+ frcti->snd_slots[kp].flags |= SND_RTX | SND_FAST_RXM;
+ frcti->rtt_lwe = k + 1;
+ }
+}
+
+/*
+ * RFC 2883 D-SACK detector. Returns true iff block[0] is a D-SACK
+ * report:
+ * case 1: blocks[0].start < pkt_ackno (strictly below cum-ACK).
+ * case 2: blocks[0] is a strict sub-range of some blocks[i>0].
+ * MAX_DSACK_LAG bounds case-1 distance to one rcv window (sanity).
+ */
+static bool sack_is_dsack(struct frcti * frcti,
+ const uint8_t * payload,
+ uint16_t n,
+ uint32_t pkt_ackno)
+{
+ uint32_t s0;
+ uint32_t e0;
+ uint16_t i;
+
+ if (n == 0)
+ return false;
+
+ sack_block_get(payload, 0, &s0, &e0);
+ if (!before(s0, e0))
+ return false;
+
+ if (before(s0, pkt_ackno)) {
+ if ((pkt_ackno - s0) <= (uint32_t) MAX_DSACK_LAG)
+ return true;
+ STAT_BUMP(frcti, dsack_drop);
+ return false;
+ }
+
+ for (i = 1; i < n; ++i) {
+ uint32_t si;
+ uint32_t ei;
+
+ sack_block_get(payload, i, &si, &ei);
+ if (!before(si, ei))
+ continue;
+ if (!before(s0, si) && !after(e0, ei)
+ && (s0 != si || e0 != ei))
+ return true;
+ }
+
+ return false;
+}
+
+/* RFC 8985 §7.2: grow reo_wnd_mult on DSACK; at most once per RTT. */
+static __inline__ void reo_wnd_on_dsack(struct frcti * frcti,
+ uint64_t now_ns)
+{
+ time_t srtt = frcti->srtt;
+
+ /* Snap is unconditional: feeds the per-D-SACK decay clock. */
+ frcti->dsack_lwe_snap = frcti->snd_cr.lwe;
+
+ if (srtt > 0
+ && now_ns - frcti->t_last_reo_widen <= (uint64_t) srtt)
+ return;
+
+ if (frcti->reo_wnd_mult < REO_WND_MULT_MAX)
+ frcti->reo_wnd_mult++;
+
+ frcti->t_last_reo_widen = now_ns;
+}
+
+/* Caller holds wrlock; retransmits queued for post-unlock emission. */
+static void frcti_sack_rcv(struct frcti * frcti,
+ buffer_t pkt,
+ uint32_t pkt_ackno,
+ uint64_t now_ns,
+ struct pending * pending)
+{
+ uint32_t hi_sacked;
+ uint32_t marked;
+ uint16_t n;
+ bool dsack;
+ uint16_t n_real;
+
+ if (pkt.len < SACK_HDR_SIZE)
+ return;
+
+ n = ntoh16(*(const uint16_t *) pkt.data);
+ if (n > SACK_MAX_BLOCKS)
+ return;
+
+ if (pkt.len < SACK_HDR_SIZE + (size_t) n * SACK_BLOCK_SIZE)
+ return;
+
+ STAT_BUMP(frcti, sack_rcv);
+
+ dsack = sack_is_dsack(frcti, pkt.data, n, pkt_ackno);
+ n_real = n - (dsack ? 1 : 0);
+
+ if (dsack) {
+ STAT_BUMP(frcti, dsack_rcv);
+ reo_wnd_on_dsack(frcti, now_ns);
+ }
+
+ /* DSACK-only carries no new gap; don't enter recovery. */
+ if (n_real > 0)
+ recovery_enter(frcti);
+
+ marked = 0;
+ hi_sacked = sack_mark_blocks(frcti, pkt.data, n, &marked);
+ frcti->dup_thresh += marked;
+
+ if (after(hi_sacked, frcti->snd_cr.lwe))
+ sack_queue_rxm(frcti, hi_sacked, now_ns, pending);
+}
+
+/* Emit and free queued packet copies. */
+static void pending_flush(struct frcti * frcti,
+ struct pending * pending)
+{
+ size_t i;
+
+ for (i = 0; i < pending->sack_rxm_cnt; ++i) {
+ sack_rxm_snd(frcti, pending->sack_rxm[i].data,
+ pending->sack_rxm[i].len);
+ free(pending->sack_rxm[i].data);
+ }
+
+ if (pending->fast_rxm.data != NULL) {
+ int ret = fast_rxm_send(frcti, pending->fast_rxm.data,
+ pending->fast_rxm.len);
+ if (ret == -EFLOWDOWN || ret == -ENOTALLOC)
+ STAT_BUMP(frcti, rxm_tx_dead);
+ free(pending->fast_rxm.data);
+ }
+}
+
+/* Pre-DRF NACK: ask peer to retransmit HoL; seqno is informational. */
+static void frcti_nack_snd(struct frcti * frcti,
+ uint32_t seqno_unseen)
+{
+ struct ssm_pk_buff * spb;
+ struct frct_pci * pci;
+
+ if (frct_ctrl_alloc(&spb, &pci, 0) < 0)
+ return;
+
+ pci->flags = hton16(FRCT_NACK);
+ pci->seqno = hton32(seqno_unseen);
+
+ frct_hcs_set(pci, false);
+
+ frct_tx(frcti, spb);
+}
+
+enum frct_act {
+ FRCT_ACTIVE,
+ FRCT_INACT_NEED_NACK,
+ FRCT_INACT_DROP,
+};
+
+/* On rcv inactivity: rebase on DRF, or arm pre-DRF NACK. Caller wrlock. */
+static enum frct_act rcv_inact_check(struct frcti * frcti,
+ uint16_t flags,
+ uint32_t seqno,
+ uint64_t now_ns)
+{
+ struct frct_cr * rcv_cr = &frcti->rcv_cr;
+ uint64_t cd;
+
+ if (!ts_aged_ns(now_ns, rcv_cr->act, rcv_cr->inact))
+ return FRCT_ACTIVE;
+
+ if (flags & FRCT_DRF) {
+ if (same_epoch_drf(seqno, flags, rcv_cr))
+ return FRCT_ACTIVE;
+
+ /* Bootstrap or fresh epoch: rebase. */
+ STAT_BUMP(frcti, drf_rebase);
+ release_rq(frcti);
+ STORE_RELEASE(&rcv_cr->lwe, seqno);
+ rcv_cr->rwe = seqno + RQ_SIZE;
+ rcv_cr->seqno = seqno;
+ return FRCT_ACTIVE;
+ }
+
+ if (!(flags & FRCT_DATA))
+ return FRCT_ACTIVE;
+
+ /* Pre-DRF: nudge sender with NACK (rate-limited). */
+ cd = frcti->srtt > 0 ? (uint64_t) frcti->srtt : NACK_COOLDOWN_NS;
+ if (!ts_aged_ns(now_ns, frcti->t_nack, cd))
+ return FRCT_INACT_DROP;
+
+ frcti->t_nack = now_ns;
+ STAT_BUMP(frcti, nack_snd);
+
+ return FRCT_INACT_NEED_NACK;
+}
+
+/* Both modes: bounded accept into rq[seqno]. Caller wrlock. */
+__attribute__((hot))
+static bool rq_accept(struct frcti * frcti,
+ uint32_t seqno,
+ size_t pos,
+ uint16_t flags)
+{
+ struct frct_cr * rcv_cr = &frcti->rcv_cr;
+
+ if (!before(seqno, rcv_cr->rwe)) {
+ STAT_BUMP(frcti, out_rcv);
+ return false;
+ }
+
+ if (!before(seqno, rcv_cr->lwe + RQ_SIZE)) {
+ STAT_BUMP(frcti, rqo_rcv);
+ return false;
+ }
+
+ if (frcti->rcv_slots[pos].idx != -1) {
+ if (flags & FRCT_RXM)
+ STAT_BUMP(frcti, rxm_dup_rcv);
+ else
+ STAT_BUMP(frcti, dup_rcv);
+ /* RFC 2883 §4 case 2: in-window dup; sub-range marker. */
+ frcti->dsack_seqno = seqno;
+ frcti->dsack_valid = true;
+ return false;
+ }
+
+ return true;
+}
+
+/* OOO arrival; throttle by min_gap + scoreboard dedup. */
+static bool sack_check(struct frcti * frcti,
+ uint32_t seqno,
+ uint64_t now_ns,
+ struct sack_args * out)
+{
+ struct frct_cr * rcv_cr = &frcti->rcv_cr;
+ uint64_t min_gap;
+ uint16_t n;
+
+ if (!after(seqno, rcv_cr->lwe))
+ return false;
+
+ STAT_BUMP(frcti, ooo_rcv);
+
+ /* SACK carries cum-ACK; bound by t_a like any other ACK. */
+ if (ACK_AGED_OUT(rcv_cr->act, now_ns, frcti->t_a))
+ return false;
+
+ /* srtt/8 gate starved recovery under burst loss; floor to save CPU. */
+ min_gap = (uint64_t) SACK_MIN_GAP_NS;
+
+ if (!ts_aged_ns(now_ns, frcti->t_snd_sack, min_gap))
+ return false;
+
+ out->dsack = false;
+ n = dsack_consume(frcti, out->blocks);
+ if (n == 1)
+ out->dsack = true;
+ n += sack_blocks_build(frcti, out->blocks + n,
+ frcti->sack_n_max - n);
+
+ if (!out->dsack
+ && rcv_cr->lwe == frcti->sack_lwe && n == frcti->sack_n)
+ return false;
+
+ out->n = n;
+ out->ack = rcv_cr->lwe;
+ out->rwe = frcti_advert_rwe(frcti);
+ frcti->t_snd_sack = now_ns;
+ frcti->sack_lwe = rcv_cr->lwe;
+ frcti->sack_n = n;
+
+ return true;
+}
+
+/* Wire-dup of fresh DATA at an already-ACKed seqno. */
+static __inline__ bool is_dup_data(uint16_t flags,
+ uint32_t seqno,
+ uint32_t lwe)
+{
+ if (!(flags & FRCT_DATA))
+ return false;
+
+ if (flags & FRCT_RXM)
+ return false;
+
+ return before(seqno, lwe);
+}
+
+/*
+ * Wire-dup ACK packet: same seqno as the previous emission. Updates
+ * the dedup ackno on a fresh ACK; caller drops on true.
+ */
+static __inline__ bool is_dup_ack(struct frcti * frcti,
+ uint16_t flags,
+ uint32_t seqno)
+{
+ if (flags & FRCT_DATA)
+ return false;
+
+ if (!(flags & FRCT_ACK))
+ return false;
+
+ if (seqno == frcti->rcv_cr.ackno)
+ return true;
+
+ frcti->rcv_cr.ackno = seqno;
+
+ return false;
+}
+
+/* Caller wrlock. */
+__attribute__((cold))
+static void seqno_rotate(struct frcti * frcti,
+ uint64_t now_ns)
+{
+ struct frct_cr * snd_cr = &frcti->snd_cr;
+
+ if (!ts_aged_ns(now_ns, snd_cr->act, snd_cr->inact))
+ return;
+ /* Idle-on-wire ≠ idle e2e: don't orphan in-flight rxm. */
+ if (snd_cr->seqno != snd_cr->lwe)
+ return;
+
+ /* Avoid colliding with peer's current rcv window. */
+ do {
+ random_buffer(&snd_cr->seqno, sizeof(snd_cr->seqno));
+ } while (in_window(snd_cr->seqno, snd_cr));
+ STORE_RELEASE(&snd_cr->lwe, snd_cr->seqno);
+ STORE_RELAXED(&snd_cr->rwe, snd_cr->lwe + START_WINDOW);
+ frcti->rtt_lwe = snd_cr->seqno;
+ frcti->in_recovery = false;
+ frcti->recovery_high = snd_cr->seqno;
+}
+
+__attribute__((hot))
+static int frcti_snd(struct frcti * frcti,
+ struct ssm_pk_buff * spb,
+ uint16_t flags)
+{
+ struct frct_pci * pci;
+ struct frct_pci_stream * spci = NULL;
+ struct timespec now;
+ struct frct_cr * snd_cr;
+ struct frct_cr * rcv_cr;
+ struct rxm_entry * rxm = NULL;
+ uint32_t seqno;
+ uint16_t pci_flags = 0;
+ bool rtx;
+ uint64_t now_ns;
+ int64_t rcv_idle;
+ uint32_t probe_id = 0;
+ uint8_t probe_nonce[RTTP_NONCE_LEN] = { 0 };
+ bool probe;
+ size_t payload_len = 0;
assert(frcti);
- assert(ssm_pk_buff_len(spb) != 0);
+ /* Stream mode permits 0-byte sends for the EOS marker. */
+ assert(ssm_pk_buff_len(spb) != 0 || frcti->stream);
snd_cr = &frcti->snd_cr;
rcv_cr = &frcti->rcv_cr;
- timerwheel_move();
+ tw_move_safe();
+
+ if (frcti->stream)
+ payload_len = ssm_pk_buff_len(spb);
- pci = (struct frct_pci *) ssm_pk_buff_head_alloc(spb, FRCT_PCILEN);
+ pci = FRCT_HDR_PUSH(spb, frcti);
if (pci == NULL)
return -ENOMEM;
- memset(pci, 0, sizeof(*pci));
+ /* Pre-allocate rxm so alloc fail can't orphan a seqno. */
+ if (snd_cr->cflags & FRCTFRTX) {
+ rxm = rxm_alloc(frcti, ssm_pk_buff_len(spb));
+ if (rxm == NULL) {
+ ssm_pk_buff_pop(spb, frcti_data_hdr_len(frcti));
+ return -ENOMEM;
+ }
+ }
+
+ memset(pci, 0, FRCT_PCILEN);
+
+ if (frcti->stream)
+ spci = FRCT_SPCI(pci);
clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
pthread_rwlock_wrlock(&frcti->lock);
rtx = snd_cr->cflags & FRCTFRTX;
- pci->flags |= FRCT_DATA;
+ pci_flags |= FRCT_DATA;
+ if (!frcti->stream)
+ pci_flags |= (flags & FRCT_FR_MASK);
- /* Set DRF if there are no unacknowledged packets. */
- if (snd_cr->seqno == snd_cr->lwe)
- pci->flags |= FRCT_DRF;
+ if (!frcti->stream && (flags & FRCT_FR_MASK) != FRCT_FR_SOLE)
+ STAT_BUMP(frcti, frag_snd);
- /* Choose a new sequence number if sender inactivity expired. */
- if (now.tv_sec - snd_cr->act.tv_sec > snd_cr->inact) {
- /* There are no unacknowledged packets. */
- assert(snd_cr->seqno == snd_cr->lwe);
- random_buffer(&snd_cr->seqno, sizeof(snd_cr->seqno));
- snd_cr->lwe = snd_cr->seqno;
- snd_cr->rwe = snd_cr->lwe + START_WINDOW;
+ if (frcti->stream) {
+ if (flags & FRCT_FIN)
+ pci_flags |= FRCT_FIN;
+
+ spci->start = hton32(frcti->snd_byte_next);
+ frcti->snd_byte_next += (uint32_t) payload_len;
+ spci->end = hton32(frcti->snd_byte_next);
+ STAT_ADD(frcti, strm_snd_byte, payload_len);
}
+ if (snd_cr->seqno == snd_cr->lwe)
+ pci_flags |= FRCT_DRF;
+
+ seqno_rotate(frcti, now_ns);
+
seqno = snd_cr->seqno;
pci->seqno = hton32(seqno);
- if (now.tv_sec - rcv_cr->act.tv_sec < rcv_cr->inact) {
- pci->flags |= FRCT_FC;
- *((uint32_t *) pci) |= hton32(rcv_cr->rwe & 0x00FFFFFF);
+ rcv_idle = ts_age_ns(now_ns, rcv_cr->act);
+
+ if (rcv_idle < (int64_t) rcv_cr->inact) {
+ pci_flags |= FRCT_FC;
+ pci->window = hton32(frcti_advert_rwe(frcti));
}
if (!rtx) {
- snd_cr->lwe++;
+ STORE_RELEASE(&snd_cr->lwe, snd_cr->lwe + 1);
+ STORE_RELEASE(&snd_cr->rwe, snd_cr->lwe + RQ_SIZE);
} else {
- if (!frcti->probe) {
- frcti->rttseq = snd_cr->seqno;
- frcti->t_probe = now;
- frcti->probe = true;
-#ifdef PROC_FLOW_STATS
- frcti->n_prb++;
-#endif
- }
- if ((now.tv_sec - rcv_cr->act.tv_sec) * BILLION <= frcti->a) {
- pci->flags |= FRCT_ACK;
+ size_t p = RQ_SLOT(seqno);
+ frcti->snd_slots[p].time = now_ns;
+ /* Fresh send clears RTX bits. */
+ frcti->snd_slots[p].flags = 0;
+ if (rcv_idle <= (int64_t) frcti->t_a) {
+ pci_flags |= FRCT_ACK;
pci->ackno = hton32(rcv_cr->lwe);
rcv_cr->seqno = rcv_cr->lwe;
}
}
+ pci->flags = hton16(pci_flags);
+
+ frct_hcs_set(pci, frcti->stream);
+
snd_cr->seqno++;
- snd_cr->act = now;
+ STORE_RELEASE(&snd_cr->act, now_ns);
+
+ probe = rtt_probe_arm(frcti, now_ns, &probe_id, probe_nonce);
pthread_rwlock_unlock(&frcti->lock);
- if (rtx)
- timerwheel_rxm(frcti, seqno, spb);
+ if (probe)
+ frcti_rttp_snd(frcti, probe_id, 0, probe_nonce);
+
+ if (rtx) {
+ assert(rxm != NULL);
+ rxm_arm(frcti, seqno, rxm, spb);
+ tlp_arm(frcti);
+ }
return 0;
}
-static void rtt_estimator(struct frcti * frcti,
- time_t mrtt)
+/* Stream FIN is armed for rxm; needs to be in window. */
+static __inline__ bool stream_fin_blocked(struct frcti * frcti)
{
- time_t srtt = frcti->srtt;
- time_t rttvar = frcti->mdev;
+ if (!frcti->stream)
+ return false;
- if (srtt == 0) { /* first measurement */
- srtt = mrtt;
- rttvar = mrtt >> 1;
- } else {
- time_t delta = mrtt - srtt;
- srtt += (delta >> 3);
- delta = (ABS(delta) - rttvar) >> 2;
-#ifdef FRCT_LINUX_RTT_ESTIMATOR
- if (delta < 0)
- delta >>= 3;
-#endif
- rttvar += delta;
+ return !before(frcti->snd_cr.seqno, frcti->snd_cr.lwe + RQ_SIZE);
+}
+
+/*
+ * Stream: 0-byte FRCT_FIN DATA so peer's flow_read returns 0 at this
+ * byte. Msg: control packet with FRCT_FIN flag, snd_cr.seqno carried
+ * in pci->ackno (sender packs via frcti_pkt_snd's ackno parameter).
+ */
+static void frcti_fin_snd(struct frcti * frcti)
+{
+ struct ssm_pk_buff * spb;
+ bool already;
+ uint32_t fin_seqno;
+
+ if (!(frcti->snd_cr.cflags & FRCTFLINGER))
+ return;
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ already = frcti->snd_fin_sent;
+
+ /* Defer before committing snd_fin_sent; linger loop retries. */
+ if (!already && stream_fin_blocked(frcti)) {
+ pthread_rwlock_unlock(&frcti->lock);
+ return;
}
-#ifdef PROC_FLOW_STATS
- frcti->n_rtt++;
-#endif
- frcti->srtt = MAX(1000L, srtt);
- frcti->mdev = MAX(100L, rttvar);
- frcti->rto = MAX(RTO_MIN, frcti->srtt + (frcti->mdev << MDEV_MUL));
-}
-
-/* Always queues the next application packet on the RQ. */
-static void __frcti_rcv(struct frcti * frcti,
- struct ssm_pk_buff * spb)
-{
- ssize_t idx;
- size_t pos;
- struct frct_pci * pci;
- struct timespec now;
- struct frct_cr * rcv_cr;
- struct frct_cr * snd_cr;
- uint32_t seqno;
- uint32_t ackno;
- uint32_t rwe;
- int fd = -1;
- assert(frcti);
+ frcti->snd_fin_sent = true;
+ fin_seqno = frcti->snd_cr.seqno;
+
+ if (!already && !frcti->stream)
+ frcti->snd_fin_seqno = fin_seqno;
+ pthread_rwlock_unlock(&frcti->lock);
+
+ if (already)
+ return;
+
+ if (!frcti->stream) {
+ frcti_pkt_snd(frcti, FRCT_FIN, fin_seqno, 0);
+ return;
+ }
+
+ if (frct_spb_reserve(frcti_data_hdr_len(frcti), &spb) < 0)
+ return;
+
+ /* Reset spb to 0-len so frcti_snd's head_alloc populates PCI. */
+ ssm_pk_buff_truncate(spb, 0);
+
+ if (frcti_snd(frcti, spb, FRCT_FIN) < 0) {
+ frct_spb_release(spb);
+ return;
+ }
+
+ if (frct_tx(frcti, spb) < 0)
+ return;
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ frcti->snd_fin_seqno = frcti->snd_cr.seqno - 1;
+
+ pthread_rwlock_unlock(&frcti->lock);
+}
+
+static bool final_ack_due(struct frcti * frcti,
+ struct frct_cr * rcv_cr,
+ uint64_t now_ns)
+{
+ if (rcv_cr->lwe == rcv_cr->seqno)
+ return false;
+
+ if (ACK_AGED_OUT(rcv_cr->act, now_ns, frcti->t_a))
+ return false;
+
+ return true;
+}
+
+/* Snd-side has FLINGER cflag and unACK'd data below the FIN/seqno. */
+static __inline__ bool snd_drain_pending(struct frct_cr * snd_cr,
+ uint32_t edge)
+{
+ if (!(snd_cr->cflags & FRCTFLINGER))
+ return false;
+
+ return before(snd_cr->lwe, edge);
+}
+
+/* Peer is still active and we haven't seen their FIN yet. */
+static __inline__ bool rcv_drain_pending(struct frcti * frcti,
+ struct frct_cr * rcv_cr,
+ uint64_t now_ns)
+{
+ if (frcti->rcv_fin_seen)
+ return false;
+
+ return !ts_aged_ns(now_ns, rcv_cr->act, rcv_cr->inact);
+}
+
+/* Drain-loop predicate: snd-side unACK'd data OR peer still active. */
+static bool frcti_lingering(struct frcti * frcti)
+{
+ struct timespec now;
+ struct frct_cr * snd_cr;
+ struct frct_cr * rcv_cr;
+ uint32_t edge;
+ uint64_t now_ns;
+ bool snd_linger;
+ bool rcv_linger;
+
+ /* Idempotent; emits FIN once per side, both stream and msg. */
+ frcti_fin_snd(frcti);
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ pthread_rwlock_rdlock(&frcti->lock);
+
+ snd_cr = &frcti->snd_cr;
rcv_cr = &frcti->rcv_cr;
+
+ if (frcti->snd_fin_sent)
+ edge = frcti->snd_fin_seqno;
+ else
+ edge = snd_cr->seqno;
+
+ snd_linger = snd_drain_pending(snd_cr, edge);
+ rcv_linger = rcv_drain_pending(frcti, rcv_cr, now_ns);
+
+ pthread_rwlock_unlock(&frcti->lock);
+
+ return snd_linger || rcv_linger;
+}
+
+static time_t frcti_dealloc(struct frcti * frcti)
+{
+ struct timespec now;
+ struct frct_cr * snd_cr;
+ struct frct_cr * rcv_cr;
+ int ackno;
+ bool due;
+ int64_t now_ns;
+ int64_t rcv;
+ int64_t snd;
+
snd_cr = &frcti->snd_cr;
+ rcv_cr = &frcti->rcv_cr;
+
+ /* Idempotent; usually already sent by frcti_lingering. */
+ frcti_fin_snd(frcti);
clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
- pci = (struct frct_pci *) ssm_pk_buff_head_release(spb, FRCT_PCILEN);
+ pthread_rwlock_rdlock(&frcti->lock);
- idx = ssm_pk_buff_get_idx(spb);
- seqno = ntoh32(pci->seqno);
- pos = seqno & (RQ_SIZE - 1);
+ ackno = rcv_cr->lwe;
+ rcv = (int64_t)(rcv_cr->act + rcv_cr->inact) - now_ns;
+ snd = (int64_t)(snd_cr->act + snd_cr->inact) - now_ns;
+ due = final_ack_due(frcti, rcv_cr, now_ns);
- pthread_rwlock_wrlock(&frcti->lock);
+ pthread_rwlock_unlock(&frcti->lock);
- if (now.tv_sec - rcv_cr->act.tv_sec > rcv_cr->inact) {
- if (pci->flags & FRCT_DRF) { /* New run. */
- rcv_cr->lwe = seqno;
- rcv_cr->rwe = seqno + RQ_SIZE;
- rcv_cr->seqno = seqno;
- } else if (pci->flags & FRCT_DATA) {
- goto drop_packet;
- }
- }
+ if (due)
+ frcti_pkt_snd(frcti, FRCT_ACK, ackno, 0);
- rcv_cr->act = now;
+ return (time_t) MAX((MAX(rcv, snd) / BILLION), 0);
+}
- /* For now, just send an immediate window update. */
- if (pci->flags & FRCT_RDVS) {
- fd = frcti->fd;
- rwe = rcv_cr->rwe;
- pthread_rwlock_unlock(&frcti->lock);
+__attribute__((hot))
+static void frcti_rcv(struct frcti * frcti,
+ struct ssm_pk_buff * spb)
+{
+ ssize_t idx;
+ size_t pos;
+ struct frct_pci * pci;
+ struct timespec now;
+ uint64_t now_ns;
+ struct frct_cr * rcv_cr;
+ uint32_t seqno;
+ uint16_t flags;
+ buffer_t pkt;
+ struct pending pending = { 0 };
+ bool in_order;
+ struct sack_args * sa = NULL;
+ bool send_sack = false;
+
+ assert(frcti);
- __send_frct_pkt(fd, FRCT_FC, 0, rwe);
+ rcv_cr = &frcti->rcv_cr;
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
- ssm_pool_remove(proc.pool, idx);
+ if (ssm_pk_buff_len(spb) < FRCT_PCILEN) {
+ frct_spb_release(spb);
return;
}
- if (pci->flags & FRCT_ACK) {
- ackno = ntoh32(pci->ackno);
- if (after(ackno, frcti->snd_cr.lwe))
- frcti->snd_cr.lwe = ackno;
+ pci = FRCT_HDR_POP(spb, frct_pci);
- if (frcti->probe && after(ackno, frcti->rttseq)) {
-#ifdef PROC_FLOW_STATS
- if (!(pci->flags & FRCT_DATA))
- frcti->n_dak++;
-#endif
- rtt_estimator(frcti, ts_diff_ns(&now, &frcti->t_probe));
- frcti->probe = false;
- }
+ idx = ssm_pk_buff_get_off(spb);
+ seqno = ntoh32(pci->seqno);
+ pos = RQ_SLOT(seqno);
+
+ flags = ntoh16(pci->flags);
+
+ pkt.data = ssm_pk_buff_head(spb);
+ pkt.len = ssm_pk_buff_len(spb);
+
+ if (flags & FRCT_RXM)
+ STAT_BUMP(frcti, rxm_rcv);
+
+ /* Stateless / lock-free dispatches. spb released via ctrl_done. */
+ if (flags & FRCT_KA) {
+ frcti_ka_rcv(frcti, pci, now_ns, flags);
+ goto ctrl_done;
}
- if (pci->flags & FRCT_FC) {
- uint32_t rwe;
+ if (flags & FRCT_RTTP) {
+ frcti_rttp_rcv(frcti, pkt, now_ns);
+ goto ctrl_done;
+ }
- rwe = ntoh32(*((uint32_t *)pci) & hton32(0x00FFFFFF));
- rwe |= snd_cr->rwe & 0xFF000000;
+ if (flags & FRCT_NACK) {
+ frcti_nack_rcv(frcti);
+ goto ctrl_done;
+ }
+
+ if (flags & FRCT_RDVS) {
+ frcti_rdv_rcv(frcti);
+ goto ctrl_done;
+ }
- /* Rollover for 24 bit */
- if (before(rwe, snd_cr->rwe) && snd_cr->rwe - rwe > 0x007FFFFF)
- rwe += 0x01000000;
+ /* Msg-mode FIN: control packet, FIN seqno carried in pci->ackno. */
+ if ((flags & FRCT_FIN) && !(flags & FRCT_DATA)) {
+ pthread_rwlock_wrlock(&frcti->lock);
+ if (!frcti->rcv_fin_seen) {
+ frcti->rcv_fin_seen = true;
+ frcti->rcv_byte_fin = ntoh32(pci->ackno);
+ }
+ pthread_rwlock_unlock(&frcti->lock);
+ goto ctrl_done;
+ }
- snd_cr->rwe = rwe;
+ pthread_rwlock_wrlock(&frcti->lock);
- pthread_mutex_lock(&frcti->mtx);
- if (!frcti->open) {
- frcti->open = true;
- pthread_cond_broadcast(&frcti->cond);
+ /* rcv_inact_check is a no-op for non-DATA non-DRF packets. */
+ if (flags & (FRCT_DATA | FRCT_DRF)) {
+ switch (rcv_inact_check(frcti, flags, seqno, now_ns)) {
+ case FRCT_INACT_NEED_NACK:
+ pthread_rwlock_unlock(&frcti->lock);
+ frcti_nack_snd(frcti, seqno - 1);
+ frct_spb_release(spb);
+ return;
+ case FRCT_INACT_DROP:
+ STAT_BUMP(frcti, inact_drop);
+ goto drop_packet;
+ case FRCT_ACTIVE:
+ /* FALLTHRU */
+ default:
+ break;
}
- pthread_mutex_unlock(&frcti->mtx);
}
- if (!(pci->flags & FRCT_DATA))
+ /* DATA-only act refresh: non-DATA would lock out DRF rebase. */
+ if (flags & FRCT_DATA)
+ STORE_RELEASE(&rcv_cr->act, now_ns);
+
+ /* Wire-dup ACK packet: same seqno as the previous emission. */
+ if (is_dup_ack(frcti, flags, seqno)) {
+ STAT_BUMP(frcti, ack_dup_rcv);
+ goto drop_packet;
+ }
+
+ /* Wire-dup of DATA: piggybacked ACK info already processed. */
+ if (is_dup_data(flags, seqno, rcv_cr->lwe)) {
+ rcv_cr->seqno = seqno;
+ STAT_BUMP(frcti, dup_rcv);
+ /* RFC 2883 §4 case 1: dup below cum-ACK. */
+ frcti->dsack_seqno = seqno;
+ frcti->dsack_valid = true;
+ goto drop_packet;
+ }
+
+ if (flags & FRCT_ACK)
+ frcti_ack_rcv(frcti, pci, flags, now_ns, &pending);
+
+ if (flags & FRCT_SACK)
+ frcti_sack_rcv(frcti, pkt, ntoh32(pci->ackno),
+ now_ns, &pending);
+
+ if (flags & FRCT_FC)
+ frcti_fc_rcv(frcti, pci);
+
+ if (!(flags & FRCT_DATA))
goto drop_packet;
if (before(seqno, rcv_cr->lwe)) {
- rcv_cr->seqno = seqno; /* Ensures we send a new ACK. */
-#ifdef PROC_FLOW_STATS
- frcti->n_dup++;
-#endif
+ /* Bump rcv_cr.seqno to force ack_snd to fire on the dup. */
+ rcv_cr->seqno = seqno;
+ if (flags & FRCT_RXM)
+ STAT_BUMP(frcti, rxm_dup_rcv);
+ else
+ STAT_BUMP(frcti, dup_rcv);
+ /* RFC 2883 §4 case 1: dup below cum-ACK. */
+ frcti->dsack_seqno = seqno;
+ frcti->dsack_valid = true;
goto drop_packet;
}
- if (rcv_cr->cflags & FRCTFRTX) {
+ if (!rq_accept(frcti, seqno, pos, flags))
+ goto drop_packet;
- if (!before(seqno, rcv_cr->rwe)) { /* Out of window. */
-#ifdef PROC_FLOW_STATS
- frcti->n_out++;
-#endif
+ if (frcti->stream) {
+ if (frcti_stream_data_rcv(frcti, spb, pos, flags) < 0) {
+ STAT_BUMP(frcti, strm_drop);
goto drop_packet;
}
-
- if (!before(seqno, rcv_cr->lwe + RQ_SIZE)) {
-#ifdef PROC_FLOW_STATS
- frcti->n_rqo++;
-#endif
- goto drop_packet; /* Out of rq. */
- }
- if (frcti->rq[pos] != -1) {
-#ifdef PROC_FLOW_STATS
- frcti->n_dup++;
-#endif
- goto drop_packet; /* Duplicate in rq. */
- }
- fd = frcti->fd;
+ /* spb consumed by stash; do not release in drop path. */
+ spb = NULL;
} else {
- rcv_cr->lwe = seqno;
+ frcti_data_stash(frcti, idx, pos, flags);
+ }
+
+ /* Lazy alloc: only OOO arrivals can trigger a SACK send. */
+ if (after(seqno, rcv_cr->lwe) && frcti->sack_n_max > 0) {
+ size_t sa_sz = sizeof(*sa)
+ + frcti->sack_n_max * sizeof(sa->blocks[0]);
+ sa = malloc(sa_sz);
+ /* If alloc fails, sack_check sees NULL and we skip SACK. */
}
- frcti->rq[pos] = idx;
+ send_sack = sa != NULL && sack_check(frcti, seqno, now_ns, sa);
+ in_order = !after(seqno, rcv_cr->lwe);
pthread_rwlock_unlock(&frcti->lock);
- if (fd != -1)
- timerwheel_delayed_ack(fd, frcti);
+ if (send_sack) {
+ STAT_BUMP(frcti, sack_snd);
+ if (sa->dsack)
+ STAT_BUMP(frcti, dsack_snd);
+ frcti_sack_snd(frcti, sa);
+ } else if (in_order) {
+ ack_arm(frcti);
+ }
+
+ if ((flags & FRCT_ACK) && frcti->snd_cr.seqno != frcti->snd_cr.lwe)
+ tlp_arm(frcti);
+
+ pending_flush(frcti, &pending);
+
+ frcti_rcv_probe(frcti, now_ns);
+ free(sa);
+ return;
+
+ ctrl_done:
+ frct_spb_release(spb);
return;
drop_packet:
pthread_rwlock_unlock(&frcti->lock);
- ssm_pool_remove(proc.pool, idx);
- send_frct_pkt(frcti);
- return;
+ frct_spb_release(spb);
+ /* with_sack=true: ack_snd no-ops if neither dsack nor SACK is due. */
+ ack_snd(frcti, true);
+
+ pending_flush(frcti, &pending);
+ free(sa);
}
+
+/* NULL-shim macros for the no-FRCT case. */
+
+#define FRCTI_SND(frcti, spb, flags) \
+ ((frcti) == NULL ? 0 : frcti_snd((frcti), (spb), (flags)))
+
+#define FRCTI_RCV(frcti, spb) \
+ do { \
+ if ((frcti) != NULL) \
+ frcti_rcv((frcti), (spb)); \
+ } while (0)
+
+#define FRCTI_PDU_READY(frcti) \
+ ((frcti) != NULL && frcti_pdu_ready(frcti))
+
+#define FRCTI_CONSUME(frcti, buf, count) \
+ ((frcti) == NULL ? (ssize_t) -EAGAIN \
+ : (frcti)->stream \
+ ? frcti_consume_stream((frcti), (buf), (count)) \
+ : frcti_consume((frcti), (buf), (count)))
+
+#define FRCTI_IS_FRTX(frcti) \
+ ((frcti) != NULL && ((frcti)->rcv_cr.cflags & FRCTFRTX))
+
+#define FRCTI_IS_STREAM(frcti) ((frcti) != NULL && (frcti)->stream)
+
+#define FRCTI_PAYLOAD_CAP(frcti) \
+ ((frcti)->frag_mtu - frcti_data_hdr_len(frcti))
+
+#define FRCTI_NEEDS_FRAG(frcti, count) \
+ ((frcti) != NULL && (count) > FRCTI_PAYLOAD_CAP(frcti))
+
+#define FRCTI_IS_WINDOW_OPEN(frcti) \
+ ((frcti) == NULL ? true : frcti_is_window_open(frcti))
+
+#define FRCTI_IS_WINDOW_OPEN_N(frcti, n) \
+ ((frcti) == NULL ? true : frcti_is_window_open_n((frcti), (n)))
+
+#define FRCTI_LINGERING(frcti) \
+ ((frcti) == NULL ? false : frcti_lingering(frcti))
+
+#define FRCTI_DEALLOC(frcti) \
+ ((frcti) == NULL ? (time_t) 0 : frcti_dealloc(frcti))
+
diff --git a/src/lib/hash.c b/src/lib/hash.c
index 7adee968..903474df 100644
--- a/src/lib/hash.c
+++ b/src/lib/hash.c
@@ -39,6 +39,9 @@
#include <ouroboros/md5.h>
#include <ouroboros/sha3.h>
#endif
+#include <ouroboros/crc8.h>
+#include <ouroboros/crc16.h>
+#include <ouroboros/crc64.h>
#include <string.h>
#include <assert.h>
#include <stdbool.h>
@@ -69,6 +72,14 @@ int hash_len_tbl [] = {
uint16_t hash_len(enum hash_algo algo)
{
+ if (algo == HASH_CRC8)
+ return CRC8_HASH_LEN;
+
+ if (algo == HASH_CRC16)
+ return CRC16_HASH_LEN;
+
+ if (algo == HASH_CRC64)
+ return CRC64_HASH_LEN;
#ifdef HAVE_LIBGCRYPT
return (uint16_t) gcry_md_get_algo_dlen(gcry_algo_tbl[algo]);
#else
@@ -81,12 +92,36 @@ void mem_hash(enum hash_algo algo,
const uint8_t * buf,
size_t len)
{
-#ifdef HAVE_LIBGCRYPT
- gcry_md_hash_buffer(gcry_algo_tbl[algo], dst, buf, len);
-#else
+#ifndef HAVE_LIBGCRYPT
struct sha3_ctx sha3_ctx;
struct md5_ctx md5_ctx;
+#endif
+ if (algo == HASH_CRC8) {
+ uint8_t crc = 0;
+
+ crc8_autosar(&crc, buf, len);
+ *(uint8_t *) dst = crc;
+ return;
+ }
+ if (algo == HASH_CRC16) {
+ uint16_t crc = 0;
+
+ crc16_ccitt_false(&crc, buf, len);
+ *(uint16_t *) dst = htobe16(crc);
+ return;
+ }
+
+ if (algo == HASH_CRC64) {
+ uint64_t crc = 0;
+
+ crc64_nvme(&crc, buf, len);
+ *(uint64_t *) dst = htobe64(crc);
+ return;
+ }
+#ifdef HAVE_LIBGCRYPT
+ gcry_md_hash_buffer(gcry_algo_tbl[algo], dst, buf, len);
+#else
switch (algo) {
case HASH_CRC32:
memset(dst, 0, CRC32_HASH_LEN);
@@ -131,3 +166,14 @@ void str_hash(enum hash_algo algo,
{
return mem_hash(algo, dst, (const uint8_t *) str, strlen(str));
}
+
+uint64_t hash_mix64(uint64_t key)
+{
+ key ^= key >> 33;
+ key *= 0xff51afd7ed558ccdULL;
+ key ^= key >> 33;
+ key *= 0xc4ceb9fe1a85ec53ULL;
+ key ^= key >> 33;
+
+ return key;
+}
diff --git a/src/lib/pb/ipcp.proto b/src/lib/pb/ipcp.proto
index 9dc402f5..afee4f91 100644
--- a/src/lib/pb/ipcp.proto
+++ b/src/lib/pb/ipcp.proto
@@ -39,6 +39,7 @@ enum ipcp_msg_code {
IPCP_CONNECT = 10;
IPCP_DISCONNECT = 11;
IPCP_REPLY = 12;
+ IPCP_FLOW_UPDATE = 13;
}
message ipcp_msg {
@@ -54,7 +55,7 @@ message ipcp_msg {
optional int32 response = 10;
optional string comp = 11;
optional uint32 timeo_sec = 12;
- optional sint32 mpl = 13;
+ optional sint32 mpl = 13; /* MPL in ms. */
optional int32 result = 14;
optional uint32 uid = 15; /* 0 = GSPP, >0 = PUP uid */
}
diff --git a/src/lib/pb/irm.proto b/src/lib/pb/irm.proto
index 9ed0a29b..f54bc9ea 100644
--- a/src/lib/pb/irm.proto
+++ b/src/lib/pb/irm.proto
@@ -53,6 +53,8 @@ enum irm_msg_code {
IPCP_FLOW_REQ_ARR = 25;
IPCP_FLOW_ALLOC_REPLY = 26;
IRM_REPLY = 27;
+ IRM_FLOW_UPDATE = 28;
+ IPCP_FLOW_UPDATE_ARR = 29;
}
message timespec_msg {
@@ -88,12 +90,15 @@ message irm_msg {
repeated ipcp_list_msg ipcps = 17;
repeated name_info_msg names = 18;
optional timespec_msg timeo = 19;
- optional sint32 mpl = 20;
+ optional sint32 mpl = 20; /* MPL in ms. */
optional string comp = 21;
optional bytes pk = 22; /* piggyback */
optional uint32 timeo_sec = 23;
optional uint32 timeo_nsec = 24;
optional sint32 result = 25;
- optional bytes sym_key = 26; /* symmetric encryption key */
- optional sint32 cipher_nid = 27; /* cipher NID */
+ optional bytes sym_key = 26; /* symmetric encryption key */
+ optional sint32 cipher_nid = 27; /* cipher NID */
+ optional uint32 generation = 28; /* re-key batch generation */
+ optional bool rekey = 29; /* re-key watermark trigger */
+ optional bool rk_initiator = 30; /* re-key proof-holder side */
}
diff --git a/src/lib/pb/model.proto b/src/lib/pb/model.proto
index f1382f3d..4c1564a5 100644
--- a/src/lib/pb/model.proto
+++ b/src/lib/pb/model.proto
@@ -28,7 +28,7 @@ message qosspec_msg {
required uint32 availability = 3; /* Class of 9s. */
required uint32 loss = 4; /* Packet loss. */
required uint32 ber = 5; /* Bit error rate, ppb. */
- required uint32 in_order = 6; /* In-order delivery. */
+ required uint32 service = 6; /* enum qos_service. */
required uint32 max_gap = 7; /* In ms. */
required uint32 timeout = 8; /* Timeout in ms. */
}
@@ -37,10 +37,11 @@ message flow_info_msg {
required uint32 id = 1;
required uint32 n_pid = 2;
required uint32 n_1_pid = 3;
- required uint32 mpl = 4;
+ required uint32 mpl = 4; /* MPL in ms. */
required uint32 state = 5;
required qosspec_msg qos = 6;
required uint32 uid = 7;
+ required uint32 mtu = 8; /* Layer MTU (bytes). */
}
message name_info_msg {
diff --git a/src/lib/protobuf.c b/src/lib/protobuf.c
index 28b3aab2..a824d357 100644
--- a/src/lib/protobuf.c
+++ b/src/lib/protobuf.c
@@ -81,6 +81,7 @@ flow_info_msg_t * flow_info_s_to_msg(const struct flow_info * s)
msg->mpl = s->mpl;
msg->state = s->state;
msg->uid = s->uid;
+ msg->mtu = s->mtu;
msg->qos = qos_spec_s_to_msg(&s->qs);
if (msg->qos == NULL)
goto fail_msg;
@@ -107,6 +108,7 @@ struct flow_info flow_info_msg_to_s(const flow_info_msg_t * msg)
s.mpl = msg->mpl;
s.state = msg->state;
s.uid = msg->uid;
+ s.mtu = msg->mtu;
s.qs = qos_spec_msg_to_s(msg->qos);
return s;
@@ -757,7 +759,7 @@ qosspec_msg_t * qos_spec_s_to_msg(const struct qos_spec * s)
msg->availability = s->availability;
msg->loss = s->loss;
msg->ber = s->ber;
- msg->in_order = s->in_order;
+ msg->service = s->service;
msg->max_gap = s->max_gap;
msg->timeout = s->timeout;
@@ -775,7 +777,7 @@ struct qos_spec qos_spec_msg_to_s(const qosspec_msg_t * msg)
s.availability = msg->availability;
s.loss = msg->loss;
s.ber = msg->ber;
- s.in_order = msg->in_order;
+ s.service = msg->service;
s.max_gap = msg->max_gap;
s.timeout = msg->timeout;
diff --git a/src/lib/qoscube.c b/src/lib/qoscube.c
index 1eaa0d7c..5d7ae17d 100644
--- a/src/lib/qoscube.c
+++ b/src/lib/qoscube.c
@@ -29,15 +29,11 @@
qoscube_t qos_spec_to_cube(qosspec_t qs)
{
- if (qs.delay <= qos_voice.delay &&
- qs.bandwidth <= qos_voice.bandwidth &&
- qs.availability >= qos_voice.availability &&
- qs.max_gap <= qos_voice.max_gap)
+ if (qs.delay <= 50 && qs.bandwidth <= 100000
+ && qs.availability >= 5 && qs.max_gap <= 50)
return QOS_CUBE_VOICE;
- else if (qs.delay <= qos_video.delay &&
- qs.bandwidth <= qos_video.bandwidth &&
- qs.availability >= qos_video.availability &&
- qs.max_gap <= qos_video.max_gap)
+ else if (qs.delay <= 100 && qs.availability >= 3
+ && qs.max_gap <= 100)
return QOS_CUBE_VIDEO;
else
return QOS_CUBE_BE;
diff --git a/src/lib/random.c b/src/lib/random.c
index 96315132..a132f470 100644
--- a/src/lib/random.c
+++ b/src/lib/random.c
@@ -28,6 +28,8 @@
#include <stdlib.h>
#elif defined(HAVE_SYS_RANDOM)
#include <sys/random.h>
+#include <errno.h>
+#include <stdint.h>
#elif defined(HAVE_LIBGCRYPT)
#include <gcrypt.h>
#elif defined(HAVE_OPENSSL_RNG)
@@ -42,13 +44,28 @@ int random_buffer(void * buf,
arc4random_buf(buf, len);
return 0;
#elif defined(HAVE_SYS_RANDOM)
- return getrandom(buf, len, GRND_NONBLOCK);
+ size_t off = 0;
+ ssize_t ret;
+
+ while (off < len) {
+ ret = getrandom((uint8_t *) buf + off, len - off,
+ GRND_NONBLOCK);
+ if (ret < 0) {
+ if (errno == EINTR)
+ continue;
+ return -1;
+ }
+ off += (size_t) ret;
+ }
+
+ return 0;
#elif defined(HAVE_LIBGCRYPT)
gcry_randomize(buf, len, GCRY_STRONG_RANDOM);
return 0;
#elif defined(HAVE_OPENSSL_RNG)
- if (len > 0 && len < INT_MAX)
- return RAND_bytes((unsigned char *) buf, (int) len);
- return -1;
+ if (len == 0 || len >= INT_MAX)
+ return -1;
+
+ return RAND_bytes((unsigned char *) buf, (int) len) == 1 ? 0 : -1;
#endif
}
diff --git a/src/lib/rib.c b/src/lib/rib.c
index a8d535c9..6e421397 100644
--- a/src/lib/rib.c
+++ b/src/lib/rib.c
@@ -112,14 +112,14 @@ static int rib_read(const char * path,
(void) info;
(void) offset;
- pthread_rwlock_wrlock(&rib.lock);
+ pthread_rwlock_rdlock(&rib.lock);
list_for_each(p, &rib.reg_comps) {
struct reg_comp * r = list_entry(p, struct reg_comp, next);
if (strcmp(comp, r->path) == 0) {
- int ret = r->ops->read(path + 1, buf, size);
+ struct rib_ops * ops = r->ops;
pthread_rwlock_unlock(&rib.lock);
- return ret;
+ return ops->read(path + 1, buf, size);
}
}
@@ -160,19 +160,25 @@ static int rib_readdir(const char * path,
ssize_t len;
ssize_t i;
struct reg_comp * c;
+ struct rib_ops * ops;
c = list_entry(p, struct reg_comp, next);
if (strcmp(path + 1, c->path) != 0)
continue;
- assert(c->ops->readdir != NULL);
+ ops = c->ops;
+
+ assert(ops->readdir != NULL);
+
+ pthread_rwlock_unlock(&rib.lock);
- len = c->ops->readdir(&dir_entries);
+ len = ops->readdir(&dir_entries);
if (len < 0)
- break;
+ return 0;
for (i = 0; i < len; ++i)
filler(buf, dir_entries[i], NULL, 0);
freepp(char, dir_entries, len);
+ return 0;
}
}
diff --git a/src/lib/serdes-irm.c b/src/lib/serdes-irm.c
index 65f2c02d..1d9b4dec 100644
--- a/src/lib/serdes-irm.c
+++ b/src/lib/serdes-irm.c
@@ -174,6 +174,54 @@ int flow__irm_result_des(buffer_t * buf,
else
memset(sk->key, 0, SYMMKEYSZ);
+ sk->epoch = msg->has_generation ? (uint8_t) msg->generation : 0;
+
+ if (msg->sym_key.len == SYMMKEYSZ)
+ crypt_secure_clear(msg->sym_key.data, msg->sym_key.len);
+
+ irm_msg__free_unpacked(msg, NULL);
+
+ return 0;
+ fail:
+ irm_msg__free_unpacked(msg, NULL);
+ fail_msg:
+ return err;
+}
+
+int flow_rekey__irm_result_des(buffer_t * buf,
+ struct crypt_sk * sk,
+ bool * has_key,
+ bool * initiator)
+{
+ irm_msg_t * msg;
+ int err;
+
+ msg = irm_msg__unpack(NULL, buf->len, buf->data);
+ if (msg == NULL) {
+ err = -EIRMD;
+ goto fail_msg;
+ }
+
+ if (!msg->has_result) {
+ err = -EIRMD;
+ goto fail;
+ }
+
+ if (msg->result < 0) {
+ err = msg->result;
+ goto fail;
+ }
+
+ *has_key = msg->has_sym_key && msg->sym_key.len == SYMMKEYSZ;
+ if (*has_key) {
+ memcpy(sk->key, msg->sym_key.data, SYMMKEYSZ);
+ sk->nid = NID_undef;
+ sk->epoch = msg->has_generation ?
+ (uint8_t) msg->generation : 0;
+ *initiator = msg->has_rk_initiator && msg->rk_initiator;
+ crypt_secure_clear(msg->sym_key.data, msg->sym_key.len);
+ }
+
irm_msg__free_unpacked(msg, NULL);
return 0;
@@ -222,6 +270,44 @@ int flow_dealloc__irm_req_ser(buffer_t * buf,
return -ENOMEM;
}
+int flow_update__irm_req_ser(buffer_t * buf,
+ const struct flow_info * flow,
+ bool rekey)
+{
+ irm_msg_t * msg;
+ size_t len;
+
+ msg = malloc(sizeof(*msg));
+ if (msg == NULL)
+ goto fail_malloc;
+
+ irm_msg__init(msg);
+
+ msg->code = IRM_MSG_CODE__IRM_FLOW_UPDATE;
+ msg->flow_info = flow_info_s_to_msg(flow);
+ if (msg->flow_info == NULL)
+ goto fail_msg;
+
+ msg->has_rekey = true;
+ msg->rekey = rekey;
+
+ len = irm_msg__get_packed_size(msg);
+ if (len == 0 || len > buf->len)
+ goto fail_msg;
+
+ buf->len = len;
+
+ irm_msg__pack(msg, buf->data);
+ irm_msg__free_unpacked(msg, NULL);
+
+ return 0;
+
+ fail_msg:
+ irm_msg__free_unpacked(msg, NULL);
+ fail_malloc:
+ return -ENOMEM;
+}
+
int ipcp_flow_dealloc__irm_req_ser(buffer_t * buf,
const struct flow_info * flow)
{
@@ -398,6 +484,56 @@ int ipcp_flow_req_arr__irm_req_ser(buffer_t * buf,
return 0;
fail_msg:
+ /* hash/pk are borrowed from the caller; detach before free. */
+ msg->hash.len = 0;
+ msg->hash.data = NULL;
+ msg->pk.len = 0;
+ msg->pk.data = NULL;
+ irm_msg__free_unpacked(msg, NULL);
+ fail_malloc:
+ return -ENOMEM;
+}
+
+int ipcp_flow_update_arr__irm_req_ser(buffer_t * buf,
+ const struct flow_info * flow,
+ const buffer_t * data)
+{
+ irm_msg_t * msg;
+ size_t len;
+
+ msg = malloc(sizeof(*msg));
+ if (msg == NULL)
+ goto fail_malloc;
+
+ irm_msg__init(msg);
+
+ msg->code = IRM_MSG_CODE__IPCP_FLOW_UPDATE_ARR;
+ msg->flow_info = flow_info_s_to_msg(flow);
+ if (msg->flow_info == NULL)
+ goto fail_msg;
+
+ msg->has_pk = true;
+ msg->pk.len = data->len;
+ msg->pk.data = data->data;
+
+ len = irm_msg__get_packed_size(msg);
+ if (len == 0 || len > buf->len)
+ goto fail_msg;
+
+ buf->len = len;
+
+ irm_msg__pack(msg, buf->data);
+
+ /* Don't free data! */
+ msg->pk.len = 0;
+ msg->pk.data = NULL;
+ irm_msg__free_unpacked(msg, NULL);
+
+ return 0;
+ fail_msg:
+ /* pk.data is borrowed from the caller; detach before free. */
+ msg->pk.len = 0;
+ msg->pk.data = NULL;
irm_msg__free_unpacked(msg, NULL);
fail_malloc:
return -ENOMEM;
diff --git a/src/lib/ssm/flow_set.c b/src/lib/ssm/flow_set.c
index 73d0db55..2e33b408 100644
--- a/src/lib/ssm/flow_set.c
+++ b/src/lib/ssm/flow_set.c
@@ -58,9 +58,9 @@
#define QUEUESIZE ((SSM_RBUFF_SIZE) * sizeof(struct flowevent))
#define SSM_FSET_FILE_SIZE (SYS_MAX_FLOWS * sizeof(ssize_t) \
- + PROG_MAX_FQUEUES * sizeof(size_t) \
- + PROG_MAX_FQUEUES * sizeof(pthread_cond_t) \
- + PROG_MAX_FQUEUES * QUEUESIZE \
+ + PROC_MAX_FQUEUES * sizeof(size_t) \
+ + PROC_MAX_FQUEUES * sizeof(pthread_cond_t) \
+ + PROC_MAX_FQUEUES * QUEUESIZE \
+ sizeof(pthread_mutex_t))
#define fqueue_ptr(fs, idx) (fs->fqueues + (SSM_RBUFF_SIZE) * idx)
@@ -104,10 +104,10 @@ static struct ssm_flow_set * flow_set_create(pid_t pid,
set->mtable = shm_base;
set->heads = (size_t *) (set->mtable + SYS_MAX_FLOWS);
- set->conds = (pthread_cond_t *)(set->heads + PROG_MAX_FQUEUES);
- set->fqueues = (struct flowevent *) (set->conds + PROG_MAX_FQUEUES);
+ set->conds = (pthread_cond_t *)(set->heads + PROC_MAX_FQUEUES);
+ set->fqueues = (struct flowevent *) (set->conds + PROC_MAX_FQUEUES);
set->lock = (pthread_mutex_t *)
- (set->fqueues + PROG_MAX_FQUEUES * (SSM_RBUFF_SIZE));
+ (set->fqueues + PROC_MAX_FQUEUES * (SSM_RBUFF_SIZE));
return set;
@@ -164,7 +164,7 @@ struct ssm_flow_set * ssm_flow_set_create(pid_t pid)
if (pthread_condattr_setclock(&cattr, PTHREAD_COND_CLOCK))
goto fail_condattr_set;
#endif
- for (i = 0; i < PROG_MAX_FQUEUES; ++i) {
+ for (i = 0; i < PROC_MAX_FQUEUES; ++i) {
set->heads[i] = 0;
if (pthread_cond_init(&set->conds[i], &cattr))
goto fail_init;
@@ -222,7 +222,7 @@ void ssm_flow_set_zero(struct ssm_flow_set * set,
ssize_t i = 0;
assert(set);
- assert(idx < PROG_MAX_FQUEUES);
+ assert(idx < PROC_MAX_FQUEUES);
pthread_mutex_lock(set->lock);
@@ -242,7 +242,7 @@ int ssm_flow_set_add(struct ssm_flow_set * set,
{
assert(set);
assert(!(flow_id < 0) && flow_id < SYS_MAX_FLOWS);
- assert(idx < PROG_MAX_FQUEUES);
+ assert(idx < PROC_MAX_FQUEUES);
pthread_mutex_lock(set->lock);
@@ -264,7 +264,7 @@ void ssm_flow_set_del(struct ssm_flow_set * set,
{
assert(set);
assert(!(flow_id < 0) && flow_id < SYS_MAX_FLOWS);
- assert(idx < PROG_MAX_FQUEUES);
+ assert(idx < PROC_MAX_FQUEUES);
pthread_mutex_lock(set->lock);
@@ -282,7 +282,7 @@ int ssm_flow_set_has(struct ssm_flow_set * set,
assert(set);
assert(!(flow_id < 0) && flow_id < SYS_MAX_FLOWS);
- assert(idx < PROG_MAX_FQUEUES);
+ assert(idx < PROC_MAX_FQUEUES);
pthread_mutex_lock(set->lock);
@@ -299,26 +299,34 @@ void ssm_flow_set_notify(struct ssm_flow_set * set,
int event)
{
struct flowevent * e;
+ ssize_t idx;
assert(set);
assert(!(flow_id < 0) && flow_id < SYS_MAX_FLOWS);
pthread_mutex_lock(set->lock);
- if (set->mtable[flow_id] == -1) {
+ idx = set->mtable[flow_id];
+ if (idx == -1) {
pthread_mutex_unlock(set->lock);
return;
}
- e = fqueue_ptr(set, set->mtable[flow_id]) +
- set->heads[set->mtable[flow_id]];
+ /* Ring full: drop redundant FLOW_PKT, reserve a slot for ctrl. */
+ if (set->heads[idx] >= SSM_RBUFF_SIZE
+ || (event == FLOW_PKT && set->heads[idx] >= SSM_RBUFF_SIZE - 1)) {
+ pthread_mutex_unlock(set->lock);
+ return;
+ }
+
+ e = fqueue_ptr(set, idx) + set->heads[idx];
e->flow_id = flow_id;
e->event = event;
- ++set->heads[set->mtable[flow_id]];
+ ++set->heads[idx];
- pthread_cond_signal(&set->conds[set->mtable[flow_id]]);
+ pthread_cond_signal(&set->conds[idx]);
pthread_mutex_unlock(set->lock);
}
@@ -332,7 +340,7 @@ ssize_t ssm_flow_set_wait(const struct ssm_flow_set * set,
ssize_t ret = 0;
assert(set);
- assert(idx < PROG_MAX_FQUEUES);
+ assert(idx < PROC_MAX_FQUEUES);
assert(fqueue);
#ifndef HAVE_ROBUST_MUTEX
diff --git a/src/lib/ssm/pool.c b/src/lib/ssm/pool.c
index f17a6e65..705de147 100644
--- a/src/lib/ssm/pool.c
+++ b/src/lib/ssm/pool.c
@@ -24,6 +24,7 @@
#include "config.h"
+#include <ouroboros/atomics.h>
#include <ouroboros/errno.h>
#include <ouroboros/pthread.h>
#include <ouroboros/ssm_pool.h>
@@ -37,10 +38,20 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <time.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
+static __inline__ uint64_t pool_now_ns(void)
+{
+ struct timespec ts;
+
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+
+ return (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec;
+}
+
/* Global Shared Packet Pool (GSPP) configuration */
static const struct ssm_size_class_cfg ssm_gspp_cfg[SSM_POOL_MAX_CLASSES] = {
{ (1 << 8), SSM_GSPP_256_BLOCKS },
@@ -75,26 +86,6 @@ static const struct ssm_size_class_cfg ssm_pup_cfg[SSM_POOL_MAX_CLASSES] = {
#define GET_SHARD_FOR_PID(pid) ((int)((pid) % SSM_POOL_SHARDS))
-#define LOAD_RELAXED(ptr) \
- (__atomic_load_n(ptr, __ATOMIC_RELAXED))
-
-#define LOAD_ACQUIRE(ptr) \
- (__atomic_load_n(ptr, __ATOMIC_ACQUIRE))
-
-#define STORE_RELEASE(ptr, val) \
- (__atomic_store_n(ptr, val, __ATOMIC_RELEASE))
-
-#define LOAD(ptr) \
- (__atomic_load_n(ptr, __ATOMIC_SEQ_CST))
-
-#define STORE(ptr, val) \
- (__atomic_store_n(ptr, val, __ATOMIC_SEQ_CST))
-
-#define FETCH_ADD(ptr, val) \
- (__atomic_fetch_add(ptr, val, __ATOMIC_SEQ_CST))
-
-#define FETCH_SUB(ptr, val) \
- (__atomic_fetch_sub(ptr, val, __ATOMIC_SEQ_CST))
#define SSM_FILE_SIZE (SSM_POOL_TOTAL_SIZE + sizeof(struct _ssm_pool_hdr))
#define SSM_GSPP_FILE_SIZE (SSM_GSPP_TOTAL_SIZE + sizeof(struct _ssm_pool_hdr))
@@ -165,29 +156,6 @@ static __inline__ void list_add_head(struct _ssm_list_head * head,
STORE(&head->count, LOAD(&head->count) + 1);
}
-static __inline__ int select_size_class(struct ssm_pool * pool,
- size_t len)
-{
- size_t sz;
- int i;
-
- assert(pool != NULL);
-
- /* Total space needed: header + headspace + data + tailspace */
- sz = sizeof(struct ssm_pk_buff) + SSM_PK_BUFF_HEADSPACE + len
- + SSM_PK_BUFF_TAILSPACE;
-
- for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) {
- struct _ssm_size_class * sc;
-
- sc = &pool->hdr->size_classes[i];
- if (sc->object_size > 0 && sz <= sc->object_size)
- return i;
- }
-
- return -1;
-}
-
static __inline__ int find_size_class_for_offset(struct ssm_pool * pool,
size_t offset)
{
@@ -278,6 +246,7 @@ static void init_size_classes(struct ssm_pool * pool)
STORE(&blk->refcount, 0);
blk->allocator_pid = 0;
+ blk->alloc_ts = 0;
STORE(&blk->next_offset, 0);
list_add_head(&sc->shards[0].free_list, blk,
@@ -308,19 +277,31 @@ static size_t reclaim_pid_from_sc(struct _ssm_size_class * sc,
size_t i;
size_t recovered = 0;
struct ssm_pk_buff * blk;
+ uint64_t now;
+ uint64_t min_age_ns;
- region = (uint8_t *) pool_base + sc->pool_start;
+ region = (uint8_t *) pool_base + sc->pool_start;
+ now = pool_now_ns();
+ min_age_ns = (uint64_t) SSM_POOL_RECLAIM_AGE_S * 1000000000ULL;
for (i = 0; i < sc->object_count; ++i) {
blk = (struct ssm_pk_buff *)(region + i * sc->object_size);
- if (blk->allocator_pid == pid && LOAD(&blk->refcount) > 0) {
- STORE(&blk->refcount, 0);
- blk->allocator_pid = 0;
- list_add_head(&shard->free_list, blk, pool_base);
- FETCH_ADD(&shard->free_count, 1);
- recovered++;
- }
+ if (blk->allocator_pid != pid)
+ continue;
+
+ if (LOAD(&blk->refcount) == 0)
+ continue;
+
+ /* Recent: a live consumer may still hold the handoff. */
+ if (now - blk->alloc_ts < min_age_ns)
+ continue;
+
+ STORE(&blk->refcount, 0);
+ blk->allocator_pid = 0;
+ list_add_head(&shard->free_list, blk, pool_base);
+ FETCH_ADD(&shard->free_count, 1);
+ recovered++;
}
return recovered;
@@ -381,6 +362,7 @@ static __inline__ ssize_t init_block(struct ssm_pool * pool,
{
STORE(&blk->refcount, 1);
blk->allocator_pid = getpid();
+ blk->alloc_ts = pool_now_ns();
blk->size = (uint32_t) (sc->object_size -
sizeof(struct ssm_pk_buff));
blk->pk_head = SSM_PK_BUFF_HEADSPACE;
@@ -702,7 +684,7 @@ ssize_t ssm_pool_alloc(struct ssm_pool * pool,
assert(pool != NULL);
assert(spb != NULL);
- idx = select_size_class(pool, count);
+ idx = select_size_class(pool->hdr, count);
if (idx >= 0)
return alloc_from_sc(pool, idx, count, ptr, spb);
@@ -720,7 +702,7 @@ ssize_t ssm_pool_alloc_b(struct ssm_pool * pool,
assert(pool != NULL);
assert(spb != NULL);
- idx = select_size_class(pool, count);
+ idx = select_size_class(pool->hdr, count);
if (idx >= 0)
return alloc_from_sc_b(pool, idx, count, ptr, spb, abstime);
@@ -746,7 +728,7 @@ ssize_t ssm_pool_read(uint8_t ** dst,
}
struct ssm_pk_buff * ssm_pool_get(struct ssm_pool * pool,
- size_t off)
+ size_t off)
{
struct ssm_pk_buff * blk;
@@ -825,36 +807,36 @@ int ssm_pool_remove(struct ssm_pool * pool,
return 0;
}
-size_t ssm_pk_buff_get_idx(struct ssm_pk_buff * spb)
+size_t ssm_pk_buff_get_off(const struct ssm_pk_buff * spb)
{
assert(spb != NULL);
return spb->off;
}
-uint8_t * ssm_pk_buff_head(struct ssm_pk_buff * spb)
+uint8_t * ssm_pk_buff_head(const struct ssm_pk_buff * spb)
{
assert(spb != NULL);
- return spb->data + spb->pk_head;
+ return (uint8_t *) spb->data + spb->pk_head;
}
-uint8_t * ssm_pk_buff_tail(struct ssm_pk_buff * spb)
+uint8_t * ssm_pk_buff_tail(const struct ssm_pk_buff * spb)
{
assert(spb != NULL);
- return spb->data + spb->pk_tail;
+ return (uint8_t *) spb->data + spb->pk_tail;
}
-size_t ssm_pk_buff_len(struct ssm_pk_buff * spb)
+size_t ssm_pk_buff_len(const struct ssm_pk_buff * spb)
{
assert(spb != NULL);
return spb->pk_tail - spb->pk_head;
}
-uint8_t * ssm_pk_buff_head_alloc(struct ssm_pk_buff * spb,
- size_t size)
+uint8_t * ssm_pk_buff_push(struct ssm_pk_buff * spb,
+ size_t size)
{
assert(spb != NULL);
@@ -866,8 +848,8 @@ uint8_t * ssm_pk_buff_head_alloc(struct ssm_pk_buff * spb,
return spb->data + spb->pk_head;
}
-uint8_t * ssm_pk_buff_tail_alloc(struct ssm_pk_buff * spb,
- size_t size)
+uint8_t * ssm_pk_buff_push_tail(struct ssm_pk_buff * spb,
+ size_t size)
{
uint8_t * buf;
@@ -883,8 +865,8 @@ uint8_t * ssm_pk_buff_tail_alloc(struct ssm_pk_buff * spb,
return buf;
}
-uint8_t * ssm_pk_buff_head_release(struct ssm_pk_buff * spb,
- size_t size)
+uint8_t * ssm_pk_buff_pop(struct ssm_pk_buff * spb,
+ size_t size)
{
uint8_t * buf;
@@ -898,8 +880,8 @@ uint8_t * ssm_pk_buff_head_release(struct ssm_pk_buff * spb,
return buf;
}
-uint8_t * ssm_pk_buff_tail_release(struct ssm_pk_buff * spb,
- size_t size)
+uint8_t * ssm_pk_buff_pop_tail(struct ssm_pk_buff * spb,
+ size_t size)
{
assert(spb != NULL);
assert(!(size > spb->pk_tail - spb->pk_head));
diff --git a/src/lib/ssm/rbuff.c b/src/lib/ssm/rbuff.c
index e4558c31..0121af89 100644
--- a/src/lib/ssm/rbuff.c
+++ b/src/lib/ssm/rbuff.c
@@ -74,12 +74,13 @@ struct ssm_rbuff {
ssize_t * shm_base; /* start of shared memory */
size_t * head; /* start of ringbuffer */
size_t * tail;
- size_t * acl; /* access control */
+ size_t * flags; /* out-of-band flags (RB_*) */
pthread_mutex_t * mtx; /* lock for cond vars only */
pthread_cond_t * add; /* signal when new data */
pthread_cond_t * del; /* signal when data removed */
pid_t pid; /* pid of the owner */
int flow_id; /* flow_id of the flow */
+ size_t n_users; /* in-flight users */
};
#define MM_FLAGS (PROT_READ | PROT_WRITE)
@@ -113,12 +114,13 @@ static struct ssm_rbuff * rbuff_create(pid_t pid,
rb->shm_base = shm_base;
rb->head = (size_t *) (rb->shm_base + (SSM_RBUFF_SIZE));
rb->tail = (size_t *) (rb->head + 1);
- rb->acl = (size_t *) (rb->tail + 1);
- rb->mtx = (pthread_mutex_t *) (rb->acl + 1);
+ rb->flags = (size_t *) (rb->tail + 1);
+ rb->mtx = (pthread_mutex_t *) (rb->flags + 1);
rb->add = (pthread_cond_t *) (rb->mtx + 1);
rb->del = rb->add + 1;
rb->pid = pid;
rb->flow_id = flow_id;
+ rb->n_users = 0;
return rb;
@@ -179,7 +181,7 @@ struct ssm_rbuff * ssm_rbuff_create(pid_t pid,
if (pthread_cond_init(rb->del, &cattr))
goto fail_del;
- *rb->acl = ACL_RDWR;
+ *rb->flags = RB_RDWR;
*rb->head = 0;
*rb->tail = 0;
@@ -228,27 +230,38 @@ void ssm_rbuff_close(struct ssm_rbuff * rb)
{
assert(rb);
+ /*
+ * Caller must set RB_FLOWDOWN first; if a user becomes
+ * cancellable, push a cleanup that decrements n_users.
+ */
+ while (__atomic_load_n(&rb->n_users, __ATOMIC_SEQ_CST) > 0) {
+ struct timespec tic = { 0, 100000 };
+ nanosleep(&tic, NULL);
+ }
+
rbuff_destroy(rb);
}
int ssm_rbuff_write(struct ssm_rbuff * rb,
- size_t idx)
+ size_t off)
{
- size_t acl;
+ size_t flags;
bool was_empty;
int ret = 0;
assert(rb != NULL);
- acl = __atomic_load_n(rb->acl, __ATOMIC_SEQ_CST);
- if (acl != ACL_RDWR) {
- if (acl & ACL_FLOWDOWN) {
+ __atomic_fetch_add(&rb->n_users, 1, __ATOMIC_SEQ_CST);
+
+ flags = __atomic_load_n(rb->flags, __ATOMIC_SEQ_CST);
+ if (flags != RB_RDWR) {
+ if (flags & RB_FLOWDOWN) {
ret = -EFLOWDOWN;
- goto fail_acl;
+ goto fail_flags;
}
- if (acl & ACL_RDONLY) {
+ if (!(flags & RB_WR)) {
ret = -ENOTALLOC;
- goto fail_acl;
+ goto fail_flags;
}
}
@@ -261,7 +274,7 @@ int ssm_rbuff_write(struct ssm_rbuff * rb,
was_empty = IS_EMPTY(rb);
- HEAD(rb) = (ssize_t) idx;
+ HEAD(rb) = (ssize_t) off;
ADVANCE_HEAD(rb);
if (was_empty)
@@ -269,33 +282,37 @@ int ssm_rbuff_write(struct ssm_rbuff * rb,
pthread_mutex_unlock(rb->mtx);
+ __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST);
return 0;
fail_mutex:
pthread_mutex_unlock(rb->mtx);
- fail_acl:
+ fail_flags:
+ __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST);
return ret;
}
int ssm_rbuff_write_b(struct ssm_rbuff * rb,
- size_t idx,
+ size_t off,
const struct timespec * abstime)
{
- size_t acl;
+ size_t flags;
int ret = 0;
bool was_empty;
assert(rb != NULL);
- acl = __atomic_load_n(rb->acl, __ATOMIC_SEQ_CST);
- if (acl != ACL_RDWR) {
- if (acl & ACL_FLOWDOWN) {
+ __atomic_fetch_add(&rb->n_users, 1, __ATOMIC_SEQ_CST);
+
+ flags = __atomic_load_n(rb->flags, __ATOMIC_SEQ_CST);
+ if (flags != RB_RDWR) {
+ if (flags & RB_FLOWDOWN) {
ret = -EFLOWDOWN;
- goto fail_acl;
+ goto fail_flags;
}
- if (acl & ACL_RDONLY) {
+ if (!(flags & RB_WR)) {
ret = -ENOTALLOC;
- goto fail_acl;
+ goto fail_flags;
}
}
@@ -304,8 +321,8 @@ int ssm_rbuff_write_b(struct ssm_rbuff * rb,
pthread_cleanup_push(__cleanup_mutex_unlock, rb->mtx);
while (IS_FULL(rb) && ret != -ETIMEDOUT) {
- acl = __atomic_load_n(rb->acl, __ATOMIC_SEQ_CST);
- if (acl & ACL_FLOWDOWN) {
+ flags = __atomic_load_n(rb->flags, __ATOMIC_SEQ_CST);
+ if (flags & RB_FLOWDOWN) {
ret = -EFLOWDOWN;
break;
}
@@ -316,7 +333,7 @@ int ssm_rbuff_write_b(struct ssm_rbuff * rb,
if (ret != -ETIMEDOUT && ret != -EFLOWDOWN) {
was_empty = IS_EMPTY(rb);
- HEAD(rb) = (ssize_t) idx;
+ HEAD(rb) = (ssize_t) off;
ADVANCE_HEAD(rb);
if (was_empty)
pthread_cond_broadcast(rb->add);
@@ -324,24 +341,28 @@ int ssm_rbuff_write_b(struct ssm_rbuff * rb,
pthread_mutex_unlock(rb->mtx);
- fail_acl:
+ fail_flags:
+ __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST);
return ret;
}
-static int check_rb_acl(struct ssm_rbuff * rb)
+static int check_rb_flags(struct ssm_rbuff * rb)
{
- size_t acl;
+ size_t flags;
assert(rb != NULL);
- acl = __atomic_load_n(rb->acl, __ATOMIC_SEQ_CST);
+ flags = __atomic_load_n(rb->flags, __ATOMIC_SEQ_CST);
- if (acl & ACL_FLOWDOWN)
+ if (flags & RB_FLOWDOWN)
return -EFLOWDOWN;
- if (acl & ACL_FLOWPEER)
+ if (flags & RB_FLOWPEER)
return -EFLOWPEER;
+ if (!(flags & RB_RD))
+ return -ENOTALLOC;
+
return -EAGAIN;
}
@@ -351,11 +372,21 @@ ssize_t ssm_rbuff_read(struct ssm_rbuff * rb)
assert(rb != NULL);
- if (IS_EMPTY(rb))
- return check_rb_acl(rb);
+ __atomic_fetch_add(&rb->n_users, 1, __ATOMIC_SEQ_CST);
+
+ if (IS_EMPTY(rb)) {
+ ret = check_rb_flags(rb);
+ goto out;
+ }
robust_mutex_lock(rb->mtx);
+ if (IS_EMPTY(rb)) {
+ pthread_mutex_unlock(rb->mtx);
+ ret = check_rb_flags(rb);
+ goto out;
+ }
+
ret = TAIL(rb);
ADVANCE_TAIL(rb);
@@ -363,6 +394,8 @@ ssize_t ssm_rbuff_read(struct ssm_rbuff * rb)
pthread_mutex_unlock(rb->mtx);
+ out:
+ __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST);
return ret;
}
@@ -370,13 +403,17 @@ ssize_t ssm_rbuff_read_b(struct ssm_rbuff * rb,
const struct timespec * abstime)
{
ssize_t idx = -1;
- size_t acl;
+ size_t flags;
assert(rb != NULL);
- acl = __atomic_load_n(rb->acl, __ATOMIC_SEQ_CST);
- if (IS_EMPTY(rb) && (acl & ACL_FLOWDOWN))
- return -EFLOWDOWN;
+ __atomic_fetch_add(&rb->n_users, 1, __ATOMIC_SEQ_CST);
+
+ flags = __atomic_load_n(rb->flags, __ATOMIC_SEQ_CST);
+ if (IS_EMPTY(rb) && (flags & RB_FLOWDOWN)) {
+ idx = -EFLOWDOWN;
+ goto out;
+ }
robust_mutex_lock(rb->mtx);
@@ -384,7 +421,7 @@ ssize_t ssm_rbuff_read_b(struct ssm_rbuff * rb,
while (IS_EMPTY(rb) &&
idx != -ETIMEDOUT &&
- check_rb_acl(rb) == -EAGAIN) {
+ check_rb_flags(rb) == -EAGAIN) {
idx = -robust_wait(rb->add, rb->mtx, abstime);
}
@@ -395,35 +432,55 @@ ssize_t ssm_rbuff_read_b(struct ssm_rbuff * rb,
ADVANCE_TAIL(rb);
pthread_cond_broadcast(rb->del);
} else if (idx != -ETIMEDOUT) {
- idx = check_rb_acl(rb);
+ idx = check_rb_flags(rb);
}
pthread_mutex_unlock(rb->mtx);
assert(idx != -EAGAIN);
+ out:
+ __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST);
return idx;
}
-void ssm_rbuff_set_acl(struct ssm_rbuff * rb,
- uint32_t flags)
+void ssm_rbuff_set_bits(struct ssm_rbuff * rb,
+ uint32_t bits)
{
assert(rb != NULL);
- __atomic_store_n(rb->acl, (size_t) flags, __ATOMIC_SEQ_CST);
+ robust_mutex_lock(rb->mtx);
+ __atomic_fetch_or(rb->flags, (size_t) bits, __ATOMIC_SEQ_CST);
+ pthread_cond_broadcast(rb->add);
+ pthread_cond_broadcast(rb->del);
+ pthread_mutex_unlock(rb->mtx);
+}
+
+void ssm_rbuff_clr_bits(struct ssm_rbuff * rb,
+ uint32_t bits)
+{
+ assert(rb != NULL);
+
+ robust_mutex_lock(rb->mtx);
+ __atomic_fetch_and(rb->flags, ~(size_t) bits, __ATOMIC_SEQ_CST);
+ pthread_cond_broadcast(rb->add);
+ pthread_cond_broadcast(rb->del);
+ pthread_mutex_unlock(rb->mtx);
}
-uint32_t ssm_rbuff_get_acl(struct ssm_rbuff * rb)
+uint32_t ssm_rbuff_get_flags(struct ssm_rbuff * rb)
{
assert(rb != NULL);
- return (uint32_t) __atomic_load_n(rb->acl, __ATOMIC_SEQ_CST);
+ return (uint32_t) __atomic_load_n(rb->flags, __ATOMIC_SEQ_CST);
}
void ssm_rbuff_fini(struct ssm_rbuff * rb)
{
assert(rb != NULL);
+ __atomic_fetch_add(&rb->n_users, 1, __ATOMIC_SEQ_CST);
+
robust_mutex_lock(rb->mtx);
pthread_cleanup_push(__cleanup_mutex_unlock, rb->mtx);
@@ -432,6 +489,8 @@ void ssm_rbuff_fini(struct ssm_rbuff * rb)
robust_wait(rb->del, rb->mtx, NULL);
pthread_cleanup_pop(true);
+
+ __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST);
}
size_t ssm_rbuff_queued(struct ssm_rbuff * rb)
diff --git a/src/lib/ssm/ssm.h.in b/src/lib/ssm/ssm.h.in
index b9246c8b..57febae4 100644
--- a/src/lib/ssm/ssm.h.in
+++ b/src/lib/ssm/ssm.h.in
@@ -38,7 +38,6 @@
#define SSM_RBUFF_PREFIX "@SSM_RBUFF_PREFIX@"
#define SSM_FLOW_SET_PREFIX "@SSM_FLOW_SET_PREFIX@"
#define SSM_POOL_NAME "@SSM_POOL_NAME@"
-#define SSM_POOL_BLOCKS @SSM_POOL_BLOCKS@
#define SSM_RBUFF_SIZE @SSM_RBUFF_SIZE@
/* Packet buffer space reservation */
@@ -84,6 +83,7 @@
/* Size class configuration */
#define SSM_POOL_MAX_CLASSES 9
#define SSM_POOL_SHARDS @SSM_POOL_SHARDS@
+#define SSM_POOL_RECLAIM_AGE_S @SSM_POOL_RECLAIM_AGE_S@
/* Internal structures - exposed for testing */
#ifdef __cplusplus
@@ -126,6 +126,7 @@ struct ssm_pk_buff {
uint32_t pk_head; /* Head offset into data */
uint32_t pk_tail; /* Tail offset into data */
uint32_t off; /* Block offset in pool */
+ uint64_t alloc_ts; /* CLOCK_MONOTONIC ns at alloc */
uint8_t data[]; /* Packet data */
};
@@ -164,6 +165,24 @@ struct _ssm_pool_hdr {
struct _ssm_size_class size_classes[SSM_POOL_MAX_CLASSES];
};
+#define SSM_PK_BUFF_TOTALSPACE (SSM_PK_BUFF_HEADSPACE + SSM_PK_BUFF_TAILSPACE)
+static __inline__ int select_size_class(struct _ssm_pool_hdr * hdr,
+ size_t len)
+{
+ size_t sz;
+ int i;
+
+ sz = sizeof(struct ssm_pk_buff) + SSM_PK_BUFF_TOTALSPACE + len;
+
+ for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) {
+ struct _ssm_size_class * sc = &hdr->size_classes[i];
+ if (sc->object_size > 0 && sz <= sc->object_size)
+ return i;
+ }
+
+ return -1;
+}
+
#ifdef __cplusplus
}
#endif
diff --git a/src/lib/ssm/tests/pool_sharding_test.c b/src/lib/ssm/tests/pool_sharding_test.c
index c53105e3..ec464a92 100644
--- a/src/lib/ssm/tests/pool_sharding_test.c
+++ b/src/lib/ssm/tests/pool_sharding_test.c
@@ -80,19 +80,13 @@ static int test_lazy_distribution(void)
goto fail_pool;
}
- /* Find the first size class with blocks */
- sc_idx = -1;
- for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) {
- if (hdr->size_classes[i].object_count > 0) {
- sc_idx = i;
- break;
- }
- }
-
+ /* Inspect the class that TEST_SIZE allocations will use */
+ sc_idx = select_size_class(hdr, TEST_SIZE);
if (sc_idx < 0) {
- printf("No size classes configured.\n");
+ printf("No size class fits TEST_SIZE=%d.\n", TEST_SIZE);
for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) {
- printf(" Class %d: count=%zu\n", i,
+ printf(" Class %d: object_size=%zu count=%zu\n", i,
+ hdr->size_classes[i].object_size,
hdr->size_classes[i].object_count);
}
goto fail_pool;
@@ -137,7 +131,6 @@ static int test_shard_migration(void)
ssize_t off;
int shard_idx;
int sc_idx;
- int i;
TEST_START();
@@ -149,18 +142,11 @@ static int test_shard_migration(void)
hdr = get_pool_hdr(pool);
- /* Find the first size class with blocks */
- sc_idx = -1;
- for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) {
- if (hdr->size_classes[i].object_count > 0) {
- sc_idx = i;
- break;
- }
- }
-
+ /* Inspect the class that TEST_SIZE allocations will use */
+ sc_idx = select_size_class(hdr, TEST_SIZE);
if (sc_idx < 0) {
- printf("No size classes configured.\n");
- goto fail;
+ printf("No size class fits TEST_SIZE=%d.\n", TEST_SIZE);
+ goto fail_pool;
}
sc = &hdr->size_classes[sc_idx];
@@ -209,7 +195,6 @@ static int test_fallback_stealing(void)
size_t total_free;
size_t i;
int sc_idx;
- int c;
TEST_START();
@@ -221,18 +206,11 @@ static int test_fallback_stealing(void)
hdr = get_pool_hdr(pool);
- /* Find the first size class with blocks */
- sc_idx = -1;
- for (c = 0; c < SSM_POOL_MAX_CLASSES; c++) {
- if (hdr->size_classes[c].object_count > 0) {
- sc_idx = c;
- break;
- }
- }
-
+ /* Inspect the class that TEST_SIZE allocations will use */
+ sc_idx = select_size_class(hdr, TEST_SIZE);
if (sc_idx < 0) {
- printf("No size classes configured.\n");
- goto fail;
+ printf("No size class fits TEST_SIZE=%d.\n", TEST_SIZE);
+ goto fail_pool;
}
sc = &hdr->size_classes[sc_idx];
@@ -261,7 +239,7 @@ static int test_fallback_stealing(void)
/* Free them all - they go to local_shard */
for (i = 0; i < total_blocks / 2; i++) {
- size_t off = ssm_pk_buff_get_idx(spbs[i]);
+ size_t off = ssm_pk_buff_get_off(spbs[i]);
if (ssm_pool_remove(pool, off) != 0) {
printf("Remove %zu failed.\n", i);
free(spbs);
@@ -299,7 +277,7 @@ static int test_fallback_stealing(void)
/* Now all allocated blocks are in use again */
/* Cleanup - free all allocated blocks */
for (i = 0; i < total_blocks / 2; i++) {
- size_t off = ssm_pk_buff_get_idx(spbs[i]);
+ size_t off = ssm_pk_buff_get_off(spbs[i]);
ssm_pool_remove(pool, off);
}
@@ -396,20 +374,15 @@ static int test_multiprocess_sharding(void)
/* Verify blocks distributed across shards */
hdr = get_pool_hdr(pool);
- /* Find the first size class with blocks */
- sc = NULL;
- for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) {
- if (hdr->size_classes[i].object_count > 0) {
- sc = &hdr->size_classes[i];
- break;
- }
- }
-
- if (sc == NULL) {
- printf("No size classes configured.\n");
+ /* Inspect the class that TEST_SIZE allocations used */
+ i = select_size_class(hdr, TEST_SIZE);
+ if (i < 0) {
+ printf("No size class fits TEST_SIZE=%d.\n", TEST_SIZE);
goto fail_pool;
}
+ sc = &hdr->size_classes[i];
+
/* After children allocate and free, blocks should be in shards
* (though exact distribution depends on PID values)
*/
diff --git a/src/lib/ssm/tests/pool_test.c b/src/lib/ssm/tests/pool_test.c
index 3fc19cd5..f86fbd9e 100644
--- a/src/lib/ssm/tests/pool_test.c
+++ b/src/lib/ssm/tests/pool_test.c
@@ -741,14 +741,14 @@ static int test_ssm_pk_buff_operations(void)
memcpy(head, data, dlen);
- tail = ssm_pk_buff_tail_alloc(spb, 32);
+ tail = ssm_pk_buff_push_tail(spb, 32);
if (tail == NULL) {
- printf("Tail_alloc failed.\n");
+ printf("push_tail failed.\n");
goto fail_ops;
}
if (ssm_pk_buff_len(spb) != POOL_256 + 32) {
- printf("Length after tail_alloc: %zu.\n",
+ printf("Length after push_tail: %zu.\n",
ssm_pk_buff_len(spb));
goto fail_ops;
}
@@ -758,14 +758,14 @@ static int test_ssm_pk_buff_operations(void)
goto fail_ops;
}
- tail = ssm_pk_buff_tail_release(spb, 32);
+ tail = ssm_pk_buff_pop_tail(spb, 32);
if (tail == NULL) {
- printf("Tail_release failed.\n");
+ printf("pop_tail failed.\n");
goto fail_ops;
}
if (ssm_pk_buff_len(spb) != POOL_256) {
- printf("Length after tail_release: %zu.\n",
+ printf("Length after pop_tail: %zu.\n",
ssm_pk_buff_len(spb));
goto fail_ops;
}
@@ -956,6 +956,8 @@ static int test_ssm_pool_reclaim_orphans(void)
ssize_t ret3;
pid_t my_pid;
pid_t fake_pid = 99999;
+ struct timespec now;
+ uint64_t old_ts;
TEST_START();
@@ -976,9 +978,15 @@ static int test_ssm_pool_reclaim_orphans(void)
goto fail_alloc;
}
- /* Simulate blocks from another process by changing allocator_pid */
+ /* Simulate blocks leaked by a dead process: foreign pid, aged out. */
+ clock_gettime(CLOCK_MONOTONIC, &now);
+ old_ts = ((uint64_t) now.tv_sec - (SSM_POOL_RECLAIM_AGE_S + 1))
+ * 1000000000ULL + (uint64_t) now.tv_nsec;
+
spb1->allocator_pid = fake_pid;
spb2->allocator_pid = fake_pid;
+ spb1->alloc_ts = old_ts;
+ spb2->alloc_ts = old_ts;
/* Keep spb3 with our pid */
/* Reclaim orphans from fake_pid */
diff --git a/src/lib/ssm/tests/rbuff_test.c b/src/lib/ssm/tests/rbuff_test.c
index 58cb39c3..48e5a714 100644
--- a/src/lib/ssm/tests/rbuff_test.c
+++ b/src/lib/ssm/tests/rbuff_test.c
@@ -206,10 +206,10 @@ static int test_ssm_rbuff_fill_drain(void)
return TEST_RC_FAIL;
}
-static int test_ssm_rbuff_acl(void)
+static int test_ssm_rbuff_flags(void)
{
struct ssm_rbuff * rb;
- uint32_t acl;
+ uint32_t flags;
TEST_START();
@@ -219,16 +219,16 @@ static int test_ssm_rbuff_acl(void)
goto fail;
}
- acl = ssm_rbuff_get_acl(rb);
- if (acl != ACL_RDWR) {
- printf("Expected ACL_RDWR, got %u.\n", acl);
+ flags = ssm_rbuff_get_flags(rb);
+ if (flags != RB_RDWR) {
+ printf("Expected RB_RDWR, got %u.\n", flags);
goto fail_rb;
}
- ssm_rbuff_set_acl(rb, ACL_RDONLY);
- acl = ssm_rbuff_get_acl(rb);
- if (acl != ACL_RDONLY) {
- printf("Expected ACL_RDONLY, got %u.\n", acl);
+ ssm_rbuff_clr_bits(rb, RB_WR);
+ flags = ssm_rbuff_get_flags(rb);
+ if (flags != RB_RD) {
+ printf("Expected RB_RD, got %u.\n", flags);
goto fail_rb;
}
@@ -237,7 +237,7 @@ static int test_ssm_rbuff_acl(void)
goto fail_rb;
}
- ssm_rbuff_set_acl(rb, ACL_FLOWDOWN);
+ ssm_rbuff_set_bits(rb, RB_FLOWDOWN);
if (ssm_rbuff_write(rb, 1) != -EFLOWDOWN) {
printf("Expected -EFLOWDOWN on FLOWDOWN.\n");
goto fail_rb;
@@ -553,7 +553,7 @@ static int test_ssm_rbuff_blocking_flowdown(void)
clock_gettime(PTHREAD_COND_CLOCK, &now);
ts_add(&now, &interval, &abs_timeout);
- ssm_rbuff_set_acl(rb, ACL_FLOWDOWN);
+ ssm_rbuff_set_bits(rb, RB_FLOWDOWN);
ret = ssm_rbuff_read_b(rb, &abs_timeout);
if (ret != -EFLOWDOWN) {
@@ -561,7 +561,7 @@ static int test_ssm_rbuff_blocking_flowdown(void)
goto fail_rb;
}
- ssm_rbuff_set_acl(rb, ACL_RDWR);
+ ssm_rbuff_clr_bits(rb, RB_FLOWDOWN);
for (i = 0; i < SSM_RBUFF_SIZE - 1; ++i) {
if (ssm_rbuff_write(rb, i) < 0) {
@@ -573,7 +573,7 @@ static int test_ssm_rbuff_blocking_flowdown(void)
clock_gettime(PTHREAD_COND_CLOCK, &now);
ts_add(&now, &interval, &abs_timeout);
- ssm_rbuff_set_acl(rb, ACL_FLOWDOWN);
+ ssm_rbuff_set_bits(rb, RB_FLOWDOWN);
ret = ssm_rbuff_write_b(rb, 999, &abs_timeout);
if (ret != -EFLOWDOWN) {
@@ -581,7 +581,7 @@ static int test_ssm_rbuff_blocking_flowdown(void)
goto fail_rb;
}
- ssm_rbuff_set_acl(rb, ACL_RDWR);
+ ssm_rbuff_clr_bits(rb, RB_FLOWDOWN);
while (ssm_rbuff_read(rb) >= 0)
;
@@ -664,7 +664,7 @@ int rbuff_test(int argc,
ret |= test_ssm_rbuff_write_read();
ret |= test_ssm_rbuff_read_empty();
ret |= test_ssm_rbuff_fill_drain();
- ret |= test_ssm_rbuff_acl();
+ ret |= test_ssm_rbuff_flags();
ret |= test_ssm_rbuff_open_close();
ret |= test_ssm_rbuff_threaded();
ret |= test_ssm_rbuff_blocking();
diff --git a/src/lib/tests/CMakeLists.txt b/src/lib/tests/CMakeLists.txt
index 5a2f2c52..002d94af 100644
--- a/src/lib/tests/CMakeLists.txt
+++ b/src/lib/tests/CMakeLists.txt
@@ -10,20 +10,24 @@ create_test_sourcelist(${PARENT_DIR}_tests test_suite.c
auth_test_slh_dsa.c
bitmap_test.c
btree_test.c
- crc32_test.c
crypt_test.c
hash_test.c
kex_test.c
kex_test_ml_kem.c
+ keyrot_test.c
md5_test.c
sha3_test.c
sockets_test.c
time_test.c
tpm_test.c
+ tw_test.c
)
add_executable(${PARENT_DIR}_test ${${PARENT_DIR}_tests})
+target_include_directories(${PARENT_DIR}_test PRIVATE
+ ${CMAKE_SOURCE_DIR}/src/lib)
+
disable_test_logging_for_target(${PARENT_DIR}_test)
target_link_libraries(${PARENT_DIR}_test ouroboros-common)
diff --git a/src/lib/tests/auth_test.c b/src/lib/tests/auth_test.c
index 0f3ef715..af7cf81c 100644
--- a/src/lib/tests/auth_test.c
+++ b/src/lib/tests/auth_test.c
@@ -24,11 +24,14 @@
#include <test/test.h>
#include <ouroboros/crypt.h>
+#include <ouroboros/name.h>
#include <ouroboros/random.h>
#include <ouroboros/utils.h>
#include <test/certs/ecdsa.h>
+#include <string.h>
+
#define TEST_MSG_SIZE 1500
static int test_auth_create_destroy_ctx(void)
@@ -138,6 +141,47 @@ static int test_check_crt_name(void)
return TEST_RC_FAIL;
}
+static int test_crt_name_confusion(void)
+{
+ char name[NAME_SIZE + 1];
+ void * crt;
+
+ TEST_START();
+
+ if (crypt_load_crt_str(confused_crt_ec, &crt) < 0) {
+ printf("Failed to load name-confusion certificate.\n");
+ goto fail_load;
+ }
+
+ /* Must extract the real CN, not the "CN=" decoy in the O field. */
+ if (crypt_get_crt_name(crt, name) < 0) {
+ printf("Failed to extract name from certificate.\n");
+ goto fail_check;
+ }
+
+ if (strcmp(name, "attacker.unittest.o7s") != 0) {
+ printf("Extracted '%s', expected real CN.\n", name);
+ goto fail_check;
+ }
+
+ /* The decoy name in the O field must never authenticate. */
+ if (crypt_check_crt_name(crt, "victim.unittest.o7s") == 0) {
+ printf("Accepted spoofed name from O field.\n");
+ goto fail_check;
+ }
+
+ crypt_free_crt(crt);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_check:
+ crypt_free_crt(crt);
+ fail_load:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
static int test_load_free_privkey(void)
{
void * key;
@@ -400,6 +444,98 @@ static int test_verify_crt_missing_root_ca(void)
return TEST_RC_FAIL;
}
+/* auth_verify_crt_pin: pin must lie in the verified chain (NULL: any) */
+static int test_verify_crt_pin(void)
+{
+ struct auth_ctx * auth;
+ void * _root_ca_crt;
+ void * _im_ca_crt;
+ void * _signed_server_crt;
+ void * _other_ca_crt;
+
+ TEST_START();
+
+ auth = auth_create_ctx();
+ if (auth == NULL) {
+ printf("Failed to create auth context.\n");
+ goto fail_create_ctx;
+ }
+
+ if (crypt_load_crt_str(root_ca_crt_ec, &_root_ca_crt) < 0) {
+ printf("Failed to load root crt from string.\n");
+ goto fail_load_root_ca;
+ }
+
+ if (crypt_load_crt_str(im_ca_crt_ec, &_im_ca_crt) < 0) {
+ printf("Failed to load intermediate crt from string.\n");
+ goto fail_load_im_ca;
+ }
+
+ if (crypt_load_crt_str(signed_server_crt_ec, &_signed_server_crt) < 0) {
+ printf("Failed to load signed crt from string.\n");
+ goto fail_load_signed;
+ }
+
+ if (crypt_load_crt_str(other_ca_crt_ec, &_other_ca_crt) < 0) {
+ printf("Failed to load out-of-chain crt from string.\n");
+ goto fail_load_other;
+ }
+
+ if (auth_add_crt_to_store(auth, _root_ca_crt) < 0) {
+ printf("Failed to add root ca crt to auth store.\n");
+ goto fail_verify;
+ }
+
+ if (auth_add_crt_to_store(auth, _im_ca_crt) < 0) {
+ printf("Failed to add intermediate ca crt to auth store.\n");
+ goto fail_verify;
+ }
+
+ if (auth_verify_crt_pin(auth, _signed_server_crt, _im_ca_crt) < 0) {
+ printf("Failed to accept pin on intermediate CA.\n");
+ goto fail_verify;
+ }
+
+ if (auth_verify_crt_pin(auth, _signed_server_crt, _root_ca_crt) < 0) {
+ printf("Failed to accept pin on root CA.\n");
+ goto fail_verify;
+ }
+
+ if (auth_verify_crt_pin(auth, _signed_server_crt, _other_ca_crt) == 0) {
+ printf("Failed to reject out-of-chain pin.\n");
+ goto fail_verify;
+ }
+
+ if (auth_verify_crt_pin(auth, _signed_server_crt, NULL) < 0) {
+ printf("Failed to accept NULL (any) pin.\n");
+ goto fail_verify;
+ }
+
+ crypt_free_crt(_other_ca_crt);
+ crypt_free_crt(_signed_server_crt);
+ crypt_free_crt(_im_ca_crt);
+ crypt_free_crt(_root_ca_crt);
+
+ auth_destroy_ctx(auth);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_verify:
+ crypt_free_crt(_other_ca_crt);
+ fail_load_other:
+ crypt_free_crt(_signed_server_crt);
+ fail_load_signed:
+ crypt_free_crt(_im_ca_crt);
+ fail_load_im_ca:
+ crypt_free_crt(_root_ca_crt);
+ fail_load_root_ca:
+ auth_destroy_ctx(auth);
+ fail_create_ctx:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
int test_auth_sign(void)
{
uint8_t buf[TEST_MSG_SIZE];
@@ -573,6 +709,7 @@ int auth_test(int argc,
#ifdef HAVE_OPENSSL
ret |= test_load_free_crt();
ret |= test_check_crt_name();
+ ret |= test_crt_name_confusion();
ret |= test_crypt_get_pubkey_crt();
ret |= test_load_free_privkey();
ret |= test_load_free_pubkey();
@@ -580,12 +717,14 @@ int auth_test(int argc,
ret |= test_store_add();
ret |= test_verify_crt();
ret |= test_verify_crt_missing_root_ca();
+ ret |= test_verify_crt_pin();
ret |= test_auth_sign();
ret |= test_auth_bad_signature();
ret |= test_crt_str();
#else
(void) test_load_free_crt;
(void) test_check_crt_name;
+ (void) test_crt_name_confusion;
(void) test_crypt_get_pubkey_crt;
(void) test_load_free_privkey;
(void) test_load_free_pubkey;
@@ -593,6 +732,7 @@ int auth_test(int argc,
(void) test_store_add;
(void) test_verify_crt;
(void) test_verify_crt_missing_root_ca;
+ (void) test_verify_crt_pin;
(void) test_auth_sign;
(void) test_auth_bad_signature;
(void) test_crt_str;
diff --git a/src/lib/tests/crypt_test.c b/src/lib/tests/crypt_test.c
index 028c4eb5..2d752238 100644
--- a/src/lib/tests/crypt_test.c
+++ b/src/lib/tests/crypt_test.c
@@ -30,6 +30,7 @@
#include <stdio.h>
#define TEST_PACKET_SIZE 1500
+#define TEST_N_PACKETS 1000
extern const uint16_t crypt_supported_nids[];
extern const uint16_t md_supported_nids[];
@@ -39,9 +40,10 @@ static int test_crypt_create_destroy(void)
struct crypt_ctx * ctx;
uint8_t key[SYMMKEYSZ];
struct crypt_sk sk = {
- .nid = NID_aes_256_gcm,
- .key = key,
- .rot_bit = KEY_ROTATION_BIT
+ .nid = NID_aes_256_gcm,
+ .key = key,
+ .epoch = 0,
+ .role = CRYPT_ROLE_INIT
};
TEST_START();
@@ -67,18 +69,27 @@ static int test_crypt_create_destroy(void)
static int test_crypt_encrypt_decrypt(int nid)
{
uint8_t pkt[TEST_PACKET_SIZE];
- struct crypt_ctx * ctx;
+ struct crypt_ctx * tx;
+ struct crypt_ctx * rx;
uint8_t key[SYMMKEYSZ];
- struct crypt_sk sk = {
- .nid = NID_aes_256_gcm,
- .key = key,
- .rot_bit = KEY_ROTATION_BIT
+ struct crypt_sk sk_tx = {
+ .key = key,
+ .epoch = 0,
+ .role = CRYPT_ROLE_INIT
+ };
+ struct crypt_sk sk_rx = {
+ .key = key,
+ .epoch = 0,
+ .role = CRYPT_ROLE_RESP
};
buffer_t in;
buffer_t out;
buffer_t out2;
const char * cipher;
+ sk_tx.nid = nid;
+ sk_rx.nid = nid;
+
cipher = crypt_nid_to_str(nid);
TEST_START("(%s)", cipher);
@@ -92,53 +103,63 @@ static int test_crypt_encrypt_decrypt(int nid)
goto fail_init;
}
- ctx = crypt_create_ctx(&sk);
- if (ctx == NULL) {
- printf("Failed to initialize cryptography.\n");
+ tx = crypt_create_ctx(&sk_tx);
+ if (tx == NULL) {
+ printf("Failed to initialize TX cryptography.\n");
goto fail_init;
}
+ rx = crypt_create_ctx(&sk_rx);
+ if (rx == NULL) {
+ printf("Failed to initialize RX cryptography.\n");
+ goto fail_tx;
+ }
+
in.len = sizeof(pkt);
in.data = pkt;
- if (crypt_encrypt(ctx, in, &out) < 0) {
+ if (crypt_encrypt(tx, in, &out) < 0) {
printf("Encryption failed.\n");
goto fail_encrypt;
}
if (out.len < in.len) {
printf("Encryption returned too little data.\n");
- goto fail_encrypt;
+ goto fail_chk;
}
- if (crypt_decrypt(ctx, out, &out2) < 0) {
+ if (crypt_decrypt(rx, out, &out2) < 0) {
printf("Decryption failed.\n");
goto fail_decrypt;
}
if (out2.len != in.len) {
printf("Decrypted data length does not match original.\n");
- goto fail_chk;
+ goto fail_chk2;
}
if (memcmp(in.data, out2.data, in.len) != 0) {
printf("Decrypted data does not match original.\n");
- goto fail_chk;
+ goto fail_chk2;
}
- crypt_destroy_ctx(ctx);
freebuf(out2);
freebuf(out);
+ crypt_destroy_ctx(rx);
+ crypt_destroy_ctx(tx);
TEST_SUCCESS("(%s)", cipher);
return TEST_RC_SUCCESS;
- fail_chk:
+ fail_chk2:
freebuf(out2);
fail_decrypt:
+ fail_chk:
freebuf(out);
fail_encrypt:
- crypt_destroy_ctx(ctx);
+ crypt_destroy_ctx(rx);
+ fail_tx:
+ crypt_destroy_ctx(tx);
fail_init:
TEST_FAIL("(%s)", cipher);
return TEST_RC_FAIL;
@@ -155,6 +176,214 @@ static int test_encrypt_decrypt_all(void)
return ret;
}
+static int test_crypt_multi_packet(int nid)
+{
+ uint8_t pkt[TEST_PACKET_SIZE];
+ struct crypt_ctx * tx;
+ struct crypt_ctx * rx;
+ uint8_t key[SYMMKEYSZ];
+ struct crypt_sk sk_tx = {
+ .key = key,
+ .epoch = 0,
+ .role = CRYPT_ROLE_INIT
+ };
+ struct crypt_sk sk_rx = {
+ .key = key,
+ .epoch = 0,
+ .role = CRYPT_ROLE_RESP
+ };
+ buffer_t in;
+ buffer_t enc;
+ buffer_t dec;
+ const char * cipher;
+ int i;
+
+ sk_tx.nid = nid;
+ sk_rx.nid = nid;
+
+ cipher = crypt_nid_to_str(nid);
+ TEST_START("(%s)", cipher);
+
+ if (random_buffer(key, sizeof(key)) < 0) {
+ printf("Failed to generate random key.\n");
+ goto fail_init;
+ }
+
+ if (random_buffer(pkt, sizeof(pkt)) < 0) {
+ printf("Failed to generate random data.\n");
+ goto fail_init;
+ }
+
+ tx = crypt_create_ctx(&sk_tx);
+ if (tx == NULL) {
+ printf("Failed to create TX context.\n");
+ goto fail_init;
+ }
+
+ rx = crypt_create_ctx(&sk_rx);
+ if (rx == NULL) {
+ printf("Failed to create RX context.\n");
+ goto fail_tx;
+ }
+
+ in.len = sizeof(pkt);
+ in.data = pkt;
+
+ for (i = 0; i < TEST_N_PACKETS; i++) {
+ if (crypt_encrypt(tx, in, &enc) < 0) {
+ printf("Encryption failed at packet %d.\n", i);
+ goto fail_rx;
+ }
+
+ if (crypt_decrypt(rx, enc, &dec) < 0) {
+ printf("Decryption failed at packet %d.\n", i);
+ freebuf(enc);
+ goto fail_rx;
+ }
+
+ if (dec.len != in.len ||
+ memcmp(in.data, dec.data, in.len) != 0) {
+ printf("Data mismatch at packet %d.\n", i);
+ freebuf(dec);
+ freebuf(enc);
+ goto fail_rx;
+ }
+
+ freebuf(dec);
+ freebuf(enc);
+ }
+
+ crypt_destroy_ctx(rx);
+ crypt_destroy_ctx(tx);
+
+ TEST_SUCCESS("(%s)", cipher);
+
+ return TEST_RC_SUCCESS;
+ fail_rx:
+ crypt_destroy_ctx(rx);
+ fail_tx:
+ crypt_destroy_ctx(tx);
+ fail_init:
+ TEST_FAIL("(%s)", cipher);
+ return TEST_RC_FAIL;
+}
+
+static int test_multi_packet_all(void)
+{
+ int ret = 0;
+ int i;
+
+ for (i = 0; crypt_supported_nids[i] != NID_undef; i++)
+ ret |= test_crypt_multi_packet(crypt_supported_nids[i]);
+
+ return ret;
+}
+
+static int test_crypt_aad_tamper(int nid)
+{
+ uint8_t pkt[TEST_PACKET_SIZE];
+ struct crypt_ctx * tx;
+ struct crypt_ctx * rx;
+ uint8_t key[SYMMKEYSZ];
+ struct crypt_sk sk_tx = {
+ .key = key,
+ .epoch = 0,
+ .role = CRYPT_ROLE_INIT
+ };
+ struct crypt_sk sk_rx = {
+ .key = key,
+ .epoch = 0,
+ .role = CRYPT_ROLE_RESP
+ };
+ buffer_t in;
+ buffer_t enc;
+ buffer_t dec;
+ const char * cipher;
+
+ sk_tx.nid = nid;
+ sk_rx.nid = nid;
+
+ cipher = crypt_nid_to_str(nid);
+ TEST_START("(%s)", cipher);
+
+ if (random_buffer(key, sizeof(key)) < 0) {
+ printf("Failed to generate random key.\n");
+ goto fail_init;
+ }
+
+ if (random_buffer(pkt, sizeof(pkt)) < 0) {
+ printf("Failed to generate random data.\n");
+ goto fail_init;
+ }
+
+ tx = crypt_create_ctx(&sk_tx);
+ if (tx == NULL) {
+ printf("Failed to create TX context.\n");
+ goto fail_init;
+ }
+
+ rx = crypt_create_ctx(&sk_rx);
+ if (rx == NULL) {
+ printf("Failed to create RX context.\n");
+ goto fail_tx;
+ }
+
+ /* Only AEAD ciphers bind the selector as AAD. */
+ if (crypt_get_tagsz(tx) == 0) {
+ crypt_destroy_ctx(rx);
+ crypt_destroy_ctx(tx);
+
+ TEST_SUCCESS("(%s)", cipher);
+
+ return TEST_RC_SUCCESS;
+ }
+
+ in.len = sizeof(pkt);
+ in.data = pkt;
+
+ if (crypt_encrypt(tx, in, &enc) < 0) {
+ printf("Encryption failed.\n");
+ goto fail_rx;
+ }
+
+ /* Flip a seq byte: epoch/node stay valid so the AEAD tag rejects. */
+ enc.data[5] ^= 0x01;
+
+ if (crypt_decrypt(rx, enc, &dec) == 0) {
+ printf("Decryption accepted a tampered selector.\n");
+ freebuf(dec);
+ freebuf(enc);
+ goto fail_rx;
+ }
+
+ freebuf(enc);
+
+ crypt_destroy_ctx(rx);
+ crypt_destroy_ctx(tx);
+
+ TEST_SUCCESS("(%s)", cipher);
+
+ return TEST_RC_SUCCESS;
+ fail_rx:
+ crypt_destroy_ctx(rx);
+ fail_tx:
+ crypt_destroy_ctx(tx);
+ fail_init:
+ TEST_FAIL("(%s)", cipher);
+ return TEST_RC_FAIL;
+}
+
+static int test_aad_tamper_all(void)
+{
+ int ret = 0;
+ int i;
+
+ for (i = 0; crypt_supported_nids[i] != NID_undef; i++)
+ ret |= test_crypt_aad_tamper(crypt_supported_nids[i]);
+
+ return ret;
+}
+
#ifdef HAVE_OPENSSL
#include <openssl/evp.h>
#include <openssl/obj_mac.h>
@@ -256,109 +485,17 @@ static int test_md_nid_values(void)
}
#endif
-static int test_key_rotation(void)
+static int test_crypt_headsz(void)
{
- uint8_t pkt[TEST_PACKET_SIZE];
- struct crypt_ctx * tx_ctx;
- struct crypt_ctx * rx_ctx;
- uint8_t key[SYMMKEYSZ];
- struct crypt_sk sk = {
- .nid = NID_aes_256_gcm,
- .key = key,
- .rot_bit = 7
- };
- buffer_t in;
- buffer_t enc;
- buffer_t dec;
- uint32_t i;
- uint32_t threshold;
-
- TEST_START();
-
- if (random_buffer(key, sizeof(key)) < 0) {
- printf("Failed to generate random key.\n");
- goto fail;
- }
-
- if (random_buffer(pkt, sizeof(pkt)) < 0) {
- printf("Failed to generate random data.\n");
- goto fail;
- }
-
- tx_ctx = crypt_create_ctx(&sk);
- if (tx_ctx == NULL) {
- printf("Failed to create TX context.\n");
- goto fail;
- }
-
- rx_ctx = crypt_create_ctx(&sk);
- if (rx_ctx == NULL) {
- printf("Failed to create RX context.\n");
- goto fail_tx;
- }
-
- in.len = sizeof(pkt);
- in.data = pkt;
-
- threshold = (1U << sk.rot_bit);
-
- /* Encrypt and decrypt across multiple rotations */
- for (i = 0; i < threshold * 3; i++) {
- if (crypt_encrypt(tx_ctx, in, &enc) < 0) {
- printf("Encryption failed at packet %u.\n", i);
- goto fail_rx;
- }
-
- if (crypt_decrypt(rx_ctx, enc, &dec) < 0) {
- printf("Decryption failed at packet %u.\n", i);
- freebuf(enc);
- goto fail_rx;
- }
-
- if (dec.len != in.len ||
- memcmp(in.data, dec.data, in.len) != 0) {
- printf("Data mismatch at packet %u.\n", i);
- freebuf(dec);
- freebuf(enc);
- goto fail_rx;
- }
-
- freebuf(dec);
- freebuf(enc);
- }
-
- crypt_destroy_ctx(rx_ctx);
- crypt_destroy_ctx(tx_ctx);
-
- TEST_SUCCESS();
-
- return TEST_RC_SUCCESS;
- fail_rx:
- crypt_destroy_ctx(rx_ctx);
- fail_tx:
- crypt_destroy_ctx(tx_ctx);
- fail:
- TEST_FAIL();
- return TEST_RC_FAIL;
-}
-
-static int test_key_phase_bit(void)
-{
- uint8_t pkt[TEST_PACKET_SIZE];
struct crypt_ctx * ctx;
uint8_t key[SYMMKEYSZ];
struct crypt_sk sk = {
- .nid = NID_aes_256_gcm,
- .key = key,
- .rot_bit = 7
+ .nid = NID_aes_256_gcm,
+ .key = key,
+ .epoch = 0,
+ .role = CRYPT_ROLE_INIT
};
- buffer_t in;
- buffer_t out;
- uint32_t count;
- uint32_t threshold;
- uint8_t phase_before;
- uint8_t phase_after;
- int ivsz;
+ int headsz;
TEST_START();
@@ -367,58 +504,15 @@ static int test_key_phase_bit(void)
goto fail;
}
- if (random_buffer(pkt, sizeof(pkt)) < 0) {
- printf("Failed to generate random data.\n");
- goto fail;
- }
-
ctx = crypt_create_ctx(&sk);
if (ctx == NULL) {
printf("Failed to initialize cryptography.\n");
goto fail;
}
- ivsz = crypt_get_ivsz(ctx);
- if (ivsz <= 0) {
- printf("Invalid IV size.\n");
- goto fail_ctx;
- }
-
- in.len = sizeof(pkt);
- in.data = pkt;
-
- /* Encrypt packets up to just before rotation threshold */
- threshold = (1U << sk.rot_bit);
-
- /* Encrypt threshold - 1 packets (indices 0 to threshold-2) */
- for (count = 0; count < threshold - 1; count++) {
- if (crypt_encrypt(ctx, in, &out) < 0) {
- printf("Encryption failed at count %u.\n", count);
- goto fail_ctx;
- }
- freebuf(out);
- }
-
- /* Packet at index threshold-1: phase should still be initial */
- if (crypt_encrypt(ctx, in, &out) < 0) {
- printf("Encryption failed before rotation.\n");
- goto fail_ctx;
- }
- phase_before = (out.data[0] & 0x80) ? 1 : 0;
- freebuf(out);
-
- /* Packet at index threshold: phase should have toggled */
- if (crypt_encrypt(ctx, in, &out) < 0) {
- printf("Encryption failed at rotation threshold.\n");
- goto fail_ctx;
- }
- phase_after = (out.data[0] & 0x80) ? 1 : 0;
- freebuf(out);
-
- /* Phase bit should have toggled */
- if (phase_before == phase_after) {
- printf("Phase bit did not toggle: before=%u, after=%u.\n",
- phase_before, phase_after);
+ headsz = crypt_get_headsz(ctx);
+ if (headsz != 6) {
+ printf("Unexpected header size: %d (expected 6).\n", headsz);
goto fail_ctx;
}
@@ -447,11 +541,13 @@ int crypt_test(int argc,
#ifdef HAVE_OPENSSL
ret |= test_cipher_nid_values();
ret |= test_md_nid_values();
- ret |= test_key_rotation();
- ret |= test_key_phase_bit();
+ ret |= test_multi_packet_all();
+ ret |= test_aad_tamper_all();
+ ret |= test_crypt_headsz();
#else
- (void) test_key_rotation;
- (void) test_key_phase_bit;
+ (void) test_multi_packet_all;
+ (void) test_aad_tamper_all;
+ (void) test_crypt_headsz;
return TEST_RC_SKIP;
#endif
diff --git a/src/lib/tests/hash_test.c b/src/lib/tests/hash_test.c
index e43847e1..a2ba62cc 100644
--- a/src/lib/tests/hash_test.c
+++ b/src/lib/tests/hash_test.c
@@ -39,6 +39,79 @@ struct vec_entry {
char * out;
};
+struct mix_entry {
+ uint64_t in;
+ uint64_t out;
+};
+
+static int test_crc8(void)
+{
+ int ret = 0;
+
+ struct vec_entry vec [] = {
+ { "", "00" },
+ { "123456789", "df" },
+ { NULL, NULL }
+ };
+
+ struct vec_entry * cur = vec;
+
+ TEST_START();
+
+ while (cur->in != NULL) {
+ uint8_t crc;
+ char res[3];
+
+ str_hash(HASH_CRC8, &crc, cur->in);
+
+ sprintf(res, "%02x", crc);
+ if (strcmp(res, cur->out) != 0) {
+ printf("Hash failed %s != %s.\n", res, cur->out);
+ ret |= -1;
+ }
+
+ ++cur;
+ }
+
+ TEST_END(ret);
+
+ return ret;
+}
+
+static int test_crc16(void)
+{
+ int ret = 0;
+
+ struct vec_entry vec [] = {
+ { "", "ffff" },
+ { "123456789", "29b1" },
+ { NULL, NULL }
+ };
+
+ struct vec_entry * cur = vec;
+
+ TEST_START();
+
+ while (cur->in != NULL) {
+ uint8_t crc[2];
+ char res[5];
+
+ str_hash(HASH_CRC16, crc, cur->in);
+
+ sprintf(res, "%02x%02x", crc[0], crc[1]);
+ if (strcmp(res, cur->out) != 0) {
+ printf("Hash failed %s != %s.\n", res, cur->out);
+ ret |= -1;
+ }
+
+ ++cur;
+ }
+
+ TEST_END(ret);
+
+ return ret;
+}
+
static int test_crc32(void)
{
int ret = 0;
@@ -74,6 +147,42 @@ static int test_crc32(void)
return ret;
}
+static int test_crc64(void)
+{
+ int ret = 0;
+
+ struct vec_entry vec [] = {
+ { "", "0000000000000000" },
+ { "123456789", "ae8b14860a799888" },
+ { "0123456789abcdef",
+ "091485ca7018730e" },
+ { NULL, NULL }
+ };
+
+ struct vec_entry * cur = vec;
+
+ TEST_START();
+
+ while (cur->in != NULL) {
+ uint8_t crc[8];
+ char res[17];
+
+ str_hash(HASH_CRC64, crc, cur->in);
+
+ sprintf(res, HASH_FMT64, HASH_VAL64(crc));
+ if (strcmp(res, cur->out) != 0) {
+ printf("Hash failed %s != %s.\n", res, cur->out);
+ ret |= -1;
+ }
+
+ ++cur;
+ }
+
+ TEST_END(ret);
+
+ return ret;
+}
+
static int test_md5(void)
{
int ret = 0;
@@ -184,6 +293,36 @@ static int test_sha3(void)
return ret;
}
+static int test_mix64(void)
+{
+ int ret = 0;
+
+ struct mix_entry vec [] = {
+ { 0x0000000000000000ULL, 0x0000000000000000ULL },
+ { 0x123456789abcdefeULL, 0xb1943cfea4f78f08ULL }
+ };
+
+ size_t n = sizeof(vec) / sizeof(vec[0]);
+ size_t i;
+
+ TEST_START();
+
+ for (i = 0; i < n; i++) {
+ uint64_t res = hash_mix64(vec[i].in);
+
+ if (res != vec[i].out) {
+ printf("Mix failed %016llx != %016llx.\n",
+ (unsigned long long) res,
+ (unsigned long long) vec[i].out);
+ ret |= -1;
+ }
+ }
+
+ TEST_END(ret);
+
+ return ret;
+}
+
int hash_test(int argc,
char ** argv)
{
@@ -192,11 +331,19 @@ int hash_test(int argc,
(void) argc;
(void) argv;
+ ret |= test_crc8();
+
+ ret |= test_crc16();
+
ret |= test_crc32();
+ ret |= test_crc64();
+
ret |= test_md5();
ret |= test_sha3();
+ ret |= test_mix64();
+
return ret;
}
diff --git a/src/lib/tests/kex_test.c b/src/lib/tests/kex_test.c
index 6a4f802e..0a00ccab 100644
--- a/src/lib/tests/kex_test.c
+++ b/src/lib/tests/kex_test.c
@@ -44,6 +44,9 @@
#define KEX_CONFIG_NONE \
"none\n"
+#define KEX_CONFIG_NO_ENC \
+ "encryption=none\n"
+
#define KEX_CONFIG_WHITESPACE \
"# Comment line\n" \
"kex = X448" \
@@ -58,6 +61,31 @@
"kex=X25519\n" \
"digest=sha384\n"
+#define KEX_CONFIG_AUTH \
+ "auth=required\n"
+
+#define KEX_CONFIG_AUTH_INVALID \
+ "auth=mandatory\n"
+
+#define KEX_CONFIG_AUTH_OPTIONAL \
+ "auth=optional\n"
+
+#define KEX_CONFIG_AUTH_THEN_NO_ENC \
+ "auth=required\n" \
+ "digest=sha512\n" \
+ "encryption=none\n"
+
+#define KEX_CONFIG_NO_ENC_THEN_AUTH \
+ "encryption=none\n" \
+ "auth=required\n" \
+ "digest=sha512\n"
+
+#define KEX_CONFIG_CACERT \
+ "cacert=/etc/ouroboros/security/cacert/ca.crt\n"
+
+#define KEX_CONFIG_UNKNOWN_KEY \
+ "autth=required\n"
+
/* Test key material for key loading tests */
#define X25519_PRIVKEY_PEM \
"-----BEGIN PRIVATE KEY-----\n" \
@@ -213,6 +241,7 @@ static int test_kex_dhe_derive(const char * algo)
memset(&kex, 0, sizeof(kex));
SET_KEX_ALGO(&kex, algo);
+ SET_KEX_KDF_NID(&kex, NID_sha256);
len = kex_pkp_create(&kex, &pkp1, buf1);
if (len < 0) {
@@ -324,6 +353,7 @@ static int test_kex_dhe_corrupted_pubkey(const char * algo)
memset(&kex, 0, sizeof(kex));
SET_KEX_ALGO(&kex, algo);
+ SET_KEX_KDF_NID(&kex, NID_sha256);
len = kex_pkp_create(&kex, &pkp, buf);
if (len < 0) {
@@ -375,6 +405,8 @@ static int test_kex_dhe_wrong_algo(void)
memset(&kex2, 0, sizeof(kex2));
SET_KEX_ALGO(&kex1, algo1);
SET_KEX_ALGO(&kex2, algo2);
+ SET_KEX_KDF_NID(&kex1, NID_sha256);
+ SET_KEX_KDF_NID(&kex2, NID_sha256);
if (kex_pkp_create(&kex1, &pkp1, buf1) < 0) {
printf("Failed to create first key pair.\n");
@@ -639,7 +671,8 @@ static int test_kex_parse_config_custom(void)
return TEST_RC_FAIL;
}
-static int test_kex_parse_config_none(void)
+/* The old bare 'none' keyword must be rejected loudly */
+static int test_kex_parse_config_none_rejected(void)
{
struct sec_config kex;
FILE * fp;
@@ -654,14 +687,51 @@ static int test_kex_parse_config_none(void)
goto fail;
}
+ if (parse_sec_config(&kex, fp) == 0) {
+ printf("Bare 'none' keyword should be rejected.\n");
+ fclose(fp);
+ goto fail;
+ }
+
+ fclose(fp);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_kex_parse_config_no_enc(void)
+{
+ struct sec_config kex;
+ FILE * fp;
+
+ TEST_START();
+
+ memset(&kex, 0, sizeof(kex));
+
+ fp = FMEMOPEN_STR(KEX_CONFIG_NO_ENC);
+ if (fp == NULL) {
+ printf("Failed to open memory stream.\n");
+ goto fail;
+ }
+
if (parse_sec_config(&kex, fp) < 0) {
- printf("Failed to parse 'none' config.\n");
+ printf("Failed to parse encryption=none config.\n");
fclose(fp);
goto fail;
}
- if (kex.x.nid != NID_undef) {
- printf("'none' keyword should disable encryption.\n");
+ if (kex.x.nid != NID_undef || kex.c.nid != NID_undef) {
+ printf("encryption=none should disable encryption.\n");
+ fclose(fp);
+ goto fail;
+ }
+
+ if (kex.d.nid != NID_sha256) {
+ printf("encryption=none should keep the digest.\n");
fclose(fp);
goto fail;
}
@@ -799,6 +869,277 @@ static int test_kex_parse_config_digest(void)
return TEST_RC_FAIL;
}
+static int test_kex_parse_config_auth(void)
+{
+ struct sec_config kex;
+ FILE * fp;
+
+ TEST_START();
+
+ memset(&kex, 0, sizeof(kex));
+
+ fp = FMEMOPEN_STR(KEX_CONFIG_AUTH);
+ if (fp == NULL) {
+ printf("Failed to open memory stream.\n");
+ goto fail;
+ }
+
+ if (parse_sec_config(&kex, fp) < 0) {
+ printf("Failed to parse auth config.\n");
+ fclose(fp);
+ goto fail;
+ }
+
+ if (!kex.a.req) {
+ printf("auth=required not parsed correctly.\n");
+ fclose(fp);
+ goto fail;
+ }
+
+ fclose(fp);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_kex_parse_config_auth_invalid(void)
+{
+ struct sec_config kex;
+ FILE * fp;
+
+ TEST_START();
+
+ memset(&kex, 0, sizeof(kex));
+
+ fp = FMEMOPEN_STR(KEX_CONFIG_AUTH_INVALID);
+ if (fp == NULL) {
+ printf("Failed to open memory stream.\n");
+ goto fail;
+ }
+
+ if (parse_sec_config(&kex, fp) == 0) {
+ printf("Invalid auth value should be rejected.\n");
+ fclose(fp);
+ goto fail;
+ }
+
+ fclose(fp);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* A caller-seeded req_auth survives parsing when no auth= line is set */
+static int test_kex_parse_config_auth_seed(void)
+{
+ struct sec_config kex;
+ FILE * fp;
+
+ TEST_START();
+
+ memset(&kex, 0, sizeof(kex));
+ kex.a.req = true;
+
+ fp = FMEMOPEN_STR(KEX_CONFIG_NO_ENC);
+ if (fp == NULL) {
+ printf("Failed to open memory stream.\n");
+ goto fail;
+ }
+
+ if (parse_sec_config(&kex, fp) < 0) {
+ printf("Failed to parse config.\n");
+ fclose(fp);
+ goto fail;
+ }
+
+ if (!kex.a.req) {
+ printf("Seeded req_auth should survive parsing.\n");
+ fclose(fp);
+ goto fail;
+ }
+
+ fclose(fp);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* An explicit auth=optional clears a caller-seeded req_auth */
+static int test_kex_parse_config_auth_optional(void)
+{
+ struct sec_config kex;
+ FILE * fp;
+
+ TEST_START();
+
+ memset(&kex, 0, sizeof(kex));
+ kex.a.req = true;
+
+ fp = FMEMOPEN_STR(KEX_CONFIG_AUTH_OPTIONAL);
+ if (fp == NULL) {
+ printf("Failed to open memory stream.\n");
+ goto fail;
+ }
+
+ if (parse_sec_config(&kex, fp) < 0) {
+ printf("Failed to parse auth=optional config.\n");
+ fclose(fp);
+ goto fail;
+ }
+
+ if (kex.a.req) {
+ printf("auth=optional should clear req_auth.\n");
+ fclose(fp);
+ goto fail;
+ }
+
+ fclose(fp);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* encryption=none must not drop auth=required or the digest */
+static int test_kex_parse_config_auth_no_enc(const char * config)
+{
+ struct sec_config kex;
+ FILE * fp;
+
+ TEST_START();
+
+ memset(&kex, 0, sizeof(kex));
+
+ fp = FMEMOPEN_STR(config);
+ if (fp == NULL) {
+ printf("Failed to open memory stream.\n");
+ goto fail;
+ }
+
+ if (parse_sec_config(&kex, fp) < 0) {
+ printf("Failed to parse auth + encryption=none.\n");
+ fclose(fp);
+ goto fail;
+ }
+
+ if (!kex.a.req) {
+ printf("encryption=none should not drop required auth.\n");
+ fclose(fp);
+ goto fail;
+ }
+
+ if (kex.x.nid != NID_undef) {
+ printf("encryption=none should disable encryption.\n");
+ fclose(fp);
+ goto fail;
+ }
+
+ if (kex.d.nid != NID_sha512) {
+ printf("encryption=none should keep the digest.\n");
+ fclose(fp);
+ goto fail;
+ }
+
+ fclose(fp);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_kex_parse_config_cacert(void)
+{
+ struct sec_config kex;
+ FILE * fp;
+
+ TEST_START();
+
+ memset(&kex, 0, sizeof(kex));
+
+ fp = FMEMOPEN_STR(KEX_CONFIG_CACERT);
+ if (fp == NULL) {
+ printf("Failed to open memory stream.\n");
+ goto fail;
+ }
+
+ if (parse_sec_config(&kex, fp) < 0) {
+ printf("Failed to parse cacert config.\n");
+ fclose(fp);
+ goto fail;
+ }
+
+ if (strcmp(kex.a.cacert,
+ "/etc/ouroboros/security/cacert/ca.crt") != 0) {
+ printf("cacert not parsed correctly.\n");
+ fclose(fp);
+ goto fail;
+ }
+
+ if (kex.a.req) {
+ printf("cacert must not imply req_auth.\n");
+ fclose(fp);
+ goto fail;
+ }
+
+ fclose(fp);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_kex_parse_config_unknown_key(void)
+{
+ struct sec_config kex;
+ FILE * fp;
+
+ TEST_START();
+
+ memset(&kex, 0, sizeof(kex));
+
+ fp = FMEMOPEN_STR(KEX_CONFIG_UNKNOWN_KEY);
+ if (fp == NULL) {
+ printf("Failed to open memory stream.\n");
+ goto fail;
+ }
+
+ if (parse_sec_config(&kex, fp) == 0) {
+ printf("Unknown key should be rejected.\n");
+ fclose(fp);
+ goto fail;
+ }
+
+ fclose(fp);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
int kex_test(int argc,
char ** argv)
{
@@ -809,7 +1150,16 @@ int kex_test(int argc,
ret |= test_kex_create_destroy();
ret |= test_kex_parse_config_empty();
- ret |= test_kex_parse_config_none();
+ ret |= test_kex_parse_config_none_rejected();
+ ret |= test_kex_parse_config_no_enc();
+ ret |= test_kex_parse_config_auth();
+ ret |= test_kex_parse_config_auth_invalid();
+ ret |= test_kex_parse_config_auth_seed();
+ ret |= test_kex_parse_config_auth_optional();
+ ret |= test_kex_parse_config_auth_no_enc(KEX_CONFIG_AUTH_THEN_NO_ENC);
+ ret |= test_kex_parse_config_auth_no_enc(KEX_CONFIG_NO_ENC_THEN_AUTH);
+ ret |= test_kex_parse_config_cacert();
+ ret |= test_kex_parse_config_unknown_key();
#ifdef HAVE_OPENSSL
ret |= test_kex_parse_config_custom();
ret |= test_kex_parse_config_whitespace();
diff --git a/src/lib/tests/keyrot_test.c b/src/lib/tests/keyrot_test.c
new file mode 100644
index 00000000..1c9f741b
--- /dev/null
+++ b/src/lib/tests/keyrot_test.c
@@ -0,0 +1,1083 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * Test of the key-rotation schedule
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+#define _POSIX_C_SOURCE 200809L
+
+#include "config.h"
+
+#include <test/test.h>
+
+#ifdef HAVE_OPENSSL
+#include <ouroboros/crypt.h>
+#include <ouroboros/pthread.h>
+
+#include "crypt/keyrot.h"
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+static const uint8_t SEED_A[SYMMKEYSZ] = {
+ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
+ 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
+ 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18,
+ 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20
+};
+
+static int test_create_destroy(void)
+{
+ struct keyrot * kr;
+
+ TEST_START();
+
+ kr = keyrot_create(SEED_A, 0, 0);
+ if (kr == NULL)
+ goto fail;
+
+ keyrot_destroy(kr);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_epoch_range(void)
+{
+ struct keyrot * a;
+
+ TEST_START();
+
+ /* epoch is a 4-bit wire field; 16 and up must be refused. */
+ if (keyrot_create(SEED_A, 16, 0) != NULL)
+ goto fail;
+
+ a = keyrot_create(SEED_A, 0, 0);
+ if (a == NULL)
+ goto fail;
+
+ if (keyrot_rekey(a, SEED_A, 16) == 0)
+ goto fail_a;
+
+ keyrot_destroy(a);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_a:
+ keyrot_destroy(a);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_tx_deterministic(void)
+{
+ struct keyrot * a;
+ struct keyrot * b;
+ uint8_t sela[KR_SELECTOR_LEN];
+ uint8_t selb[KR_SELECTOR_LEN];
+ uint8_t na[KR_NONCE_LEN];
+ uint8_t nb[KR_NONCE_LEN];
+ const uint8_t * ka;
+ const uint8_t * kb;
+
+ TEST_START();
+
+ a = keyrot_create(SEED_A, 0, 0);
+ if (a == NULL)
+ goto fail;
+
+ b = keyrot_create(SEED_A, 0, 0);
+ if (b == NULL)
+ goto fail_a;
+
+ if (keyrot_tx_next(a, sela, &ka, na) != 0)
+ goto fail_b;
+
+ if (keyrot_tx_next(b, selb, &kb, nb) != 0)
+ goto fail_b;
+
+ if (memcmp(sela, selb, KR_SELECTOR_LEN) != 0)
+ goto fail_b;
+
+ if (memcmp(ka, kb, SYMMKEYSZ) != 0)
+ goto fail_b;
+
+ if (memcmp(na, nb, KR_NONCE_LEN) != 0)
+ goto fail_b;
+
+ keyrot_destroy(b);
+ keyrot_destroy(a);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_b:
+ keyrot_destroy(b);
+ fail_a:
+ keyrot_destroy(a);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_selector_layout(void)
+{
+ struct keyrot * a;
+ uint8_t sel[KR_SELECTOR_LEN];
+ uint8_t nonce[KR_NONCE_LEN];
+ const uint8_t * k;
+
+ TEST_START();
+
+ a = keyrot_create(SEED_A, 3, 0);
+ if (a == NULL)
+ goto fail;
+
+ /* First packet: epoch 3, node 0, seq 0 */
+ if (keyrot_tx_next(a, sel, &k, nonce) != 0)
+ goto fail_a;
+
+ if ((sel[0] >> 4) != 3) /* epoch */
+ goto fail_a;
+
+ if ((((sel[0] & 0x0F) << 8) | sel[1]) != 0) /* node */
+ goto fail_a;
+
+ if (sel[2] != 0 || sel[3] != 0 || sel[4] != 0 || sel[5] != 0)
+ goto fail_a;
+
+ /* Second packet: seq advances to 1 */
+ if (keyrot_tx_next(a, sel, &k, nonce) != 0)
+ goto fail_a;
+
+ if (sel[5] != 1)
+ goto fail_a;
+
+ keyrot_destroy(a);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_a:
+ keyrot_destroy(a);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_nodes_left_initial(void)
+{
+ struct keyrot * a;
+
+ TEST_START();
+
+ a = keyrot_create(SEED_A, 0, 0);
+ if (a == NULL)
+ goto fail;
+
+ if (keyrot_tx_nodes_left(a) != KEY_NODE_COUNT)
+ goto fail_a;
+
+ keyrot_destroy(a);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_a:
+ keyrot_destroy(a);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_roundtrip(void)
+{
+ struct keyrot * a; /* role 0 */
+ struct keyrot * b; /* role 1 */
+ uint8_t sel[KR_SELECTOR_LEN];
+ uint8_t ntx[KR_NONCE_LEN];
+ uint8_t nrx[KR_NONCE_LEN];
+ uint8_t ktx[SYMMKEYSZ];
+ const uint8_t * ptx;
+ const uint8_t * prx;
+ struct kr_rx rx;
+ int i;
+
+ TEST_START();
+
+ a = keyrot_create(SEED_A, 0, 0);
+ if (a == NULL)
+ goto fail;
+
+ b = keyrot_create(SEED_A, 0, 1);
+ if (b == NULL)
+ goto fail_a;
+
+ for (i = 0; i < 256; i++) {
+ if (keyrot_tx_next(a, sel, &ptx, ntx) != 0)
+ goto fail_b;
+ memcpy(ktx, ptx, SYMMKEYSZ);
+ if (keyrot_rx_lookup(b, sel, &prx, nrx, &rx) != 0)
+ goto fail_b;
+ if (keyrot_rx_commit(b, &rx) != 0)
+ goto fail_b;
+ if (memcmp(ktx, prx, SYMMKEYSZ) != 0)
+ goto fail_b;
+ if (memcmp(ntx, nrx, KR_NONCE_LEN) != 0)
+ goto fail_b;
+ }
+
+ keyrot_destroy(b);
+ keyrot_destroy(a);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_b:
+ keyrot_destroy(b);
+ fail_a:
+ keyrot_destroy(a);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_direction_separation(void)
+{
+ struct keyrot * a; /* role 0 */
+ struct keyrot * b; /* role 1 */
+ uint8_t sela[KR_SELECTOR_LEN];
+ uint8_t selb[KR_SELECTOR_LEN];
+ uint8_t n[KR_NONCE_LEN];
+ uint8_t ka[SYMMKEYSZ];
+ const uint8_t * pa;
+ const uint8_t * pb;
+
+ TEST_START();
+
+ a = keyrot_create(SEED_A, 0, 0);
+ if (a == NULL)
+ goto fail;
+
+ b = keyrot_create(SEED_A, 0, 1);
+ if (b == NULL)
+ goto fail_a;
+
+ if (keyrot_tx_next(a, sela, &pa, n) != 0)
+ goto fail_b;
+
+ memcpy(ka, pa, SYMMKEYSZ);
+ if (keyrot_tx_next(b, selb, &pb, n) != 0)
+ goto fail_b;
+
+ /* Same position, different role -> different leaf key */
+ if (memcmp(ka, pb, SYMMKEYSZ) == 0)
+ goto fail_b;
+
+ keyrot_destroy(b);
+ keyrot_destroy(a);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_b:
+ keyrot_destroy(b);
+ fail_a:
+ keyrot_destroy(a);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* Build a selector by hand (test knows the wire format). */
+static void mk_sel(uint8_t epoch,
+ uint16_t node,
+ uint32_t seq,
+ uint8_t sel[KR_SELECTOR_LEN])
+{
+ sel[0] = (uint8_t) ((epoch << 4) | ((node >> 8) & 0x0F));
+ sel[1] = (uint8_t) (node & 0xFF);
+ sel[2] = (uint8_t) (seq >> 24);
+ sel[3] = (uint8_t) (seq >> 16);
+ sel[4] = (uint8_t) (seq >> 8);
+ sel[5] = (uint8_t) (seq);
+}
+
+static int test_random_access(void)
+{
+ struct keyrot * b;
+ uint8_t s0[KR_SELECTOR_LEN];
+ uint8_t s5[KR_SELECTOR_LEN];
+ uint8_t n[KR_NONCE_LEN];
+ uint8_t k_first[SYMMKEYSZ];
+ uint8_t k_node5[SYMMKEYSZ];
+ const uint8_t * p;
+ struct kr_rx rx;
+
+ TEST_START();
+
+ b = keyrot_create(SEED_A, 0, 1);
+ if (b == NULL)
+ goto fail;
+
+ mk_sel(0, 0, 0, s0);
+ mk_sel(0, 5, 12345, s5); /* a far-ahead node, mid-span */
+
+ /* Jump straight to node 0 */
+ if (keyrot_rx_lookup(b, s0, &p, n, &rx) != 0)
+ goto fail_b;
+
+ memcpy(k_first, p, SYMMKEYSZ);
+
+ /* Jump forward to node 5 (simulates a burst skip) */
+ if (keyrot_rx_lookup(b, s5, &p, n, &rx) != 0)
+ goto fail_b;
+
+ memcpy(k_node5, p, SYMMKEYSZ);
+
+ /* Different nodes must yield different keys */
+ if (memcmp(k_first, k_node5, SYMMKEYSZ) == 0)
+ goto fail_b;
+
+ /* Jump back to node 0: still works, identical (no wedge) */
+ if (keyrot_rx_lookup(b, s0, &p, n, &rx) != 0)
+ goto fail_b;
+
+ if (memcmp(k_first, p, SYMMKEYSZ) != 0)
+ goto fail_b;
+
+ /* Out-of-range node must be rejected */
+ mk_sel(0, KEY_NODE_COUNT, 0, s0);
+ if (keyrot_rx_lookup(b, s0, &p, n, &rx) == 0)
+ goto fail_b;
+
+ keyrot_destroy(b);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_b:
+ keyrot_destroy(b);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static const uint8_t SEED_B[SYMMKEYSZ] = {
+ 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8,
+ 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0,
+ 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8,
+ 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0
+};
+
+/*
+ * Look up and commit one within-node counter on epoch 0. Returns 0 on
+ * accept, 1 on a rejected commit (replay or too old), and -1 if the
+ * lookup itself failed - kept distinct so a reject assertion can never
+ * pass on an unrelated lookup miss.
+ */
+static int commit_ctr(struct keyrot * kr,
+ uint32_t ctr)
+{
+ uint8_t sel[KR_SELECTOR_LEN];
+ uint8_t n[KR_NONCE_LEN];
+ const uint8_t * k;
+ struct kr_rx rx;
+
+ mk_sel(0, 0, ctr, sel);
+
+ if (keyrot_rx_lookup(kr, sel, &k, n, &rx) != 0)
+ return -1;
+
+ return keyrot_rx_commit(kr, &rx) == 0 ? 0 : 1;
+}
+
+static int test_replay_window(void)
+{
+ struct keyrot * b;
+ struct keyrot * c;
+ uint32_t base;
+ uint32_t jump;
+
+ TEST_START();
+
+ b = keyrot_create(SEED_A, 0, 1);
+ if (b == NULL)
+ goto fail;
+
+ /* Fresh counters accepted; an immediate replay is rejected. */
+ if (commit_ctr(b, 100) != 0)
+ goto fail_b;
+
+ if (commit_ctr(b, 100) != 1)
+ goto fail_b;
+
+ /* In-window reorder: accepted once, rejected on replay. */
+ if (commit_ctr(b, 105) != 0)
+ goto fail_b;
+
+ if (commit_ctr(b, 102) != 0)
+ goto fail_b;
+
+ if (commit_ctr(b, 102) != 1)
+ goto fail_b;
+
+ /* Too-old boundary: the window edge is rejected, just inside is not. */
+ base = 4 * KEY_REPLAY_WINDOW;
+ if (commit_ctr(b, base) != 0)
+ goto fail_b;
+
+ if (commit_ctr(b, base - (KEY_REPLAY_WINDOW - 64)) != 1)
+ goto fail_b;
+
+ if (commit_ctr(b, base - (KEY_REPLAY_WINDOW - 64) + 1) != 0)
+ goto fail_b;
+
+ /*
+ * RFC 6479 slack-word regression: two low counters, then a
+ * forward jump of a full bitmap that aliases their slot, then a
+ * replay of a low counter. Without the reserved slack word this
+ * replay is wrongly accepted.
+ */
+ c = keyrot_create(SEED_A, 0, 1);
+ if (c == NULL)
+ goto fail_b;
+
+ if (commit_ctr(c, 70) != 0)
+ goto fail_c;
+
+ if (commit_ctr(c, 74) != 0)
+ goto fail_c;
+
+ jump = KEY_REPLAY_WINDOW + 63;
+ if (commit_ctr(c, jump) != 0)
+ goto fail_c;
+
+ if (commit_ctr(c, 74) != 1)
+ goto fail_c;
+
+ keyrot_destroy(c);
+ keyrot_destroy(b);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_c:
+ keyrot_destroy(c);
+ fail_b:
+ keyrot_destroy(b);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_lookup_no_commit(void)
+{
+ struct keyrot * b;
+ uint8_t sel[KR_SELECTOR_LEN];
+ uint8_t n[KR_NONCE_LEN];
+ const uint8_t * k;
+ struct kr_rx rx;
+ int i;
+
+ TEST_START();
+
+ b = keyrot_create(SEED_A, 0, 1);
+ if (b == NULL)
+ goto fail;
+
+ mk_sel(0, 0, 100, sel);
+
+ /* Repeated lookups are pre-AEAD and must not consume the slot. */
+ for (i = 0; i < 4; i++) {
+ if (keyrot_rx_lookup(b, sel, &k, n, &rx) != 0)
+ goto fail_b;
+ }
+
+ /* The slot is still fresh, so the first commit accepts ... */
+ if (keyrot_rx_commit(b, &rx) != 0)
+ goto fail_b;
+
+ /* ... and only the commit advanced it, so the next is a replay. */
+ if (keyrot_rx_commit(b, &rx) == 0)
+ goto fail_b;
+
+ keyrot_destroy(b);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_b:
+ keyrot_destroy(b);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_commit_prev_batch(void)
+{
+ struct keyrot * b;
+ uint8_t sel[KR_SELECTOR_LEN];
+ uint8_t n[KR_NONCE_LEN];
+ const uint8_t * k;
+ struct kr_rx rx;
+
+ TEST_START();
+
+ b = keyrot_create(SEED_A, 0, 1);
+ if (b == NULL)
+ goto fail;
+
+ /* Capture a packet under cur (epoch 0). */
+ mk_sel(0, 0, 7, sel);
+ if (keyrot_rx_lookup(b, sel, &k, n, &rx) != 0)
+ goto fail_b;
+
+ /* Re-key: the captured batch becomes prev and the flag clears. */
+ if (keyrot_rekey(b, SEED_B, 1) != 0)
+ goto fail_b;
+
+ /* The straggler commits under prev without claiming a switch. */
+ if (keyrot_rx_commit(b, &rx) != 0)
+ goto fail_b;
+
+ if (keyrot_peer_switched(b))
+ goto fail_b;
+
+ /* prev still holds a replay window: its replay is rejected. */
+ if (keyrot_rx_commit(b, &rx) == 0)
+ goto fail_b;
+
+ keyrot_destroy(b);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_b:
+ keyrot_destroy(b);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_replay_forward_clear(void)
+{
+ struct keyrot * d;
+ uint32_t low;
+ uint32_t alias;
+ uint32_t jump;
+
+ TEST_START();
+
+ d = keyrot_create(SEED_A, 0, 1);
+ if (d == NULL)
+ goto fail;
+
+ /* alias shares low's slot a window away; the jump must clear it. */
+ low = 10;
+ alias = low + KEY_REPLAY_WINDOW;
+ jump = alias + KEY_REPLAY_WINDOW / 2;
+
+ if (commit_ctr(d, low) != 0)
+ goto fail_d;
+
+ if (commit_ctr(d, jump) != 0)
+ goto fail_d;
+
+ if (commit_ctr(d, alias) != 0)
+ goto fail_d;
+
+ if (commit_ctr(d, alias) != 1)
+ goto fail_d;
+
+ keyrot_destroy(d);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_d:
+ keyrot_destroy(d);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_rekey_overlap(void)
+{
+ struct keyrot * a; /* role 0 */
+ struct keyrot * b; /* role 1 */
+ uint8_t old_sel[KR_SELECTOR_LEN];
+ uint8_t sel[KR_SELECTOR_LEN];
+ uint8_t ntx[KR_NONCE_LEN];
+ uint8_t nrx[KR_NONCE_LEN];
+ uint8_t ktx[SYMMKEYSZ];
+ const uint8_t * ptx;
+ const uint8_t * prx;
+ struct kr_rx rx;
+
+ TEST_START();
+
+ a = keyrot_create(SEED_A, 0, 0);
+ if (a == NULL)
+ goto fail;
+
+ b = keyrot_create(SEED_A, 0, 1);
+ if (b == NULL)
+ goto fail_a;
+
+ /* Send one gen-0 packet; keep its selector for the overlap. */
+ if (keyrot_tx_next(a, old_sel, &ptx, ntx) != 0)
+ goto fail_b;
+
+ memcpy(ktx, ptx, SYMMKEYSZ);
+ if (keyrot_rx_lookup(b, old_sel, &prx, nrx, &rx) != 0)
+ goto fail_b;
+
+ if (memcmp(ktx, prx, SYMMKEYSZ) != 0)
+ goto fail_b;
+
+ /* Both ends re-key to epoch 1 with a fresh seed. */
+ if (keyrot_rekey(a, SEED_B, 1) != 0)
+ goto fail_b;
+
+ if (keyrot_rekey(b, SEED_B, 1) != 0)
+ goto fail_b;
+
+ /* TX is gated until promotion; promote a to emit the new epoch. */
+ keyrot_tx_promote(a);
+
+ /* New gen-1 traffic works. */
+ if (keyrot_tx_next(a, sel, &ptx, ntx) != 0)
+ goto fail_b;
+
+ memcpy(ktx, ptx, SYMMKEYSZ);
+ if (keyrot_rx_lookup(b, sel, &prx, nrx, &rx) != 0)
+ goto fail_b;
+
+ if (memcmp(ktx, prx, SYMMKEYSZ) != 0)
+ goto fail_b;
+
+ /* A straggling gen-0 packet still decrypts (overlap window). */
+ if (keyrot_rx_lookup(b, old_sel, &prx, nrx, &rx) != 0)
+ goto fail_b;
+
+ /* An unknown epoch is rejected. */
+ mk_sel(7, 0, 0, sel);
+ if (keyrot_rx_lookup(b, sel, &prx, nrx, &rx) == 0)
+ goto fail_b;
+
+ keyrot_destroy(b);
+ keyrot_destroy(a);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_b:
+ keyrot_destroy(b);
+ fail_a:
+ keyrot_destroy(a);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_tx_gate(void)
+{
+ struct keyrot * a; /* role 0 */
+ struct keyrot * b; /* role 1 */
+ uint8_t sel[KR_SELECTOR_LEN];
+ uint8_t n[KR_NONCE_LEN];
+ const uint8_t * p;
+ struct kr_rx rx;
+
+ TEST_START();
+
+ a = keyrot_create(SEED_A, 0, 0);
+ if (a == NULL)
+ goto fail;
+
+ b = keyrot_create(SEED_A, 0, 1);
+ if (b == NULL)
+ goto fail_a;
+
+ /* Both re-key to epoch 1; TX must stay on epoch 0 until promoted. */
+ if (keyrot_rekey(a, SEED_B, 1) != 0)
+ goto fail_b;
+
+ if (keyrot_rekey(b, SEED_B, 1) != 0)
+ goto fail_b;
+
+ /* a's TX still stamps the old epoch (0). */
+ if (keyrot_tx_next(a, sel, &p, n) != 0)
+ goto fail_b;
+
+ if ((sel[0] >> 4) != 0)
+ goto fail_b;
+
+ /* b decrypts the old-epoch packet via its prev batch. */
+ if (keyrot_rx_lookup(b, sel, &p, n, &rx) != 0)
+ goto fail_b;
+
+ if (keyrot_rx_commit(b, &rx) != 0)
+ goto fail_b;
+
+ /* b has not yet seen the new epoch from a. */
+ if (keyrot_peer_switched(b))
+ goto fail_b;
+
+ /* a promotes; its TX now stamps the new epoch (1). */
+ keyrot_tx_promote(a);
+ if (keyrot_tx_next(a, sel, &p, n) != 0)
+ goto fail_b;
+
+ if ((sel[0] >> 4) != 1)
+ goto fail_b;
+
+ /* b sees the new epoch and reports the peer switched. */
+ if (keyrot_rx_lookup(b, sel, &p, n, &rx) != 0)
+ goto fail_b;
+
+ if (keyrot_rx_commit(b, &rx) != 0)
+ goto fail_b;
+
+ if (!keyrot_peer_switched(b))
+ goto fail_b;
+
+ keyrot_destroy(b);
+ keyrot_destroy(a);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_b:
+ keyrot_destroy(b);
+ fail_a:
+ keyrot_destroy(a);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_peer_switched_commit_only(void)
+{
+ struct keyrot * b;
+ uint8_t sel[KR_SELECTOR_LEN];
+ uint8_t n[KR_NONCE_LEN];
+ const uint8_t * k;
+ struct kr_rx rx;
+
+ TEST_START();
+
+ b = keyrot_create(SEED_A, 0, 1);
+ if (b == NULL)
+ goto fail;
+
+ /* A re-key clears the flag until a packet is seen on cur. */
+ if (keyrot_rekey(b, SEED_B, 1) != 0)
+ goto fail_b;
+
+ if (keyrot_peer_switched(b))
+ goto fail_b;
+
+ mk_sel(1, 0, 0, sel);
+
+ /* Lookup is pre-AEAD: selecting a key must not flip the flag. */
+ if (keyrot_rx_lookup(b, sel, &k, n, &rx) != 0)
+ goto fail_b;
+
+ if (keyrot_peer_switched(b))
+ goto fail_b;
+
+ /* Commit runs post-AEAD and is what records the peer switched. */
+ if (keyrot_rx_commit(b, &rx) != 0)
+ goto fail_b;
+
+ if (!keyrot_peer_switched(b))
+ goto fail_b;
+
+ keyrot_destroy(b);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_b:
+ keyrot_destroy(b);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_commit_evicted(void)
+{
+ struct keyrot * b;
+ uint8_t sel[KR_SELECTOR_LEN];
+ uint8_t n[KR_NONCE_LEN];
+ const uint8_t * k;
+ struct kr_rx rx;
+
+ TEST_START();
+
+ b = keyrot_create(SEED_A, 0, 1);
+ if (b == NULL)
+ goto fail;
+
+ mk_sel(0, 0, 3, sel);
+ if (keyrot_rx_lookup(b, sel, &k, n, &rx) != 0)
+ goto fail_b;
+
+ /* Two re-keys drop the captured batch from both cur and prev. */
+ if (keyrot_rekey(b, SEED_B, 1) != 0)
+ goto fail_b;
+
+ if (keyrot_rekey(b, SEED_A, 2) != 0)
+ goto fail_b;
+
+ /* Commit on an evicted batch is a silent no-op, not a fault. */
+ if (keyrot_rx_commit(b, &rx) != 0)
+ goto fail_b;
+
+ keyrot_destroy(b);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_b:
+ keyrot_destroy(b);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/*
+ * Concurrency: many TX threads + RX + re-key share one keyrot. The
+ * (epoch, counter) the TX side stamps must be globally unique (no AEAD
+ * nonce reuse). Capped below 16 re-keys so epoch maps 1:1 to a batch and
+ * the wire epoch never wraps (a wrapped epoch under a fresh key is not
+ * reuse but would false-trip the uniqueness check). Run under TSan to
+ * catch data races the static reviews can't.
+ */
+#define CT_THREADS 4
+#define CT_PKTS 2000
+#define CT_REKEYS 8
+
+struct ct_rec {
+ uint8_t epoch;
+ uint64_t ctr;
+};
+
+struct ct_arg {
+ struct keyrot * kr;
+ struct ct_rec * recs;
+ size_t n;
+};
+
+static void * ct_tx_thread(void * a)
+{
+ struct ct_arg * arg = a;
+ uint8_t sel[KR_SELECTOR_LEN];
+ uint8_t nonce[KR_NONCE_LEN];
+ const uint8_t * k;
+ uint64_t ctr;
+ size_t i;
+ size_t j;
+
+ for (i = 0; i < CT_PKTS; i++) {
+ if (keyrot_tx_next(arg->kr, sel, &k, nonce) != 0)
+ continue;
+
+ ctr = 0;
+ for (j = 0; j < 8; j++)
+ ctr = (ctr << 8) | nonce[j];
+
+ arg->recs[arg->n].epoch = (uint8_t) (sel[0] >> 4);
+ arg->recs[arg->n].ctr = ctr;
+ arg->n++;
+ }
+
+ return NULL;
+}
+
+static void * ct_rx_thread(void * a)
+{
+ struct keyrot * kr = a;
+ uint8_t sel[KR_SELECTOR_LEN];
+ uint8_t nonce[KR_NONCE_LEN];
+ const uint8_t * k;
+ struct kr_rx rx;
+ size_t i;
+
+ /* Exercise rx_lookup against re-key reclaim; results ignored. */
+ for (i = 0; i < CT_PKTS; i++) {
+ mk_sel((uint8_t) (i % 16), 0, (uint32_t) i, sel);
+ if (keyrot_rx_lookup(kr, sel, &k, nonce, &rx) == 0)
+ (void) keyrot_rx_commit(kr, &rx);
+ }
+
+ return NULL;
+}
+
+static void * ct_rekey_thread(void * a)
+{
+ struct keyrot * kr = a;
+ struct timespec t;
+ int e;
+
+ t.tv_sec = 0;
+ t.tv_nsec = 2 * 1000 * 1000; /* 2 ms */
+
+ for (e = 1; e <= CT_REKEYS; e++) {
+ nanosleep(&t, NULL);
+ if (keyrot_rekey(kr, (e & 1) ? SEED_B : SEED_A,
+ (uint8_t) e) != 0)
+ break;
+ keyrot_tx_promote(kr);
+ }
+
+ return NULL;
+}
+
+static int ct_cmp(const void * x,
+ const void * y)
+{
+ const struct ct_rec * a = x;
+ const struct ct_rec * b = y;
+
+ if (a->epoch != b->epoch)
+ return a->epoch < b->epoch ? -1 : 1;
+
+ if (a->ctr != b->ctr)
+ return a->ctr < b->ctr ? -1 : 1;
+
+ return 0;
+}
+
+static int test_concurrent_nonce_unique(void)
+{
+ struct keyrot * kr;
+ struct ct_arg arg[CT_THREADS];
+ pthread_t tx[CT_THREADS];
+ pthread_t rx;
+ pthread_t rk;
+ struct ct_rec * all;
+ size_t total;
+ size_t i;
+ bool reuse = false;
+
+ TEST_START();
+
+ kr = keyrot_create(SEED_A, 0, 0);
+ if (kr == NULL)
+ goto fail;
+
+ all = malloc(sizeof(*all) * CT_THREADS * CT_PKTS);
+ if (all == NULL)
+ goto fail_kr;
+
+ for (i = 0; i < CT_THREADS; i++) {
+ arg[i].kr = kr;
+ arg[i].n = 0;
+ arg[i].recs = all + i * CT_PKTS;
+ }
+
+ for (i = 0; i < CT_THREADS; i++)
+ pthread_create(&tx[i], NULL, ct_tx_thread, &arg[i]);
+
+ pthread_create(&rx, NULL, ct_rx_thread, kr);
+ pthread_create(&rk, NULL, ct_rekey_thread, kr);
+
+ for (i = 0; i < CT_THREADS; i++)
+ pthread_join(tx[i], NULL);
+
+ pthread_join(rx, NULL);
+ pthread_join(rk, NULL);
+
+ total = 0;
+ for (i = 0; i < CT_THREADS; i++) {
+ memmove(all + total, all + i * CT_PKTS,
+ arg[i].n * sizeof(*all));
+ total += arg[i].n;
+ }
+
+ qsort(all, total, sizeof(*all), ct_cmp);
+
+ for (i = 1; i < total; i++)
+ if (ct_cmp(&all[i - 1], &all[i]) == 0) {
+ printf("(epoch %u, ctr %llu) reused\n",
+ all[i].epoch,
+ (unsigned long long) all[i].ctr);
+ reuse = true;
+ break;
+ }
+
+ free(all);
+
+ if (reuse)
+ goto fail_kr;
+
+ keyrot_destroy(kr);
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_kr:
+ keyrot_destroy(kr);
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+#endif /* HAVE_OPENSSL */
+
+int keyrot_test(int argc,
+ char ** argv)
+{
+ int ret = 0;
+
+ (void) argc;
+ (void) argv;
+
+#ifdef HAVE_OPENSSL
+ ret |= test_create_destroy();
+ ret |= test_epoch_range();
+ ret |= test_tx_deterministic();
+ ret |= test_selector_layout();
+ ret |= test_nodes_left_initial();
+ ret |= test_roundtrip();
+ ret |= test_direction_separation();
+ ret |= test_random_access();
+ ret |= test_peer_switched_commit_only();
+ ret |= test_commit_evicted();
+ ret |= test_replay_window();
+ ret |= test_lookup_no_commit();
+ ret |= test_commit_prev_batch();
+ ret |= test_replay_forward_clear();
+ ret |= test_rekey_overlap();
+ ret |= test_tx_gate();
+ ret |= test_concurrent_nonce_unique();
+#endif
+ return ret;
+}
diff --git a/src/lib/tests/tpm_test.c b/src/lib/tests/tpm_test.c
index df1d8850..7cc049cd 100644
--- a/src/lib/tests/tpm_test.c
+++ b/src/lib/tests/tpm_test.c
@@ -21,7 +21,7 @@
*/
-#include "tpm.c"
+#include <ouroboros/tpm.h>
#include <test/test.h>
diff --git a/src/lib/tests/tw_test.c b/src/lib/tests/tw_test.c
new file mode 100644
index 00000000..32c302c4
--- /dev/null
+++ b/src/lib/tests/tw_test.c
@@ -0,0 +1,663 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * Generic timing-wheel tests
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+#if defined(__linux__) || defined(__CYGWIN__)
+#define _DEFAULT_SOURCE
+#else
+#define _POSIX_C_SOURCE 200809L
+#endif
+
+#include "config.h"
+
+#include <test/test.h>
+
+#include <ouroboros/time.h>
+#include <ouroboros/tw.h>
+
+#include <stdint.h>
+#include <stdio.h>
+#include <time.h>
+
+struct payload {
+ struct tw_entry tw;
+ int fired;
+};
+
+struct cancel_payload {
+ struct tw_entry tw;
+ int fired;
+ struct tw_entry * sibling;
+};
+
+struct repost_payload {
+ struct tw_entry tw;
+ int fired;
+ struct payload * sibling;
+ uint64_t repost_at;
+};
+
+static void cb_count(void * arg)
+{
+ struct payload * p = arg;
+ p->fired++;
+}
+
+static void cb_cancel_sibling(void * arg)
+{
+ struct cancel_payload * p = arg;
+ p->fired++;
+ tw_cancel(p->sibling);
+}
+
+static void cb_repost_sibling(void * arg)
+{
+ struct repost_payload * p = arg;
+ p->fired++;
+ tw_post(&p->sibling->tw, p->repost_at, cb_count, p->sibling);
+}
+
+static uint64_t now_ns(void)
+{
+ struct timespec ts;
+ clock_gettime(PTHREAD_COND_CLOCK, &ts);
+ return TS_TO_UINT64(ts);
+}
+
+static void sleep_ns(uint64_t ns)
+{
+ struct timespec ts;
+ UINT64_TO_TS(ns, &ts);
+ nanosleep(&ts, NULL);
+}
+
+static int test_tw_init_fini(void)
+{
+ TEST_START();
+
+ if (tw_init() < 0) {
+ printf("tw_init failed.\n");
+ goto fail;
+ }
+
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_tw_post_fires_after_deadline(void)
+{
+ struct payload p;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&p.tw);
+ p.fired = 0;
+
+ tw_post(&p.tw, now_ns() + 5 * MILLION, cb_count, &p);
+
+ sleep_ns(20 * MILLION);
+ tw_move();
+
+ if (p.fired != 1) {
+ printf("expected 1 fire, got %d\n", p.fired);
+ goto fail_post;
+ }
+
+ tw_cancel(&p.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&p.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_tw_no_fire_before_deadline(void)
+{
+ struct payload p;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&p.tw);
+ p.fired = 0;
+
+ tw_post(&p.tw, now_ns() + 100 * MILLION, cb_count, &p);
+
+ sleep_ns(2 * MILLION);
+ tw_move();
+
+ if (p.fired != 0) {
+ printf("expected 0 fires, got %d\n", p.fired);
+ goto fail_post;
+ }
+
+ tw_cancel(&p.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&p.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_tw_cancel_prevents_fire(void)
+{
+ struct payload p;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&p.tw);
+ p.fired = 0;
+
+ tw_post(&p.tw, now_ns() + 5 * MILLION, cb_count, &p);
+ tw_cancel(&p.tw);
+
+ sleep_ns(20 * MILLION);
+ tw_move();
+
+ if (p.fired != 0) {
+ printf("cancelled entry fired %d times\n", p.fired);
+ goto fail_init;
+ }
+
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_init:
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_tw_cancel_unposted_is_noop(void)
+{
+ struct tw_entry e;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&e);
+ tw_cancel(&e);
+ tw_cancel(&e);
+
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_tw_fire_only_once(void)
+{
+ struct payload p;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&p.tw);
+ p.fired = 0;
+
+ tw_post(&p.tw, now_ns() + 3 * MILLION, cb_count, &p);
+
+ sleep_ns(20 * MILLION);
+ tw_move();
+ tw_move();
+ tw_move();
+
+ if (p.fired != 1) {
+ printf("expected 1 fire, got %d after 3 moves\n", p.fired);
+ goto fail_post;
+ }
+
+ tw_cancel(&p.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&p.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* Multi-level: post a level-1 (>= 256ms) deadline; should still fire. */
+static int test_tw_post_level1_fires(void)
+{
+ struct payload p;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&p.tw);
+ p.fired = 0;
+
+ tw_post(&p.tw, now_ns() + 300 * MILLION, cb_count, &p);
+
+ if (p.tw.lvl != 1) {
+ printf("expected level 1 placement, got %zu\n", p.tw.lvl);
+ goto fail_post;
+ }
+
+ sleep_ns(320 * MILLION);
+ tw_move();
+
+ if (p.fired != 1) {
+ printf("level-1 entry didn't fire (got %d)\n", p.fired);
+ goto fail_post;
+ }
+
+ tw_cancel(&p.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&p.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_tw_many_entries_all_fire(void)
+{
+ struct payload pl[16];
+ size_t i;
+ size_t total = 0;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ for (i = 0; i < 16; ++i) {
+ tw_init_entry(&pl[i].tw);
+ pl[i].fired = 0;
+ tw_post(&pl[i].tw, now_ns() + (1 + i) * MILLION,
+ cb_count, &pl[i]);
+ }
+
+ sleep_ns(40 * MILLION);
+ tw_move();
+
+ for (i = 0; i < 16; ++i)
+ total += pl[i].fired;
+
+ if (total != 16) {
+ printf("expected 16 fires, got %zu\n", total);
+ goto fail_post;
+ }
+
+ for (i = 0; i < 16; ++i)
+ tw_cancel(&pl[i].tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ for (i = 0; i < 16; ++i)
+ tw_cancel(&pl[i].tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* tw_next_expiry signals empty wheel via tv_nsec == -1. */
+static int test_tw_next_expiry_empty(void)
+{
+ struct timespec out;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_next_expiry(&out);
+ if (out.tv_nsec != -1) {
+ printf("expected tv_nsec=-1, got %ld\n", (long) out.tv_nsec);
+ goto fail_init;
+ }
+
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_init:
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* tw_next_expiry returns a deadline within the right ballpark. */
+static int test_tw_next_expiry_returns_deadline(void)
+{
+ struct payload p;
+ struct timespec out;
+ uint64_t target;
+ uint64_t out_ns;
+ int64_t skew;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&p.tw);
+ p.fired = 0;
+
+ target = now_ns() + 50 * MILLION;
+ tw_post(&p.tw, target, cb_count, &p);
+
+ tw_next_expiry(&out);
+ out_ns = TS_TO_UINT64(out);
+
+ /* Level-0 quantization gives ±1 slot of skew. */
+ skew = (int64_t)(out_ns) - (int64_t)(target);
+ if (skew < -2 * MILLION || skew > 4 * MILLION) {
+ printf("deadline not in -2..+4 ms, skew=%ld ns\n", (long) skew);
+ goto fail_post;
+ }
+
+ tw_cancel(&p.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&p.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* Repost: fire, then post again. */
+static int test_tw_repost_after_fire(void)
+{
+ struct payload p;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&p.tw);
+ p.fired = 0;
+
+ tw_post(&p.tw, now_ns() + 3 * MILLION, cb_count, &p);
+ sleep_ns(20 * MILLION);
+ tw_move();
+ if (p.fired != 1) {
+ printf("first fire missed\n");
+ goto fail_post;
+ }
+
+ tw_post(&p.tw, now_ns() + 3 * MILLION, cb_count, &p);
+ sleep_ns(20 * MILLION);
+ tw_move();
+ if (p.fired != 2) {
+ printf("second fire missed (fired=%d)\n", p.fired);
+ goto fail_post;
+ }
+
+ tw_cancel(&p.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&p.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* Double-post replaces the schedule; only the second fires. */
+static int test_tw_double_post_replaces(void)
+{
+ struct payload p;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&p.tw);
+ p.fired = 0;
+
+ tw_post(&p.tw, now_ns() + 30 * MILLION, cb_count, &p);
+ tw_post(&p.tw, now_ns() + 3 * MILLION, cb_count, &p);
+
+ sleep_ns(20 * MILLION);
+ tw_move();
+
+ if (p.fired != 1) {
+ printf("expected 1 fire after replace, got %d\n", p.fired);
+ goto fail_post;
+ }
+
+ sleep_ns(40 * MILLION);
+ tw_move();
+
+ if (p.fired != 1) {
+ printf("first schedule fired after replace (got %d)\n",
+ p.fired);
+ goto fail_post;
+ }
+
+ tw_cancel(&p.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&p.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* Fire callback may safely cancel a sibling in the same slot. */
+static int test_tw_fire_cancels_sibling(void)
+{
+ struct cancel_payload a;
+ struct payload b;
+ uint64_t deadline;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&a.tw);
+ tw_init_entry(&b.tw);
+ a.fired = 0;
+ a.sibling = &b.tw;
+ b.fired = 0;
+
+ deadline = now_ns() + 3 * MILLION;
+ tw_post(&a.tw, deadline, cb_cancel_sibling, &a);
+ tw_post(&b.tw, deadline, cb_count, &b);
+
+ sleep_ns(20 * MILLION);
+ tw_move();
+
+ if (a.fired != 1) {
+ printf("a expected 1 fire, got %d\n", a.fired);
+ goto fail_post;
+ }
+ if (b.fired != 0) {
+ printf("b should not have fired (got %d)\n", b.fired);
+ goto fail_post;
+ }
+
+ tw_cancel(&a.tw);
+ tw_cancel(&b.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&a.tw);
+ tw_cancel(&b.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* Fire callback may safely repost a sibling to a future slot. */
+static int test_tw_fire_posts_sibling(void)
+{
+ struct repost_payload a;
+ struct payload b;
+ uint64_t deadline;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&a.tw);
+ tw_init_entry(&b.tw);
+ a.fired = 0;
+ a.sibling = &b;
+ a.repost_at = now_ns() + 30 * MILLION;
+ b.fired = 0;
+
+ deadline = now_ns() + 3 * MILLION;
+ tw_post(&a.tw, deadline, cb_repost_sibling, &a);
+ tw_post(&b.tw, deadline, cb_count, &b);
+
+ sleep_ns(20 * MILLION);
+ tw_move();
+
+ if (a.fired != 1) {
+ printf("a expected 1 fire, got %d\n", a.fired);
+ goto fail_post;
+ }
+ if (b.fired != 0) {
+ printf("b fired before reposted deadline (got %d)\n",
+ b.fired);
+ goto fail_post;
+ }
+
+ sleep_ns(25 * MILLION);
+ tw_move();
+
+ if (b.fired != 1) {
+ printf("b expected 1 fire after repost, got %d\n",
+ b.fired);
+ goto fail_post;
+ }
+
+ tw_cancel(&a.tw);
+ tw_cancel(&b.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&a.tw);
+ tw_cancel(&b.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+int tw_test(int argc,
+ char ** argv)
+{
+ int ret = 0;
+
+ (void) argc;
+ (void) argv;
+
+ ret |= test_tw_init_fini();
+ ret |= test_tw_post_fires_after_deadline();
+ ret |= test_tw_no_fire_before_deadline();
+ ret |= test_tw_cancel_prevents_fire();
+ ret |= test_tw_cancel_unposted_is_noop();
+ ret |= test_tw_fire_only_once();
+ ret |= test_tw_post_level1_fires();
+ ret |= test_tw_many_entries_all_fire();
+ ret |= test_tw_next_expiry_empty();
+ ret |= test_tw_next_expiry_returns_deadline();
+ ret |= test_tw_repost_after_fire();
+ ret |= test_tw_double_post_replaces();
+ ret |= test_tw_fire_cancels_sibling();
+ ret |= test_tw_fire_posts_sibling();
+
+ return ret;
+}
diff --git a/src/lib/timerwheel.c b/src/lib/timerwheel.c
deleted file mode 100644
index 2c796c96..00000000
--- a/src/lib/timerwheel.c
+++ /dev/null
@@ -1,414 +0,0 @@
-/*
- * Ouroboros - Copyright (C) 2016 - 2026
- *
- * Timerwheel
- *
- * Dimitri Staessens <dimitri@ouroboros.rocks>
- * Sander Vrijders <sander@ouroboros.rocks>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * version 2.1 as published by the Free Software Foundation.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., http://www.fsf.org/about/contact/.
- */
-
-#include <ouroboros/list.h>
-
-/* Overflow limits range to about 6 hours. */
-#define ts_to_ns(ts) (ts.tv_sec * BILLION + ts.tv_nsec)
-#define ts_to_rxm_slot(ts) (ts_to_ns(ts) >> RXMQ_RES)
-#define ts_to_ack_slot(ts) (ts_to_ns(ts) >> ACKQ_RES)
-
-struct rxm {
- struct list_head next;
- uint32_t seqno;
-#ifndef RXM_BUFFER_ON_HEAP
- struct ssm_pk_buff * spb;
-#endif
- struct frct_pci * pkt;
- size_t len;
- time_t t0; /* Time when original was sent (us). */
- struct frcti * frcti;
- int fd;
- int flow_id; /* Prevent rtx when fd reused. */
-};
-
-struct ack {
- struct list_head next;
- struct frcti * frcti;
- int fd;
- int flow_id;
-};
-
-struct {
- /*
- * At a 1 ms min resolution, every level bumps the
- * resolution by a factor of 16.
- */
- struct list_head rxms[RXMQ_LVLS][RXMQ_SLOTS];
-
- struct list_head acks[ACKQ_SLOTS];
- bool map[ACKQ_SLOTS][PROG_MAX_FLOWS];
-
- size_t prv_rxm[RXMQ_LVLS]; /* Last processed rxm slots. */
- size_t prv_ack; /* Last processed ack slot. */
- pthread_mutex_t lock;
-} rw;
-
-static void timerwheel_fini(void)
-{
- size_t i;
- size_t j;
- struct list_head * p;
- struct list_head * h;
-
- pthread_mutex_lock(&rw.lock);
-
- for (i = 0; i < RXMQ_LVLS; ++i) {
- for (j = 0; j < RXMQ_SLOTS; j++) {
- list_for_each_safe(p, h, &rw.rxms[i][j]) {
- struct rxm * rxm;
- rxm = list_entry(p, struct rxm, next);
- list_del(&rxm->next);
-#ifdef RXM_BUFFER_ON_HEAP
- free(rxm->pkt);
-#else
- ssm_pk_buff_ack(rxm->spb);
- ipcp_spb_release(rxm->spb);
-#endif
- free(rxm);
- }
- }
- }
-
- for (i = 0; i < ACKQ_SLOTS; ++i) {
- list_for_each_safe(p, h, &rw.acks[i]) {
- struct ack * a = list_entry(p, struct ack, next);
- list_del(&a->next);
- free(a);
- }
- }
-
- pthread_mutex_unlock(&rw.lock);
-
- pthread_mutex_destroy(&rw.lock);
-}
-
-static int timerwheel_init(void)
-{
- struct timespec now;
- size_t i;
- size_t j;
-
- if (pthread_mutex_init(&rw.lock, NULL))
- return -1;
-
- clock_gettime(PTHREAD_COND_CLOCK, &now);
-
- for (i = 0; i < RXMQ_LVLS; ++i) {
- rw.prv_rxm[i] = (ts_to_rxm_slot(now) - 1);
- rw.prv_rxm[i] >>= (RXMQ_BUMP * i);
- rw.prv_rxm[i] &= (RXMQ_SLOTS - 1);
- for (j = 0; j < RXMQ_SLOTS; ++j)
- list_head_init(&rw.rxms[i][j]);
- }
-
- rw.prv_ack = (ts_to_ack_slot(now) - 1) & (ACKQ_SLOTS - 1);
- for (i = 0; i < ACKQ_SLOTS; ++i)
- list_head_init(&rw.acks[i]);
-
- return 0;
-}
-
-static void timerwheel_move(void)
-{
- struct timespec now;
- struct list_head * p;
- struct list_head * h;
- size_t rxm_slot;
- size_t ack_slot;
- size_t i;
- size_t j;
-
- pthread_mutex_lock(&rw.lock);
-
- pthread_cleanup_push(__cleanup_mutex_unlock, &rw.lock);
-
- clock_gettime(PTHREAD_COND_CLOCK, &now);
-
- rxm_slot = ts_to_rxm_slot(now);
-
- for (i = 0; i < RXMQ_LVLS; ++i) {
- size_t j_max_slot = rxm_slot & (RXMQ_SLOTS - 1);
- j = rw.prv_rxm[i];
- if (j_max_slot < j)
- j_max_slot += RXMQ_SLOTS;
- while (j++ < j_max_slot) {
- list_for_each_safe(p, h,
- &rw.rxms[i][j & (RXMQ_SLOTS - 1)]) {
- struct rxm * r;
- struct frct_cr * snd_cr;
- struct frct_cr * rcv_cr;
- size_t slot;
- size_t rslot;
- ssize_t idx;
- struct ssm_pk_buff * spb;
- struct frct_pci * pci;
- struct flow * f;
- uint32_t snd_lwe;
- uint32_t rcv_lwe;
- size_t lvl = 0;
-
- r = list_entry(p, struct rxm, next);
-
- list_del(&r->next);
-
- snd_cr = &r->frcti->snd_cr;
- rcv_cr = &r->frcti->rcv_cr;
- f = &proc.flows[r->fd];
-#ifndef RXM_BUFFER_ON_HEAP
- ssm_pk_buff_ack(r->spb);
-#endif
- if (f->frcti == NULL
- || f->info.id != r->flow_id)
- goto cleanup;
-
- pthread_rwlock_rdlock(&r->frcti->lock);
-
- snd_lwe = snd_cr->lwe;
- rcv_lwe = rcv_cr->lwe;
-
- pthread_rwlock_unlock(&r->frcti->lock);
-
- /* Has been ack'd, remove. */
- if (before(r->seqno, snd_lwe))
- goto cleanup;
-
- /* Check for r-timer expiry. */
- if (ts_to_ns(now) - r->t0 > r->frcti->r)
- goto flow_down;
-
- pthread_rwlock_wrlock(&r->frcti->lock);
-
- if (r->seqno == r->frcti->rttseq) {
- r->frcti->rto +=
- r->frcti->rto >> RTO_DIV;
- r->frcti->probe = false;
- }
-#ifdef PROC_FLOW_STATS
- r->frcti->n_rtx++;
-#endif
- rslot = r->frcti->rto >> RXMQ_RES;
-
- pthread_rwlock_unlock(&r->frcti->lock);
-
- /* Schedule at least in the next time slot. */
- slot = ts_to_ns(now) >> RXMQ_RES;
-
- while (rslot >= RXMQ_SLOTS) {
- ++lvl;
- rslot >>= RXMQ_BUMP;
- slot >>= RXMQ_BUMP;
- }
-
- if (lvl >= RXMQ_LVLS) /* Can't reschedule */
- goto flow_down;
-
- rslot = (rslot + slot + 1) & (RXMQ_SLOTS - 1);
-#ifdef RXM_BLOCKING
- if (ipcp_spb_reserve(&spb, r->len) < 0)
-#else
- if (ssm_pool_alloc(proc.pool, r->len, NULL,
- &spb) < 0)
-#endif
- goto reschedule; /* rdrbuff full */
-
- pci = (struct frct_pci *) ssm_pk_buff_head(spb);
- memcpy(pci, r->pkt, r->len);
-#ifndef RXM_BUFFER_ON_HEAP
- ipcp_spb_release(r->spb);
- r->spb = spb;
- r->pkt = pci;
- ssm_pk_buff_wait_ack(spb);
-#endif
- idx = ssm_pk_buff_get_idx(spb);
-
- /* Retransmit the copy. */
- pci->ackno = hton32(rcv_lwe);
-#ifdef RXM_BLOCKING
- if (ssm_rbuff_write_b(f->tx_rb, idx, NULL) < 0)
-#else
- if (ssm_rbuff_write(f->tx_rb, idx) < 0)
-#endif
- goto flow_down;
- ssm_flow_set_notify(f->set, f->info.id,
- FLOW_PKT);
- reschedule:
- list_add(&r->next, &rw.rxms[lvl][rslot]);
- continue;
-
- flow_down:
- ssm_rbuff_set_acl(f->tx_rb, ACL_FLOWDOWN);
- ssm_rbuff_set_acl(f->rx_rb, ACL_FLOWDOWN);
- cleanup:
-#ifdef RXM_BUFFER_ON_HEAP
- free(r->pkt);
-#else
- ipcp_spb_release(r->spb);
-#endif
- free(r);
- }
- }
- rw.prv_rxm[i] = rxm_slot & (RXMQ_SLOTS - 1);
- /* Move up a level in the wheel. */
- rxm_slot >>= RXMQ_BUMP;
- }
-
- ack_slot = ts_to_ack_slot(now) & (ACKQ_SLOTS - 1) ;
-
- j = rw.prv_ack;
-
- if (ack_slot < j)
- ack_slot += ACKQ_SLOTS;
-
- while (j++ < ack_slot) {
- list_for_each_safe(p, h, &rw.acks[j & (ACKQ_SLOTS - 1)]) {
- struct ack * a;
- struct flow * f;
-
- a = list_entry(p, struct ack, next);
-
- list_del(&a->next);
-
- f = &proc.flows[a->fd];
-
- rw.map[j & (ACKQ_SLOTS - 1)][a->fd] = false;
-
- if (f->info.id == a->flow_id && f->frcti != NULL)
- send_frct_pkt(a->frcti);
-
- free(a);
- }
- }
-
- rw.prv_ack = ack_slot & (ACKQ_SLOTS - 1);
-
- pthread_cleanup_pop(true);
-}
-
-static int timerwheel_rxm(struct frcti * frcti,
- uint32_t seqno,
- struct ssm_pk_buff * spb)
-{
- struct timespec now;
- struct rxm * r;
- size_t slot;
- size_t lvl = 0;
- time_t rto_slot;
-
- r = malloc(sizeof(*r));
- if (r == NULL)
- return -ENOMEM;
-
- clock_gettime(PTHREAD_COND_CLOCK, &now);
-
- r->t0 = ts_to_ns(now);
- r->seqno = seqno;
- r->frcti = frcti;
- r->len = ssm_pk_buff_len(spb);
-#ifdef RXM_BUFFER_ON_HEAP
- r->pkt = malloc(r->len);
- if (r->pkt == NULL) {
- free(r);
- return -ENOMEM;
- }
- memcpy(r->pkt, ssm_pk_buff_head(spb), r->len);
-#else
- r->spb = spb;
- r->pkt = (struct frct_pci *) ssm_pk_buff_head(spb);
-#endif
- pthread_rwlock_rdlock(&r->frcti->lock);
-
- rto_slot = frcti->rto >> RXMQ_RES;
- slot = r->t0 >> RXMQ_RES;
-
- r->fd = frcti->fd;
- r->flow_id = proc.flows[r->fd].info.id;
-
- pthread_rwlock_unlock(&r->frcti->lock);
-
- while (rto_slot >= RXMQ_SLOTS) {
- ++lvl;
- rto_slot >>= RXMQ_BUMP;
- slot >>= RXMQ_BUMP;
- }
-
- if (lvl >= RXMQ_LVLS) { /* Out of timerwheel range. */
-#ifdef RXM_BUFFER_ON_HEAP
- free(r->pkt);
-#endif
- free(r);
- return -EPERM;
- }
-
- slot = (slot + rto_slot + 1) & (RXMQ_SLOTS - 1);
-
- pthread_mutex_lock(&rw.lock);
-
- list_add_tail(&r->next, &rw.rxms[lvl][slot]);
-#ifndef RXM_BUFFER_ON_HEAP
- ssm_pk_buff_wait_ack(spb);
-#endif
- pthread_mutex_unlock(&rw.lock);
-
- return 0;
-}
-
-static int timerwheel_delayed_ack(int fd,
- struct frcti * frcti)
-{
- struct timespec now;
- struct ack * a;
- size_t slot;
-
- a = malloc(sizeof(*a));
- if (a == NULL)
- return -ENOMEM;
-
- clock_gettime(PTHREAD_COND_CLOCK, &now);
-
- pthread_rwlock_rdlock(&frcti->lock);
-
- slot = (((ts_to_ns(now) + (TICTIME << 1)) >> ACKQ_RES) + 1)
- & (ACKQ_SLOTS - 1);
-
- pthread_rwlock_unlock(&frcti->lock);
-
- a->fd = fd;
- a->frcti = frcti;
- a->flow_id = proc.flows[fd].info.id;
-
- pthread_mutex_lock(&rw.lock);
-
- if (rw.map[slot][fd]) {
- pthread_mutex_unlock(&rw.lock);
- free(a);
- return 0;
- }
-
- rw.map[slot][fd] = true;
-
- list_add_tail(&a->next, &rw.acks[slot]);
-
- pthread_mutex_unlock(&rw.lock);
-
- return 0;
-}
diff --git a/src/lib/tw.c b/src/lib/tw.c
new file mode 100644
index 00000000..ccde7dd1
--- /dev/null
+++ b/src/lib/tw.c
@@ -0,0 +1,307 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * Generic deadline-ordered callback queue (timing wheel)
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+#if defined(__linux__) || defined(__CYGWIN__)
+#define _DEFAULT_SOURCE
+#else
+#define _POSIX_C_SOURCE 200809L
+#endif
+
+#include "config.h"
+
+#include <ouroboros/list.h>
+#include <ouroboros/pthread.h>
+#include <ouroboros/time.h>
+#include <ouroboros/tw.h>
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+/* 3 levels × 256 slots, 1 ms / 16 ms / 256 ms per-slot resolution. */
+#define TW_LVLS 3
+#define TW_SLOTS 256
+#define TW_BUMP 4
+#define TW_RES 20 /* 2^20 ns ≈ 1 ms per slot at level 0. */
+
+#define TW_SLOT(x) ((x) & (TW_SLOTS - 1))
+
+static struct {
+ struct list_head levels[TW_LVLS][TW_SLOTS];
+ size_t prv[TW_LVLS];
+ pthread_mutex_t mtx;
+ pthread_mutex_t move_mtx;
+ bool initialised;
+} tw;
+
+static size_t tw_lvl_res(size_t lvl)
+{
+ return TW_RES + TW_BUMP * lvl;
+}
+
+/* Smallest level whose slot range covers the deadline. */
+static size_t tw_pick_lvl(uint64_t now_ns,
+ uint64_t deadline_ns)
+{
+ uint64_t delta;
+ size_t lvl;
+
+ delta = deadline_ns > now_ns ? deadline_ns - now_ns : 0;
+ lvl = 0;
+
+ while (lvl < TW_LVLS - 1 && (delta >> tw_lvl_res(lvl)) >= TW_SLOTS)
+ ++lvl;
+
+ return lvl;
+}
+
+static size_t tw_slot(uint64_t ns,
+ size_t lvl)
+{
+ return TW_SLOT(ns >> tw_lvl_res(lvl));
+}
+
+int tw_init(void)
+{
+ struct timespec now;
+ size_t i;
+ size_t j;
+
+ assert(!tw.initialised);
+
+ if (pthread_mutex_init(&tw.mtx, NULL))
+ goto fail_mtx;
+
+ if (pthread_mutex_init(&tw.move_mtx, NULL))
+ goto fail_move_mtx;
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+
+ for (i = 0; i < TW_LVLS; ++i) {
+ tw.prv[i] = TW_SLOT(tw_slot(TS_TO_UINT64(now), i) - 1);
+ for (j = 0; j < TW_SLOTS; ++j)
+ list_head_init(&tw.levels[i][j]);
+ }
+
+ tw.initialised = true;
+
+ return 0;
+
+ fail_move_mtx:
+ pthread_mutex_destroy(&tw.mtx);
+ fail_mtx:
+ return -1;
+}
+
+void tw_fini(void)
+{
+ size_t i;
+ size_t j;
+
+ assert(tw.initialised);
+
+ for (i = 0; i < TW_LVLS; ++i) {
+ for (j = 0; j < TW_SLOTS; ++j)
+ assert(list_is_empty(&tw.levels[i][j]));
+ }
+
+ pthread_mutex_destroy(&tw.move_mtx);
+ pthread_mutex_destroy(&tw.mtx);
+
+ tw.initialised = false;
+}
+
+void tw_init_entry(struct tw_entry * e)
+{
+ list_head_init(&e->next);
+
+ e->deadline_ns = 0;
+ e->fire = NULL;
+ e->arg = NULL;
+ e->lvl = 0;
+}
+
+void tw_post(struct tw_entry * e,
+ uint64_t deadline_ns,
+ tw_fire_fn_t fire,
+ void * arg)
+{
+ struct timespec now;
+ size_t lvl;
+ size_t slot;
+
+ assert(tw.initialised);
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+
+ lvl = tw_pick_lvl(TS_TO_UINT64(now), deadline_ns);
+ /* +1 so deadline <= slot_start; lands later in slot. */
+ slot = TW_SLOT(tw_slot(deadline_ns, lvl) + 1);
+
+ e->deadline_ns = deadline_ns;
+ e->fire = fire;
+ e->arg = arg;
+ e->lvl = lvl;
+
+ pthread_mutex_lock(&tw.mtx);
+
+ if (!list_is_empty(&e->next))
+ list_del(&e->next);
+
+ list_add_tail(&e->next, &tw.levels[lvl][slot]);
+
+ pthread_mutex_unlock(&tw.mtx);
+}
+
+void tw_cancel(struct tw_entry * e)
+{
+ if (e == NULL)
+ return;
+
+ assert(tw.initialised);
+
+ pthread_mutex_lock(&tw.mtx);
+
+ if (!list_is_empty(&e->next)) {
+ list_del(&e->next);
+ list_head_init(&e->next);
+ }
+
+ pthread_mutex_unlock(&tw.mtx);
+}
+
+void tw_move(void)
+{
+ struct timespec now;
+ struct list_head deferred;
+ struct list_head * p;
+ uint64_t now_ns;
+ size_t i;
+ size_t j;
+ size_t cur;
+
+ assert(tw.initialised);
+
+ if (pthread_mutex_trylock(&tw.move_mtx) != 0)
+ return;
+
+ pthread_cleanup_push(__cleanup_mutex_unlock, &tw.move_mtx);
+
+ pthread_mutex_lock(&tw.mtx);
+
+ pthread_cleanup_push(__cleanup_mutex_unlock, &tw.mtx);
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ for (i = 0; i < TW_LVLS; ++i) {
+ cur = tw_slot(now_ns, i);
+
+ j = tw.prv[i];
+ if (cur < j)
+ cur += TW_SLOTS;
+
+ while (j++ < cur) {
+ size_t s = TW_SLOT(j);
+
+ /* Pop-front so fire may mutate any entry. */
+ list_head_init(&deferred);
+
+ while (!list_is_empty(&tw.levels[i][s])) {
+ struct tw_entry * e;
+ p = tw.levels[i][s].nxt;
+ e = list_entry(p, struct tw_entry, next);
+ list_del(&e->next);
+
+ if (e->deadline_ns > now_ns) {
+ list_add_tail(&e->next, &deferred);
+ continue;
+ }
+
+ pthread_mutex_unlock(&tw.mtx);
+ e->fire(e->arg);
+ pthread_mutex_lock(&tw.mtx);
+ }
+
+ while (!list_is_empty(&deferred)) {
+ p = deferred.nxt;
+ list_del(p);
+ list_add_tail(p, &tw.levels[i][s]);
+ }
+ }
+
+ tw.prv[i] = TW_SLOT(cur);
+ }
+
+ pthread_cleanup_pop(true); /* tw.mtx */
+ pthread_cleanup_pop(true); /* tw.move_mtx */
+}
+
+/* Earliest pending deadline at level lvl, INT64_MAX if level is empty. */
+static int64_t tw_lvl_earliest(size_t lvl,
+ uint64_t now_ns)
+{
+ size_t cur = tw_slot(now_ns, lvl);
+ size_t j;
+
+ for (j = 1; j <= TW_SLOTS; ++j) {
+ size_t s = TW_SLOT(cur + j);
+
+ if (list_is_empty(&tw.levels[lvl][s]))
+ continue;
+
+ return (int64_t)(now_ns + ((uint64_t) j << tw_lvl_res(lvl)));
+ }
+
+ return INT64_MAX;
+}
+
+void tw_next_expiry(struct timespec * out)
+{
+ struct timespec now;
+ uint64_t now_ns;
+ int64_t earliest = INT64_MAX;
+ size_t i;
+
+ assert(tw.initialised);
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ pthread_mutex_lock(&tw.mtx);
+
+ for (i = 0; i < TW_LVLS; ++i) {
+ int64_t dl = tw_lvl_earliest(i, now_ns);
+ if (dl < earliest)
+ earliest = dl;
+ }
+
+ pthread_mutex_unlock(&tw.mtx);
+
+ if (earliest == INT64_MAX) {
+ /* Empty wheel: tv_nsec=-1 is an invalid normalised value. */
+ out->tv_sec = 0;
+ out->tv_nsec = -1;
+ } else {
+ UINT64_TO_TS((uint64_t) earliest, out);
+ }
+}
diff --git a/src/tools/CMakeLists.txt b/src/tools/CMakeLists.txt
index 3cec8172..6b418838 100644
--- a/src/tools/CMakeLists.txt
+++ b/src/tools/CMakeLists.txt
@@ -63,6 +63,11 @@ target_include_directories(operf PRIVATE ${TOOLS_INCLUDE_DIRS})
target_link_libraries(operf PRIVATE ouroboros-dev)
install(TARGETS operf RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+add_executable(oftp oftp/oftp.c)
+target_include_directories(oftp PRIVATE ${TOOLS_INCLUDE_DIRS})
+target_link_libraries(oftp PRIVATE ouroboros-dev)
+install(TARGETS oftp RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+
if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
add_executable(ovpn ovpn/ovpn.c)
target_include_directories(ovpn PRIVATE ${TOOLS_INCLUDE_DIRS})
diff --git a/src/tools/irm/irm_ipcp_connect.c b/src/tools/irm/irm_ipcp_connect.c
index f88c36dc..fb21faec 100644
--- a/src/tools/irm/irm_ipcp_connect.c
+++ b/src/tools/irm/irm_ipcp_connect.c
@@ -100,16 +100,18 @@ int do_connect_ipcp(int argc,
}
if (qos != NULL) {
- if (strcmp(qos, "best") == 0)
- qs = qos_best_effort;
- else if (strcmp(qos, "raw") == 0)
+ if (strcmp(qos, "raw") == 0)
qs = qos_raw;
- else if (strcmp(qos, "video") == 0)
- qs = qos_video;
- else if (strcmp(qos, "voice") == 0)
- qs = qos_voice;
- else if (strcmp(qos, "data") == 0)
- qs = qos_data;
+ else if (strcmp(qos, "safe") == 0)
+ qs = qos_raw_safe;
+ else if (strcmp(qos, "rt") == 0)
+ qs = qos_rt;
+ else if (strcmp(qos, "rt-safe") == 0)
+ qs = qos_rt_safe;
+ else if (strcmp(qos, "msg") == 0)
+ qs = qos_msg;
+ else if (strcmp(qos, "stream") == 0)
+ qs = qos_stream;
else
printf("Unknown QoS cube, defaulting to raw.\n");
}
@@ -126,7 +128,7 @@ int do_connect_ipcp(int argc,
if (wildcard_match(comp, MGMT) == 0) {
component = MGMT_COMP;
- /* FIXME: move to qos_data when stable */
+ /* FIXME: move to qos_msg when stable */
if (irm_connect_ipcp(pid, dst, component, qos_raw))
return -1;
}
diff --git a/src/tools/irm/irm_name_create.c b/src/tools/irm/irm_name_create.c
index 1055700c..40a51193 100644
--- a/src/tools/irm/irm_name_create.c
+++ b/src/tools/irm/irm_name_create.c
@@ -51,10 +51,10 @@
#define RR "round-robin"
#define SPILL "spillover"
-#define SENC "<security_dir>/server/<name>/enc.conf"
+#define SSEC "<security_dir>/server/<name>/sec.conf"
#define SCRT "<security_dir>/server/<name>/crt.pem"
#define SKEY "<security_dir>/server/<name>/key.pem"
-#define CENC "<security_dir>/client/<name>/enc.conf"
+#define CSEC "<security_dir>/client/<name>/sec.conf"
#define CCRT "<security_dir>/client/<name>/crt.pem"
#define CKEY "<security_dir>/client/<name>/key.pem"
@@ -63,10 +63,10 @@ static void usage(void)
printf("Usage: irm name create\n"
" <name>. max %d chars.\n"
" [lb LB_POLICY], default: %s\n"
- " [sencpath <path>, default: " SENC "]\n"
+ " [ssecpath <path>, default: " SSEC "]\n"
" [scrtpath <path>, default: " SCRT "]\n"
" [skeypath <path>, default: " SKEY "]\n"
- " [cencpath <path>, default: " CENC "]\n"
+ " [csecpath <path>, default: " CSEC "]\n"
" [ccrtpath <path>, default: " CCRT "]\n"
" [ckeypath <path>, default: " CKEY "]\n"
"\n"
@@ -105,10 +105,10 @@ int do_create_name(int argc,
{
struct name_info info = {};
char * name = NULL;
- char * sencpath = NULL;
+ char * ssecpath = NULL;
char * scrtpath = NULL;
char * skeypath = NULL;
- char * cencpath = NULL;
+ char * csecpath = NULL;
char * ccrtpath = NULL;
char * ckeypath = NULL;
char * lb_pol = RR;
@@ -119,14 +119,14 @@ int do_create_name(int argc,
while (argc > 0) {
if (matches(*argv, "lb") == 0) {
lb_pol = *(argv + 1);
- } else if (matches(*argv, "sencpath") == 0) {
- sencpath = *(argv + 1);
+ } else if (matches(*argv, "ssecpath") == 0) {
+ ssecpath = *(argv + 1);
} else if (matches(*argv, "scrtpath") == 0) {
scrtpath = *(argv + 1);
} else if (matches(*argv, "skeypath") == 0) {
skeypath = *(argv + 1);
- } else if (matches(*argv, "cencpath") == 0) {
- cencpath = *(argv + 1);
+ } else if (matches(*argv, "csecpath") == 0) {
+ csecpath = *(argv + 1);
} else if (matches(*argv, "ccrtpath") == 0) {
ccrtpath = *(argv + 1);
} else if (matches(*argv, "ckeypath") == 0) {
@@ -151,7 +151,7 @@ int do_create_name(int argc,
strcpy(info.name, name);
- if (sencpath != NULL && cp_chk_path(info.s.enc, sencpath) < 0)
+ if (ssecpath != NULL && cp_chk_path(info.s.sec, ssecpath) < 0)
goto fail;
if (scrtpath != NULL && cp_chk_path(info.s.crt, scrtpath) < 0)
@@ -160,7 +160,7 @@ int do_create_name(int argc,
if (skeypath != NULL && cp_chk_path(info.s.key, skeypath) < 0)
goto fail;
- if (cencpath != NULL && cp_chk_path(info.c.enc, cencpath) < 0)
+ if (csecpath != NULL && cp_chk_path(info.c.sec, csecpath) < 0)
goto fail;
if (ccrtpath != NULL && cp_chk_path(info.c.crt, ccrtpath) < 0)
diff --git a/src/tools/ocbr/ocbr_client.c b/src/tools/ocbr/ocbr_client.c
index 9dd9904c..36c07d43 100644
--- a/src/tools/ocbr/ocbr_client.c
+++ b/src/tools/ocbr/ocbr_client.c
@@ -37,8 +37,11 @@
*/
#include <ouroboros/dev.h>
+#include <ouroboros/qos.h>
#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
volatile bool stop;
@@ -86,6 +89,11 @@ int client_main(char * server,
struct timespec end;
struct timespec intv = {(gap / BILLION), gap % BILLION};
int ms;
+ const char * qenv;
+ qosspec_t qs;
+ qosspec_t * qsp;
+
+ qsp = NULL;
stop = false;
@@ -98,16 +106,38 @@ int client_main(char * server,
sigaction(SIGHUP, &sig_act, NULL) ||
sigaction(SIGPIPE, &sig_act, NULL)) {
printf("Failed to install sighandler.\n");
- return -1;
+ return 2;
}
printf("Client started, duration %d, rate %lu b/s, size %d B.\n",
duration, rate, size);
- fd = flow_alloc(server, NULL, NULL);
+ qenv = getenv("OCBR_QOS");
+ if (qenv != NULL) {
+ if (strcmp(qenv, "raw") == 0)
+ qs = qos_raw;
+ else if (strcmp(qenv, "safe") == 0)
+ qs = qos_raw_safe;
+ else if (strcmp(qenv, "rt") == 0)
+ qs = qos_rt;
+ else if (strcmp(qenv, "rt_safe") == 0)
+ qs = qos_rt_safe;
+ else if (strcmp(qenv, "msg") == 0)
+ qs = qos_msg;
+ else if (strcmp(qenv, "stream") == 0)
+ qs = qos_stream;
+ else {
+ fprintf(stderr,
+ "Unknown OCBR_QOS='%s', using raw.\n", qenv);
+ qs = qos_raw;
+ }
+ qsp = &qs;
+ printf("OCBR_QOS=%s\n", qenv);
+ }
+ fd = flow_alloc(server, qsp, NULL);
if (fd < 0) {
printf("Failed to allocate flow.\n");
- return -1;
+ return 2;
}
clock_gettime(CLOCK_REALTIME, &start);
diff --git a/src/tools/oecho/oecho.c b/src/tools/oecho/oecho.c
index 14caab53..ef0a168f 100644
--- a/src/tools/oecho/oecho.c
+++ b/src/tools/oecho/oecho.c
@@ -101,20 +101,20 @@ static int client_main(void)
fd = flow_alloc("oecho", NULL, NULL);
if (fd < 0) {
printf("Failed to allocate flow.\n");
- return -1;
+ return 2;
}
if (flow_write(fd, message, strlen(message) + 1) < 0) {
printf("Failed to write packet.\n");
flow_dealloc(fd);
- return -1;
+ return 1;
}
count = flow_read(fd, buf, BUF_SIZE);
if (count < 0) {
printf("Failed to read packet.\n");
flow_dealloc(fd);
- return -1;
+ return 1;
}
printf("Server replied with %.*s\n", (int) count, buf);
@@ -126,7 +126,7 @@ static int client_main(void)
int main(int argc, char ** argv)
{
- int ret = -1;
+ int ret = 0;
bool server = false;
argc--;
diff --git a/src/tools/oftp/oftp.c b/src/tools/oftp/oftp.c
new file mode 100644
index 00000000..1ae99403
--- /dev/null
+++ b/src/tools/oftp/oftp.c
@@ -0,0 +1,441 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * A minimal file-transfer tool over an FRCT stream flow
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define _POSIX_C_SOURCE 200809L
+
+#include <ouroboros/crc64.h>
+#include <ouroboros/dev.h>
+#include <ouroboros/errno.h>
+#include <ouroboros/fccntl.h>
+#include <ouroboros/qos.h>
+
+#include <fcntl.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+
+#define BUF_SIZE 16384
+
+static volatile sig_atomic_t stop = 0;
+
+static void apply_rto_min_env(int fd)
+{
+ const char * env;
+ long v;
+
+ env = getenv("OFTP_FRCT_RTO_MIN");
+ if (env == NULL)
+ return;
+ v = strtol(env, NULL, 10);
+ if (v <= 0)
+ return;
+ if (fccntl(fd, FRCTSRTOMIN, (time_t) v) < 0)
+ fprintf(stderr,
+ "oftp: failed to set RTO_MIN=%ld ns\n", v);
+}
+
+static void apply_stream_ring_sz_env(int fd)
+{
+ const char * env;
+ long v;
+
+ env = getenv("OFTP_FRCT_STREAM_RING_SZ");
+ if (env == NULL)
+ return;
+ v = strtol(env, NULL, 10);
+ if (v <= 0)
+ return;
+ if (fccntl(fd, FRCTSRRINGSZ, (size_t) v) < 0)
+ fprintf(stderr,
+ "oftp: failed to set STREAM_RING_SZ=%ld\n", v);
+}
+
+static void on_signal(int signo)
+{
+ (void) signo;
+ stop = 1;
+}
+
+static void usage(void)
+{
+ printf("Usage: oftp [OPTION]...\n"
+ "Stream-mode file transfer over an Ouroboros flow.\n\n"
+ " -l, --listen Run as the receiver (server)\n"
+ " -n, --name NAME Destination service name (client)\n"
+ " -i, --in FILE Read input from FILE (default stdin)\n"
+ " -o, --out FILE Write output to FILE (default stdout)\n"
+ " -N, --bytes SIZE Stop after SIZE bytes "
+ "(K/M/G suffix; client only)\n"
+ " --help Display this help text and exit\n");
+}
+
+static int parse_size(const char * s, size_t * out)
+{
+ char * end;
+ unsigned long v;
+ size_t mul;
+
+ v = strtoul(s, &end, 0);
+ if (end == s)
+ return -1;
+
+ mul = 1;
+ if (*end == 'k' || *end == 'K')
+ mul = 1024UL;
+ else if (*end == 'm' || *end == 'M')
+ mul = 1024UL * 1024UL;
+ else if (*end == 'g' || *end == 'G')
+ mul = 1024UL * 1024UL * 1024UL;
+ else if (*end != '\0')
+ return -1;
+
+ *out = (size_t) v * mul;
+ return 0;
+}
+
+static void report_xfer(const char * tag,
+ size_t total,
+ uint64_t crc,
+ const struct timespec * t0,
+ const struct timespec * t1)
+{
+ double elapsed_s;
+ double mib_per_s;
+
+ elapsed_s = (t1->tv_sec - t0->tv_sec)
+ + (t1->tv_nsec - t0->tv_nsec) / 1e9;
+ if (elapsed_s <= 0.0)
+ elapsed_s = 1e-9;
+
+ mib_per_s = ((double) total / (1024.0 * 1024.0)) / elapsed_s;
+
+ fprintf(stderr,
+ "oftp: %s %zu bytes in %.3f s (%.2f MiB/s) "
+ "crc64=%016" PRIx64 "\n",
+ tag, total, elapsed_s, mib_per_s, crc);
+}
+
+static int xfer_to_flow(int fd, FILE * in, size_t max_bytes)
+{
+ char buf[BUF_SIZE];
+ size_t n;
+ size_t total;
+ size_t want;
+ size_t off;
+ ssize_t w;
+ uint64_t crc;
+ struct timespec t0;
+ struct timespec t1;
+
+ total = 0;
+ crc = 0;
+
+ clock_gettime(CLOCK_MONOTONIC, &t0);
+
+ while (!stop) {
+ want = sizeof(buf);
+ if (max_bytes > 0 && max_bytes - total < want)
+ want = max_bytes - total;
+ if (want == 0)
+ break;
+
+ n = fread(buf, 1, want, in);
+ if (n == 0)
+ break;
+
+ crc64_nvme(&crc, buf, n);
+
+ off = 0;
+ while (off < n) {
+ w = flow_write(fd, buf + off, n - off);
+ if (w < 0) {
+ fprintf(stderr,
+ "flow_write failed: %zd\n", w);
+ return 1;
+ }
+ off += (size_t) w;
+ total += (size_t) w;
+ }
+ }
+
+ clock_gettime(CLOCK_MONOTONIC, &t1);
+
+ if (ferror(in)) {
+ fprintf(stderr, "Input read error.\n");
+ return 1;
+ }
+
+ report_xfer("sent", total, crc, &t0, &t1);
+ return 0;
+}
+
+static int xfer_from_flow(int fd, FILE * out)
+{
+ char buf[BUF_SIZE];
+ size_t total;
+ ssize_t n;
+ uint64_t crc;
+ struct timespec timeout;
+ struct timespec t0;
+ struct timespec t1;
+ bool started;
+
+ total = 0;
+ crc = 0;
+ started = false;
+ timeout.tv_sec = 1;
+ timeout.tv_nsec = 0;
+
+ /* Short timeout so SIGTERM/SIGINT 'stop' is observed promptly. */
+ fccntl(fd, FLOWSRCVTIMEO, &timeout);
+
+ while (!stop) {
+ n = flow_read(fd, buf, sizeof(buf));
+ if (n == 0) {
+ /* Clean EOF: peer sent EOS and we drained it. */
+ clock_gettime(CLOCK_MONOTONIC, &t1);
+ fflush(out);
+ if (!started)
+ t0 = t1;
+ report_xfer("received", total, crc, &t0, &t1);
+ return 0;
+ }
+ if (n == -ETIMEDOUT)
+ continue;
+ if (n < 0) {
+ /* Peer aborted before EOS: partial transfer. */
+ if (n == -EFLOWDOWN || n == -EFLOWPEER) {
+ fprintf(stderr,
+ "oftp: peer aborted at %zu B\n",
+ total);
+ return 2;
+ }
+ fprintf(stderr,
+ "flow_read failed: %zd\n", n);
+ return 1;
+ }
+ if (!started) {
+ clock_gettime(CLOCK_MONOTONIC, &t0);
+ started = true;
+ }
+ crc64_nvme(&crc, buf, (size_t) n);
+ if (fwrite(buf, 1, (size_t) n, out) != (size_t) n) {
+ fprintf(stderr, "Output write error.\n");
+ return 1;
+ }
+ total += (size_t) n;
+ }
+
+ /* Receiver was signalled (SIGINT/SIGTERM) before EOF. */
+ fflush(out);
+ fprintf(stderr, "oftp: interrupted at %zu B\n", total);
+ return 2;
+}
+
+static int server_main(const char * outpath)
+{
+ FILE * out = stdout;
+ int fd;
+ int ofd;
+ int rc;
+ qosspec_t qs;
+
+ if (outpath != NULL) {
+ ofd = open(outpath,
+ O_WRONLY | O_CREAT | O_EXCL | O_NOFOLLOW,
+ 0600);
+ if (ofd < 0) {
+ perror("open");
+ return 1;
+ }
+ out = fdopen(ofd, "wb");
+ if (out == NULL) {
+ perror("fdopen");
+ close(ofd);
+ unlink(outpath);
+ return 1;
+ }
+ }
+
+ fprintf(stderr, "oftp: listening...\n");
+
+ fd = flow_accept(&qs, NULL);
+ if (fd < 0) {
+ fprintf(stderr, "flow_accept failed: %d\n", fd);
+ if (out != stdout)
+ fclose(out);
+ return 1;
+ }
+
+ if (qs.service != SVC_STREAM) {
+ fprintf(stderr,
+ "oftp: rejecting non-stream flow (service=%u)\n",
+ qs.service);
+ flow_dealloc(fd);
+ if (out != stdout) {
+ fclose(out);
+ unlink(outpath);
+ }
+ return 1;
+ }
+
+ apply_rto_min_env(fd);
+ apply_stream_ring_sz_env(fd);
+
+ rc = xfer_from_flow(fd, out);
+
+ flow_dealloc(fd);
+
+ if (out != stdout) {
+ fclose(out);
+ /* Drop the half-written file on abort/interrupt. */
+ if (rc != 0)
+ unlink(outpath);
+ }
+
+ return rc;
+}
+
+static int client_main(const char * name,
+ const char * inpath,
+ size_t max_bytes)
+{
+ FILE * in;
+ int fd;
+ int rc;
+ qosspec_t qs;
+
+ in = stdin;
+ qs = qos_stream;
+
+ if (inpath != NULL) {
+ in = fopen(inpath, "rb");
+ if (in == NULL) {
+ perror("fopen");
+ return 1;
+ }
+ }
+
+ fd = flow_alloc(name, &qs, NULL);
+ if (fd < 0) {
+ fprintf(stderr, "flow_alloc failed: %d\n", fd);
+ if (in != stdin)
+ fclose(in);
+ return 2;
+ }
+
+ apply_rto_min_env(fd);
+ apply_stream_ring_sz_env(fd);
+
+ rc = xfer_to_flow(fd, in, max_bytes);
+
+ flow_dealloc(fd);
+
+ if (in != stdin)
+ fclose(in);
+
+ return rc;
+}
+
+int main(int argc, char ** argv)
+{
+ bool server;
+ const char * name;
+ const char * inpath;
+ const char * outpath;
+ size_t max_bytes;
+ struct sigaction sa;
+
+ server = false;
+ name = NULL;
+ inpath = NULL;
+ outpath = NULL;
+ max_bytes = 0;
+
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler = on_signal;
+ sigaction(SIGINT, &sa, NULL);
+ sigaction(SIGTERM, &sa, NULL);
+ signal(SIGPIPE, SIG_IGN);
+
+ argc--; argv++;
+ while (argc > 0) {
+ if (strcmp(*argv, "-l") == 0 ||
+ strcmp(*argv, "--listen") == 0) {
+ server = true;
+ } else if ((strcmp(*argv, "-n") == 0 ||
+ strcmp(*argv, "--name") == 0) && argc > 1) {
+ name = *(++argv); argc--;
+ } else if ((strcmp(*argv, "-i") == 0 ||
+ strcmp(*argv, "--in") == 0) && argc > 1) {
+ inpath = *(++argv); argc--;
+ } else if ((strcmp(*argv, "-o") == 0 ||
+ strcmp(*argv, "--out") == 0) && argc > 1) {
+ outpath = *(++argv); argc--;
+ } else if ((strcmp(*argv, "-N") == 0 ||
+ strcmp(*argv, "--bytes") == 0) && argc > 1) {
+ if (parse_size(*(++argv), &max_bytes) < 0) {
+ fprintf(stderr,
+ "oftp: bad size '%s'\n", *argv);
+ return 1;
+ }
+ argc--;
+ } else if (strcmp(*argv, "--help") == 0) {
+ usage();
+ return 0;
+ } else {
+ usage();
+ return 1;
+ }
+ argc--; argv++;
+ }
+
+ if (server)
+ return server_main(outpath);
+
+ if (name == NULL) {
+ usage();
+ return 1;
+ }
+
+ return client_main(name, inpath, max_bytes);
+}
diff --git a/src/tools/operf/operf.c b/src/tools/operf/operf.c
index 1872b351..0198e871 100644
--- a/src/tools/operf/operf.c
+++ b/src/tools/operf/operf.c
@@ -248,5 +248,5 @@ int main(int argc, char ** argv)
if (ret < 0)
exit(EXIT_FAILURE);
- exit(EXIT_SUCCESS);
+ exit(ret);
}
diff --git a/src/tools/operf/operf_client.c b/src/tools/operf/operf_client.c
index 7e8f1a9b..e478aeff 100644
--- a/src/tools/operf/operf_client.c
+++ b/src/tools/operf/operf_client.c
@@ -185,7 +185,7 @@ int client_main(void)
sigaction(SIGHUP, &sig_act, NULL) ||
sigaction(SIGPIPE, &sig_act, NULL)) {
printf("Failed to install sighandler.\n");
- return -1;
+ return 2;
}
client.sent = 0;
@@ -196,7 +196,7 @@ int client_main(void)
fd = flow_alloc(client.server_name, NULL, NULL);
if (fd < 0) {
printf("Failed to allocate flow.\n");
- return -1;
+ return 2;
}
if (client.conf.test_type == TEST_TYPE_BI)
@@ -207,7 +207,7 @@ int client_main(void)
if (flow_write(fd, &client.conf, sizeof(client.conf)) < 0) {
printf("Failed to send configuration.\n");
flow_dealloc(fd);
- return -1;
+ return 1;
}
sleep(1);
diff --git a/src/tools/oping/oping.c b/src/tools/oping/oping.c
index 763c0d62..10e1e23c 100644
--- a/src/tools/oping/oping.c
+++ b/src/tools/oping/oping.c
@@ -60,7 +60,7 @@
#include <errno.h>
#include <float.h>
-#define OPING_BUF_SIZE 1500
+#define OPING_BUF_SIZE 16384
#define ECHO_REQUEST 0
#define ECHO_REPLY 1
#define OPING_MAX_FLOWS 256
@@ -81,8 +81,9 @@
" -F, --flood-busy Flood with busy-polling (lower latency)\n" \
" -i, --interval Interval (default 1000ms)\n" \
" -n, --server-name Name of the oping server\n" \
-" -q, --qos QoS (raw, best, video, voice, data)\n" \
+" -q, --qos QoS (raw, safe, rt, rt-safe, msg)\n" \
" -s, --size Payload size (B, default 64)\n" \
+" -W, --timeout Per-packet recv timeout, ms (default 2000)\n" \
" -Q, --quiet Only print final statistics\n" \
" -D, --timeofday Print time of day before each line\n" \
"\n" \
@@ -93,9 +94,11 @@ struct {
int interval;
uint32_t count;
int size;
+ int timeout; /* per-packet recv timeout, ms */
bool timestamp;
bool flood;
bool flood_busy;
+ long duration;
qosspec_t qs;
/* stats */
@@ -175,18 +178,20 @@ int main(int argc,
argc--;
argv++;
- client.s_apn = NULL;
- client.interval = 1000;
- client.size = 64;
- client.count = INT_MAX;
- client.timestamp = false;
- client.flood = false;
+ client.s_apn = NULL;
+ client.interval = 1000;
+ client.size = 64;
+ client.count = INT_MAX;
+ client.timeout = 2000;
+ client.timestamp = false;
+ client.flood = false;
client.flood_busy = false;
- client.qs = qos_raw;
- client.quiet = false;
- server.quiet = false;
- server.poll = false;
- server.busy = false;
+ client.duration = 0;
+ client.qs = qos_raw;
+ client.quiet = false;
+ server.quiet = false;
+ server.poll = false;
+ server.busy = false;
while (argc > 0) {
if ((strcmp(*argv, "-i") == 0 ||
@@ -216,6 +221,12 @@ int main(int argc,
argc > 1) {
client.size = strtol(*(++argv), &rem, 10);
--argc;
+ } else if ((strcmp(*argv, "-W") == 0 ||
+ strcmp(*argv, "--timeout") == 0) &&
+ argc > 1) {
+ client.timeout = strtol(*(++argv), &rem, 10);
+ client.timeout *= time_mul(rem);
+ --argc;
} else if ((strcmp(*argv, "-q") == 0 ||
strcmp(*argv, "--qos") == 0) &&
argc > 1) {
@@ -249,23 +260,25 @@ int main(int argc,
}
if (duration > 0) {
- if (client.interval == 0)
+ if (client.flood || client.flood_busy)
+ client.duration = duration;
+ else if (client.interval == 0)
client.count = duration * 10;
else
client.count = duration / client.interval;
}
if (qos != NULL) {
- if (strcmp(qos, "best") == 0)
- client.qs = qos_best_effort;
- else if (strcmp(qos, "raw") == 0)
+ if (strcmp(qos, "raw") == 0)
client.qs = qos_raw;
- else if (strcmp(qos, "video") == 0)
- client.qs = qos_video;
- else if (strcmp(qos, "voice") == 0)
- client.qs = qos_voice;
- else if (strcmp(qos, "data") == 0)
- client.qs = qos_data;
+ else if (strcmp(qos, "safe") == 0)
+ client.qs = qos_raw_safe;
+ else if (strcmp(qos, "rt") == 0)
+ client.qs = qos_rt;
+ else if (strcmp(qos, "rt-safe") == 0)
+ client.qs = qos_rt_safe;
+ else if (strcmp(qos, "msg") == 0)
+ client.qs = qos_msg;
else
printf("Unknown QoS cube, defaulting to raw.\n");
}
@@ -298,7 +311,7 @@ int main(int argc,
if (ret < 0)
exit(EXIT_FAILURE);
- exit(EXIT_SUCCESS);
+ exit(ret);
fail:
usage();
diff --git a/src/tools/oping/oping_client.c b/src/tools/oping/oping_client.c
index 23807f65..4b01315d 100644
--- a/src/tools/oping/oping_client.c
+++ b/src/tools/oping/oping_client.c
@@ -47,6 +47,7 @@ void shutdown_client(int signo, siginfo_t * info, void * c)
case SIGINT:
case SIGTERM:
case SIGHUP:
+ case SIGALRM:
stop = true;
default:
return;
@@ -89,7 +90,7 @@ static void print_rtt(int len, int seq,
void * reader(void * o)
{
- struct timespec timeout = {client.interval / 1000 + 2, 0};
+ struct timespec timeout;
struct timespec now = {0, 0};
struct timespec sent;
@@ -100,6 +101,9 @@ void * reader(void * o)
double ms = 0;
uint32_t exp_id = 0;
+ timeout.tv_sec = client.timeout / 1000;
+ timeout.tv_nsec = (client.timeout % 1000) * MILLION;
+
fccntl(fd, FLOWSRCVTIMEO, &timeout);
while (!stop && client.rcvd != client.count) {
@@ -284,18 +288,15 @@ static int flood_busy_ping(int fd)
msg->tv_sec = sent.tv_sec;
msg->tv_nsec = sent.tv_nsec;
- if (flow_write(fd, buf,
- client.size) < 0) {
- printf("Failed to send "
- "packet.\n");
+ if (flow_write(fd, buf, client.size) < 0) {
+ printf("Failed to send packet.\n");
break;
}
++client.sent;
do {
- n = flow_read(fd, buf,
- OPING_BUF_SIZE);
+ n = flow_read(fd, buf, OPING_BUF_SIZE);
} while (n == -EAGAIN && !stop);
if (n < 0)
@@ -315,9 +316,7 @@ static int flood_busy_ping(int fd)
update_rtt_stats(ms);
if (!client.quiet)
- print_rtt(client.size,
- ntohl(msg->id), ms,
- NULL);
+ print_rtt(client.size, ntohl(msg->id), ms, NULL);
}
return 0;
@@ -371,9 +370,7 @@ static int flood_ping(int fd)
update_rtt_stats(ms);
if (!client.quiet)
- print_rtt(client.size,
- ntohl(msg->id), ms,
- NULL);
+ print_rtt(client.size, ntohl(msg->id), ms, NULL);
}
return 0;
@@ -404,25 +401,34 @@ static int client_main(void)
if (sigaction(SIGINT, &sig_act, NULL) ||
sigaction(SIGTERM, &sig_act, NULL) ||
sigaction(SIGHUP, &sig_act, NULL) ||
- sigaction(SIGPIPE, &sig_act, NULL)) {
+ sigaction(SIGPIPE, &sig_act, NULL) ||
+ sigaction(SIGALRM, &sig_act, NULL)) {
printf("Failed to install sighandler.\n");
- return -1;
+ return 2;
}
if (client_init()) {
printf("Failed to initialize client.\n");
- return -1;
+ return 2;
}
fd = flow_alloc(client.s_apn, &client.qs, NULL);
if (fd < 0) {
printf("Failed to allocate flow: %d.\n", fd);
client_fini();
- return -1;
+ return 2;
}
fccntl(fd, FLOWSFLAGS, FLOWFRDWR | FLOWFRNOPART);
+ if (client.duration > 0) {
+ struct itimerval it;
+ memset(&it, 0, sizeof(it));
+ it.it_value.tv_sec = client.duration / 1000;
+ it.it_value.tv_usec = (client.duration % 1000) * 1000;
+ setitimer(ITIMER_REAL, &it, NULL);
+ }
+
clock_gettime(CLOCK_REALTIME, &tic);
if (client.flood_busy)
@@ -439,5 +445,5 @@ static int client_main(void)
flow_dealloc(fd);
client_fini();
- return 0;
+ return client.rcvd == client.sent ? 0 : 1;
}
diff --git a/src/tools/oping/oping_server.c b/src/tools/oping/oping_server.c
index 33af28c4..e98ca040 100644
--- a/src/tools/oping/oping_server.c
+++ b/src/tools/oping/oping_server.c
@@ -237,6 +237,14 @@ int server_main(void)
return -1;
}
+ if (pthread_mutex_init(&server.lock, NULL)) {
+ fqueue_destroy(server.fq);
+ fset_destroy(server.flows);
+ return -1;
+ }
+
+ memset(server.times, 0, sizeof(server.times));
+
pthread_create(&server.cleaner_pt, NULL, cleaner_thread, NULL);
if (server.busy) {
@@ -255,11 +263,13 @@ int server_main(void)
pthread_cancel(server.cleaner_pt);
- fset_destroy(server.flows);
- fqueue_destroy(server.fq);
-
+ /* Join cancellable threads before tearing down their fset. */
pthread_join(server.server_pt, NULL);
pthread_join(server.cleaner_pt, NULL);
+ pthread_mutex_destroy(&server.lock);
+ fset_destroy(server.flows);
+ fqueue_destroy(server.fq);
+
return 0;
}