summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.ci/woodpecker/10-build.yaml9
-rw-r--r--.gitignore1
-rw-r--r--CMakeLists.txt1
-rw-r--r--cmake/config/ipcp/broadcast.cmake3
-rw-r--r--cmake/config/ipcp/common.cmake10
-rw-r--r--cmake/config/ipcp/eth.cmake4
-rw-r--r--cmake/config/ipcp/local.cmake32
-rw-r--r--cmake/config/ipcp/udp.cmake4
-rw-r--r--cmake/config/ipcp/unicast.cmake2
-rw-r--r--cmake/config/irmd.cmake4
-rw-r--r--cmake/config/lib.cmake43
-rw-r--r--cmake/config/ssm.cmake61
-rw-r--r--cmake/tags.cmake21
-rw-r--r--cmake/utils/CPUUtils.cmake82
-rw-r--r--doc/man/flow_alloc.388
-rw-r--r--doc/man/flow_read.346
-rw-r--r--doc/man/fqueue.321
-rw-r--r--include/ouroboros/atomics.h39
-rw-r--r--include/ouroboros/crc16.h43
-rw-r--r--include/ouroboros/crc64.h44
-rw-r--r--include/ouroboros/crc8.h43
-rw-r--r--include/ouroboros/errno.h1
-rw-r--r--include/ouroboros/fccntl.h13
-rw-r--r--include/ouroboros/flow.h3
-rw-r--r--include/ouroboros/hash.h3
-rw-r--r--include/ouroboros/ipcp-dev.h4
-rw-r--r--include/ouroboros/np1_flow.h2
-rw-r--r--include/ouroboros/qos.h57
-rw-r--r--include/ouroboros/ssm_pk_buff.h24
-rw-r--r--include/ouroboros/ssm_pool.h12
-rw-r--r--include/ouroboros/ssm_rbuff.h4
-rw-r--r--include/ouroboros/time.h6
-rw-r--r--include/ouroboros/tpm.h1
-rw-r--r--include/ouroboros/tw.h77
-rw-r--r--src/ipcpd/broadcast/dt.c2
-rw-r--r--src/ipcpd/broadcast/main.c2
-rw-r--r--src/ipcpd/config.h.in13
-rw-r--r--src/ipcpd/eth/eth.c556
-rw-r--r--src/ipcpd/ipcp.c3
-rw-r--r--src/ipcpd/ipcp.h1
-rw-r--r--src/ipcpd/local/main.c9
-rw-r--r--src/ipcpd/udp/udp.c74
-rw-r--r--src/ipcpd/unicast/dt.c32
-rw-r--r--src/ipcpd/unicast/fa.c27
-rw-r--r--src/ipcpd/unicast/routing/graph.c8
-rw-r--r--src/ipcpd/unicast/routing/link-state.c2
-rw-r--r--src/irmd/main.c13
-rw-r--r--src/irmd/oap.c130
-rw-r--r--src/irmd/oap/auth.c1
-rw-r--r--src/irmd/oap/srv.c23
-rw-r--r--src/irmd/oap/tests/oap_test.c7
-rw-r--r--src/irmd/reg/flow.c2
-rw-r--r--src/irmd/reg/reg.c6
-rw-r--r--src/irmd/reg/tests/flow_test.c20
-rw-r--r--src/irmd/reg/tests/reg_test.c109
-rw-r--r--src/lib/CMakeLists.txt7
-rw-r--r--src/lib/config.h.in25
-rw-r--r--src/lib/crc/crc16.c61
-rw-r--r--src/lib/crc/crc32.c (renamed from src/lib/crc32.c)0
-rw-r--r--src/lib/crc/crc64.c363
-rw-r--r--src/lib/crc/crc8.c62
-rw-r--r--src/lib/crc/tests/CMakeLists.txt21
-rw-r--r--src/lib/crc/tests/crc16_test.c67
-rw-r--r--src/lib/crc/tests/crc32_test.c (renamed from src/lib/tests/crc32_test.c)0
-rw-r--r--src/lib/crc/tests/crc64_test.c126
-rw-r--r--src/lib/crc/tests/crc8_test.c67
-rw-r--r--src/lib/dev.c1329
-rw-r--r--src/lib/frct.c4245
-rw-r--r--src/lib/hash.c30
-rw-r--r--src/lib/pb/ipcp.proto2
-rw-r--r--src/lib/pb/irm.proto6
-rw-r--r--src/lib/pb/model.proto5
-rw-r--r--src/lib/protobuf.c6
-rw-r--r--src/lib/qoscube.c12
-rw-r--r--src/lib/random.c7
-rw-r--r--src/lib/rib.c18
-rw-r--r--src/lib/ssm/flow_set.c24
-rw-r--r--src/lib/ssm/pool.c78
-rw-r--r--src/lib/ssm/rbuff.c60
-rw-r--r--src/lib/ssm/ssm.h.in19
-rw-r--r--src/lib/ssm/tests/pool_sharding_test.c69
-rw-r--r--src/lib/ssm/tests/pool_test.c12
-rw-r--r--src/lib/tests/CMakeLists.txt2
-rw-r--r--src/lib/tests/hash_test.c110
-rw-r--r--src/lib/tests/tpm_test.c2
-rw-r--r--src/lib/tests/tw_test.c663
-rw-r--r--src/lib/timerwheel.c414
-rw-r--r--src/lib/tw.c307
-rw-r--r--src/tools/CMakeLists.txt5
-rw-r--r--src/tools/irm/irm_ipcp_connect.c22
-rw-r--r--src/tools/ocbr/ocbr_client.c36
-rw-r--r--src/tools/oecho/oecho.c8
-rw-r--r--src/tools/oftp/oftp.c441
-rw-r--r--src/tools/operf/operf.c2
-rw-r--r--src/tools/operf/operf_client.c6
-rw-r--r--src/tools/oping/oping.c61
-rw-r--r--src/tools/oping/oping_client.c42
-rw-r--r--src/tools/oping/oping_server.c16
98 files changed, 8748 insertions, 1958 deletions
diff --git a/.ci/woodpecker/10-build.yaml b/.ci/woodpecker/10-build.yaml
index 0a82c469..31b9b9b4 100644
--- a/.ci/woodpecker/10-build.yaml
+++ b/.ci/woodpecker/10-build.yaml
@@ -88,13 +88,6 @@ steps:
done
done
- for rxm_heap in TRUE FALSE; do
- for rxm_block in TRUE FALSE; do
- echo "--- HEAP=$rxm_heap BLOCKING=$rxm_block ---"
- run_build \
- -DRXM_BUFFER_ON_HEAP=$rxm_heap \
- -DRXM_BLOCKING=$rxm_block
- done
- done
+ run_build
diff --git a/.gitignore b/.gitignore
index 43f47a46..b10e8173 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
*~
*#
build/
+/tags
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c886146d..bfabd711 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,5 +69,6 @@ add_subdirectory(src/ipcpd)
add_subdirectory(src/tools)
setup_coverage_target()
include(doc)
+include(tags)
include(install)
diff --git a/cmake/config/ipcp/broadcast.cmake b/cmake/config/ipcp/broadcast.cmake
index 79f41d10..f521ed8e 100644
--- a/cmake/config/ipcp/broadcast.cmake
+++ b/cmake/config/ipcp/broadcast.cmake
@@ -4,3 +4,6 @@ set(IPCP_BROADCAST_TARGET ipcpd-broadcast)
set(IPCP_BROADCAST_MPL 100 CACHE STRING
"Default maximum packet lifetime for the Broadcast IPCP, in ms")
+
+set(IPCP_BROADCAST_MTU 1400 CACHE STRING
+ "Layer MTU advertised by the Broadcast IPCP, in bytes")
diff --git a/cmake/config/ipcp/common.cmake b/cmake/config/ipcp/common.cmake
index ffd5dc32..0c873b76 100644
--- a/cmake/config/ipcp/common.cmake
+++ b/cmake/config/ipcp/common.cmake
@@ -41,3 +41,13 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
set(IPCP_LINUX_TIMERSLACK_NS 100 CACHE STRING
"Slack value for high resolution timers on Linux systems.")
endif()
+
+# ipcpd-eth flow statistics (requires FUSE - eth.c relies on
+# IPCP_ETH_FLOW_STATS implying HAVE_FUSE for rib_reg/rib_unreg)
+if(HAVE_FUSE)
+ set(IPCP_ETH_FLOW_STATS FALSE CACHE BOOL
+ "Enable ipcpd-eth flow statistics via RIB")
+ if(IPCP_ETH_FLOW_STATS)
+ message(STATUS "ipcpd-eth flow statistics enabled")
+ endif()
+endif()
diff --git a/cmake/config/ipcp/eth.cmake b/cmake/config/ipcp/eth.cmake
index 4b9007d2..d336d647 100644
--- a/cmake/config/ipcp/eth.cmake
+++ b/cmake/config/ipcp/eth.cmake
@@ -10,6 +10,10 @@ set(IPCP_ETH_WR_THR 1 CACHE STRING
"Number of writer threads in Ethernet IPCP")
set(IPCP_ETH_QDISC_BYPASS false CACHE BOOL
"Bypass the Qdisc in the kernel when using raw sockets")
+set(IPCP_ETH_SNDBUF 0 CACHE STRING
+ "Raw socket SO_SNDBUF in bytes; 0 = leave kernel default (wmem_default)")
+set(IPCP_ETH_RCVBUF 0 CACHE STRING
+ "Raw socket SO_RCVBUF in bytes; 0 = leave kernel default (rmem_default)")
set(IPCP_ETH_LO_MTU 9000 CACHE STRING
"Restrict Ethernet MTU over loopback interfaces")
set(IPCP_ETH_MGMT_FRAME_SIZE 9000 CACHE STRING
diff --git a/cmake/config/ipcp/local.cmake b/cmake/config/ipcp/local.cmake
index 88ee8998..70423cd1 100644
--- a/cmake/config/ipcp/local.cmake
+++ b/cmake/config/ipcp/local.cmake
@@ -2,8 +2,38 @@
set(IPCP_LOCAL_TARGET ipcpd-local)
-set(IPCP_LOCAL_MPL 100 CACHE STRING
+set(IPCP_LOCAL_MPL 50 CACHE STRING
"Default maximum packet lifetime for the Local IPCP, in ms")
+set(IPCP_LOCAL_MTU 65000 CACHE STRING
+ "Layer MTU advertised by the Local IPCP, in bytes")
+
set(IPCP_LOCAL_POLLING FALSE CACHE BOOL
"Enable active polling in the Local IPCP for low-latency mode")
+
+# IPCP_LOCAL_MTU must fit in the largest enabled GSPP and PUP class
+# (sender-side allocation: daemons use GSPP, apps use PUP). Reserve a
+# margin for sizeof(struct ssm_pk_buff) + HEADSPACE + TAILSPACE.
+math(EXPR _ssm_pk_overhead
+ "${SSM_PK_BUFF_HEADSPACE} + ${SSM_PK_BUFF_TAILSPACE} + 64")
+
+foreach(_pool GSPP PUP)
+ set(_largest 0)
+ foreach(_pair "256;256" "512;512" "1K;1024" "2K;2048" "4K;4096"
+ "16K;16384" "64K;65536" "256K;262144" "1M;1048576")
+ list(GET _pair 0 _name)
+ list(GET _pair 1 _bytes)
+ if(SSM_${_pool}_${_name}_BLOCKS GREATER 0
+ AND _bytes GREATER _largest)
+ set(_largest ${_bytes})
+ endif()
+ endforeach()
+ math(EXPR _avail "${_largest} - ${_ssm_pk_overhead}")
+ if(IPCP_LOCAL_MTU GREATER _avail)
+ message(FATAL_ERROR
+ "IPCP_LOCAL_MTU (${IPCP_LOCAL_MTU}) exceeds largest enabled "
+ "SSM_${_pool} class minus per-block overhead "
+ "(${_largest} - ${_ssm_pk_overhead} = ${_avail} bytes). "
+ "Lower IPCP_LOCAL_MTU or enable a larger SSM_${_pool}_*_BLOCKS.")
+ endif()
+endforeach()
diff --git a/cmake/config/ipcp/udp.cmake b/cmake/config/ipcp/udp.cmake
index 0124c261..af84a844 100644
--- a/cmake/config/ipcp/udp.cmake
+++ b/cmake/config/ipcp/udp.cmake
@@ -10,3 +10,7 @@ set(IPCP_UDP_WR_THR 3 CACHE STRING
"Number of writer threads in UDP IPCPs")
set(IPCP_UDP_MPL 5000 CACHE STRING
"Default maximum packet lifetime for the UDP IPCPs, in ms")
+set(IPCP_UDP4_MTU 1472 CACHE STRING
+ "Fallback UDP4 layer MTU when getsockopt(IP_MTU) is unavailable, in bytes")
+set(IPCP_UDP6_MTU 1452 CACHE STRING
+ "Fallback UDP6 layer MTU when getsockopt(IPV6_MTU) is unavailable, in bytes")
diff --git a/cmake/config/ipcp/unicast.cmake b/cmake/config/ipcp/unicast.cmake
index 3b5b0ce7..b8d4d516 100644
--- a/cmake/config/ipcp/unicast.cmake
+++ b/cmake/config/ipcp/unicast.cmake
@@ -4,6 +4,8 @@ set(IPCP_UNICAST_TARGET ipcpd-unicast)
set(IPCP_UNICAST_MPL 100 CACHE STRING
"Default maximum packet lifetime for the Unicast IPCP, in ms")
+set(IPCP_UNICAST_MTU 1400 CACHE STRING
+ "Layer MTU advertised by the Unicast IPCP, in bytes (TODO: derive per-flow from n-1 path MTU minus DT PCI)")
set(PFT_SIZE 256 CACHE STRING
"Prefix forwarding table size for the Unicast IPCP")
diff --git a/cmake/config/irmd.cmake b/cmake/config/irmd.cmake
index b86a40c5..45d9e73d 100644
--- a/cmake/config/irmd.cmake
+++ b/cmake/config/irmd.cmake
@@ -10,8 +10,8 @@ set(ENROLL_TIMEOUT 20000 CACHE STRING
"Timeout for an IPCP to enroll (ms)")
set(REG_TIMEOUT 20000 CACHE STRING
"Timeout for registering a name (ms)")
-set(QUERY_TIMEOUT 200 CACHE STRING
- "Timeout to query a name with an IPCP (ms)")
+set(QUERY_TIMEOUT 2000 CACHE STRING
+ "Timeout to query a name with an IPCP (ms); must exceed shim retry budget")
set(CONNECT_TIMEOUT 20000 CACHE STRING
"Timeout to connect an IPCP to another IPCP (ms)")
set(FLOW_ALLOC_TIMEOUT 20000 CACHE STRING
diff --git a/cmake/config/lib.cmake b/cmake/config/lib.cmake
index 287f30dc..25130519 100644
--- a/cmake/config/lib.cmake
+++ b/cmake/config/lib.cmake
@@ -4,11 +4,11 @@
# Flow limits
set(SYS_MAX_FLOWS 10240 CACHE STRING
"Maximum number of total flows for this system")
-set(PROG_MAX_FLOWS 4096 CACHE STRING
+set(PROC_MAX_FLOWS 4096 CACHE STRING
"Maximum number of flows in an application")
-set(PROG_RES_FDS 64 CACHE STRING
+set(PROC_RES_FDS 64 CACHE STRING
"Number of reserved flow descriptors per application")
-set(PROG_MAX_FQUEUES 32 CACHE STRING
+set(PROC_MAX_FQUEUES 32 CACHE STRING
"Maximum number of flow sets per application")
# Threading
@@ -28,18 +28,28 @@ set(SOCKET_TIMEOUT 500 CACHE STRING
set(QOS_DISABLE_CRC TRUE CACHE BOOL
"Ignores ber setting on all QoS cubes")
-# Delta-t protocol timers
-set(DELTA_T_MPL 60 CACHE STRING
- "Maximum packet lifetime (s)")
-set(DELTA_T_ACK 10 CACHE STRING
- "Maximum time to acknowledge a packet (s)")
-set(DELTA_T_RTX 120 CACHE STRING
- "Maximum time to retransmit a packet (s)")
+include(utils/CPUUtils)
+detect_pclmul()
+detect_pmull()
+if(HAVE_PCLMUL)
+ message(STATUS "CRC-64/NVMe backend: PCLMUL (x86 SSE4.1+PCLMUL)")
+elseif(HAVE_PMULL)
+ message(STATUS "CRC-64/NVMe backend: PMULL (aarch64 crypto)")
+else()
+ message(STATUS "CRC-64/NVMe backend: byte table (no acceleration)")
+endif()
+
+# Delta-t protocol timers (Watson bound: 3*MPL + A + R).
+# MPL is reported per IPCP (IPCP_*_MPL); A and R are FRCT-wide.
+set(DELTA_T_ACK 1000 CACHE STRING
+ "Maximum time to acknowledge a packet (ms)")
+set(DELTA_T_RTX 30000 CACHE STRING
+ "Maximum time to retransmit a packet (ms)")
# FRCT configuration
-set(FRCT_REORDER_QUEUE_SIZE 256 CACHE STRING
+set(FRCT_REORDER_QUEUE_SIZE 128 CACHE STRING
"Size of the reordering queue, must be a power of 2")
-set(FRCT_START_WINDOW 64 CACHE STRING
+set(FRCT_START_WINDOW 128 CACHE STRING
"Start window, must be a power of 2")
set(FRCT_LINUX_RTT_ESTIMATOR TRUE CACHE BOOL
"Use Linux RTT estimator formula instead of the TCP RFC formula")
@@ -48,15 +58,13 @@ set(FRCT_RTO_MDEV_MULTIPLIER 2 CACHE STRING
set(FRCT_RTO_INC_FACTOR 0 CACHE STRING
"Divisor for RTO increase after timeout: RTO += RTX >> X, 0: Karn/Partridge")
set(FRCT_RTO_MIN 250 CACHE STRING
- "Minimum Retransmission Timeout (RTO) for FRCT (us)")
+ "Hard floor for Retransmission Timeout (RTO) for FRCT (us)")
set(FRCT_TICK_TIME 5000 CACHE STRING
"Tick time for FRCT activity (retransmission, acknowledgments) (us)")
+set(FRCT_DEBUG_STDOUT FALSE CACHE BOOL
+ "Print FRCT final counters to stdout at flow teardown")
# Retransmission (RXM) configuration
-set(RXM_BUFFER_ON_HEAP FALSE CACHE BOOL
- "Store packets for retransmission on the heap instead of in packet buffer")
-set(RXM_BLOCKING TRUE CACHE BOOL
- "Use blocking writes for retransmission")
set(RXM_MIN_RESOLUTION 20 CACHE STRING
"Minimum retransmission delay (ns), as a power to 2")
set(RXM_WHEEL_MULTIPLIER 4 CACHE STRING
@@ -92,3 +100,4 @@ if(HAVE_FUSE)
message(STATUS "Application flow statistics disabled")
endif()
endif()
+
diff --git a/cmake/config/ssm.cmake b/cmake/config/ssm.cmake
index c1f34655..913396ec 100644
--- a/cmake/config/ssm.cmake
+++ b/cmake/config/ssm.cmake
@@ -15,14 +15,22 @@ set(SSM_PUP_NAME_FMT "/${SSM_PREFIX}.pup.%d" CACHE INTERNAL
# Packet buffer configuration
set(SSM_POOL_NAME "/${SHM_PREFIX}.pool" CACHE INTERNAL
"Name for the main POSIX shared memory pool")
-set(SSM_POOL_BLOCKS 16384 CACHE STRING
- "Number of blocks in SSM packet pool, must be a power of 2")
set(SSM_PK_BUFF_HEADSPACE 256 CACHE STRING
"Bytes of headspace to reserve for future headers")
set(SSM_PK_BUFF_TAILSPACE 32 CACHE STRING
"Bytes of tailspace to reserve for future tails")
-set(SSM_RBUFF_SIZE 1024 CACHE STRING
+# Sized to absorb burst arrivals from fragmented SDUs without
+# overflowing at the eth->FRCT boundary. Must hold at least one
+# full FRCT reorder window plus margin for transient app-thread
+# unavailability; 4x FRCT_REORDER_QUEUE_SIZE leaves comfortable
+# burst headroom. Floor at 1024 for small RQ configs.
+math(EXPR _SSM_RBUFF_DEFAULT "${FRCT_REORDER_QUEUE_SIZE} * 4")
+if(_SSM_RBUFF_DEFAULT LESS 1024)
+ set(_SSM_RBUFF_DEFAULT 1024)
+endif()
+set(SSM_RBUFF_SIZE ${_SSM_RBUFF_DEFAULT} CACHE STRING
"Number of blocks in rbuff buffer, must be a power of 2")
+unset(_SSM_RBUFF_DEFAULT)
set(SSM_RBUFF_PREFIX "/${SHM_PREFIX}.rbuff." CACHE INTERNAL
"Prefix for rbuff POSIX shared memory filenames")
set(SSM_FLOW_SET_PREFIX "/${SHM_PREFIX}.set." CACHE INTERNAL
@@ -36,7 +44,7 @@ set(SSM_POOL_SHARDS 4 CACHE STRING
# Shared by all processes in 'ouroboros' group (~60 MB total)
set(SSM_GSPP_256_BLOCKS 1024 CACHE STRING
"GSPP: Number of 256B blocks")
-set(SSM_GSPP_512_BLOCKS 768 CACHE STRING
+set(SSM_GSPP_512_BLOCKS 2048 CACHE STRING
"GSPP: Number of 512B blocks")
set(SSM_GSPP_1K_BLOCKS 512 CACHE STRING
"GSPP: Number of 1KB blocks")
@@ -55,13 +63,13 @@ set(SSM_GSPP_1M_BLOCKS 16 CACHE STRING
# Per-User Pool (PUP) - for unprivileged applications
# Each unprivileged app gets its own smaller pool (~7.5 MB total)
-set(SSM_PUP_256_BLOCKS 128 CACHE STRING
+set(SSM_PUP_256_BLOCKS 512 CACHE STRING
"PUP: Number of 256B blocks")
-set(SSM_PUP_512_BLOCKS 96 CACHE STRING
+set(SSM_PUP_512_BLOCKS 512 CACHE STRING
"PUP: Number of 512B blocks")
-set(SSM_PUP_1K_BLOCKS 64 CACHE STRING
+set(SSM_PUP_1K_BLOCKS 512 CACHE STRING
"PUP: Number of 1KB blocks")
-set(SSM_PUP_2K_BLOCKS 48 CACHE STRING
+set(SSM_PUP_2K_BLOCKS 512 CACHE STRING
"PUP: Number of 2KB blocks")
set(SSM_PUP_4K_BLOCKS 32 CACHE STRING
"PUP: Number of 4KB blocks")
@@ -74,6 +82,23 @@ set(SSM_PUP_256K_BLOCKS 2 CACHE STRING
set(SSM_PUP_1M_BLOCKS 0 CACHE STRING
"PUP: Number of 1MB blocks")
+# Zero classes too small for spb header + HEADSPACE + TAILSPACE + 1 B.
+math(EXPR _SSM_MIN_USEFUL_CLASS
+ "32 + ${SSM_PK_BUFF_HEADSPACE} + ${SSM_PK_BUFF_TAILSPACE}")
+foreach(_pair "256:256" "512:512" "1K:1024" "2K:2048")
+ string(REPLACE ":" ";" _p "${_pair}")
+ list(GET _p 0 _suffix)
+ list(GET _p 1 _size)
+ if(_size LESS _SSM_MIN_USEFUL_CLASS)
+ set(SSM_GSPP_${_suffix}_BLOCKS 0)
+ set(SSM_PUP_${_suffix}_BLOCKS 0)
+ endif()
+endforeach()
+unset(_SSM_MIN_USEFUL_CLASS)
+unset(_p)
+unset(_suffix)
+unset(_size)
+
# SSM pool size calculations
include(utils/HumanReadable)
@@ -129,3 +154,23 @@ message(STATUS " Blocks: ${SSM_PUP_256_BLOCKS}, ${SSM_PUP_512_BLOCKS}, "
"${SSM_PUP_1K_BLOCKS}, ${SSM_PUP_2K_BLOCKS}, ${SSM_PUP_4K_BLOCKS}, "
"${SSM_PUP_16K_BLOCKS}, ${SSM_PUP_64K_BLOCKS}, ${SSM_PUP_256K_BLOCKS}, "
"${SSM_PUP_1M_BLOCKS}")
+
+# FRCT reorder queue must fit in every enabled size class. If RQ_SIZE
+# >= any backing pool, the receiver advertises a window the pool
+# cannot back; np1_flow_write fails under load and a single dropped
+# fragment wedges the flow. Auto-zeroed classes are skipped.
+foreach(_class 256 512 1K 2K)
+ if(SSM_PUP_${_class}_BLOCKS GREATER 0
+ AND NOT FRCT_REORDER_QUEUE_SIZE LESS SSM_PUP_${_class}_BLOCKS)
+ message(FATAL_ERROR
+ "FRCT_REORDER_QUEUE_SIZE (${FRCT_REORDER_QUEUE_SIZE}) must be "
+ "< SSM_PUP_${_class}_BLOCKS (${SSM_PUP_${_class}_BLOCKS}): "
+ "the FC window cannot exceed the pool that backs OOO stashing.")
+ endif()
+ if(SSM_GSPP_${_class}_BLOCKS GREATER 0
+ AND NOT FRCT_REORDER_QUEUE_SIZE LESS SSM_GSPP_${_class}_BLOCKS)
+ message(FATAL_ERROR
+ "FRCT_REORDER_QUEUE_SIZE (${FRCT_REORDER_QUEUE_SIZE}) must be "
+ "< SSM_GSPP_${_class}_BLOCKS (${SSM_GSPP_${_class}_BLOCKS}).")
+ endif()
+endforeach()
diff --git a/cmake/tags.cmake b/cmake/tags.cmake
new file mode 100644
index 00000000..00e6f0d6
--- /dev/null
+++ b/cmake/tags.cmake
@@ -0,0 +1,21 @@
+find_program(CTAGS_EXECUTABLE
+ NAMES ctags-universal universal-ctags ctags
+ DOC "Generate a ctags index for source navigation: make tags")
+mark_as_advanced(CTAGS_EXECUTABLE)
+
+if(CTAGS_EXECUTABLE)
+ add_custom_target(tags
+ COMMAND ${CTAGS_EXECUTABLE}
+ -R
+ --languages=C
+ --c-kinds=+p
+ --fields=+S
+ --exclude=build
+ --exclude=build-claude
+ --exclude=build_tmp
+ --exclude=.git
+ -f ${CMAKE_SOURCE_DIR}/tags
+ ${CMAKE_SOURCE_DIR}
+ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+ COMMENT "Generating ctags index at ${CMAKE_SOURCE_DIR}/tags")
+endif()
diff --git a/cmake/utils/CPUUtils.cmake b/cmake/utils/CPUUtils.cmake
new file mode 100644
index 00000000..8ca7683a
--- /dev/null
+++ b/cmake/utils/CPUUtils.cmake
@@ -0,0 +1,82 @@
+include(CheckCSourceRuns)
+
+# Compile + run a probe so we only enable a feature the host CPU
+# actually implements (toolchains accept flags the silicon may lack).
+# Cross-compile without an emulator: feature off.
+function(detect_cpu_feature _result_var _flags _source)
+ set(_save_flags "${CMAKE_REQUIRED_FLAGS}")
+ set(_save_quiet "${CMAKE_REQUIRED_QUIET}")
+ set(CMAKE_REQUIRED_FLAGS "${_save_flags} ${_flags}")
+ set(CMAKE_REQUIRED_QUIET TRUE)
+ if(CMAKE_CROSSCOMPILING AND NOT CMAKE_CROSSCOMPILING_EMULATOR)
+ set(${_result_var} FALSE CACHE INTERNAL
+ "${_result_var} (cross-compile without emulator: off)")
+ else()
+ check_c_source_runs("${_source}" ${_result_var})
+ endif()
+ set(CMAKE_REQUIRED_FLAGS "${_save_flags}")
+ set(CMAKE_REQUIRED_QUIET "${_save_quiet}")
+endfunction()
+
+# x86 PCLMULQDQ + SSE4.1. argc-derived input defeats constant folding;
+# SIGILL handler exits cleanly so the kernel skips the core dump.
+function(detect_pclmul)
+ detect_cpu_feature(_HAVE_PCLMUL "-mpclmul"
+"#include <wmmintrin.h>
+#include <signal.h>
+#include <unistd.h>
+static void on_sigill(int sig) { (void) sig; _exit(1); }
+int main(int argc, char ** argv) {
+ __m128i a;
+ __m128i b;
+ (void) argv;
+ signal(SIGILL, on_sigill);
+ a = _mm_set1_epi32(argc);
+ b = _mm_clmulepi64_si128(a, a, 0);
+ return _mm_cvtsi128_si32(b) & 0;
+}")
+ detect_cpu_feature(_HAVE_SSE41 "-msse4.1"
+"#include <smmintrin.h>
+#include <signal.h>
+#include <unistd.h>
+static void on_sigill(int sig) { (void) sig; _exit(1); }
+int main(int argc, char ** argv) {
+ __m128i a;
+ (void) argv;
+ signal(SIGILL, on_sigill);
+ a = _mm_set1_epi32(argc);
+ return _mm_extract_epi32(a, 0) & 0;
+}")
+ if(_HAVE_PCLMUL AND _HAVE_SSE41)
+ set(HAVE_PCLMUL TRUE CACHE INTERNAL
+ "x86 PCLMUL + SSE4.1 intrinsics available")
+ else()
+ unset(HAVE_PCLMUL CACHE)
+ endif()
+endfunction()
+
+# aarch64 FEAT_PMULL (vmull_p64). Pi 4's BCM2711 accepts +crypto at
+# compile time but lacks the hardware — the runtime probe catches that.
+function(detect_pmull)
+ detect_cpu_feature(_HAVE_PMULL "-march=armv8-a+crypto"
+"#include <arm_neon.h>
+#include <signal.h>
+#include <stdint.h>
+#include <unistd.h>
+static void on_sigill(int sig) { (void) sig; _exit(1); }
+int main(int argc, char ** argv) {
+ poly64_t a;
+ poly128_t c;
+ (void) argv;
+ signal(SIGILL, on_sigill);
+ a = (poly64_t) (uint64_t) argc;
+ c = vmull_p64(a, a);
+ return (int) (vgetq_lane_u64((uint64x2_t) c, 0) & 0);
+}")
+ if(_HAVE_PMULL)
+ set(HAVE_PMULL TRUE CACHE INTERNAL
+ "aarch64 PMULL intrinsics available")
+ else()
+ unset(HAVE_PMULL CACHE)
+ endif()
+endfunction()
diff --git a/doc/man/flow_alloc.3 b/doc/man/flow_alloc.3
index dbe5323c..8a9b5f5b 100644
--- a/doc/man/flow_alloc.3
+++ b/doc/man/flow_alloc.3
@@ -62,10 +62,60 @@ The \fBflow_dealloc\fR() function will release any resources
associated with the flow. This call may block and keep reliable flows
active until all packets are acknowledged.
-A \fBqosspec_t\fR specifies the following QoS characteristics of a
-flow:
-
-TODO: specify a qosspec_t
+A \fBqosspec_t\fR specifies the QoS characteristics of a flow.
+The fields are:
+
+.TP
+\fBdelay\fR (ms)
+Maximum one-way delay.
+.TP
+\fBbandwidth\fR (bits/s)
+Minimum bandwidth.
+.TP
+\fBavailability\fR
+Class of 9s (e.g. 5 = 99.999%).
+.TP
+\fBloss\fR
+Tolerated packet loss; 0 selects reliable delivery.
+.TP
+\fBber\fR
+Tolerated bit error rate (errors per billion bits); 0 enables an
+end-to-end integrity check (corrupted packets are dropped).
+.TP
+\fBservice\fR
+Framing / reliability class: \fBSVC_RAW\fR (0) disables FRCT;
+\fBSVC_MESSAGE\fR (1) preserves SDU boundaries; \fBSVC_STREAM\fR (2) is
+a byte stream with no SDU boundaries. \fBSVC_STREAM\fR requires
+\fIloss\fR = 0; otherwise
+\fBflow_alloc\fR()/\fBflow_accept\fR() returns \fB-EINVAL\fR.
+.TP
+\fBmax_gap\fR (ms)
+Maximum tolerated inter-packet gap. Packets exceeding the gap
+budget are dropped under the real-time cubes.
+.TP
+\fBtimeout\fR (ms)
+Peer-liveness timeout; 0 disables. Only applies when FRCT is
+enabled (service > 0).
+
+.PP
+The library provides predefined cubes:
+
+.TP
+\fBqos_raw\fR
+No guarantees, no integrity check.
+.TP
+\fBqos_raw_safe\fR
+Best-effort with end-to-end integrity (ber = 0).
+.TP
+\fBqos_rt\fR / \fBqos_rt_safe\fR
+Real-time messages, optimised for latency over reliability;
+\fBqos_rt_safe\fR adds an end-to-end integrity check.
+.TP
+\fBqos_msg\fR
+Reliable, SDU-preserving delivery.
+.TP
+\fBqos_stream\fR
+Reliable byte stream; no SDU boundaries are preserved.
.SH RETURN VALUE
@@ -117,13 +167,39 @@ _
\fBflow_dealloc\fR() & Thread safety & MT-Safe
.TE
+.SH NOTES
+The returned file descriptor is subject to a single-reader and
+single-writer discipline \(em at most one thread may call
+.BR flow_read ()
+(or monitor the fd via
+.BR fevent ())
+and at most one thread may call
+.BR flow_write ()
+concurrently. See
+.BR flow_read (3),
+.BR flow_write (3),
+and
+.BR fevent (3)
+for details.
+.PP
+.BR flow_dealloc ()
+must not be called concurrently with any thread that is inside
+.BR flow_read (),
+.BR flow_write (),
+.BR fevent (),
+or any other Ouroboros library call on the same fd; the result is
+undefined behaviour. Applications must serialise teardown with
+in-flight use, e.g. by signalling worker threads to drop the fd
+before calling
+.BR flow_dealloc ().
+
.SH TERMINOLOGY
Please see \fBouroboros-glossary\fR(7).
.SH SEE ALSO
-.BR fccntl "(3), " flow_read "(3), " fqueue "(3), " fset "(3), " \
-ouroboros (8)
+.BR fccntl "(3), " fevent "(3), " flow_read "(3), " flow_write "(3), " \
+fqueue "(3), " fset "(3), " ouroboros (8)
.SH COLOPHON
This page is part of the Ouroboros project, found at
diff --git a/doc/man/flow_read.3 b/doc/man/flow_read.3
index acc1f61e..d4a5e883 100644
--- a/doc/man/flow_read.3
+++ b/doc/man/flow_read.3
@@ -39,8 +39,7 @@ end of the datagram.
On success, \fBflow_write\fR() returns the number of bytes written. On
failure, a negative value indicating the error will be returned.
-Partial writes needs to be explicitly enabled. Passing a
-NULL pointer for \fIbuf\fR returns 0 with no other effects.
+Passing a NULL pointer for \fIbuf\fR returns 0 with no other effects.
.SH ERRORS
.B -EINVAL
@@ -62,7 +61,8 @@ The flow has been reported down.
The flow's peer is unresponsive (flow timed out).
.B -EMSGSIZE
-The buffer was too large to be written.
+The received packet does not fit in the caller's buffer and partial
+reads are disabled (see \fBfccntl\fR(3), \fBFLOWFRNOPART\fR).
.SH ATTRIBUTES
@@ -74,11 +74,47 @@ LB|LB|LB
L|L|L.
Interface & Attribute & Value
_
-\fBflow_read\fR() & Thread safety & MT-Safe
+\fBflow_read\fR() & Thread safety & MT-Safe race:fd
_
-\fBflow_write\fR() & Thread safety & MT-Safe
+\fBflow_write\fR() & Thread safety & MT-Safe race:fd
.TE
+.SH THREAD SAFETY
+Only one thread may call
+.BR flow_read ()
+on a given file descriptor at any time. Partial-read state kept
+across calls assumes a single logical reader; two threads racing
+.BR flow_read ()
+on the same fd is undefined behaviour. Likewise, only one thread
+may call
+.BR flow_write ()
+on a given fd at a time; two writer threads on the same fd is
+undefined behaviour.
+.PP
+Combining a writer thread with a reader thread (one thread calling
+.BR flow_write (),
+another calling
+.BR flow_read ()
+or
+.BR fevent ())
+is permitted and safe. The writer does not need a dedicated reader
+thread \(em when the FRCT send window fills,
+.BR flow_write ()
+drives its own inbound rx draining internally to process incoming
+ACKs and reopen the window, clamped by the caller's
+.BR fccntl (3)
+send-timeout if any.
+.PP
+Monitoring the same fd via
+.BR fevent ()
+from a different thread is well-defined but races: events reported
+by
+.BR fevent ()
+may already have been consumed by the racing
+.BR flow_read (),
+so the second reader may then block. See
+.BR fevent (3).
+
.SH TERMINOLOGY
Please see \fBouroboros-glossary\fR(7).
diff --git a/doc/man/fqueue.3 b/doc/man/fqueue.3
index 72a0bc25..f2fb8c9f 100644
--- a/doc/man/fqueue.3
+++ b/doc/man/fqueue.3
@@ -116,6 +116,27 @@ _
\fBfevent\fR() & Thread safety & MT-Safe
.TE
+.SH THREAD SAFETY
+.BR fevent ()
+and
+.BR flow_read ()
+on the same fd from distinct threads is well-defined but races:
+events reported by
+.BR fevent ()
+may already have been consumed by the racing
+.BR flow_read (),
+so the reader may then block. Same shape as
+.BR select (2)
++
+.BR read (2)
+from distinct threads. The intended pattern is that the thread
+invoking
+.BR fevent ()
+is the same thread that calls
+.BR flow_read ()
+on the fds returned by
+.BR fqueue_next ().
+
.SH TERMINOLOGY
Please see \fBouroboros-glossary\fR(7).
diff --git a/include/ouroboros/atomics.h b/include/ouroboros/atomics.h
new file mode 100644
index 00000000..8e667522
--- /dev/null
+++ b/include/ouroboros/atomics.h
@@ -0,0 +1,39 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * Atomic helpers
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+#ifndef OUROBOROS_LIB_ATOMICS_H
+#define OUROBOROS_LIB_ATOMICS_H
+
+#define LOAD_RELAXED(p) (__atomic_load_n(p, __ATOMIC_RELAXED))
+#define LOAD_ACQUIRE(p) (__atomic_load_n(p, __ATOMIC_ACQUIRE))
+#define LOAD(p) (__atomic_load_n(p, __ATOMIC_SEQ_CST))
+
+#define STORE_RELAXED(p, v) (__atomic_store_n(p, v, __ATOMIC_RELAXED))
+#define STORE_RELEASE(p, v) (__atomic_store_n(p, v, __ATOMIC_RELEASE))
+#define STORE(p, v) (__atomic_store_n(p, v, __ATOMIC_SEQ_CST))
+
+#define FETCH_ADD_RELAXED(p, v) (__atomic_fetch_add(p, v, __ATOMIC_RELAXED))
+#define FETCH_SUB_RELAXED(p, v) (__atomic_fetch_sub(p, v, __ATOMIC_RELAXED))
+#define FETCH_ADD(p, v) (__atomic_fetch_add(p, v, __ATOMIC_SEQ_CST))
+#define FETCH_SUB(p, v) (__atomic_fetch_sub(p, v, __ATOMIC_SEQ_CST))
+
+#endif /* OUROBOROS_LIB_ATOMICS_H */
diff --git a/include/ouroboros/crc16.h b/include/ouroboros/crc16.h
new file mode 100644
index 00000000..df4d4f57
--- /dev/null
+++ b/include/ouroboros/crc16.h
@@ -0,0 +1,43 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * 16-bit Cyclic Redundancy Check (CCITT-FALSE variant)
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+/*
+ * Polynomial: ITU-T V.41 / CCITT-FALSE, CRC-16/IBM-3740.
+ * reveng catalog: https://reveng.sourceforge.io/crc-catalogue
+ *
+ * Intended for medium-size header check sequences (typ. <= 4 KiB).
+ * Hamming distance HD=4 up to 32751 message bits.
+ */
+
+#ifndef OUROBOROS_LIB_CRC16_H
+#define OUROBOROS_LIB_CRC16_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define CRC16_HASH_LEN 2
+
+void crc16_ccitt_false(uint16_t * crc,
+ const void * buf,
+ size_t len);
+
+#endif /* OUROBOROS_LIB_CRC16_H */
diff --git a/include/ouroboros/crc64.h b/include/ouroboros/crc64.h
new file mode 100644
index 00000000..f6e407a0
--- /dev/null
+++ b/include/ouroboros/crc64.h
@@ -0,0 +1,44 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * 64-bit Cyclic Redundancy Check (NVMe variant)
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+/*
+ * Polynomial: NVM Express Base Spec, CRC-64/NVMe.
+ * reveng catalog: https://reveng.sourceforge.io/crc-catalogue
+ *
+ * Fold-by-N (PCLMUL/PMULL) algorithm:
+ * V. Gopal et al., "Fast CRC Computation for Generic Polynomials
+ * Using PCLMULQDQ", Intel white paper, 2009.
+ */
+
+#ifndef OUROBOROS_LIB_CRC64_H
+#define OUROBOROS_LIB_CRC64_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define CRC64_HASH_LEN 8
+
+void crc64_nvme(uint64_t * crc,
+ const void * buf,
+ size_t len);
+
+#endif /* OUROBOROS_LIB_CRC64_H */
diff --git a/include/ouroboros/crc8.h b/include/ouroboros/crc8.h
new file mode 100644
index 00000000..97502a25
--- /dev/null
+++ b/include/ouroboros/crc8.h
@@ -0,0 +1,43 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * 8-bit Cyclic Redundancy Check (AUTOSAR variant)
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+/*
+ * Polynomial: AUTOSAR SWS_CRC, CRC-8/AUTOSAR.
+ * reveng catalog: https://reveng.sourceforge.io/crc-catalogue
+ *
+ * Intended for short header check sequences (typ. <= 32 bytes).
+ * Hamming distance HD=4 up to 119 message bits, HD=3 up to 247.
+ */
+
+#ifndef OUROBOROS_LIB_CRC8_H
+#define OUROBOROS_LIB_CRC8_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define CRC8_HASH_LEN 1
+
+void crc8_autosar(uint8_t * crc,
+ const void * buf,
+ size_t len);
+
+#endif /* OUROBOROS_LIB_CRC8_H */
diff --git a/include/ouroboros/errno.h b/include/ouroboros/errno.h
index 9d84df88..eedd978f 100644
--- a/include/ouroboros/errno.h
+++ b/include/ouroboros/errno.h
@@ -37,5 +37,6 @@
#ifndef EAUTH /* Exists on BSD */
#define EAUTH 1009 /* Authentication error */
#endif
+#define EREPLAY 1010 /* OAP replay detected */
#endif /* OUROBOROS_ERRNO_H */
diff --git a/include/ouroboros/fccntl.h b/include/ouroboros/fccntl.h
index d3baea8f..e91e91dd 100644
--- a/include/ouroboros/fccntl.h
+++ b/include/ouroboros/fccntl.h
@@ -50,6 +50,12 @@
#define FRCTFRESCNTL 00000002 /* Feedback from receiver */
#define FRCTFLINGER 00000004 /* Send unsent data */
+/* All user-visible bits (readable via FRCTGFLAGS). */
+#define FRCTFMASK (FRCTFRTX | FRCTFRESCNTL | FRCTFLINGER)
+
+/* Subset writable via FRCTSFLAGS; FRCTFRTX is fixed at flow_alloc. */
+#define FRCTFSETMASK (FRCTFRESCNTL | FRCTFLINGER)
+
/* Flow operations */
#define FLOWSRCVTIMEO 00000001 /* Set read timeout */
#define FLOWGRCVTIMEO 00000002 /* Get read timeout */
@@ -60,10 +66,17 @@
#define FLOWGFLAGS 00000007 /* Get flags for flow */
#define FLOWGRXQLEN 00000010 /* Get queue length on rx */
#define FLOWGTXQLEN 00000011 /* Get queue length on tx */
+#define FLOWGMTU 00000012 /* Get per-packet MTU */
/* FRCT operations */
#define FRCTSFLAGS 00001000 /* Set flags for FRCT */
#define FRCTGFLAGS 00002000 /* Get flags for FRCT */
+#define FRCTSMAXSDU 00003000 /* Set max recv SDU size */
+#define FRCTGMAXSDU 00004000 /* Get max recv SDU size */
+#define FRCTSRRINGSZ 00005000 /* Set stream rcv ring sz */
+#define FRCTGRRINGSZ 00006000 /* Get stream rcv ring sz */
+#define FRCTSRTOMIN 00007000 /* Set RTO floor (ns) */
+#define FRCTGRTOMIN 00010000 /* Get RTO floor (ns) */
__BEGIN_DECLS
diff --git a/include/ouroboros/flow.h b/include/ouroboros/flow.h
index fe4582e7..8b096410 100644
--- a/include/ouroboros/flow.h
+++ b/include/ouroboros/flow.h
@@ -25,6 +25,7 @@
#include <ouroboros/qos.h>
+#include <stdint.h>
#include <sys/types.h>
#define SYMMKEYSZ 32
@@ -50,6 +51,8 @@ struct flow_info {
time_t mpl;
+ uint32_t mtu; /* n-1 layer MTU in bytes, 0 = unknown */
+
struct qos_spec qs;
enum flow_state state;
diff --git a/include/ouroboros/hash.h b/include/ouroboros/hash.h
index 0838df97..17ab98ac 100644
--- a/include/ouroboros/hash.h
+++ b/include/ouroboros/hash.h
@@ -38,6 +38,9 @@ enum hash_algo {
HASH_SHA3_512 = DIR_HASH_SHA3_512,
HASH_CRC32,
HASH_MD5,
+ HASH_CRC64,
+ HASH_CRC8,
+ HASH_CRC16,
};
#define HASH_FMT32 "%02x%02x%02x%02x"
diff --git a/include/ouroboros/ipcp-dev.h b/include/ouroboros/ipcp-dev.h
index 93236271..d23f757e 100644
--- a/include/ouroboros/ipcp-dev.h
+++ b/include/ouroboros/ipcp-dev.h
@@ -28,16 +28,20 @@
#include <ouroboros/ssm_pool.h>
#include <ouroboros/utils.h>
+#include <stdint.h>
+
int ipcp_create_r(const struct ipcp_info * info);
int ipcp_flow_req_arr(const buffer_t * dst,
qosspec_t qs,
time_t mpl,
+ uint32_t mtu,
const buffer_t * data);
int ipcp_flow_alloc_reply(int fd,
int response,
time_t mpl,
+ uint32_t mtu,
const buffer_t * data);
int ipcp_flow_read(int fd,
diff --git a/include/ouroboros/np1_flow.h b/include/ouroboros/np1_flow.h
index 6f341cfc..309d01c2 100644
--- a/include/ouroboros/np1_flow.h
+++ b/include/ouroboros/np1_flow.h
@@ -37,12 +37,12 @@ int np1_flow_dealloc(int flow_id,
time_t timeo);
static const qosspec_t qos_np1 = {
+ .service = SVC_RAW,
.delay = UINT32_MAX,
.bandwidth = 0,
.availability = 0,
.loss = UINT32_MAX,
.ber = UINT32_MAX,
- .in_order = 0,
.max_gap = UINT32_MAX,
.timeout = 0
};
diff --git a/include/ouroboros/qos.h b/include/ouroboros/qos.h
index 6b0bbc17..7980ad00 100644
--- a/include/ouroboros/qos.h
+++ b/include/ouroboros/qos.h
@@ -28,79 +28,88 @@
#define DEFAULT_PEER_TIMEOUT 120000
+/* qos_spec.service: framing / reliability class. */
+enum qos_service {
+ SVC_RAW = 0, /* No FRCT; best-effort raw messages */
+ SVC_MESSAGE = 1, /* FRCT, reliable ordered messages */
+ SVC_STREAM = 2, /* FRCT, reliable ordered byte stream */
+};
+
typedef struct qos_spec {
+ uint8_t service; /* enum qos_service; gates FRCT (>0). */
uint32_t delay; /* In ms. */
uint64_t bandwidth; /* In bits/s. */
uint8_t availability; /* Class of 9s. */
uint32_t loss; /* Packet loss. */
uint32_t ber; /* Bit error rate, errors per billion bits. */
- uint8_t in_order; /* In-order delivery, enables FRCT. */
uint32_t max_gap; /* In ms. */
uint32_t timeout; /* Peer timeout time, in ms, 0 = no timeout. */
} qosspec_t;
+/* "_safe" = integrity check (ber=0). "rt" = latency over reliability. */
+
static const qosspec_t qos_raw = {
+ .service = SVC_RAW,
.delay = UINT32_MAX,
.bandwidth = 0,
.availability = 0,
.loss = 1,
.ber = 1,
- .in_order = 0,
.max_gap = UINT32_MAX,
- .timeout = DEFAULT_PEER_TIMEOUT
+ .timeout = 0
};
-static const qosspec_t qos_raw_no_errors = {
+static const qosspec_t qos_raw_safe = {
+ .service = SVC_RAW,
.delay = UINT32_MAX,
.bandwidth = 0,
.availability = 0,
.loss = 1,
.ber = 0,
- .in_order = 0,
.max_gap = UINT32_MAX,
- .timeout = DEFAULT_PEER_TIMEOUT
+ .timeout = 0
};
-static const qosspec_t qos_best_effort = {
- .delay = UINT32_MAX,
- .bandwidth = 0,
- .availability = 0,
+static const qosspec_t qos_rt = {
+ .service = SVC_MESSAGE,
+ .delay = 100,
+ .bandwidth = UINT64_MAX,
+ .availability = 3,
.loss = 1,
- .ber = 0,
- .in_order = 1,
- .max_gap = UINT32_MAX,
+ .ber = 1,
+ .max_gap = 100,
.timeout = DEFAULT_PEER_TIMEOUT
};
-static const qosspec_t qos_video = {
+static const qosspec_t qos_rt_safe = {
+ .service = SVC_MESSAGE,
.delay = 100,
.bandwidth = UINT64_MAX,
.availability = 3,
.loss = 1,
.ber = 0,
- .in_order = 1,
.max_gap = 100,
.timeout = DEFAULT_PEER_TIMEOUT
};
-static const qosspec_t qos_voice = {
- .delay = 50,
- .bandwidth = 100000,
- .availability = 5,
- .loss = 1,
+static const qosspec_t qos_msg = {
+ .service = SVC_MESSAGE,
+ .delay = 1000,
+ .bandwidth = 0,
+ .availability = 0,
+ .loss = 0,
.ber = 0,
- .in_order = 1,
- .max_gap = 50,
+ .max_gap = 2000,
.timeout = DEFAULT_PEER_TIMEOUT
};
-static const qosspec_t qos_data = {
+static const qosspec_t qos_stream = {
+ .service = SVC_STREAM,
.delay = 1000,
.bandwidth = 0,
.availability = 0,
.loss = 0,
.ber = 0,
- .in_order = 1,
.max_gap = 2000,
.timeout = DEFAULT_PEER_TIMEOUT
};
diff --git a/include/ouroboros/ssm_pk_buff.h b/include/ouroboros/ssm_pk_buff.h
index 1b779ad1..1d5597c7 100644
--- a/include/ouroboros/ssm_pk_buff.h
+++ b/include/ouroboros/ssm_pk_buff.h
@@ -28,25 +28,25 @@
struct ssm_pk_buff;
-size_t ssm_pk_buff_get_idx(struct ssm_pk_buff * spb);
+size_t ssm_pk_buff_get_off(const struct ssm_pk_buff * spb);
-uint8_t * ssm_pk_buff_head(struct ssm_pk_buff * spb);
+uint8_t * ssm_pk_buff_head(const struct ssm_pk_buff * spb);
-uint8_t * ssm_pk_buff_tail(struct ssm_pk_buff * spb);
+uint8_t * ssm_pk_buff_tail(const struct ssm_pk_buff * spb);
-size_t ssm_pk_buff_len(struct ssm_pk_buff * spb);
+size_t ssm_pk_buff_len(const struct ssm_pk_buff * spb);
-uint8_t * ssm_pk_buff_head_alloc(struct ssm_pk_buff * spb,
- size_t size);
+uint8_t * ssm_pk_buff_push(struct ssm_pk_buff * spb,
+ size_t size);
-uint8_t * ssm_pk_buff_tail_alloc(struct ssm_pk_buff * spb,
- size_t size);
+uint8_t * ssm_pk_buff_push_tail(struct ssm_pk_buff * spb,
+ size_t size);
-uint8_t * ssm_pk_buff_head_release(struct ssm_pk_buff * spb,
- size_t size);
+uint8_t * ssm_pk_buff_pop(struct ssm_pk_buff * spb,
+ size_t size);
-uint8_t * ssm_pk_buff_tail_release(struct ssm_pk_buff * spb,
- size_t size);
+uint8_t * ssm_pk_buff_pop_tail(struct ssm_pk_buff * spb,
+ size_t size);
void ssm_pk_buff_truncate(struct ssm_pk_buff * spb,
size_t len);
diff --git a/include/ouroboros/ssm_pool.h b/include/ouroboros/ssm_pool.h
index 89eff8eb..bba76798 100644
--- a/include/ouroboros/ssm_pool.h
+++ b/include/ouroboros/ssm_pool.h
@@ -32,7 +32,7 @@
struct ssm_pool;
-/* Pool API: uid = 0 for GSPP (privileged), uid > 0 for PUP (per-user) */
+/* Pool API: uid = 0 for GSPP (privileged), uid > 0 for PUP (per-user). */
struct ssm_pool * ssm_pool_create(uid_t uid,
gid_t gid);
@@ -46,13 +46,13 @@ int ssm_pool_mlock(struct ssm_pool * pool);
void ssm_pool_gspp_purge(void);
-/* Alloc count bytes, returns block index, a ptr and pk_buff. */
+/* Alloc count bytes, returns block offset, a ptr and pk_buff. */
ssize_t ssm_pool_alloc(struct ssm_pool * pool,
size_t count,
uint8_t ** ptr,
struct ssm_pk_buff ** spb);
-ssize_t ssm_pool_alloc_b(struct ssm_pool * pool,
+ssize_t ssm_pool_alloc_b(struct ssm_pool * pool,
size_t count,
uint8_t ** ptr,
struct ssm_pk_buff ** spb,
@@ -60,13 +60,13 @@ ssize_t ssm_pool_alloc_b(struct ssm_pool * pool,
ssize_t ssm_pool_read(uint8_t ** dst,
struct ssm_pool * pool,
- size_t idx);
+ size_t off);
struct ssm_pk_buff * ssm_pool_get(struct ssm_pool * pool,
- size_t idx);
+ size_t off);
int ssm_pool_remove(struct ssm_pool * pool,
- size_t idx);
+ size_t off);
void ssm_pool_reclaim_orphans(struct ssm_pool * pool,
pid_t pid);
diff --git a/include/ouroboros/ssm_rbuff.h b/include/ouroboros/ssm_rbuff.h
index ffa10b8e..2443b63d 100644
--- a/include/ouroboros/ssm_rbuff.h
+++ b/include/ouroboros/ssm_rbuff.h
@@ -55,10 +55,10 @@ void ssm_rbuff_fini(struct ssm_rbuff * rb);
int ssm_rbuff_mlock(struct ssm_rbuff * rb);
int ssm_rbuff_write(struct ssm_rbuff * rb,
- size_t idx);
+ size_t off);
int ssm_rbuff_write_b(struct ssm_rbuff * rb,
- size_t idx,
+ size_t off,
const struct timespec * abstime);
ssize_t ssm_rbuff_read(struct ssm_rbuff * rb);
diff --git a/include/ouroboros/time.h b/include/ouroboros/time.h
index 3d037a3c..a4136e8e 100644
--- a/include/ouroboros/time.h
+++ b/include/ouroboros/time.h
@@ -46,6 +46,12 @@
#define TS_TO_UINT64(ts) \
((uint64_t)(ts).tv_sec * BILLION + (uint64_t)(ts).tv_nsec)
+#define UINT64_TO_TS(ns, ts) \
+ do { \
+ (ts)->tv_sec = (time_t)((ns) / BILLION); \
+ (ts)->tv_nsec = (long)((ns) % BILLION); \
+ } while (0)
+
#define TIMEVAL_INIT_S(s) {(s), 0}
#define TIMEVAL_INIT_MS(ms) {(ms) / 1000, ((ms) % 1000) * 1000}
#define TIMEVAL_INIT_US(us) {(us) / MILLION, ((us) % MILLION)}
diff --git a/include/ouroboros/tpm.h b/include/ouroboros/tpm.h
index c01a235c..56c04701 100644
--- a/include/ouroboros/tpm.h
+++ b/include/ouroboros/tpm.h
@@ -24,6 +24,7 @@
#define OUROBOROS_LIB_TPM_H
#include <stdbool.h>
+#include <sys/types.h>
struct tpm;
diff --git a/include/ouroboros/tw.h b/include/ouroboros/tw.h
new file mode 100644
index 00000000..156f99db
--- /dev/null
+++ b/include/ouroboros/tw.h
@@ -0,0 +1,77 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * Generic deadline-ordered callback queue (timing wheel)
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+#ifndef OUROBOROS_TW_H
+#define OUROBOROS_TW_H
+
+#include <ouroboros/cdefs.h>
+#include <ouroboros/list.h>
+
+#include <stddef.h>
+#include <stdint.h>
+#include <time.h>
+
+typedef void (*tw_fire_fn_t)(void * arg);
+
+struct tw_entry {
+ struct list_head next;
+ uint64_t deadline_ns;
+ tw_fire_fn_t fire;
+ void * arg;
+ size_t lvl;
+};
+
+__BEGIN_DECLS
+
+int tw_init(void);
+
+void tw_fini(void);
+
+void tw_init_entry(struct tw_entry * e);
+
+/*
+ * Schedule e to fire at deadline_ns. If e is already posted,
+ * the previous schedule is cancelled and replaced.
+ */
+void tw_post(struct tw_entry * e,
+ uint64_t deadline_ns,
+ tw_fire_fn_t fire,
+ void * arg);
+
+void tw_cancel(struct tw_entry * e);
+
+/*
+ * Advance the wheel and fire due callbacks. Callbacks run with the wheel
+ * unlocked and may call tw_post / tw_cancel on any entry, including the one
+ * currently firing. Concurrent tw_move from a second thread is a no-op.
+ */
+void tw_move(void);
+
+/*
+ * Write the absolute deadline of the earliest pending entry to *out.
+ * Empty wheel is signalled by out->tv_nsec == -1.
+ */
+void tw_next_expiry(struct timespec * out);
+
+__END_DECLS
+
+#endif /* OUROBOROS_TW_H */
diff --git a/src/ipcpd/broadcast/dt.c b/src/ipcpd/broadcast/dt.c
index 30e89a4f..95483e33 100644
--- a/src/ipcpd/broadcast/dt.c
+++ b/src/ipcpd/broadcast/dt.c
@@ -28,7 +28,7 @@
#include "config.h"
-#define BROADCAST_MTU 1400 /* FIXME: avoid packet copy. */
+#define BROADCAST_MTU IPCP_BROADCAST_MTU /* FIXME: avoid packet copy. */
#define DT "dt"
#define OUROBOROS_PREFIX DT
diff --git a/src/ipcpd/broadcast/main.c b/src/ipcpd/broadcast/main.c
index b3cbdc79..77e22531 100644
--- a/src/ipcpd/broadcast/main.c
+++ b/src/ipcpd/broadcast/main.c
@@ -242,7 +242,7 @@ static int broadcast_ipcp_join(int fd,
notifier_event(NOTIFY_DT_CONN_ADD, &conn);
- ipcp_flow_alloc_reply(fd, 0, mpl, &data);
+ ipcp_flow_alloc_reply(fd, 0, mpl, IPCP_BROADCAST_MTU, &data);
return 0;
}
diff --git a/src/ipcpd/config.h.in b/src/ipcpd/config.h.in
index 0b4252e5..7edec526 100644
--- a/src/ipcpd/config.h.in
+++ b/src/ipcpd/config.h.in
@@ -23,8 +23,8 @@
#define PTHREAD_COND_CLOCK @PTHREAD_COND_CLOCK@
#define SYS_MAX_FLOWS @SYS_MAX_FLOWS@
-#define PROG_RES_FDS @PROG_RES_FDS@
-#define PROG_MAX_FLOWS @PROG_MAX_FLOWS@
+#define PROC_RES_FDS @PROC_RES_FDS@
+#define PROC_MAX_FLOWS @PROC_MAX_FLOWS@
#define SOCKET_TIMEOUT @SOCKET_TIMEOUT@
#define CONNECT_TIMEOUT @CONNECT_TIMEOUT@
@@ -46,11 +46,13 @@
#define IPCP_SCHED_THR_MUL @IPCP_SCHED_THR_MUL@
#define PFT_SIZE @PFT_SIZE@
#define IPCP_UNICAST_MPL @IPCP_UNICAST_MPL@
+#define IPCP_UNICAST_MTU @IPCP_UNICAST_MTU@
#define CONNMGR_RCV_TIMEOUT @CONNMGR_RCV_TIMEOUT@
#cmakedefine DISABLE_CORE_LOCK
#cmakedefine BUILD_CONTAINER
#cmakedefine IPCP_FLOW_STATS
+#cmakedefine IPCP_ETH_FLOW_STATS
#cmakedefine IPCP_DEBUG_LOCAL
#ifdef CONFIG_OUROBOROS_DEBUG
#cmakedefine DEBUG_PROTO_DHT
@@ -65,6 +67,8 @@
#define IPCP_UDP_RD_THR @IPCP_UDP_RD_THR@
#define IPCP_UDP_WR_THR @IPCP_UDP_WR_THR@
#define IPCP_UDP_MPL @IPCP_UDP_MPL@
+#define IPCP_UDP4_MTU @IPCP_UDP4_MTU@
+#define IPCP_UDP6_MTU @IPCP_UDP6_MTU@
/* eth */
#cmakedefine HAVE_NETMAP
@@ -76,10 +80,13 @@
#define IPCP_ETH_LO_MTU @IPCP_ETH_LO_MTU@
#define IPCP_ETH_MGMT_FRAME_SIZE @IPCP_ETH_MGMT_FRAME_SIZE@
#define IPCP_ETH_MPL @IPCP_ETH_MPL@
+#define IPCP_ETH_SNDBUF @IPCP_ETH_SNDBUF@
+#define IPCP_ETH_RCVBUF @IPCP_ETH_RCVBUF@
/* local */
#define IPCP_LOCAL_MPL @IPCP_LOCAL_MPL@
+#define IPCP_LOCAL_MTU @IPCP_LOCAL_MTU@
/* broadcast */
-/* local */
#define IPCP_BROADCAST_MPL @IPCP_BROADCAST_MPL@
+#define IPCP_BROADCAST_MTU @IPCP_BROADCAST_MTU@
diff --git a/src/ipcpd/eth/eth.c b/src/ipcpd/eth/eth.c
index 4be7775e..103ba881 100644
--- a/src/ipcpd/eth/eth.c
+++ b/src/ipcpd/eth/eth.c
@@ -37,12 +37,14 @@
#include "config.h"
+#include <ouroboros/atomics.h>
#include <ouroboros/endian.h>
#include <ouroboros/hash.h>
#include <ouroboros/errno.h>
#include <ouroboros/list.h>
#include <ouroboros/utils.h>
#include <ouroboros/bitmap.h>
+#include <ouroboros/crc8.h>
#include <ouroboros/dev.h>
#include <ouroboros/ipcp-dev.h>
#include <ouroboros/fqueue.h>
@@ -50,6 +52,14 @@
#include <ouroboros/time.h>
#include <ouroboros/fccntl.h>
#include <ouroboros/pthread.h>
+#include <ouroboros/rib.h>
+
+#ifndef IPCP_ETH_FLOW_STATS
+#undef FETCH_ADD_RELAXED
+#define FETCH_ADD_RELAXED(p, v) ((void) 0)
+#undef FETCH_SUB_RELAXED
+#define FETCH_SUB_RELAXED(p, v) ((void) 0)
+#endif
#include "ipcp.h"
#include "np1.h"
@@ -122,7 +132,8 @@
#define MGMT_EID 0
#define DIX_EID_SIZE sizeof(uint16_t)
#define DIX_LENGTH_SIZE sizeof(uint16_t)
-#define DIX_HEADER_SIZE (DIX_EID_SIZE + DIX_LENGTH_SIZE)
+#define DIX_HCS_SIZE CRC8_HASH_LEN
+#define DIX_HEADER_SIZE (DIX_EID_SIZE + DIX_LENGTH_SIZE + DIX_HCS_SIZE)
#define ETH_HEADER_TOT_SIZE (ETH_HEADER_SIZE + DIX_HEADER_SIZE)
#define MAX_EIDS (1 << (8 * DIX_EID_SIZE))
#define ETH_MAX_PACKET_SIZE (ETH_MTU - DIX_HEADER_SIZE)
@@ -130,16 +141,20 @@
#elif defined(BUILD_ETH_LLC)
#define THIS_TYPE IPCP_ETH_LLC
#define MGMT_SAP 0x01
-#define LLC_HEADER_SIZE 3
+#define LLC_FIELDS_SIZE 3
+#define LLC_HCS_SIZE CRC8_HASH_LEN
+#define LLC_HEADER_SIZE (LLC_FIELDS_SIZE + LLC_HCS_SIZE)
#define ETH_HEADER_TOT_SIZE (ETH_HEADER_SIZE + LLC_HEADER_SIZE)
#define MAX_SAPS 64
#define ETH_MAX_PACKET_SIZE (ETH_MTU - LLC_HEADER_SIZE)
#define ETH_FRAME_SIZE (ETH_HEADER_SIZE + ETH_MTU_MAX)
#endif
-#define NAME_QUERY_TIMEO 2000 /* ms */
-#define MGMT_TIMEO 100 /* ms */
+#define NAME_QUERY_TIMEO 1900 /* ms total budget */
+#define NAME_QUERY_RETRIES 3 /* retransmits, 4 attempts total */
+#define MGMT_TIMEO 100 /* ms */
#define MGMT_FRAME_SIZE IPCP_ETH_MGMT_FRAME_SIZE
+#define ETH_RIB_PATH "eth"
#define FLOW_REQ 0
#define FLOW_REPLY 1
@@ -165,7 +180,7 @@ struct mgmt_msg {
uint32_t delay;
uint32_t timeout;
int32_t response;
- uint8_t in_order;
+ uint8_t service;
#if defined (BUILD_ETH_DIX)
uint8_t code;
uint8_t availability;
@@ -185,6 +200,7 @@ struct eth_frame {
uint8_t ssap;
uint8_t cf;
#endif
+ uint8_t hcs;
uint8_t payload;
} __attribute__((packed));
@@ -196,6 +212,17 @@ struct ef {
int8_t r_sap;
#endif
uint8_t r_addr[MAC_SIZE];
+#ifdef IPCP_ETH_FLOW_STATS
+ struct {
+ time_t stamp;
+ size_t p_rcv;
+ size_t b_rcv;
+ size_t p_dlv_f;
+ size_t p_snd;
+ size_t b_snd;
+ size_t p_snd_f;
+ } stat;
+#endif
};
struct mgmt_frame {
@@ -233,6 +260,22 @@ struct {
struct ef * fd_to_ef;
fset_t * np1_flows;
pthread_rwlock_t flows_lock;
+#ifdef IPCP_ETH_FLOW_STATS
+ struct {
+ size_t n_flows;
+ size_t n_rcv;
+ size_t n_snd;
+ size_t n_mgmt_rcv;
+ size_t n_mgmt_snd;
+ size_t n_bad_id;
+ size_t n_dlv_f;
+ size_t n_buf_f;
+ size_t n_rcv_f;
+ size_t n_snd_f;
+ size_t kern_rcv;
+ size_t kern_drp;
+ } stat;
+#endif
pthread_t packet_writer[IPCP_ETH_WR_THR];
pthread_t packet_reader[IPCP_ETH_RD_THR];
@@ -284,7 +327,14 @@ static int eth_data_init(void)
eth_data.fd_to_ef[i].r_sap = -1;
#endif
memset(&eth_data.fd_to_ef[i].r_addr, 0, MAC_SIZE);
+#ifdef IPCP_ETH_FLOW_STATS
+ memset(&eth_data.fd_to_ef[i].stat, 0,
+ sizeof(eth_data.fd_to_ef[i].stat));
+#endif
}
+#ifdef IPCP_ETH_FLOW_STATS
+ memset(&eth_data.stat, 0, sizeof(eth_data.stat));
+#endif
eth_data.shim_data = shim_data_create();
if (eth_data.shim_data == NULL)
@@ -357,6 +407,227 @@ static void eth_data_fini(void)
free(eth_data.fd_to_ef);
}
+#ifdef IPCP_ETH_FLOW_STATS
+static int eth_rib_read(const char * path,
+ char * buf,
+ size_t len)
+{
+ struct ef * flow;
+ int fd;
+ char tmstr[RIB_TM_STRLEN];
+ struct tm * tm;
+ time_t stamp;
+ char * entry;
+
+ entry = strstr(path, RIB_SEPARATOR) + 1;
+ assert(entry);
+
+ if (len < 2048)
+ return 0;
+
+ buf[0] = '\0';
+
+ if (strcmp(entry, "summary") == 0) {
+ int n;
+#if defined(HAVE_RAW_SOCKETS)
+ int rcvbuf = 0;
+ int sndbuf = 0;
+ int queued = 0;
+ socklen_t optlen = sizeof(rcvbuf);
+# if defined(__linux__)
+ struct tpacket_stats tp_stats;
+ socklen_t tp_len = sizeof(tp_stats);
+# endif
+
+ getsockopt(eth_data.s_fd, SOL_SOCKET,
+ SO_RCVBUF, &rcvbuf, &optlen);
+ optlen = sizeof(sndbuf);
+ getsockopt(eth_data.s_fd, SOL_SOCKET,
+ SO_SNDBUF, &sndbuf, &optlen);
+ ioctl(eth_data.s_fd, FIONREAD, &queued);
+# if defined(__linux__)
+ if (getsockopt(eth_data.s_fd, SOL_PACKET,
+ PACKET_STATISTICS,
+ &tp_stats, &tp_len) == 0) {
+ FETCH_ADD_RELAXED(&eth_data.stat.kern_rcv,
+ tp_stats.tp_packets);
+ FETCH_ADD_RELAXED(&eth_data.stat.kern_drp,
+ tp_stats.tp_drops);
+ }
+# endif
+#endif
+ n = sprintf(buf,
+ "Active flows: %20zu\n"
+ "Total frames received: %20zu\n"
+ "Total frames sent: %20zu\n"
+ "Management frames received: %20zu\n"
+ "Management frames sent: %20zu\n"
+ "Bad EID/SAP frames: %20zu\n"
+ "Delivery (N+1) failures: %20zu\n"
+ "Buffer alloc failures: %20zu\n"
+ "Frame read failures: %20zu\n"
+ "Frame send failures: %20zu\n",
+ LOAD_RELAXED(&eth_data.stat.n_flows),
+ LOAD_RELAXED(&eth_data.stat.n_rcv),
+ LOAD_RELAXED(&eth_data.stat.n_snd),
+ LOAD_RELAXED(&eth_data.stat.n_mgmt_rcv),
+ LOAD_RELAXED(&eth_data.stat.n_mgmt_snd),
+ LOAD_RELAXED(&eth_data.stat.n_bad_id),
+ LOAD_RELAXED(&eth_data.stat.n_dlv_f),
+ LOAD_RELAXED(&eth_data.stat.n_buf_f),
+ LOAD_RELAXED(&eth_data.stat.n_rcv_f),
+ LOAD_RELAXED(&eth_data.stat.n_snd_f));
+#if defined(HAVE_RAW_SOCKETS)
+ n += sprintf(buf + n,
+ "Socket rcvbuf (bytes): %20d\n"
+ "Socket sndbuf (bytes): %20d\n"
+ "Socket queued (bytes): %20d\n",
+ rcvbuf, sndbuf, queued);
+# if defined(__linux__)
+ n += sprintf(buf + n,
+ "Kernel frames received: %20zu\n"
+ "Kernel frames dropped: %20zu\n",
+ LOAD_RELAXED(&eth_data.stat.kern_rcv),
+ LOAD_RELAXED(&eth_data.stat.kern_drp));
+# endif
+#endif
+ return n;
+ }
+
+ fd = atoi(entry);
+
+ if (fd < 0 || fd >= SYS_MAX_FLOWS)
+ return -1;
+
+ flow = &eth_data.fd_to_ef[fd];
+
+ pthread_rwlock_rdlock(&eth_data.flows_lock);
+
+ stamp = flow->stat.stamp;
+ if (stamp == 0) {
+ pthread_rwlock_unlock(&eth_data.flows_lock);
+ return 0;
+ }
+
+ pthread_rwlock_unlock(&eth_data.flows_lock);
+
+ tm = gmtime(&stamp);
+ strftime(tmstr, sizeof(tmstr), RIB_TM_FORMAT, tm);
+
+ sprintf(buf,
+ "Flow established at: %20s\n"
+ "Sent (packets): %20zu\n"
+ "Sent (bytes): %20zu\n"
+ "Send failed (packets): %20zu\n"
+ "Received (packets): %20zu\n"
+ "Received (bytes): %20zu\n"
+ "Delivery (N+1) failures: %20zu\n",
+ tmstr,
+ LOAD_RELAXED(&flow->stat.p_snd),
+ LOAD_RELAXED(&flow->stat.b_snd),
+ LOAD_RELAXED(&flow->stat.p_snd_f),
+ LOAD_RELAXED(&flow->stat.p_rcv),
+ LOAD_RELAXED(&flow->stat.b_rcv),
+ LOAD_RELAXED(&flow->stat.p_dlv_f));
+
+ return strlen(buf);
+}
+
+static int eth_rib_readdir(char *** buf)
+{
+ char entry[RIB_PATH_LEN + 1];
+ size_t i;
+ int idx = 0;
+ int n_entries;
+
+ pthread_rwlock_rdlock(&eth_data.flows_lock);
+
+ n_entries = (int) LOAD_RELAXED(&eth_data.stat.n_flows) + 1;
+
+ *buf = malloc(sizeof(**buf) * n_entries);
+ if (*buf == NULL)
+ goto fail_entries;
+
+ (*buf)[idx] = malloc(strlen("summary") + 1);
+ if ((*buf)[idx] == NULL)
+ goto fail_entry;
+
+ strcpy((*buf)[idx++], "summary");
+
+ for (i = 0; i < SYS_MAX_FLOWS && idx < n_entries; ++i) {
+ if (eth_data.fd_to_ef[i].stat.stamp == 0)
+ continue;
+
+ sprintf(entry, "%zu", i);
+
+ (*buf)[idx] = malloc(strlen(entry) + 1);
+ if ((*buf)[idx] == NULL)
+ goto fail_entry;
+
+ strcpy((*buf)[idx++], entry);
+ }
+
+ pthread_rwlock_unlock(&eth_data.flows_lock);
+
+ return idx;
+
+ fail_entry:
+ while (idx-- > 0)
+ free((*buf)[idx]);
+ free(*buf);
+ fail_entries:
+ pthread_rwlock_unlock(&eth_data.flows_lock);
+ return -ENOMEM;
+}
+
+static int eth_rib_getattr(const char * path,
+ struct rib_attr * attr)
+{
+ int fd;
+ char * entry;
+ struct ef * flow;
+
+ entry = strstr(path, RIB_SEPARATOR) + 1;
+ assert(entry);
+
+ if (strcmp(entry, "summary") == 0) {
+ attr->size = 2048;
+ attr->mtime = 0;
+ return 0;
+ }
+
+ fd = atoi(entry);
+
+ if (fd < 0 || fd >= SYS_MAX_FLOWS) {
+ attr->size = 0;
+ attr->mtime = 0;
+ return 0;
+ }
+
+ flow = &eth_data.fd_to_ef[fd];
+
+ pthread_rwlock_rdlock(&eth_data.flows_lock);
+
+ if (flow->stat.stamp != 0) {
+ attr->size = 2048;
+ attr->mtime = flow->stat.stamp;
+ } else {
+ attr->size = 0;
+ attr->mtime = 0;
+ }
+
+ pthread_rwlock_unlock(&eth_data.flows_lock);
+
+ return 0;
+}
+
+static struct rib_ops eth_r_ops = {
+ .read = eth_rib_read,
+ .readdir = eth_rib_readdir,
+ .getattr = eth_rib_getattr
+};
+#endif /* IPCP_ETH_FLOW_STATS */
+
#ifdef BUILD_ETH_LLC
static uint8_t reverse_bits(uint8_t b)
{
@@ -409,12 +680,18 @@ static int eth_ipcp_send_frame(const uint8_t * dst_addr,
e_frame->ethertype = eth_data.ethertype;
e_frame->eid = htons(deid);
e_frame->length = htons(len);
+ mem_hash(HASH_CRC8, &e_frame->hcs,
+ (uint8_t *) &e_frame->eid,
+ DIX_EID_SIZE + DIX_LENGTH_SIZE);
frame_len = ETH_HEADER_TOT_SIZE + len;
#elif defined(BUILD_ETH_LLC)
e_frame->length = htons(LLC_HEADER_SIZE + len);
e_frame->dsap = dsap;
e_frame->ssap = ssap;
e_frame->cf = cf;
+ mem_hash(HASH_CRC8, &e_frame->hcs,
+ (uint8_t *) &e_frame->dsap,
+ LLC_FIELDS_SIZE);
frame_len = ETH_HEADER_TOT_SIZE + len;
#endif
@@ -440,10 +717,7 @@ static int eth_ipcp_send_frame(const uint8_t * dst_addr,
}
assert(FD_ISSET(eth_data.s_fd, &fds));
- if (sendto(eth_data.s_fd,
- frame,
- frame_len,
- 0,
+ if (sendto(eth_data.s_fd, frame, frame_len, 0,
(struct sockaddr *) &eth_data.device,
sizeof(eth_data.device)) <= 0) {
log_dbg("Failed to send message: %s.", strerror(errno));
@@ -451,6 +725,8 @@ static int eth_ipcp_send_frame(const uint8_t * dst_addr,
}
#endif /* HAVE_NETMAP */
+ FETCH_ADD_RELAXED(&eth_data.stat.n_snd, 1);
+
return 0;
}
@@ -490,7 +766,7 @@ static int eth_ipcp_alloc(const uint8_t * dst_addr,
msg->availability = qs.availability;
msg->loss = hton32(qs.loss);
msg->ber = hton32(qs.ber);
- msg->in_order = qs.in_order;
+ msg->service = qs.service;
msg->max_gap = hton32(qs.max_gap);
msg->timeout = hton32(qs.timeout);
@@ -508,6 +784,9 @@ static int eth_ipcp_alloc(const uint8_t * dst_addr,
buf, len + data->len);
free(buf);
+ if (ret == 0)
+ FETCH_ADD_RELAXED(&eth_data.stat.n_mgmt_snd, 1);
+
return ret;
}
@@ -558,6 +837,8 @@ static int eth_ipcp_alloc_resp(uint8_t * dst_addr,
return -1;
}
+ FETCH_ADD_RELAXED(&eth_data.stat.n_mgmt_snd, 1);
+
free(buf);
return 0;
@@ -575,7 +856,8 @@ static int eth_ipcp_req(uint8_t * r_addr,
{
int fd;
- fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_ETH_MPL, data);
+ fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_ETH_MPL,
+ ETH_MAX_PACKET_SIZE, data);
if (fd < 0) {
log_err("Could not get new flow from IRMd.");
return -1;
@@ -622,7 +904,7 @@ static int eth_ipcp_alloc_reply(uint8_t * r_addr,
fd = eth_data.ef_to_fd[dsap];
#endif
if (fd < 0) {
- pthread_rwlock_unlock(& eth_data.flows_lock);
+ pthread_rwlock_unlock(&eth_data.flows_lock);
log_err("No flow found with that SAP.");
return -1; /* -EFLOWNOTFOUND */
}
@@ -647,7 +929,8 @@ static int eth_ipcp_alloc_reply(uint8_t * r_addr,
#elif defined(BUILD_ETH_LLC)
log_dbg("Flow reply, fd %d, SSAP %d, DSAP %d.", fd, ssap, dsap);
#endif
- if ((ret = ipcp_flow_alloc_reply(fd, response, mpl, data)) < 0) {
+ if ((ret = ipcp_flow_alloc_reply(fd, response, mpl,
+ ETH_MAX_PACKET_SIZE, data)) < 0) {
log_err("Failed to reply to flow allocation.");
return -1;
}
@@ -689,6 +972,8 @@ static int eth_ipcp_name_query_req(const uint8_t * hash,
return -1;
}
+ FETCH_ADD_RELAXED(&eth_data.stat.n_mgmt_snd, 1);
+
free(buf);
}
@@ -718,20 +1003,24 @@ static int eth_ipcp_mgmt_frame(const uint8_t * buf,
qosspec_t qs;
buffer_t data;
+ if (len < sizeof(*msg))
+ return -1;
+
msg = (struct mgmt_msg *) buf;
switch (msg->code) {
case FLOW_REQ:
msg_len = sizeof(*msg) + ipcp_dir_hash_len();
- assert(len >= msg_len);
+ if (len < msg_len)
+ return -1;
qs.delay = ntoh32(msg->delay);
qs.bandwidth = ntoh64(msg->bandwidth);
qs.availability = msg->availability;
qs.loss = ntoh32(msg->loss);
qs.ber = ntoh32(msg->ber);
- qs.in_order = msg->in_order;
+ qs.service = msg->service;
qs.max_gap = ntoh32(msg->max_gap);
qs.timeout = ntoh32(msg->timeout);
@@ -752,8 +1041,6 @@ static int eth_ipcp_mgmt_frame(const uint8_t * buf,
}
break;
case FLOW_REPLY:
- assert(len >= sizeof(*msg));
-
data.data = (uint8_t *) buf + sizeof(*msg);
data.len = len - sizeof(*msg);
@@ -769,9 +1056,13 @@ static int eth_ipcp_mgmt_frame(const uint8_t * buf,
&data);
break;
case NAME_QUERY_REQ:
+ if (len < sizeof(*msg) + ipcp_dir_hash_len())
+ return -1;
eth_ipcp_name_query_req(buf + sizeof(*msg), r_addr);
break;
case NAME_QUERY_REPLY:
+ if (len < sizeof(*msg) + ipcp_dir_hash_len())
+ return -1;
eth_ipcp_name_query_reply(buf + sizeof(*msg), r_addr);
break;
default:
@@ -844,6 +1135,12 @@ static void * eth_ipcp_packet_reader(void * o)
fd_set fds;
int frame_len;
#endif
+#if defined(HAVE_RAW_SOCKETS)
+ struct sockaddr_ll src;
+ socklen_t slen;
+#endif
+ size_t eth_len;
+ uint8_t hcs;
struct eth_frame * e_frame;
struct mgmt_frame * frame;
@@ -881,24 +1178,58 @@ static void * eth_ipcp_packet_reader(void * o)
if (select(eth_data.s_fd + 1, &fds, NULL, NULL, NULL) < 0)
continue;
assert(FD_ISSET(eth_data.s_fd, &fds));
- if (ipcp_spb_reserve(&spb, ETH_MTU))
+ if (ipcp_spb_reserve(&spb, ETH_MTU)) {
+ FETCH_ADD_RELAXED(&eth_data.stat.n_buf_f, 1);
continue;
- buf = ssm_pk_buff_head_alloc(spb, ETH_HEADER_TOT_SIZE);
+ }
+ buf = ssm_pk_buff_push(spb, ETH_HEADER_TOT_SIZE);
if (buf == NULL) {
log_dbg("Failed to allocate header.");
ipcp_spb_release(spb);
+ FETCH_ADD_RELAXED(&eth_data.stat.n_buf_f, 1);
continue;
}
- frame_len = recv(eth_data.s_fd, buf,
- ETH_MTU + ETH_HEADER_TOT_SIZE, 0);
+ slen = sizeof(src);
+ /* MSG_DONTWAIT: RD_THR>1 race-loser bails with EAGAIN. */
+ frame_len = recvfrom(eth_data.s_fd, buf,
+ ETH_MTU + ETH_HEADER_TOT_SIZE,
+ MSG_DONTWAIT,
+ (struct sockaddr *) &src, &slen);
#endif
- if (frame_len <= 0) {
- log_dbg("Failed to receive frame.");
+ if (frame_len == 0) {
+ ipcp_spb_release(spb);
+ continue; /* Spurious */
+ }
+
+ if (frame_len < 0) {
ipcp_spb_release(spb);
+
+ if (errno == EAGAIN || errno == EWOULDBLOCK)
+ continue;
+
+ log_dbg("Failed to rcv frame: %s.", strerror(errno));
+ FETCH_ADD_RELAXED(&eth_data.stat.n_rcv_f, 1);
continue;
}
#endif
+#if defined(HAVE_NETMAP)
+ eth_len = hdr.len;
+#elif defined(HAVE_BPF)
+ eth_len = ((struct bpf_hdr *) buf)->bh_caplen;
+#else
+ eth_len = (size_t) frame_len;
+#endif
+ /* Defense in depth: reject before parsing dereferences. */
+ if (eth_len < ETH_HEADER_TOT_SIZE)
+ goto fail_frame;
+
+#if defined(HAVE_RAW_SOCKETS)
+ /* Drop our own egress. */
+ if (src.sll_pkttype == PACKET_OUTGOING)
+ goto fail_frame;
+#endif
+
#if defined(HAVE_BPF) && !defined(HAVE_NETMAP)
e_frame = (struct eth_frame *)
(buf + ((struct bpf_hdr *) buf)->bh_hdrlen);
@@ -916,6 +1247,8 @@ static void * eth_ipcp_packet_reader(void * o)
e_frame->dst_hwaddr,
MAC_SIZE) &&
memcmp(br_addr, e_frame->dst_hwaddr, MAC_SIZE)) {
+ FETCH_ADD_RELAXED(&eth_data.stat.n_bad_id, 1);
+ goto fail_frame;
}
#endif
length = ntohs(e_frame->length);
@@ -923,17 +1256,41 @@ static void * eth_ipcp_packet_reader(void * o)
if (e_frame->ethertype != eth_data.ethertype)
goto fail_frame;
+ if (length > ETH_MTU)
+ goto fail_frame;
+
deid = ntohs(e_frame->eid);
- if (deid == MGMT_EID) {
#elif defined (BUILD_ETH_LLC)
if (length > 0x05FF) /* DIX */
goto fail_frame;
+ if (length < LLC_HEADER_SIZE || length > ETH_MTU)
+ goto fail_frame;
+
length -= LLC_HEADER_SIZE;
dsap = reverse_bits(e_frame->dsap);
ssap = reverse_bits(e_frame->ssap);
+#endif
+
+ if (eth_len < ETH_HEADER_TOT_SIZE + (size_t) length)
+ goto fail_frame;
+#if defined(BUILD_ETH_DIX)
+ mem_hash(HASH_CRC8, &hcs,
+ (uint8_t *) &e_frame->eid,
+ DIX_EID_SIZE + DIX_LENGTH_SIZE);
+#elif defined(BUILD_ETH_LLC)
+ mem_hash(HASH_CRC8, &hcs,
+ (uint8_t *) &e_frame->dsap,
+ LLC_FIELDS_SIZE);
+#endif
+ if (hcs != e_frame->hcs)
+ goto fail_frame;
+
+#if defined(BUILD_ETH_DIX)
+ if (deid == MGMT_EID) {
+#elif defined (BUILD_ETH_LLC)
if (ssap == MGMT_SAP && dsap == MGMT_SAP) {
#endif
ipcp_spb_release(spb); /* No need for the N+1 buffer. */
@@ -958,6 +1315,8 @@ static void * eth_ipcp_packet_reader(void * o)
list_add(&frame->next, &eth_data.mgmt_frames);
pthread_cond_signal(&eth_data.mgmt_cond);
pthread_mutex_unlock(&eth_data.mgmt_lock);
+ FETCH_ADD_RELAXED(&eth_data.stat.n_rcv, 1);
+ FETCH_ADD_RELAXED(&eth_data.stat.n_mgmt_rcv, 1);
} else {
pthread_rwlock_rdlock(&eth_data.flows_lock);
@@ -968,6 +1327,7 @@ static void * eth_ipcp_packet_reader(void * o)
#endif
if (fd < 0) {
pthread_rwlock_unlock(&eth_data.flows_lock);
+ FETCH_ADD_RELAXED(&eth_data.stat.n_bad_id, 1);
goto fail_frame;
}
@@ -976,13 +1336,18 @@ static void * eth_ipcp_packet_reader(void * o)
|| memcmp(eth_data.fd_to_ef[fd].r_addr,
e_frame->src_hwaddr, MAC_SIZE)) {
pthread_rwlock_unlock(&eth_data.flows_lock);
+ FETCH_ADD_RELAXED(&eth_data.stat.n_bad_id, 1);
goto fail_frame;
}
#endif
+ FETCH_ADD_RELAXED(&eth_data.fd_to_ef[fd].stat.p_rcv, 1);
+ FETCH_ADD_RELAXED(&eth_data.fd_to_ef[fd].stat.b_rcv,
+ length);
+ FETCH_ADD_RELAXED(&eth_data.stat.n_rcv, 1);
pthread_rwlock_unlock(&eth_data.flows_lock);
#ifndef HAVE_NETMAP
- ssm_pk_buff_head_release(spb, ETH_HEADER_TOT_SIZE);
+ ssm_pk_buff_pop(spb, ETH_HEADER_TOT_SIZE);
ssm_pk_buff_truncate(spb, length);
#else
if (ipcp_spb_reserve(&spb, length))
@@ -991,8 +1356,13 @@ static void * eth_ipcp_packet_reader(void * o)
buf = ssm_pk_buff_head(spb);
memcpy(buf, &e_frame->payload, length);
#endif
- if (np1_flow_write(fd, spb, NP1_GET_POOL(fd)) < 0)
+ if (np1_flow_write(fd, spb, NP1_GET_POOL(fd)) < 0) {
ipcp_spb_release(spb);
+ FETCH_ADD_RELAXED(
+ &eth_data.fd_to_ef[fd].stat.p_dlv_f,
+ 1);
+ FETCH_ADD_RELAXED(&eth_data.stat.n_dlv_f, 1);
+ }
continue;
fail_frame:
@@ -1048,10 +1418,11 @@ static void * eth_ipcp_packet_writer(void * o)
len = ssm_pk_buff_len(spb);
- if (ssm_pk_buff_head_alloc(spb, ETH_HEADER_TOT_SIZE)
+ if (ssm_pk_buff_push(spb, ETH_HEADER_TOT_SIZE)
== NULL) {
log_dbg("Failed to allocate header.");
ipcp_spb_release(spb);
+ FETCH_ADD_RELAXED(&eth_data.stat.n_buf_f, 1);
continue;
}
@@ -1075,8 +1446,20 @@ static void * eth_ipcp_packet_writer(void * o)
dsap, ssap,
#endif
ssm_pk_buff_head(spb),
- len))
+ len)) {
log_dbg("Failed to send frame.");
+ FETCH_ADD_RELAXED(
+ &eth_data.fd_to_ef[fd].stat.p_snd_f,
+ 1);
+ FETCH_ADD_RELAXED(&eth_data.stat.n_snd_f, 1);
+ } else {
+ FETCH_ADD_RELAXED(
+ &eth_data.fd_to_ef[fd].stat.p_snd,
+ 1);
+ FETCH_ADD_RELAXED(
+ &eth_data.fd_to_ef[fd].stat.b_snd,
+ len);
+ }
ipcp_spb_release(spb);
}
}
@@ -1424,12 +1807,14 @@ static int eth_init_bpf(struct ifreq * ifr)
return -1;
}
#elif defined(HAVE_RAW_SOCKETS)
+#define SOCKOPT()
static int eth_init_raw_socket(struct ifreq * ifr)
{
int idx;
- int flags;
+ int sndbuf;
+ int rcvbuf;
#if defined(IPCP_ETH_QDISC_BYPASS)
- int qdisc_bypass = 1;
+ int qdisc_bypass = 1;
#endif /* ENABLE_QDISC_BYPASS */
idx = if_nametoindex(ifr->ifr_name);
@@ -1437,6 +1822,7 @@ static int eth_init_raw_socket(struct ifreq * ifr)
log_err("Failed to retrieve interface index.");
return -1;
}
+
memset(&(eth_data.device), 0, sizeof(eth_data.device));
eth_data.device.sll_ifindex = idx;
eth_data.device.sll_family = AF_PACKET;
@@ -1453,17 +1839,6 @@ static int eth_init_raw_socket(struct ifreq * ifr)
goto fail_socket;
}
- flags = fcntl(eth_data.s_fd, F_GETFL, 0);
- if (flags < 0) {
- log_err("Failed to get flags.");
- goto fail_device;
- }
-
- if (fcntl(eth_data.s_fd, F_SETFL, flags | O_NONBLOCK)) {
- log_err("Failed to set socket non-blocking.");
- goto fail_device;
- }
-
#if defined(IPCP_ETH_QDISC_BYPASS)
if (setsockopt(eth_data.s_fd, SOL_PACKET, PACKET_QDISC_BYPASS,
&qdisc_bypass, sizeof(qdisc_bypass))) {
@@ -1471,6 +1846,18 @@ static int eth_init_raw_socket(struct ifreq * ifr)
}
#endif
+ sndbuf = IPCP_ETH_SNDBUF;
+ if (sndbuf > 0 && setsockopt(eth_data.s_fd, SOL_SOCKET, SO_SNDBUF,
+ &sndbuf, sizeof(sndbuf))) {
+ log_info("Failed to set SO_SNDBUF to %d.", sndbuf);
+ }
+
+ rcvbuf = IPCP_ETH_RCVBUF;
+ if (rcvbuf > 0 && setsockopt(eth_data.s_fd, SOL_SOCKET, SO_RCVBUF,
+ &rcvbuf, sizeof(rcvbuf))) {
+ log_info("Failed to set SO_RCVBUF to %d.", rcvbuf);
+ }
+
if (bind(eth_data.s_fd, (struct sockaddr *) &eth_data.device,
sizeof(eth_data.device)) < 0) {
log_err("Failed to bind socket to interface.");
@@ -1543,6 +1930,12 @@ static int eth_ipcp_bootstrap(struct ipcp_config * conf)
return -1;
}
#endif /* HAVE_NETMAP */
+#ifdef IPCP_ETH_FLOW_STATS
+ if (rib_reg(ETH_RIB_PATH, &eth_r_ops)) {
+ log_err("Failed to register RIB.");
+ goto fail_rib_reg;
+ }
+#endif
#if defined(__linux__)
if (pthread_create(&eth_data.if_monitor, NULL,
eth_ipcp_if_monitor, NULL)) {
@@ -1606,6 +1999,10 @@ static int eth_ipcp_bootstrap(struct ipcp_config * conf)
#if defined(__linux__)
fail_monitor:
#endif
+#ifdef IPCP_ETH_FLOW_STATS
+ rib_unreg(ETH_RIB_PATH);
+ fail_rib_reg:
+#endif
#if defined(HAVE_NETMAP)
nm_close(eth_data.nmd);
#elif defined(HAVE_BPF)
@@ -1637,12 +2034,14 @@ static int eth_ipcp_unreg(const uint8_t * hash)
static int eth_ipcp_query(const uint8_t * hash)
{
uint8_t r_addr[MAC_SIZE];
- struct timespec timeout = TIMESPEC_INIT_MS(NAME_QUERY_TIMEO);
+ struct timespec timeout;
struct dir_query * query;
int ret;
+ int attempt;
uint8_t * buf;
struct mgmt_msg * msg;
size_t len;
+ long per_ms;
if (shim_data_dir_has(eth_data.shim_data, hash))
return 0;
@@ -1662,32 +2061,46 @@ static int eth_ipcp_query(const uint8_t * hash)
memset(r_addr, 0xff, MAC_SIZE);
- query = shim_data_dir_query_create(eth_data.shim_data, hash);
- if (query == NULL) {
- free(buf);
- return -1;
- }
+ per_ms = NAME_QUERY_TIMEO / (NAME_QUERY_RETRIES + 1);
+
+ ret = -1;
+ for (attempt = 0; attempt <= NAME_QUERY_RETRIES; ++attempt) {
+ query = shim_data_dir_query_create(eth_data.shim_data, hash);
+ if (query == NULL) {
+ ret = -1;
+ break;
+ }
- if (eth_ipcp_send_frame(r_addr,
+ if (eth_ipcp_send_frame(r_addr,
#if defined(BUILD_ETH_DIX)
- MGMT_EID,
+ MGMT_EID,
#elif defined(BUILD_ETH_LLC)
- reverse_bits(MGMT_SAP),
- reverse_bits(MGMT_SAP),
+ reverse_bits(MGMT_SAP),
+ reverse_bits(MGMT_SAP),
#endif
- buf, len)) {
- log_err("Failed to send management frame.");
+ buf, len)) {
+ log_err("Failed to send management frame.");
+ shim_data_dir_query_destroy(eth_data.shim_data,
+ query);
+ ret = -1;
+ break;
+ }
+
+ FETCH_ADD_RELAXED(&eth_data.stat.n_mgmt_snd, 1);
+
+ timeout.tv_sec = per_ms / 1000;
+ timeout.tv_nsec = (per_ms % 1000) * 1000000L;
+
+ ret = shim_data_dir_query_wait(query, &timeout);
+
shim_data_dir_query_destroy(eth_data.shim_data, query);
- free(buf);
- return -1;
+
+ if (ret != -ETIMEDOUT)
+ break;
}
free(buf);
- ret = shim_data_dir_query_wait(query, &timeout);
-
- shim_data_dir_query_destroy(eth_data.shim_data, query);
-
return ret;
}
@@ -1748,6 +2161,14 @@ static int eth_ipcp_flow_alloc(int fd,
}
fset_add(eth_data.np1_flows, fd);
+#ifdef IPCP_ETH_FLOW_STATS
+ pthread_rwlock_wrlock(&eth_data.flows_lock);
+ memset(&eth_data.fd_to_ef[fd].stat, 0,
+ sizeof(eth_data.fd_to_ef[fd].stat));
+ eth_data.fd_to_ef[fd].stat.stamp = time(NULL);
+ FETCH_ADD_RELAXED(&eth_data.stat.n_flows, 1);
+ pthread_rwlock_unlock(&eth_data.flows_lock);
+#endif
#if defined(BUILD_ETH_LLC)
log_dbg("Assigned SAP %d for fd %d.", ssap, fd);
#endif
@@ -1808,6 +2229,14 @@ static int eth_ipcp_flow_alloc_resp(int fd,
}
fset_add(eth_data.np1_flows, fd);
+#ifdef IPCP_ETH_FLOW_STATS
+ pthread_rwlock_wrlock(&eth_data.flows_lock);
+ memset(&eth_data.fd_to_ef[fd].stat, 0,
+ sizeof(eth_data.fd_to_ef[fd].stat));
+ eth_data.fd_to_ef[fd].stat.stamp = time(NULL);
+ FETCH_ADD_RELAXED(&eth_data.stat.n_flows, 1);
+ pthread_rwlock_unlock(&eth_data.flows_lock);
+#endif
#if defined(BUILD_ETH_LLC)
log_dbg("Assigned SAP %d for fd %d.", ssap, fd);
#endif
@@ -1836,6 +2265,12 @@ static int eth_ipcp_flow_dealloc(int fd)
#endif
memset(&eth_data.fd_to_ef[fd].r_addr, 0, MAC_SIZE);
+#ifdef IPCP_ETH_FLOW_STATS
+ memset(&eth_data.fd_to_ef[fd].stat, 0,
+ sizeof(eth_data.fd_to_ef[fd].stat));
+ FETCH_SUB_RELAXED(&eth_data.stat.n_flows, 1);
+#endif
+
pthread_rwlock_unlock(&eth_data.flows_lock);
ipcp_flow_dealloc(fd);
@@ -1902,6 +2337,9 @@ int main(int argc,
#ifdef __linux__
pthread_join(eth_data.if_monitor, NULL);
#endif
+#ifdef IPCP_ETH_FLOW_STATS
+ rib_unreg(ETH_RIB_PATH);
+#endif
}
ipcp_stop();
diff --git a/src/ipcpd/ipcp.c b/src/ipcpd/ipcp.c
index 5ad2401f..1052a686 100644
--- a/src/ipcpd/ipcp.c
+++ b/src/ipcpd/ipcp.c
@@ -363,6 +363,7 @@ static void * acceptloop(void * o)
int ipcp_wait_flow_req_arr(const uint8_t * dst,
qosspec_t qs,
time_t mpl,
+ uint32_t mtu,
const buffer_t * data)
{
struct timespec ts = TIMESPEC_INIT_MS(ALLOC_TIMEOUT);
@@ -392,7 +393,7 @@ int ipcp_wait_flow_req_arr(const uint8_t * dst,
assert(ipcpd.alloc_id == -1);
- fd = ipcp_flow_req_arr(&hash, qs, mpl, data);
+ fd = ipcp_flow_req_arr(&hash, qs, mpl, mtu, data);
if (fd < 0) {
pthread_mutex_unlock(&ipcpd.alloc_lock);
log_err("Failed to get fd for flow.");
diff --git a/src/ipcpd/ipcp.h b/src/ipcpd/ipcp.h
index 26a780a3..0adcc694 100644
--- a/src/ipcpd/ipcp.h
+++ b/src/ipcpd/ipcp.h
@@ -98,6 +98,7 @@ enum ipcp_state ipcp_get_state(void);
int ipcp_wait_flow_req_arr(const uint8_t * dst,
qosspec_t qs,
time_t mpl,
+ uint32_t mtu,
const buffer_t * data);
int ipcp_wait_flow_resp(const int fd);
diff --git a/src/ipcpd/local/main.c b/src/ipcpd/local/main.c
index 2c867317..eb9836f2 100644
--- a/src/ipcpd/local/main.c
+++ b/src/ipcpd/local/main.c
@@ -203,7 +203,8 @@ static int local_ipcp_flow_alloc(int fd,
HASH_VAL32(dst), fd);
assert(dst);
- out_fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_LOCAL_MPL, data);
+ out_fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_LOCAL_MPL,
+ IPCP_LOCAL_MTU, data);
if (out_fd < 0) {
log_dbg("Flow allocation failed: %d", out_fd);
return -1;
@@ -255,14 +256,16 @@ static int local_ipcp_flow_alloc_resp(int fd,
}
if (response < 0) {
- ipcp_flow_alloc_reply(out_fd, response, mpl, data);
+ ipcp_flow_alloc_reply(out_fd, response, mpl,
+ IPCP_LOCAL_MTU, data);
log_info("Flow allocation rejected, fds (%d, %d).", out_fd, fd);
return 0;
}
fset_add(local_data.flows, fd);
- if (ipcp_flow_alloc_reply(out_fd, response, mpl, data) < 0) {
+ if (ipcp_flow_alloc_reply(out_fd, response, mpl,
+ IPCP_LOCAL_MTU, data) < 0) {
log_err("Failed to reply to allocation");
fset_del(local_data.flows, fd);
return -1;
diff --git a/src/ipcpd/udp/udp.c b/src/ipcpd/udp/udp.c
index 452bbc1a..93e88b9b 100644
--- a/src/ipcpd/udp/udp.c
+++ b/src/ipcpd/udp/udp.c
@@ -47,6 +47,10 @@
#include <stdlib.h>
#include <sys/wait.h>
#include <fcntl.h>
+#include <unistd.h>
+#if defined(__linux__)
+#include <netinet/ip.h>
+#endif
#define FLOW_REQ 1
#define FLOW_REPLY 2
@@ -87,7 +91,7 @@ struct mgmt_msg {
uint8_t code;
/* QoS parameters from spec */
uint8_t availability;
- uint8_t in_order;
+ uint8_t service;
} __attribute__((packed));
struct mgmt_frame {
@@ -130,6 +134,53 @@ static const char * __inet_ntop(const struct __ADDR * addr,
return inet_ntop(__AF, addr, buf, __ADDRSTRLEN);
}
+#if defined(BUILD_IPCP_UDP4)
+#define UDP_MTU_FALLBACK IPCP_UDP4_MTU
+#define UDP_IP_OVERHEAD 28U /* IPv4 + UDP */
+#else
+#define UDP_MTU_FALLBACK IPCP_UDP6_MTU
+#define UDP_IP_OVERHEAD 48U /* IPv6 + UDP */
+#endif
+
+static uint32_t udp_query_mtu(const struct __SOCKADDR * saddr)
+{
+#if defined(__linux__) && (defined(IP_MTU) || defined(IPV6_MTU))
+ int sock;
+ int mtu = 0;
+ socklen_t len = sizeof(mtu);
+
+ sock = socket(__AF, SOCK_DGRAM, IPPROTO_UDP);
+ if (sock < 0)
+ return UDP_MTU_FALLBACK;
+
+ if (connect(sock, (const struct sockaddr *) saddr,
+ sizeof(*saddr)) < 0)
+ goto fallback;
+
+#if defined(BUILD_IPCP_UDP4) && defined(IP_MTU)
+ if (getsockopt(sock, IPPROTO_IP, IP_MTU, &mtu, &len) < 0)
+ goto fallback;
+#elif defined(BUILD_IPCP_UDP6) && defined(IPV6_MTU)
+ if (getsockopt(sock, IPPROTO_IPV6, IPV6_MTU, &mtu, &len) < 0)
+ goto fallback;
+#else
+ goto fallback;
+#endif
+ close(sock);
+
+ if (mtu <= (int) UDP_IP_OVERHEAD)
+ return UDP_MTU_FALLBACK;
+
+ return (uint32_t) mtu - UDP_IP_OVERHEAD;
+
+ fallback:
+ close(sock);
+#else
+ (void) saddr;
+#endif
+ return UDP_MTU_FALLBACK;
+}
+
static int udp_data_init(void)
{
int i;
@@ -220,7 +271,7 @@ static int udp_ipcp_port_alloc(const struct __SOCKADDR * r_saddr,
msg->availability = qs.availability;
msg->loss = hton32(qs.loss);
msg->ber = hton32(qs.ber);
- msg->in_order = qs.in_order;
+ msg->service = qs.service;
msg->max_gap = hton32(qs.max_gap);
msg->timeout = hton32(qs.timeout);
@@ -285,7 +336,8 @@ static int udp_ipcp_port_req(struct __SOCKADDR * c_saddr,
{
int fd;
- fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_UDP_MPL, data);
+ fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_UDP_MPL,
+ udp_query_mtu(c_saddr), data);
if (fd < 0) {
log_err("Could not get new flow from IRMd.");
return -1;
@@ -332,7 +384,8 @@ static int udp_ipcp_port_alloc_reply(const struct __SOCKADDR * saddr,
pthread_rwlock_unlock(&udp_data.flows_lock);
- if (ipcp_flow_alloc_reply(s_eid, response, mpl, data) < 0) {
+ if (ipcp_flow_alloc_reply(s_eid, response, mpl,
+ udp_query_mtu(saddr), data) < 0) {
log_err("Failed to reply to flow allocation.");
return -1;
}
@@ -352,13 +405,18 @@ static int udp_ipcp_mgmt_frame(struct __SOCKADDR c_saddr,
qosspec_t qs;
buffer_t data;
+ /* Defence against malformed/corrupted wire input. */
+ if (len < sizeof(*msg))
+ return -1;
+
msg = (struct mgmt_msg *) buf;
switch (msg->code) {
case FLOW_REQ:
msg_len = sizeof(*msg) + ipcp_dir_hash_len();
- assert(len >= msg_len);
+ if (len < msg_len)
+ return -1;
data.len = len - msg_len;
data.data = (uint8_t *) buf + msg_len;
@@ -369,7 +427,7 @@ static int udp_ipcp_mgmt_frame(struct __SOCKADDR c_saddr,
qs.availability = msg->availability;
qs.loss = ntoh32(msg->loss);
qs.ber = ntoh32(msg->ber);
- qs.in_order = msg->in_order;
+ qs.service = msg->service;
qs.max_gap = ntoh32(msg->max_gap);
qs.timeout = ntoh32(msg->timeout);
@@ -377,8 +435,6 @@ static int udp_ipcp_mgmt_frame(struct __SOCKADDR c_saddr,
(uint8_t *) (msg + 1), qs,
&data);
case FLOW_REPLY:
- assert(len >= sizeof(*msg));
-
data.len = len - sizeof(*msg);
data.data = (uint8_t *) buf + sizeof(*msg);
@@ -549,7 +605,7 @@ static void * udp_ipcp_packet_writer(void * o)
continue;
}
- buf = ssm_pk_buff_head_alloc(spb, OUR_HEADER_LEN);
+ buf = ssm_pk_buff_push(spb, OUR_HEADER_LEN);
if (buf == NULL) {
log_dbg("Failed to allocate header.");
ipcp_spb_release(spb);
diff --git a/src/ipcpd/unicast/dt.c b/src/ipcpd/unicast/dt.c
index 252477f4..cc54efa1 100644
--- a/src/ipcpd/unicast/dt.c
+++ b/src/ipcpd/unicast/dt.c
@@ -139,7 +139,7 @@ static void dt_pci_shrink(struct ssm_pk_buff * spb)
{
assert(spb);
- ssm_pk_buff_head_release(spb, dt_pci_info.head_size);
+ ssm_pk_buff_pop(spb, dt_pci_info.head_size);
}
struct {
@@ -168,12 +168,12 @@ struct {
size_t f_nhp_pkt[QOS_CUBE_MAX];
size_t f_nhp_bytes[QOS_CUBE_MAX];
pthread_mutex_t lock;
- } stat[PROG_MAX_FLOWS];
+ } stat[PROC_MAX_FLOWS];
size_t n_flows;
#endif
struct bmp * res_fds;
- struct comp_info comps[PROG_RES_FDS];
+ struct comp_info comps[PROC_RES_FDS];
pthread_rwlock_t lock;
pthread_t listener;
@@ -220,7 +220,7 @@ static int dt_rib_read(const char * path,
tm = gmtime(&dt.stat[fd].stamp);
strftime(tmstr, sizeof(tmstr), RIB_TM_FORMAT, tm);
- if (fd >= PROG_RES_FDS) {
+ if (fd >= PROC_RES_FDS) {
fccntl(fd, FLOWGRXQLEN, &rxqlen);
fccntl(fd, FLOWGTXQLEN, &txqlen);
}
@@ -296,7 +296,7 @@ static int dt_rib_readdir(char *** buf)
if (*buf == NULL)
goto fail_entries;
- for (i = 0; i < PROG_MAX_FLOWS; ++i) {
+ for (i = 0; i < PROC_MAX_FLOWS; ++i) {
pthread_mutex_lock(&dt.stat[i].lock);
if (dt.stat[i].stamp == 0) {
@@ -514,7 +514,7 @@ static void packet_handler(int fd,
#endif
} else {
dt_pci_shrink(spb);
- if (dt_pci.eid >= PROG_RES_FDS) {
+ if (dt_pci.eid >= PROC_RES_FDS) {
uint8_t ecn = *(head + dt_pci_info.ecn_o);
fa_np1_rcv(dt_pci.eid, ecn, spb);
return;
@@ -636,13 +636,13 @@ int dt_init(struct dt_config cfg)
goto fail_rwlock_init;
}
- dt.res_fds = bmp_create(PROG_RES_FDS, 0);
+ dt.res_fds = bmp_create(PROC_RES_FDS, 0);
if (dt.res_fds == NULL)
goto fail_res_fds;
#ifdef IPCP_FLOW_STATS
memset(dt.stat, 0, sizeof(dt.stat));
- for (i = 0; i < PROG_MAX_FLOWS; ++i)
+ for (i = 0; i < PROC_MAX_FLOWS; ++i)
if (pthread_mutex_init(&dt.stat[i].lock, NULL)) {
log_err("Failed to init mutex for flow %d.", i);
for (j = 0; j < i; ++j)
@@ -662,7 +662,7 @@ int dt_init(struct dt_config cfg)
fail_rib_reg:
#ifdef IPCP_FLOW_STATS
- for (i = 0; i < PROG_MAX_FLOWS; ++i)
+ for (i = 0; i < PROC_MAX_FLOWS; ++i)
pthread_mutex_destroy(&dt.stat[i].lock);
fail_stat_lock:
#endif
@@ -691,7 +691,7 @@ void dt_fini(void)
sprintf(dtstr, "%s.%" PRIu64, DT, dt.addr);
rib_unreg(dtstr);
#ifdef IPCP_FLOW_STATS
- for (i = 0; i < PROG_MAX_FLOWS; ++i)
+ for (i = 0; i < PROC_MAX_FLOWS; ++i)
pthread_mutex_destroy(&dt.stat[i].lock);
#endif
bmp_destroy(dt.res_fds);
@@ -791,7 +791,7 @@ int dt_reg_comp(void * comp,
void dt_unreg_comp(int eid)
{
- assert(eid >= 0 && eid < PROG_RES_FDS);
+ assert(eid >= 0 && eid < PROC_RES_FDS);
pthread_rwlock_wrlock(&dt.lock);
@@ -823,7 +823,7 @@ int dt_write_packet(uint64_t dst_addr,
#ifdef IPCP_FLOW_STATS
len = ssm_pk_buff_len(spb);
- if (eid < PROG_RES_FDS) {
+ if (eid < PROC_RES_FDS) {
pthread_mutex_lock(&dt.stat[eid].lock);
++dt.stat[eid].lcl_r_pkt[qc];
@@ -837,7 +837,7 @@ int dt_write_packet(uint64_t dst_addr,
log_dbg("Could not get nhop for " ADDR_FMT32 ".",
ADDR_VAL32(&dst_addr));
#ifdef IPCP_FLOW_STATS
- if (eid < PROG_RES_FDS) {
+ if (eid < PROC_RES_FDS) {
pthread_mutex_lock(&dt.stat[eid].lock);
++dt.stat[eid].lcl_r_pkt[qc];
@@ -849,7 +849,7 @@ int dt_write_packet(uint64_t dst_addr,
return -EPERM;
}
- head = ssm_pk_buff_head_alloc(spb, dt_pci_info.head_size);
+ head = ssm_pk_buff_push(spb, dt_pci_info.head_size);
if (head == NULL) {
log_dbg("Failed to allocate DT header.");
goto fail_write;
@@ -876,7 +876,7 @@ int dt_write_packet(uint64_t dst_addr,
#ifdef IPCP_FLOW_STATS
pthread_mutex_lock(&dt.stat[fd].lock);
- if (dt_pci.eid < PROG_RES_FDS) {
+ if (dt_pci.eid < PROC_RES_FDS) {
++dt.stat[fd].lcl_w_pkt[qc];
dt.stat[fd].lcl_w_bytes[qc] += len;
}
@@ -891,7 +891,7 @@ int dt_write_packet(uint64_t dst_addr,
#ifdef IPCP_FLOW_STATS
pthread_mutex_lock(&dt.stat[fd].lock);
- if (eid < PROG_RES_FDS) {
+ if (eid < PROC_RES_FDS) {
++dt.stat[fd].lcl_w_pkt[qc];
dt.stat[fd].lcl_w_bytes[qc] += len;
}
diff --git a/src/ipcpd/unicast/fa.c b/src/ipcpd/unicast/fa.c
index c157d71c..c0447885 100644
--- a/src/ipcpd/unicast/fa.c
+++ b/src/ipcpd/unicast/fa.c
@@ -81,7 +81,7 @@ struct fa_msg {
uint16_t ece;
uint8_t code;
uint8_t availability;
- uint8_t in_order;
+ uint8_t service;
} __attribute__((packed));
struct cmd {
@@ -111,7 +111,7 @@ struct fa_flow {
struct {
pthread_rwlock_t flows_lock;
- struct fa_flow flows[PROG_MAX_FLOWS];
+ struct fa_flow flows[PROC_MAX_FLOWS];
#ifdef IPCP_FLOW_STATS
size_t n_flows;
#endif
@@ -145,7 +145,7 @@ static int fa_rib_read(const char * path,
fd = atoi(entry);
- if (fd < 0 || fd >= PROG_MAX_FLOWS)
+ if (fd < 0 || fd >= PROC_MAX_FLOWS)
return -1;
if (len < 1536)
@@ -225,7 +225,7 @@ static int fa_rib_readdir(char *** buf)
if (*buf == NULL)
goto fail_entries;
- for (i = 0; i < PROG_MAX_FLOWS; ++i) {
+ for (i = 0; i < PROC_MAX_FLOWS; ++i) {
struct fa_flow * flow;
flow = &fa.flows[i];
@@ -306,7 +306,7 @@ static int eid_to_fd(uint64_t eid)
fd = eid & 0xFFFFFFFF;
- if (fd < 0 || fd >= PROG_MAX_FLOWS)
+ if (fd < 0 || fd >= PROC_MAX_FLOWS)
return -1;
flow = &fa.flows[fd];
@@ -496,11 +496,12 @@ static int fa_handle_flow_req(struct fa_msg * msg,
qs.availability = msg->availability;
qs.loss = ntoh32(msg->loss);
qs.ber = ntoh32(msg->ber);
- qs.in_order = msg->in_order;
+ qs.service = msg->service;
qs.max_gap = ntoh32(msg->max_gap);
qs.timeout = ntoh32(msg->timeout);
- fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_UNICAST_MPL, &data);
+ fd = ipcp_wait_flow_req_arr(dst, qs, IPCP_UNICAST_MPL,
+ IPCP_UNICAST_MTU, &data);
if (fd < 0)
return fd;
@@ -528,7 +529,8 @@ static int fa_handle_flow_reply(struct fa_msg * msg,
time_t mpl = IPCP_UNICAST_MPL;
int response;
- assert(len >= sizeof(*msg));
+ if (len < sizeof(*msg))
+ return -EINVAL;
data.data = (uint8_t *) msg + sizeof(*msg);
data.len = len - sizeof(*msg);
@@ -558,7 +560,8 @@ static int fa_handle_flow_reply(struct fa_msg * msg,
pthread_rwlock_unlock(&fa.flows_lock);
- if (ipcp_flow_alloc_reply(fd, response, mpl, &data) < 0) {
+ if (ipcp_flow_alloc_reply(fd, response, mpl,
+ IPCP_UNICAST_MTU, &data) < 0) {
log_err("Failed to reply for flow allocation on fd %d.", fd);
return -EIRMD;
}
@@ -572,8 +575,8 @@ static int fa_handle_flow_update(struct fa_msg * msg,
struct fa_flow * flow;
int fd;
- (void) len;
- assert(len >= sizeof(*msg));
+ if (len < sizeof(*msg))
+ return -EINVAL;
pthread_rwlock_wrlock(&fa.flows_lock);
@@ -789,7 +792,7 @@ int fa_alloc(int fd,
msg->availability = qs.availability;
msg->loss = hton32(qs.loss);
msg->ber = hton32(qs.ber);
- msg->in_order = qs.in_order;
+ msg->service = qs.service;
msg->max_gap = hton32(qs.max_gap);
msg->timeout = hton32(qs.timeout);
diff --git a/src/ipcpd/unicast/routing/graph.c b/src/ipcpd/unicast/routing/graph.c
index 0226c762..c168eb7d 100644
--- a/src/ipcpd/unicast/routing/graph.c
+++ b/src/ipcpd/unicast/routing/graph.c
@@ -603,9 +603,9 @@ static int graph_routing_table_lfa(struct graph * graph,
struct list_head * table,
int ** dist)
{
- int * n_dist[PROG_MAX_FLOWS];
- uint64_t addrs[PROG_MAX_FLOWS];
- int n_index[PROG_MAX_FLOWS];
+ int * n_dist[PROC_MAX_FLOWS];
+ uint64_t addrs[PROC_MAX_FLOWS];
+ int n_index[PROC_MAX_FLOWS];
struct list_head * p;
struct list_head * q;
struct vertex * v;
@@ -618,7 +618,7 @@ static int graph_routing_table_lfa(struct graph * graph,
if (graph_routing_table_simple(graph, s_addr, table, dist))
goto fail_table;
- for (j = 0; j < PROG_MAX_FLOWS; j++) {
+ for (j = 0; j < PROC_MAX_FLOWS; j++) {
n_dist[j] = NULL;
n_index[j] = -1;
addrs[j] = -1;
diff --git a/src/ipcpd/unicast/routing/link-state.c b/src/ipcpd/unicast/routing/link-state.c
index 051dd98d..c4ea9e1c 100644
--- a/src/ipcpd/unicast/routing/link-state.c
+++ b/src/ipcpd/unicast/routing/link-state.c
@@ -415,7 +415,7 @@ static void calculate_pff(struct routing_i * instance)
struct list_head table;
struct list_head * p;
struct list_head * q;
- int fds[PROG_MAX_FLOWS];
+ int fds[PROC_MAX_FLOWS];
assert(instance);
diff --git a/src/irmd/main.c b/src/irmd/main.c
index a85a9bf0..4808934f 100644
--- a/src/irmd/main.c
+++ b/src/irmd/main.c
@@ -86,7 +86,9 @@
#define TIMESYNC_SLACK 100 /* ms */
#define OAP_SEEN_TIMER 20 /* s */
#define DEALLOC_TIME 300 /* s */
-#define DIRECT_MPL 1 /* s */
+#define DIRECT_MPL 20 /* ms */
+/* bytes; in-process, bounded only by PUP/GSPP. */
+#define DIRECT_MTU 65000
enum irm_state {
IRMD_NULL = 0,
@@ -910,6 +912,10 @@ static int flow_accept(struct flow_info * flow,
flow->uid = reg_get_proc_uid(flow->n_pid);
err = oap_srv_process(&info, req_hdr, &resp_hdr, data, sk);
+ if (err == -EREPLAY) {
+ log_warn("Dropping replayed alloc request for %s.", name);
+ goto fail_replay;
+ }
if (err < 0) {
log_err("OAP processing failed for %s.", name);
goto fail_oap;
@@ -938,6 +944,9 @@ static int flow_accept(struct flow_info * flow,
fail_oap:
if (!reg_flow_is_direct(flow->id))
ipcp_flow_alloc_resp(flow, err, resp_hdr);
+ fail_replay:
+ freebuf(req_hdr);
+ freebuf(resp_hdr);
fail_wait:
reg_destroy_flow(flow->id);
fail_flow:
@@ -1209,6 +1218,7 @@ static int flow_alloc_direct(const char * dst,
acc.n_1_pid = flow->n_pid;
acc.mpl = DIRECT_MPL;
+ acc.mtu = DIRECT_MTU;
acc.qs = flow->qs;
acc.state = FLOW_ALLOCATED;
@@ -1244,6 +1254,7 @@ static int flow_alloc_direct(const char * dst,
flow->id = acc.id;
flow->n_1_pid = acc.n_pid;
flow->mpl = DIRECT_MPL;
+ flow->mtu = DIRECT_MTU;
flow->state = FLOW_ALLOCATED;
log_info("Flow %d allocated (direct) for %d to %s.",
diff --git a/src/irmd/oap.c b/src/irmd/oap.c
deleted file mode 100644
index 1831f533..00000000
--- a/src/irmd/oap.c
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Ouroboros - Copyright (C) 2016 - 2026
- *
- * OAP - Shared credential and configuration loading
- *
- * Dimitri Staessens <dimitri@ouroboros.rocks>
- * Sander Vrijders <sander@ouroboros.rocks>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., http://www.fsf.org/about/contact/.
- */
-
-#if defined(__linux__) || defined(__CYGWIN__)
- #define _DEFAULT_SOURCE
-#else
- #define _POSIX_C_SOURCE 200809L
-#endif
-
-#define OUROBOROS_PREFIX "irmd/oap"
-
-#include <ouroboros/crypt.h>
-#include <ouroboros/errno.h>
-#include <ouroboros/logs.h>
-
-#include "config.h"
-
-#include <assert.h>
-#include <string.h>
-#include <sys/stat.h>
-
-/*
- * Shared credential and configuration loading helpers
- */
-
-#ifndef OAP_TEST_MODE
-
-static bool file_exists(const char * path)
-{
- struct stat s;
-
- if (stat(path, &s) < 0 && errno == ENOENT) {
- log_dbg("File %s does not exist.", path);
- return false;
- }
-
- return true;
-}
-
-int load_credentials(const char * name,
- const struct name_sec_paths * paths,
- void ** pkp,
- void ** crt)
-{
- assert(paths != NULL);
- assert(pkp != NULL);
- assert(crt != NULL);
-
- *pkp = NULL;
- *crt = NULL;
-
- if (!file_exists(paths->crt) || !file_exists(paths->key)) {
- log_info("No authentication certificates for %s.", name);
- return 0;
- }
-
- if (crypt_load_crt_file(paths->crt, crt) < 0) {
- log_err("Failed to load %s for %s.", paths->crt, name);
- goto fail_crt;
- }
-
- if (crypt_load_privkey_file(paths->key, pkp) < 0) {
- log_err("Failed to load %s for %s.", paths->key, name);
- goto fail_key;
- }
-
- log_info("Loaded authentication certificates for %s.", name);
-
- return 0;
-
- fail_key:
- crypt_free_crt(*crt);
- *crt = NULL;
- fail_crt:
- return -EAUTH;
-}
-
-int load_kex_config(const char * name,
- const char * path,
- struct sec_config * cfg)
-{
- assert(name != NULL);
- assert(cfg != NULL);
-
- memset(cfg, 0, sizeof(*cfg));
-
- /* Load encryption config */
- if (!file_exists(path))
- log_dbg("No encryption %s for %s.", path, name);
-
- if (load_sec_config_file(cfg, path) < 0) {
- log_warn("Failed to load %s for %s.", path, name);
- return -1;
- }
-
- if (!IS_KEX_ALGO_SET(cfg)) {
- log_info("Key exchange not configured for %s.", name);
- return 0;
- }
-
- if (cfg->c.nid == NID_undef || crypt_nid_to_str(cfg->c.nid) == NULL) {
- log_err("Invalid cipher NID %d for %s.", cfg->c.nid, name);
- return -ECRYPT;
- }
-
- log_info("Encryption enabled for %s.", name);
-
- return 0;
-}
-
-#endif /* OAP_TEST_MODE */
diff --git a/src/irmd/oap/auth.c b/src/irmd/oap/auth.c
index 4b86f055..d165de73 100644
--- a/src/irmd/oap/auth.c
+++ b/src/irmd/oap/auth.c
@@ -174,6 +174,7 @@ int oap_check_hdr(const struct oap_hdr * hdr)
fail_replay:
pthread_mutex_unlock(&oap_auth.replay.mtx);
free(new);
+ return -EREPLAY;
fail_stamp:
return -EAUTH;
}
diff --git a/src/irmd/oap/srv.c b/src/irmd/oap/srv.c
index afc54acc..587a8f9f 100644
--- a/src/irmd/oap/srv.c
+++ b/src/irmd/oap/srv.c
@@ -180,11 +180,7 @@ static int negotiate_cipher(const struct oap_hdr * peer_hdr,
cli_rank = crypt_kdf_rank(peer_hdr->kdf_nid);
srv_rank = crypt_kdf_rank(kcfg->k.nid);
- /*
- * For client-encap KEM, the KDF is baked into
- * the ciphertext. The server must use the client's
- * KDF and can only verify the minimum.
- */
+ /* Client-encap KEM bakes KDF into ciphertext; verify min. */
if (OAP_KEX_ROLE(peer_hdr) == KEM_MODE_CLIENT_ENCAP) {
if (srv_rank > cli_rank) {
log_err_id(id, "Client KDF too weak.");
@@ -388,11 +384,12 @@ int oap_srv_process(const struct name_info * info,
uint8_t hash_buf[MAX_HASH_SIZE];
buffer_t req_hash = BUF_INIT;
ssize_t hash_ret;
- char cli_name[NAME_SIZE + 1]; /* TODO */
+ char cli_name[NAME_SIZE + 1];
uint8_t * id;
void * pkp = NULL;
void * crt = NULL;
int req_md_nid;
+ int ret;
assert(info != NULL);
assert(rsp_buf != NULL);
@@ -427,8 +424,13 @@ int oap_srv_process(const struct name_info * info,
id = peer_hdr.id.data; /* Logging */
- if (oap_check_hdr(&peer_hdr) < 0) {
- log_err_id(id, "OAP header failed replay check.");
+ ret = oap_check_hdr(&peer_hdr);
+ if (ret == -EREPLAY) {
+ log_warn_id(id, "OAP header failed replay check.");
+ goto fail_replay;
+ }
+ if (ret < 0) {
+ log_err_id(id, "OAP header check failed.");
goto fail_auth;
}
@@ -491,6 +493,11 @@ int oap_srv_process(const struct name_info * info,
fail_cred:
return -EAUTH;
+ fail_replay:
+ crypt_free_crt(crt);
+ crypt_free_key(pkp);
+ return -EREPLAY;
+
fail_kex:
crypt_free_crt(crt);
crypt_free_key(pkp);
diff --git a/src/irmd/oap/tests/oap_test.c b/src/irmd/oap/tests/oap_test.c
index a324b586..a525d988 100644
--- a/src/irmd/oap/tests/oap_test.c
+++ b/src/irmd/oap/tests/oap_test.c
@@ -32,6 +32,7 @@
#include <ouroboros/crypt.h>
#include <ouroboros/endian.h>
+#include <ouroboros/errno.h>
#include <ouroboros/flow.h>
#include <ouroboros/name.h>
#include <ouroboros/random.h>
@@ -1053,9 +1054,9 @@ static int test_oap_replay_packet(void)
freebuf(ctx.req_hdr);
ctx.req_hdr = saved_req;
- /* Replayed request should fail */
- if (oap_srv_process_ctx(&ctx) == 0) {
- printf("Server should reject replayed packet.\n");
+ /* Replay must return -EREPLAY so callers can drop silently. */
+ if (oap_srv_process_ctx(&ctx) != -EREPLAY) {
+ printf("Replayed packet rejection != -EREPLAY.\n");
goto fail_cleanup;
}
diff --git a/src/irmd/reg/flow.c b/src/irmd/reg/flow.c
index 93c3e128..5c709dea 100644
--- a/src/irmd/reg/flow.c
+++ b/src/irmd/reg/flow.c
@@ -42,6 +42,7 @@ struct reg_flow * reg_flow_create(const struct flow_info * info)
assert(info->n_pid != 0);
assert(info->n_1_pid == 0);
assert(info->mpl == 0);
+ assert(info->mtu == 0);
assert(info->state == FLOW_INIT);
flow = malloc(sizeof(*flow));
@@ -160,6 +161,7 @@ int reg_flow_update(struct reg_flow * flow,
assert(info->mpl != 0);
flow->info.mpl = info->mpl;
+ flow->info.mtu = info->mtu;
if (flow->info.state == FLOW_ALLOC_PENDING)
break;
diff --git a/src/irmd/reg/reg.c b/src/irmd/reg/reg.c
index 0025f695..365064e5 100644
--- a/src/irmd/reg/reg.c
+++ b/src/irmd/reg/reg.c
@@ -1820,7 +1820,11 @@ int reg_respond_alloc(struct flow_info * info,
goto fail_flow;
}
- assert(flow->info.state == FLOW_ALLOC_PENDING);
+ if (flow->info.state != FLOW_ALLOC_PENDING) {
+ log_warn("Flow %d already responded.", info->id);
+ goto fail_flow;
+ }
+
assert(flow->rsp_data.len == 0);
assert(flow->rsp_data.data == NULL);
diff --git a/src/irmd/reg/tests/flow_test.c b/src/irmd/reg/tests/flow_test.c
index 7e1c1360..18214078 100644
--- a/src/irmd/reg/tests/flow_test.c
+++ b/src/irmd/reg/tests/flow_test.c
@@ -122,6 +122,21 @@ static int test_reg_flow_create_has_mpl(void) {
return TEST_RC_SUCCESS;
}
+static int test_reg_flow_create_has_mtu(void) {
+ struct flow_info info = {
+ .id = 1,
+ .n_pid = 1,
+ .n_1_pid = 0,
+ .mtu = 1400,
+ .qs = qos_raw,
+ .state = FLOW_ALLOC_PENDING
+ };
+
+ reg_flow_create(&info); /* assert fail */
+
+ return TEST_RC_SUCCESS;
+}
+
static int test_reg_flow_update(void)
{
struct reg_flow * f;
@@ -136,7 +151,7 @@ static int test_reg_flow_update(void)
struct flow_info upd = {
.id = 1,
.n_pid = 1,
- .qs = qos_data,
+ .qs = qos_msg,
.state = FLOW_DEALLOCATED
};
@@ -179,7 +194,7 @@ static int test_reg_flow_update_wrong_id(void)
struct flow_info upd = {
.id = 2,
.n_pid = 1,
- .qs = qos_data,
+ .qs = qos_msg,
.state = FLOW_DEALLOCATED
};
@@ -210,6 +225,7 @@ static int test_reg_flow_assert_fails(void)
ret |= test_assert_fail(test_reg_flow_create_has_n_1_pid);
ret |= test_assert_fail(test_reg_flow_create_wrong_state);
ret |= test_assert_fail(test_reg_flow_create_has_mpl);
+ ret |= test_assert_fail(test_reg_flow_create_has_mtu);
ret |= test_assert_fail(test_reg_flow_update_wrong_id);
return ret;
diff --git a/src/irmd/reg/tests/reg_test.c b/src/irmd/reg/tests/reg_test.c
index f4b0188b..0b1014f9 100644
--- a/src/irmd/reg/tests/reg_test.c
+++ b/src/irmd/reg/tests/reg_test.c
@@ -31,6 +31,7 @@
#define TEST_N_1_PID 3999
#define TEST_FAKE_ID 9128349
#define TEST_MPL 5
+#define TEST_MTU 1400
#define TEST_PROG "reg_test" /* own binary for binary check */
#define TEST_IPCP "testipcp"
#define TEST_NAME "testname"
@@ -239,7 +240,7 @@ static int test_reg_accept_flow_success(void)
struct flow_info n_1_info = {
.n_1_pid = TEST_N_1_PID,
- .qs = qos_data,
+ .qs = qos_msg,
.state = FLOW_ALLOCATED /* RESPONSE SUCCESS */
};
@@ -266,6 +267,7 @@ static int test_reg_accept_flow_success(void)
n_1_info.id = info.id;
n_1_info.mpl = 1;
+ n_1_info.mtu = TEST_MTU;
pthread_create(&thr, NULL, test_flow_respond_accept, &n_1_info);
@@ -284,6 +286,11 @@ static int test_reg_accept_flow_success(void)
goto fail;
}
+ if (info.mtu != TEST_MTU) {
+ printf("MTU not propagated.\n");
+ goto fail;
+ }
+
if (rbuf.data == NULL) {
printf("rbuf data not returned.\n");
goto fail;
@@ -336,7 +343,7 @@ static int test_reg_accept_flow_success_no_crypt(void)
struct flow_info n_1_info = {
.n_1_pid = TEST_N_1_PID,
- .qs = qos_data,
+ .qs = qos_msg,
.state = FLOW_ALLOCATED /* RESPONSE SUCCESS */
};
@@ -363,6 +370,7 @@ static int test_reg_accept_flow_success_no_crypt(void)
n_1_info.id = info.id;
n_1_info.mpl = 1;
+ n_1_info.mtu = TEST_MTU;
pthread_create(&thr, NULL, test_flow_respond_accept, &n_1_info);
@@ -381,6 +389,11 @@ static int test_reg_accept_flow_success_no_crypt(void)
goto fail;
}
+ if (info.mtu != TEST_MTU) {
+ printf("MTU not propagated.\n");
+ goto fail;
+ }
+
if (rbuf.data == NULL) {
printf("rbuf data was not returned.\n");
goto fail;
@@ -431,7 +444,7 @@ static int test_reg_allocate_flow_fail(void)
struct flow_info n_1_info = {
.n_1_pid = TEST_N_1_PID,
- .qs = qos_data,
+ .qs = qos_msg,
.state = FLOW_DEALLOCATED /* RESPONSE FAIL */
};
@@ -489,6 +502,93 @@ static int test_reg_allocate_flow_fail(void)
return TEST_RC_FAIL;
}
+static int test_reg_respond_alloc_duplicate(void)
+{
+ pthread_t thr;
+ struct timespec abstime;
+ struct timespec timeo = TIMESPEC_INIT_S(1);
+ buffer_t rbuf = BUF_INIT;
+ buffer_t empty = BUF_INIT;
+ struct flow_info dup_info;
+
+ struct flow_info info = {
+ .n_pid = TEST_PID,
+ .qs = qos_raw
+ };
+
+ struct flow_info n_1_info = {
+ .n_1_pid = TEST_N_1_PID,
+ .qs = qos_msg,
+ .state = FLOW_ALLOCATED /* RESPONSE SUCCESS */
+ };
+
+ TEST_START();
+
+ clock_gettime(PTHREAD_COND_CLOCK, &abstime);
+ ts_add(&abstime, &timeo, &abstime);
+
+ if (reg_init() < 0) {
+ printf("Failed to init registry.\n");
+ goto fail;
+ }
+
+ if (reg_create_flow(&info) < 0) {
+ printf("Failed to add flow.\n");
+ goto fail;
+ }
+
+ info.n_1_pid = TEST_N_1_PID;
+
+ if (reg_prepare_flow_alloc(&info) < 0) {
+ printf("Failed to prepare flow for alloc.\n");
+ goto fail;
+ }
+
+ n_1_info.id = info.id;
+ n_1_info.mpl = 1;
+ n_1_info.mtu = TEST_MTU;
+
+ pthread_create(&thr, NULL, test_flow_respond_alloc, &n_1_info);
+
+ if (reg_wait_flow_allocated(&info, &rbuf, &abstime) < 0) {
+ printf("Flow allocation failed.\n");
+ pthread_join(thr, NULL);
+ reg_destroy_flow(info.id);
+ reg_fini();
+ goto fail;
+ }
+
+ pthread_join(thr, NULL);
+ freebuf(rbuf);
+
+ if (info.mtu != TEST_MTU) {
+ printf("MTU not propagated.\n");
+ goto fail;
+ }
+
+ /* Duplicate reply on an already-ALLOCATED flow must not assert. */
+ dup_info = n_1_info;
+ dup_info.state = FLOW_DEALLOCATED;
+
+ if (reg_respond_alloc(&dup_info, &empty, -EREPLAY) != -1) {
+ printf("Duplicate respond_alloc should return -1.\n");
+ goto fail;
+ }
+
+ reg_dealloc_flow(&info);
+ reg_dealloc_flow_resp(&info);
+ reg_destroy_flow(n_1_info.id);
+
+ reg_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail:
+ REG_TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
struct direct_alloc_info {
struct flow_info info;
buffer_t rsp;
@@ -564,7 +664,7 @@ static int test_reg_direct_flow_success(void)
dai.info.id = info.id;
dai.info.n_1_pid = TEST_N_1_PID;
dai.info.mpl = TEST_MPL;
- dai.info.qs = qos_data;
+ dai.info.qs = qos_msg;
dai.info.state = FLOW_ALLOCATED;
dai.rsp.len = 0;
dai.rsp.data = NULL;
@@ -679,6 +779,7 @@ static int test_reg_flow(void) {
rc |= test_reg_accept_flow_success();
rc |= test_reg_accept_flow_success_no_crypt();
rc |= test_reg_allocate_flow_fail();
+ rc |= test_reg_respond_alloc_duplicate();
rc |= test_reg_direct_flow_success();
return rc;
diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
index 79263924..6cd3a8a4 100644
--- a/src/lib/CMakeLists.txt
+++ b/src/lib/CMakeLists.txt
@@ -17,7 +17,10 @@ protobuf_generate_c(IPCP_PROTO_SRCS IPCP_PROTO_HDRS
set(SOURCE_FILES_COMMON
bitmap.c
btree.c
- crc32.c
+ crc/crc8.c
+ crc/crc16.c
+ crc/crc32.c
+ crc/crc64.c
crypt.c
hash.c
lockfile.c
@@ -36,6 +39,7 @@ set(SOURCE_FILES_COMMON
ssm/pool.c
sockets.c
tpm.c
+ tw.c
utils.c
)
@@ -155,5 +159,6 @@ configure_file("${CMAKE_CURRENT_SOURCE_DIR}/ssm/ssm.h.in"
if(BUILD_TESTS)
add_subdirectory(tests)
+ add_subdirectory(crc/tests)
add_subdirectory(ssm/tests)
endif()
diff --git a/src/lib/config.h.in b/src/lib/config.h.in
index 08e9baf6..7124a974 100644
--- a/src/lib/config.h.in
+++ b/src/lib/config.h.in
@@ -20,6 +20,14 @@
* Foundation, Inc., http://www.fsf.org/about/contact/.
*/
+#ifndef MILLION
+#define MILLION 1000000LL
+#endif
+
+#ifndef BILLION
+#define BILLION 1000000000LL
+#endif
+
#cmakedefine HAVE_SYS_RANDOM
#cmakedefine HAVE_EXPLICIT_BZERO
#cmakedefine HAVE_LIBGCRYPT
@@ -37,6 +45,8 @@
#cmakedefine QOS_DISABLE_CRC
#cmakedefine HAVE_OPENSSL_RNG
+#cmakedefine HAVE_PCLMUL
+#cmakedefine HAVE_PMULL
#define SHM_LOCKFILE_NAME "@SHM_LOCKFILE_NAME@"
#define FLOW_ALLOC_TIMEOUT @FLOW_ALLOC_TIMEOUT@
@@ -60,16 +70,18 @@
#cmakedefine PROC_FLOW_STATS
#endif
+#cmakedefine FRCT_DEBUG_STDOUT
+
#define PTHREAD_COND_CLOCK @PTHREAD_COND_CLOCK@
-#define PROG_MAX_FLOWS @PROG_MAX_FLOWS@
-#define PROG_RES_FDS @PROG_RES_FDS@
-#define PROG_MAX_FQUEUES @PROG_MAX_FQUEUES@
+#define PROC_MAX_FLOWS @PROC_MAX_FLOWS@
+#define PROC_RES_FDS @PROC_RES_FDS@
+#define PROC_MAX_FQUEUES @PROC_MAX_FQUEUES@
/* Default Delta-t parameters */
#cmakedefine FRCT_LINUX_RTT_ESTIMATOR
-#define DELT_A (@DELTA_T_ACK@) /* ns */
-#define DELT_R (@DELTA_T_RTX@) /* ns */
+#define DELT_A (@DELTA_T_ACK@) /* ms */
+#define DELT_R (@DELTA_T_RTX@) /* ms */
#define RQ_SIZE (@FRCT_REORDER_QUEUE_SIZE@)
#define START_WINDOW (@FRCT_START_WINDOW@)
@@ -80,9 +92,6 @@
#define TICTIME (@FRCT_TICK_TIME@ * 1000) /* ns */
/* Retransmission tuning */
-#cmakedefine RXM_BUFFER_ON_HEAP
-#cmakedefine RXM_BLOCKING
-
#define RXMQ_RES (@RXM_MIN_RESOLUTION@) /* 2^N ns */
#define RXMQ_BUMP (@RXM_WHEEL_MULTIPLIER@)
#define RXMQ_LVLS (@RXM_WHEEL_LEVELS@)
diff --git a/src/lib/crc/crc16.c b/src/lib/crc/crc16.c
new file mode 100644
index 00000000..9dc59429
--- /dev/null
+++ b/src/lib/crc/crc16.c
@@ -0,0 +1,61 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * 16-bit Cyclic Redundancy Check (CCITT-FALSE variant)
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+/*
+ * CRC-16/CCITT-FALSE (reveng catalog, alias CRC-16/IBM-3740):
+ * poly = 0x1021
+ * init = 0xffff
+ * refin = false
+ * refout = false
+ * xorout = 0x0000
+ * check = crc16_ccitt_false("123456789") == 0x29b1
+ */
+
+#include "config.h"
+
+#include <ouroboros/crc16.h>
+
+/* Bit-by-bit MSB-first CRC. */
+void crc16_ccitt_false(uint16_t * crc,
+ const void * buf,
+ size_t len)
+{
+ const uint8_t * p;
+ uint16_t c;
+ size_t n;
+ int i;
+
+ p = (const uint8_t *) buf;
+ c = *crc ^ 0xffff;
+
+ for (n = 0; n < len; n++) {
+ c ^= ((uint16_t) p[n]) << 8;
+ for (i = 0; i < 8; i++) {
+ if (c & 0x8000)
+ c = (uint16_t) ((c << 1) ^ 0x1021);
+ else
+ c = (uint16_t) (c << 1);
+ }
+ }
+
+ *crc = c;
+}
diff --git a/src/lib/crc32.c b/src/lib/crc/crc32.c
index 0fdb62b1..0fdb62b1 100644
--- a/src/lib/crc32.c
+++ b/src/lib/crc/crc32.c
diff --git a/src/lib/crc/crc64.c b/src/lib/crc/crc64.c
new file mode 100644
index 00000000..1b6fb5f6
--- /dev/null
+++ b/src/lib/crc/crc64.c
@@ -0,0 +1,363 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * 64-bit Cyclic Redundancy Check (NVMe variant)
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+/*
+ * CRC-64/NVMe (reveng catalog):
+ * poly = 0xad93d23594c93659
+ * init = 0xffffffffffffffff
+ * refin = true
+ * refout = true
+ * xorout = 0xffffffffffffffff
+ * check = crc64_nvme("123456789") == 0xae8b14860a799888
+ */
+
+#include "config.h"
+
+#include <ouroboros/crc64.h>
+
+/*
+ * Reflected CRC-64/NVMe table. Polynomial in reflected form:
+ * 0x9a6c9329ac4bc9b5 (bitrev of 0xad93d23594c93659).
+ */
+static const uint64_t crc64_nvme_tab[256] = {
+ 0x0000000000000000ULL, 0x7f6ef0c830358979ULL,
+ 0xfedde190606b12f2ULL, 0x81b31158505e9b8bULL,
+ 0xc962e5739841b68fULL, 0xb60c15bba8743ff6ULL,
+ 0x37bf04e3f82aa47dULL, 0x48d1f42bc81f2d04ULL,
+ 0xa61cecb46814fe75ULL, 0xd9721c7c5821770cULL,
+ 0x58c10d24087fec87ULL, 0x27affdec384a65feULL,
+ 0x6f7e09c7f05548faULL, 0x1010f90fc060c183ULL,
+ 0x91a3e857903e5a08ULL, 0xeecd189fa00bd371ULL,
+ 0x78e0ff3b88be6f81ULL, 0x078e0ff3b88be6f8ULL,
+ 0x863d1eabe8d57d73ULL, 0xf953ee63d8e0f40aULL,
+ 0xb1821a4810ffd90eULL, 0xceecea8020ca5077ULL,
+ 0x4f5ffbd87094cbfcULL, 0x30310b1040a14285ULL,
+ 0xdefc138fe0aa91f4ULL, 0xa192e347d09f188dULL,
+ 0x2021f21f80c18306ULL, 0x5f4f02d7b0f40a7fULL,
+ 0x179ef6fc78eb277bULL, 0x68f0063448deae02ULL,
+ 0xe943176c18803589ULL, 0x962de7a428b5bcf0ULL,
+ 0xf1c1fe77117cdf02ULL, 0x8eaf0ebf2149567bULL,
+ 0x0f1c1fe77117cdf0ULL, 0x7072ef2f41224489ULL,
+ 0x38a31b04893d698dULL, 0x47cdebccb908e0f4ULL,
+ 0xc67efa94e9567b7fULL, 0xb9100a5cd963f206ULL,
+ 0x57dd12c379682177ULL, 0x28b3e20b495da80eULL,
+ 0xa900f35319033385ULL, 0xd66e039b2936bafcULL,
+ 0x9ebff7b0e12997f8ULL, 0xe1d10778d11c1e81ULL,
+ 0x606216208142850aULL, 0x1f0ce6e8b1770c73ULL,
+ 0x8921014c99c2b083ULL, 0xf64ff184a9f739faULL,
+ 0x77fce0dcf9a9a271ULL, 0x08921014c99c2b08ULL,
+ 0x4043e43f0183060cULL, 0x3f2d14f731b68f75ULL,
+ 0xbe9e05af61e814feULL, 0xc1f0f56751dd9d87ULL,
+ 0x2f3dedf8f1d64ef6ULL, 0x50531d30c1e3c78fULL,
+ 0xd1e00c6891bd5c04ULL, 0xae8efca0a188d57dULL,
+ 0xe65f088b6997f879ULL, 0x9931f84359a27100ULL,
+ 0x1882e91b09fcea8bULL, 0x67ec19d339c963f2ULL,
+ 0xd75adabd7a6e2d6fULL, 0xa8342a754a5ba416ULL,
+ 0x29873b2d1a053f9dULL, 0x56e9cbe52a30b6e4ULL,
+ 0x1e383fcee22f9be0ULL, 0x6156cf06d21a1299ULL,
+ 0xe0e5de5e82448912ULL, 0x9f8b2e96b271006bULL,
+ 0x71463609127ad31aULL, 0x0e28c6c1224f5a63ULL,
+ 0x8f9bd7997211c1e8ULL, 0xf0f5275142244891ULL,
+ 0xb824d37a8a3b6595ULL, 0xc74a23b2ba0eececULL,
+ 0x46f932eaea507767ULL, 0x3997c222da65fe1eULL,
+ 0xafba2586f2d042eeULL, 0xd0d4d54ec2e5cb97ULL,
+ 0x5167c41692bb501cULL, 0x2e0934dea28ed965ULL,
+ 0x66d8c0f56a91f461ULL, 0x19b6303d5aa47d18ULL,
+ 0x980521650afae693ULL, 0xe76bd1ad3acf6feaULL,
+ 0x09a6c9329ac4bc9bULL, 0x76c839faaaf135e2ULL,
+ 0xf77b28a2faafae69ULL, 0x8815d86aca9a2710ULL,
+ 0xc0c42c4102850a14ULL, 0xbfaadc8932b0836dULL,
+ 0x3e19cdd162ee18e6ULL, 0x41773d1952db919fULL,
+ 0x269b24ca6b12f26dULL, 0x59f5d4025b277b14ULL,
+ 0xd846c55a0b79e09fULL, 0xa72835923b4c69e6ULL,
+ 0xeff9c1b9f35344e2ULL, 0x90973171c366cd9bULL,
+ 0x1124202993385610ULL, 0x6e4ad0e1a30ddf69ULL,
+ 0x8087c87e03060c18ULL, 0xffe938b633338561ULL,
+ 0x7e5a29ee636d1eeaULL, 0x0134d92653589793ULL,
+ 0x49e52d0d9b47ba97ULL, 0x368bddc5ab7233eeULL,
+ 0xb738cc9dfb2ca865ULL, 0xc8563c55cb19211cULL,
+ 0x5e7bdbf1e3ac9decULL, 0x21152b39d3991495ULL,
+ 0xa0a63a6183c78f1eULL, 0xdfc8caa9b3f20667ULL,
+ 0x97193e827bed2b63ULL, 0xe877ce4a4bd8a21aULL,
+ 0x69c4df121b863991ULL, 0x16aa2fda2bb3b0e8ULL,
+ 0xf86737458bb86399ULL, 0x8709c78dbb8deae0ULL,
+ 0x06bad6d5ebd3716bULL, 0x79d4261ddbe6f812ULL,
+ 0x3105d23613f9d516ULL, 0x4e6b22fe23cc5c6fULL,
+ 0xcfd833a67392c7e4ULL, 0xb0b6c36e43a74e9dULL,
+ 0x9a6c9329ac4bc9b5ULL, 0xe50263e19c7e40ccULL,
+ 0x64b172b9cc20db47ULL, 0x1bdf8271fc15523eULL,
+ 0x530e765a340a7f3aULL, 0x2c608692043ff643ULL,
+ 0xadd397ca54616dc8ULL, 0xd2bd67026454e4b1ULL,
+ 0x3c707f9dc45f37c0ULL, 0x431e8f55f46abeb9ULL,
+ 0xc2ad9e0da4342532ULL, 0xbdc36ec59401ac4bULL,
+ 0xf5129aee5c1e814fULL, 0x8a7c6a266c2b0836ULL,
+ 0x0bcf7b7e3c7593bdULL, 0x74a18bb60c401ac4ULL,
+ 0xe28c6c1224f5a634ULL, 0x9de29cda14c02f4dULL,
+ 0x1c518d82449eb4c6ULL, 0x633f7d4a74ab3dbfULL,
+ 0x2bee8961bcb410bbULL, 0x548079a98c8199c2ULL,
+ 0xd53368f1dcdf0249ULL, 0xaa5d9839ecea8b30ULL,
+ 0x449080a64ce15841ULL, 0x3bfe706e7cd4d138ULL,
+ 0xba4d61362c8a4ab3ULL, 0xc52391fe1cbfc3caULL,
+ 0x8df265d5d4a0eeceULL, 0xf29c951de49567b7ULL,
+ 0x732f8445b4cbfc3cULL, 0x0c41748d84fe7545ULL,
+ 0x6bad6d5ebd3716b7ULL, 0x14c39d968d029fceULL,
+ 0x95708ccedd5c0445ULL, 0xea1e7c06ed698d3cULL,
+ 0xa2cf882d2576a038ULL, 0xdda178e515432941ULL,
+ 0x5c1269bd451db2caULL, 0x237c997575283bb3ULL,
+ 0xcdb181ead523e8c2ULL, 0xb2df7122e51661bbULL,
+ 0x336c607ab548fa30ULL, 0x4c0290b2857d7349ULL,
+ 0x04d364994d625e4dULL, 0x7bbd94517d57d734ULL,
+ 0xfa0e85092d094cbfULL, 0x856075c11d3cc5c6ULL,
+ 0x134d926535897936ULL, 0x6c2362ad05bcf04fULL,
+ 0xed9073f555e26bc4ULL, 0x92fe833d65d7e2bdULL,
+ 0xda2f7716adc8cfb9ULL, 0xa54187de9dfd46c0ULL,
+ 0x24f29686cda3dd4bULL, 0x5b9c664efd965432ULL,
+ 0xb5517ed15d9d8743ULL, 0xca3f8e196da80e3aULL,
+ 0x4b8c9f413df695b1ULL, 0x34e26f890dc31cc8ULL,
+ 0x7c339ba2c5dc31ccULL, 0x035d6b6af5e9b8b5ULL,
+ 0x82ee7a32a5b7233eULL, 0xfd808afa9582aa47ULL,
+ 0x4d364994d625e4daULL, 0x3258b95ce6106da3ULL,
+ 0xb3eba804b64ef628ULL, 0xcc8558cc867b7f51ULL,
+ 0x8454ace74e645255ULL, 0xfb3a5c2f7e51db2cULL,
+ 0x7a894d772e0f40a7ULL, 0x05e7bdbf1e3ac9deULL,
+ 0xeb2aa520be311aafULL, 0x944455e88e0493d6ULL,
+ 0x15f744b0de5a085dULL, 0x6a99b478ee6f8124ULL,
+ 0x224840532670ac20ULL, 0x5d26b09b16452559ULL,
+ 0xdc95a1c3461bbed2ULL, 0xa3fb510b762e37abULL,
+ 0x35d6b6af5e9b8b5bULL, 0x4ab846676eae0222ULL,
+ 0xcb0b573f3ef099a9ULL, 0xb465a7f70ec510d0ULL,
+ 0xfcb453dcc6da3dd4ULL, 0x83daa314f6efb4adULL,
+ 0x0269b24ca6b12f26ULL, 0x7d0742849684a65fULL,
+ 0x93ca5a1b368f752eULL, 0xeca4aad306bafc57ULL,
+ 0x6d17bb8b56e467dcULL, 0x12794b4366d1eea5ULL,
+ 0x5aa8bf68aecec3a1ULL, 0x25c64fa09efb4ad8ULL,
+ 0xa4755ef8cea5d153ULL, 0xdb1bae30fe90582aULL,
+ 0xbcf7b7e3c7593bd8ULL, 0xc399472bf76cb2a1ULL,
+ 0x422a5673a732292aULL, 0x3d44a6bb9707a053ULL,
+ 0x759552905f188d57ULL, 0x0afba2586f2d042eULL,
+ 0x8b48b3003f739fa5ULL, 0xf42643c80f4616dcULL,
+ 0x1aeb5b57af4dc5adULL, 0x6585ab9f9f784cd4ULL,
+ 0xe436bac7cf26d75fULL, 0x9b584a0fff135e26ULL,
+ 0xd389be24370c7322ULL, 0xace74eec0739fa5bULL,
+ 0x2d545fb4576761d0ULL, 0x523aaf7c6752e8a9ULL,
+ 0xc41748d84fe75459ULL, 0xbb79b8107fd2dd20ULL,
+ 0x3acaa9482f8c46abULL, 0x45a459801fb9cfd2ULL,
+ 0x0d75adabd7a6e2d6ULL, 0x721b5d63e7936bafULL,
+ 0xf3a84c3bb7cdf024ULL, 0x8cc6bcf387f8795dULL,
+ 0x620ba46c27f3aa2cULL, 0x1d6554a417c62355ULL,
+ 0x9cd645fc4798b8deULL, 0xe3b8b53477ad31a7ULL,
+ 0xab69411fbfb21ca3ULL, 0xd407b1d78f8795daULL,
+ 0x55b4a08fdfd90e51ULL, 0x2ada5047efec8728ULL
+};
+
+static __inline__ uint64_t crc64_nvme_step(uint64_t c,
+ const uint8_t * p,
+ size_t len)
+{
+ size_t n;
+
+ for (n = 0; n < len; n++)
+ c = crc64_nvme_tab[(c ^ p[n]) & 0xff] ^ (c >> 8);
+
+ return c;
+}
+
+void crc64_nvme_table(uint64_t * crc,
+ const void * buf,
+ size_t len)
+{
+ uint64_t c;
+
+ c = crc64_nvme_step(*crc ^ UINT64_MAX,
+ (const uint8_t *) buf, len);
+
+ *crc = c ^ UINT64_MAX;
+}
+
+#ifdef HAVE_PCLMUL
+
+#include <smmintrin.h>
+#include <wmmintrin.h>
+
+/*
+ * Fold-by-16 constants for reflected CRC-64/NVMe. Properties of the
+ * polynomial; identical between the PCLMUL and PMULL backends.
+ * k3 = bitrev64(x^(128+64) mod P) << 1
+ * k4 = bitrev64(x^(128+0) mod P) << 1
+ */
+static const uint64_t k3_clmul = 0xeadc41fd2ba3d420ULL;
+static const uint64_t k4_clmul = 0x21e9761e252621acULL;
+
+__attribute__((target("pclmul,sse4.1")))
+static __m128i fold16(__m128i x,
+ __m128i k)
+{
+ __m128i lo;
+ __m128i hi;
+
+ lo = _mm_clmulepi64_si128(x, k, 0x00);
+ hi = _mm_clmulepi64_si128(x, k, 0x11);
+ return _mm_xor_si128(lo, hi);
+}
+
+/*
+ * Fold-by-16 over 16-byte chunks; the 128-bit folded state is then
+ * emitted as 16 little-endian bytes and run through the byte-table
+ * loop together with any tail (<=15 bytes). The 16-byte minimum on
+ * the bulk loop is why the short-input path uses the table directly.
+ */
+__attribute__((target("pclmul,sse4.1")))
+static void crc64_nvme_clmul(uint64_t * crc,
+ const void * buf,
+ size_t len)
+{
+ const uint8_t * p;
+ uint64_t seed;
+ uint64_t c;
+ size_t off;
+ __m128i x;
+ __m128i k;
+ uint8_t post[16];
+
+ p = (const uint8_t *) buf;
+ seed = *crc;
+
+ if (len < 16) {
+ c = crc64_nvme_step(seed ^ UINT64_MAX, p, len);
+ *crc = c ^ UINT64_MAX;
+ return;
+ }
+
+ x = _mm_loadu_si128((const __m128i *) p);
+ x = _mm_xor_si128(x, _mm_cvtsi64_si128((int64_t)
+ (seed ^ UINT64_MAX)));
+
+ k = _mm_set_epi64x((int64_t) k4_clmul, (int64_t) k3_clmul);
+
+ off = 16;
+ while (off + 16 <= len) {
+ __m128i d;
+
+ d = _mm_loadu_si128((const __m128i *) (p + off));
+ x = _mm_xor_si128(fold16(x, k), d);
+ off += 16;
+ }
+
+ _mm_storeu_si128((__m128i *) post, x);
+
+ c = crc64_nvme_step(0, post, 16);
+ c = crc64_nvme_step(c, p + off, len - off);
+
+ *crc = c ^ UINT64_MAX;
+}
+
+#endif /* HAVE_PCLMUL */
+
+#ifdef HAVE_PMULL
+
+#include <arm_neon.h>
+
+/* Same fold-by-16 constants as the PCLMUL path (poly properties). */
+static const uint64_t k3_pmull = 0xeadc41fd2ba3d420ULL;
+static const uint64_t k4_pmull = 0x21e9761e252621acULL;
+
+__attribute__((target("+crypto")))
+static uint64x2_t fold16_pmull(uint64x2_t x,
+ uint64x2_t k)
+{
+ poly64x2_t xp;
+ poly64x2_t kp;
+ uint64x2_t lo;
+ uint64x2_t hi;
+
+ xp = vreinterpretq_p64_u64(x);
+ kp = vreinterpretq_p64_u64(k);
+ lo = vreinterpretq_u64_p128(
+ vmull_p64((poly64_t) vgetq_lane_u64(x, 0),
+ (poly64_t) vgetq_lane_u64(k, 0)));
+ hi = vreinterpretq_u64_p128(vmull_high_p64(xp, kp));
+ return veorq_u64(lo, hi);
+}
+
+__attribute__((target("+crypto")))
+static void crc64_nvme_pmull(uint64_t * crc,
+ const void * buf,
+ size_t len)
+{
+ const uint8_t * p;
+ uint64_t seed;
+ uint64_t c;
+ size_t off;
+ uint64x2_t x;
+ uint64x2_t k;
+ uint64_t seed_lane[2];
+ uint64_t k_lanes[2];
+ uint8_t post[16];
+
+ p = (const uint8_t *) buf;
+ seed = *crc;
+
+ if (len < 16) {
+ c = crc64_nvme_step(seed ^ UINT64_MAX, p, len);
+ *crc = c ^ UINT64_MAX;
+ return;
+ }
+
+ x = vld1q_u64((const uint64_t *) p);
+ seed_lane[0] = seed ^ UINT64_MAX;
+ seed_lane[1] = 0;
+ x = veorq_u64(x, vld1q_u64(seed_lane));
+
+ k_lanes[0] = k3_pmull;
+ k_lanes[1] = k4_pmull;
+ k = vld1q_u64(k_lanes);
+
+ off = 16;
+ while (off + 16 <= len) {
+ uint64x2_t d;
+
+ d = vld1q_u64((const uint64_t *) (p + off));
+ x = veorq_u64(fold16_pmull(x, k), d);
+ off += 16;
+ }
+
+ vst1q_u8(post, vreinterpretq_u8_u64(x));
+
+ c = crc64_nvme_step(0, post, 16);
+ c = crc64_nvme_step(c, p + off, len - off);
+
+ *crc = c ^ UINT64_MAX;
+}
+#endif /* HAVE_PMULL */
+
+void crc64_nvme(uint64_t * crc,
+ const void * buf,
+ size_t len)
+{
+#ifdef HAVE_PCLMUL
+ crc64_nvme_clmul(crc, buf, len);
+#elif defined(HAVE_PMULL)
+ crc64_nvme_pmull(crc, buf, len);
+#else
+ crc64_nvme_table(crc, buf, len);
+#endif
+}
diff --git a/src/lib/crc/crc8.c b/src/lib/crc/crc8.c
new file mode 100644
index 00000000..20976b29
--- /dev/null
+++ b/src/lib/crc/crc8.c
@@ -0,0 +1,62 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * 8-bit Cyclic Redundancy Check (AUTOSAR variant)
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+/*
+ * CRC-8/AUTOSAR (reveng catalog):
+ * poly = 0x2f
+ * init = 0xff
+ * refin = false
+ * refout = false
+ * xorout = 0xff
+ * check = crc8_autosar("123456789") == 0xdf
+ */
+
+#include "config.h"
+
+#include <ouroboros/crc8.h>
+
+
+ /* Bit-by-bit MSB-first CRC. */
+void crc8_autosar(uint8_t * crc,
+ const void * buf,
+ size_t len)
+{
+ const uint8_t * p;
+ uint8_t c;
+ size_t n;
+ int i;
+
+ p = (const uint8_t *) buf;
+ c = *crc ^ 0xff;
+
+ for (n = 0; n < len; n++) {
+ c ^= p[n];
+ for (i = 0; i < 8; i++) {
+ if (c & 0x80)
+ c = (uint8_t) ((c << 1) ^ 0x2f);
+ else
+ c = (uint8_t) (c << 1);
+ }
+ }
+
+ *crc = c ^ 0xff;
+}
diff --git a/src/lib/crc/tests/CMakeLists.txt b/src/lib/crc/tests/CMakeLists.txt
new file mode 100644
index 00000000..11daca5a
--- /dev/null
+++ b/src/lib/crc/tests/CMakeLists.txt
@@ -0,0 +1,21 @@
+get_filename_component(PARENT_PATH ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
+get_filename_component(PARENT_DIR ${PARENT_PATH} NAME)
+
+compute_test_prefix()
+
+create_test_sourcelist(${PARENT_DIR}_tests test_suite.c
+ # Add new tests here
+ crc8_test.c
+ crc16_test.c
+ crc32_test.c
+ crc64_test.c
+ )
+
+add_executable(${PARENT_DIR}_test ${${PARENT_DIR}_tests})
+
+disable_test_logging_for_target(${PARENT_DIR}_test)
+target_link_libraries(${PARENT_DIR}_test ouroboros-common)
+
+add_dependencies(build_tests ${PARENT_DIR}_test)
+
+ouroboros_register_tests(TARGET ${PARENT_DIR}_test TESTS ${${PARENT_DIR}_tests})
diff --git a/src/lib/crc/tests/crc16_test.c b/src/lib/crc/tests/crc16_test.c
new file mode 100644
index 00000000..03a5b504
--- /dev/null
+++ b/src/lib/crc/tests/crc16_test.c
@@ -0,0 +1,67 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * Test of the CRC-16/CCITT-FALSE function
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+#include "config.h"
+
+#include <ouroboros/crc16.h>
+
+#include <test/test.h>
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+/* reveng-catalog smoke vectors. */
+static int test_crc16_ccitt_false_basic(void)
+{
+ uint16_t crc;
+
+ TEST_START();
+
+ crc = 0;
+ crc16_ccitt_false(&crc, "", 0);
+ if (crc != 0xffff)
+ goto fail;
+
+ crc = 0;
+ crc16_ccitt_false(&crc, "123456789", 9);
+ if (crc != 0x29b1)
+ goto fail;
+
+ TEST_SUCCESS();
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+int crc16_test(int argc,
+ char ** argv)
+{
+ int ret = 0;
+
+ (void) argc;
+ (void) argv;
+
+ ret |= test_crc16_ccitt_false_basic();
+ return ret;
+}
diff --git a/src/lib/tests/crc32_test.c b/src/lib/crc/tests/crc32_test.c
index 5a1ddd87..5a1ddd87 100644
--- a/src/lib/tests/crc32_test.c
+++ b/src/lib/crc/tests/crc32_test.c
diff --git a/src/lib/crc/tests/crc64_test.c b/src/lib/crc/tests/crc64_test.c
new file mode 100644
index 00000000..cf3f5ca3
--- /dev/null
+++ b/src/lib/crc/tests/crc64_test.c
@@ -0,0 +1,126 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * Test of the CRC-64/NVMe function
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+#include "config.h"
+
+#include <ouroboros/crc64.h>
+#include <ouroboros/random.h>
+
+#include <test/test.h>
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+/* Reference impl, internal to libouroboros-common. */
+extern void crc64_nvme_table(uint64_t * crc,
+ const void * buf,
+ size_t len);
+
+/* reveng-catalog smoke vectors plus a 16-byte fold-boundary check. */
+static int test_crc64_nvme_basic(void)
+{
+ uint64_t crc;
+
+ TEST_START();
+
+ crc = 0;
+ crc64_nvme(&crc, "", 0);
+ if (crc != 0x0000000000000000ULL)
+ goto fail;
+
+ crc = 0;
+ crc64_nvme(&crc, "123456789", 9);
+ if (crc != 0xae8b14860a799888ULL)
+ goto fail;
+
+ crc = 0;
+ crc64_nvme(&crc, "0123456789abcdef", 16);
+ if (crc != 0x091485ca7018730eULL)
+ goto fail;
+
+ TEST_SUCCESS();
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+#if defined(HAVE_PCLMUL) || defined(HAVE_PMULL)
+/* Cross-check the accelerated dispatcher path against the byte-table. */
+static int test_crc64_nvme_random(void)
+{
+ static const size_t lens[] = {
+ 0, 1, 7, 8, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128,
+ 129, 255, 256, 257, 1023, 1024, 1025, 4096
+ };
+ uint8_t buf[4096];
+ size_t i;
+ uint64_t ref;
+ uint64_t got;
+
+ TEST_START();
+
+ if (random_buffer(buf, sizeof(buf)) < 0) {
+ printf("Failed to generate random data.\n");
+ goto fail;
+ }
+
+ for (i = 0; i < sizeof(lens) / sizeof(lens[0]); i++) {
+ ref = 0;
+ crc64_nvme_table(&ref, buf, lens[i]);
+
+ got = 0;
+ crc64_nvme(&got, buf, lens[i]);
+
+ if (ref == got)
+ continue;
+
+ printf("Mismatch at len=%zu: table=0x%016lx disp=0x%016lx\n",
+ lens[i],
+ (unsigned long) ref,
+ (unsigned long) got);
+ goto fail;
+ }
+
+ TEST_SUCCESS();
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+#endif
+}
+
+int crc64_test(int argc,
+ char ** argv)
+{
+ int ret = 0;
+
+ (void) argc;
+ (void) argv;
+
+ ret |= test_crc64_nvme_basic();
+#if defined(HAVE_PCLMUL) || defined(HAVE_PMULL)
+ ret |= test_crc64_nvme_random();
+#endif
+ return ret;
+}
diff --git a/src/lib/crc/tests/crc8_test.c b/src/lib/crc/tests/crc8_test.c
new file mode 100644
index 00000000..f7bb33b8
--- /dev/null
+++ b/src/lib/crc/tests/crc8_test.c
@@ -0,0 +1,67 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * Test of the CRC-8/AUTOSAR function
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+#include "config.h"
+
+#include <ouroboros/crc8.h>
+
+#include <test/test.h>
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+/* reveng-catalog smoke vectors. */
+static int test_crc8_autosar_basic(void)
+{
+ uint8_t crc;
+
+ TEST_START();
+
+ crc = 0;
+ crc8_autosar(&crc, "", 0);
+ if (crc != 0x00)
+ goto fail;
+
+ crc = 0;
+ crc8_autosar(&crc, "123456789", 9);
+ if (crc != 0xdf)
+ goto fail;
+
+ TEST_SUCCESS();
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+int crc8_test(int argc,
+ char ** argv)
+{
+ int ret = 0;
+
+ (void) argc;
+ (void) argv;
+
+ ret |= test_crc8_autosar_basic();
+ return ret;
+}
diff --git a/src/lib/dev.c b/src/lib/dev.c
index 9cfc24ee..ae0401b7 100644
--- a/src/lib/dev.c
+++ b/src/lib/dev.c
@@ -29,10 +29,13 @@
#include "config.h"
#include "ssm.h"
+#include <ouroboros/atomics.h>
#include <ouroboros/bitmap.h>
#include <ouroboros/cep.h>
+#include <ouroboros/crc16.h>
#include <ouroboros/crypt.h>
#include <ouroboros/dev.h>
+#include <ouroboros/endian.h>
#include <ouroboros/errno.h>
#include <ouroboros/fccntl.h>
#include <ouroboros/flow.h>
@@ -45,32 +48,33 @@
#include <ouroboros/np1_flow.h>
#include <ouroboros/pthread.h>
#include <ouroboros/random.h>
+#ifdef PROC_FLOW_STATS
+#include <ouroboros/rib.h>
+#endif
#include <ouroboros/serdes-irm.h>
+#include <ouroboros/sockets.h>
#include <ouroboros/ssm_flow_set.h>
#include <ouroboros/ssm_pool.h>
#include <ouroboros/ssm_rbuff.h>
-#include <ouroboros/sockets.h>
+#include <ouroboros/tw.h>
#include <ouroboros/utils.h>
-#ifdef PROC_FLOW_STATS
-#include <ouroboros/rib.h>
-#endif
+#include <assert.h>
#ifdef HAVE_LIBGCRYPT
#include <gcrypt.h>
#endif
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
#include <stdarg.h>
#include <stdbool.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
#include <sys/types.h>
#ifndef CLOCK_REALTIME_COARSE
#define CLOCK_REALTIME_COARSE CLOCK_REALTIME
#endif
-/* Partial read information. */
#define NO_PART -1
#define DONE_PART -2
@@ -78,19 +82,12 @@
#define SECMEMSZ 16384
#define MSGBUFSZ 2048
-/* map flow_ids to flow descriptors; track state of the flow */
struct fmap {
int fd;
- /* TODO: use actual flow state */
enum flow_state state;
};
-#define frcti_to_flow(frcti) \
- ((struct flow *)((uint8_t *) frcti - offsetof(struct flow, frcti)))
-
struct flow {
- struct list_head next;
-
struct flow_info info;
struct ssm_rbuff * rx_rb;
@@ -135,16 +132,10 @@ struct {
struct flow * flows;
struct fmap * id_to_fd;
- struct list_head flow_list;
pthread_mutex_t mtx;
pthread_cond_t cond;
- pthread_t tx;
- pthread_t rx;
- size_t n_frcti;
- fset_t * frct_set;
-
pthread_rwlock_t lock;
} proc;
@@ -243,7 +234,7 @@ static int proc_announce(const struct proc_info * proc)
return irm__irm_result_des(&msg);
}
-/* IRMd will clean up the mess if this fails */
+/* IRMd cleans up on failure. */
static void proc_exit(void)
{
uint8_t buf[SOCK_BUF_SIZE];
@@ -264,7 +255,7 @@ static int spb_encrypt(struct flow * flow,
uint8_t * tail;
if (flow->crypt == NULL)
- return 0; /* No encryption */
+ return 0;
in.data = ssm_pk_buff_head(spb);
in.len = ssm_pk_buff_len(spb);
@@ -272,11 +263,11 @@ static int spb_encrypt(struct flow * flow,
if (crypt_encrypt(flow->crypt, in, &out) < 0)
goto fail_encrypt;
- head = ssm_pk_buff_head_alloc(spb, flow->headsz);
+ head = ssm_pk_buff_push(spb, flow->headsz);
if (head == NULL)
goto fail_alloc;
- tail = ssm_pk_buff_tail_alloc(spb, flow->tailsz);
+ tail = ssm_pk_buff_push_tail(spb, flow->tailsz);
if (tail == NULL)
goto fail_alloc;
@@ -299,7 +290,7 @@ static int spb_decrypt(struct flow * flow,
uint8_t * head;
if (flow->crypt == NULL)
- return 0; /* No decryption */
+ return 0;
in.data = ssm_pk_buff_head(spb);
in.len = ssm_pk_buff_len(spb);
@@ -308,8 +299,8 @@ static int spb_decrypt(struct flow * flow,
return -ENOMEM;
- head = ssm_pk_buff_head_release(spb, flow->headsz) + flow->headsz;
- ssm_pk_buff_tail_release(spb, flow->tailsz);
+ head = ssm_pk_buff_pop(spb, flow->headsz) + flow->headsz;
+ ssm_pk_buff_pop_tail(spb, flow->tailsz);
memcpy(head, out.data, out.len);
@@ -318,130 +309,280 @@ static int spb_decrypt(struct flow * flow,
return 0;
}
-#include "frct.c"
-
-void * flow_tx(void * o)
+/* tw_move under proc.lock rdlock; gates teardown vs in-flight fires. */
+static void tw_move_safe(void)
{
- struct timespec tic = TIMESPEC_INIT_NS(TICTIME);
-
- (void) o;
+ pthread_rwlock_rdlock(&proc.lock);
- while (true) {
- timerwheel_move();
+ pthread_cleanup_push(__cleanup_rwlock_unlock, &proc.lock);
- nanosleep(&tic, NULL);
- }
+ tw_move();
- return (void *) 0;
+ pthread_cleanup_pop(1);
}
-static void flow_send_keepalive(struct flow * flow,
- struct timespec now)
+static int crc_add(struct ssm_pk_buff * spb,
+ size_t head_skip)
{
- struct ssm_pk_buff * spb;
- ssize_t idx;
- uint8_t * ptr;
-
- idx = ssm_pool_alloc(proc.pool, 0, &ptr, &spb);
- if (idx < 0)
- return;
+ uint8_t * head;
+ uint8_t * tail;
- pthread_rwlock_wrlock(&proc.lock);
+ tail = ssm_pk_buff_push_tail(spb, CRCLEN);
+ if (tail == NULL)
+ return -ENOMEM;
- flow->snd_act = now;
+ head = ssm_pk_buff_head(spb) + head_skip;
- if (ssm_rbuff_write(flow->tx_rb, idx))
- ssm_pool_remove(proc.pool, idx);
- else
- ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT);
+ mem_hash(HASH_CRC32, tail, head, tail - head);
- pthread_rwlock_unlock(&proc.lock);
+ return 0;
}
-/* Needs rdlock on proc. */
-static void _flow_keepalive(struct flow * flow)
+static int crc_check(struct ssm_pk_buff * spb,
+ size_t head_skip)
{
- struct timespec now;
- struct timespec s_act;
- struct timespec r_act;
- int flow_id;
- time_t timeo;
- uint32_t acl;
+ uint32_t crc;
+ uint8_t * head = ssm_pk_buff_head(spb) + head_skip;
+ uint8_t * tail = ssm_pk_buff_pop_tail(spb, CRCLEN);
+
+ mem_hash(HASH_CRC32, &crc, head, tail - head);
- s_act = flow->snd_act;
- r_act = flow->rcv_act;
+ return !(crc == *((uint32_t *) tail));
+}
- flow_id = flow->info.id;
- timeo = flow->info.qs.timeout;
+/* FRCT included here so it can use proc and dev.c statics directly. */
+#include "frct.c"
- acl = ssm_rbuff_get_acl(flow->rx_rb);
- if (timeo == 0 || acl & (ACL_FLOWPEER | ACL_FLOWDOWN))
- return;
+/*
+ * SACK / DATA carry trailer CRC32; HCS protects the headers on every
+ * FRCT packet. Decrypt before any check so plaintext is authoritative.
+ */
+static bool invalid_pkt(struct flow * flow,
+ struct ssm_pk_buff * spb)
+{
+ const struct frct_pci * pci;
+ uint16_t flags;
+ size_t pci_total;
- clock_gettime(PTHREAD_COND_CLOCK, &now);
+ if (spb == NULL || ssm_pk_buff_len(spb) == 0)
+ return true;
- if (ts_diff_ns(&now, &r_act) > (int64_t) timeo * MILLION) {
- ssm_rbuff_set_acl(flow->rx_rb, ACL_FLOWPEER);
- ssm_flow_set_notify(proc.fqset, flow_id, FLOW_PEER);
- return;
+ if (spb_decrypt(flow, spb) < 0)
+ return true;
+
+ if (flow->frcti == NULL) {
+ if (flow->info.qs.ber == 0 && crc_check(spb, 0) != 0)
+ return true;
+ return false;
}
- if (ts_diff_ns(&now, &s_act) > (int64_t) timeo * (MILLION >> 2)) {
- pthread_rwlock_unlock(&proc.lock);
+ if (ssm_pk_buff_len(spb) < FRCT_PCILEN)
+ return true;
- flow_send_keepalive(flow, now);
+ pci = (const struct frct_pci *) ssm_pk_buff_head(spb);
+ flags = ntoh16(pci->flags);
- pthread_rwlock_rdlock(&proc.lock);
+ /* Untrusted flag read; mismatch on HCS will drop on corrupt. */
+ if (flags & FRCT_DATA)
+ pci_total = frcti_data_hdr_len(flow->frcti);
+ else
+ pci_total = frcti_ctrl_hdr_len(flow->frcti);
+
+ if (ssm_pk_buff_len(spb) < pci_total)
+ return true;
+
+ if (frct_hcs_check(pci, flow->frcti) != 0)
+ return true;
+
+ /* HCS valid: CRC32 on SACK; or on DATA if ber = 0. */
+ if (flags & FRCT_SACK) {
+ if (crc_check(spb, pci_total) != 0)
+ return true;
+
+ } else if ((flags & FRCT_DATA) && flow->info.qs.ber == 0) {
+ if (crc_check(spb, pci_total) != 0)
+ return true;
}
+
+ return false;
}
-static void handle_keepalives(void)
+static bool deadline_passed(const struct timespec * abs)
{
- struct list_head * p;
- struct list_head * h;
+ struct timespec now;
- pthread_rwlock_rdlock(&proc.lock);
+ if (abs == NULL)
+ return false;
- list_for_each_safe(p, h, &proc.flow_list) {
- struct flow * flow;
- flow = list_entry(p, struct flow, next);
- _flow_keepalive(flow);
- }
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
- pthread_rwlock_unlock(&proc.lock);
+ return ts_diff_ns(&now, abs) >= 0;
}
-static void __cleanup_fqueue_destroy(void * fq)
+/* Clamp the wait by min(dl, next tw expiry, now + TICTIME). */
+static void compute_wait_deadline(const struct timespec * dl,
+ struct timespec * out)
{
- fqueue_destroy((fqueue_t *) fq);
+ struct timespec now;
+ struct timespec cap;
+ struct timespec expiry;
+ struct timespec tic = TIMESPEC_INIT_NS(TICTIME);
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ ts_add(&now, &tic, &cap);
+
+ tw_next_expiry(&expiry);
+
+ *out = (ts_diff_ns(&cap, &expiry) < 0) ? expiry : cap;
+ if (dl != NULL && ts_diff_ns(out, dl) > 0)
+ *out = *dl;
}
-void * flow_rx(void * o)
+/*
+ * proc.lock rdlock held across each iteration so flow_fini's wrlock
+ * waits for us to finish; FLOWDOWN already set means we exit promptly.
+ */
+static void flow_drain_rx_nb(struct flow * flow)
{
- struct timespec tic = TIMESPEC_INIT_NS(TICTIME);
- int ret;
- struct fqueue * fq;
+ ssize_t idx;
+ struct ssm_pk_buff * spb;
+ struct ssm_rbuff * rx_rb;
+ struct frcti * frcti;
+#ifdef PROC_FLOW_STATS
+ struct timespec t_a;
+ struct timespec t_b;
+#endif
+
+ if (flow->frcti != NULL)
+ STAT_BUMP(flow->frcti, drain_calls);
- (void) o;
+ while (true) {
+ pthread_rwlock_rdlock(&proc.lock);
- fq = fqueue_create();
+ rx_rb = flow->rx_rb;
+ if (rx_rb == NULL) {
+ pthread_rwlock_unlock(&proc.lock);
+ return;
+ }
- pthread_cleanup_push(__cleanup_fqueue_destroy, fq);
+ idx = ssm_rbuff_read(rx_rb);
+ if (idx < 0) {
+ pthread_rwlock_unlock(&proc.lock);
+ return;
+ }
- /* fevent will filter all FRCT packets for us */
- while ((ret = fevent(proc.frct_set, fq, &tic)) != 0) {
- if (ret == -ETIMEDOUT) {
- handle_keepalives();
+ spb = ssm_pool_get(proc.pool, idx);
+ if (invalid_pkt(flow, spb)) {
+ ssm_pool_remove(proc.pool, idx);
+ pthread_rwlock_unlock(&proc.lock);
continue;
}
- while (fqueue_next(fq) >= 0)
- ; /* no need to act */
+ frcti = flow->frcti;
+ if (frcti != NULL) {
+#ifdef PROC_FLOW_STATS
+ clock_gettime(CLOCK_MONOTONIC, &t_a);
+ FRCTI_RCV(frcti, spb);
+ clock_gettime(CLOCK_MONOTONIC, &t_b);
+ STAT_ADD(frcti, rcv_proc_ns,
+ (size_t) ts_diff_ns(&t_b, &t_a));
+#else
+ FRCTI_RCV(frcti, spb);
+#endif
+ } else {
+ ssm_pool_remove(proc.pool, idx);
+ }
+
+ pthread_rwlock_unlock(&proc.lock);
+
+ /* Per-packet so the delayed-ACK fires on time in a burst. */
+#ifdef PROC_FLOW_STATS
+ clock_gettime(CLOCK_MONOTONIC, &t_a);
+ tw_move_safe();
+ clock_gettime(CLOCK_MONOTONIC, &t_b);
+ if (frcti != NULL)
+ STAT_ADD(frcti, tw_move_ns,
+ (size_t) ts_diff_ns(&t_b, &t_a));
+#else
+ tw_move_safe();
+#endif
}
+}
- pthread_cleanup_pop(true);
+/*
+ * Wait clamped by caller deadline, next tw expiry, and TICTIME;
+ * a clamp-timeout means tw work is due, not caller-deadline.
+ */
+static int flow_rx_one(struct flow * flow,
+ struct timespec * abs)
+{
+ struct timespec wait_abs;
+ struct ssm_pk_buff * spb;
+ struct ssm_rbuff * rx_rb;
+ ssize_t idx;
+
+ while (true) {
+ compute_wait_deadline(abs, &wait_abs);
+
+ /* rdlock gates flow_fini; FLOWDOWN preempts the block. */
+ pthread_rwlock_rdlock(&proc.lock);
+
+ rx_rb = flow->rx_rb;
+ if (rx_rb == NULL) {
+ pthread_rwlock_unlock(&proc.lock);
+ return -EFLOWDOWN;
+ }
+
+ idx = ssm_rbuff_read_b(rx_rb, &wait_abs);
+ if (idx == -ETIMEDOUT) {
+ pthread_rwlock_unlock(&proc.lock);
+ if (deadline_passed(abs))
+ return -ETIMEDOUT;
+ tw_move_safe();
+ continue;
+ }
+ if (idx < 0) {
+ pthread_rwlock_unlock(&proc.lock);
+ return idx;
+ }
+
+ spb = ssm_pool_get(proc.pool, idx);
+ if (invalid_pkt(flow, spb)) {
+ ssm_pool_remove(proc.pool, idx);
+ pthread_rwlock_unlock(&proc.lock);
+ continue;
+ }
+
+ if (flow->frcti != NULL)
+ FRCTI_RCV(flow->frcti, spb);
+ else
+ ssm_pool_remove(proc.pool, idx);
+
+ pthread_rwlock_unlock(&proc.lock);
- return (void *) 0;
+ tw_move_safe();
+ return 0;
+ }
+}
+
+/* 0 = window open; -EAGAIN = !block and would block; else flow_rx_one rc. */
+static __inline__ int flow_wait_window(struct flow * flow,
+ size_t n,
+ bool block,
+ struct timespec * dl)
+{
+ int rc;
+
+ while (true) {
+ flow_drain_rx_nb(flow);
+ if (FRCTI_IS_WINDOW_OPEN_N(flow->frcti, n))
+ return 0;
+ if (!block)
+ return -EAGAIN;
+ rc = flow_rx_one(flow, dl);
+ if (rc < 0)
+ return rc;
+ }
}
static void flow_clear(int fd)
@@ -451,36 +592,40 @@ static void flow_clear(int fd)
proc.flows[fd].info.id = -1;
}
-static void __flow_fini(int fd)
+/*
+ * Set ACL_FLOWDOWN on rx/tx so any in-flight blocking reads or writes
+ * wake up and drop their proc.lock rdlock. Must run BEFORE flow_fini's
+ * wrlock, else the wrlock blocks on those rdlock holders and the
+ * in-flight calls never see the FLOWDOWN signal.
+ */
+static void flow_quiesce(int fd)
{
- assert(fd >= 0 && fd < SYS_MAX_FLOWS);
+ struct ssm_rbuff * rx_rb = proc.flows[fd].rx_rb;
+ struct ssm_rbuff * tx_rb = proc.flows[fd].tx_rb;
- if (proc.flows[fd].frcti != NULL) {
- proc.n_frcti--;
- if (proc.n_frcti == 0) {
- pthread_cancel(proc.tx);
- pthread_join(proc.tx, NULL);
- }
+ if (rx_rb != NULL)
+ ssm_rbuff_set_acl(rx_rb, ACL_FLOWDOWN);
+ if (tx_rb != NULL)
+ ssm_rbuff_set_acl(tx_rb, ACL_FLOWDOWN);
+}
- ssm_flow_set_del(proc.fqset, 0, proc.flows[fd].info.id);
+static void do_flow_fini(int fd)
+{
+ assert(fd >= 0 && fd < PROC_MAX_FLOWS);
+ if (proc.flows[fd].frcti != NULL)
frcti_destroy(proc.flows[fd].frcti);
- }
if (proc.flows[fd].info.id != -1) {
flow_destroy(&proc.id_to_fd[proc.flows[fd].info.id]);
bmp_release(proc.fds, fd);
}
- if (proc.flows[fd].rx_rb != NULL) {
- ssm_rbuff_set_acl(proc.flows[fd].rx_rb, ACL_FLOWDOWN);
+ if (proc.flows[fd].rx_rb != NULL)
ssm_rbuff_close(proc.flows[fd].rx_rb);
- }
- if (proc.flows[fd].tx_rb != NULL) {
- ssm_rbuff_set_acl(proc.flows[fd].tx_rb, ACL_FLOWDOWN);
+ if (proc.flows[fd].tx_rb != NULL)
ssm_rbuff_close(proc.flows[fd].tx_rb);
- }
if (proc.flows[fd].set != NULL) {
ssm_flow_set_notify(proc.flows[fd].set,
@@ -491,24 +636,40 @@ static void __flow_fini(int fd)
crypt_destroy_ctx(proc.flows[fd].crypt);
- list_del(&proc.flows[fd].next);
-
flow_clear(fd);
}
static void flow_fini(int fd)
{
+ flow_quiesce(fd);
+
pthread_rwlock_wrlock(&proc.lock);
- __flow_fini(fd);
+ do_flow_fini(fd);
pthread_rwlock_unlock(&proc.lock);
}
#define IS_ENCRYPTED(crypt) ((crypt)->nid != NID_undef)
-#define IS_ORDERED(flow) (flow.qs.in_order != 0)
+#define IS_ORDERED(info) ((info)->qs.service != SVC_RAW)
+#define IS_STREAM(info) ((info)->qs.service == SVC_STREAM)
+
+/* Raw MTU minus the wrapping (IV/Tag + optional CRC) dev.c adds. */
+static __inline__ size_t flow_user_mtu(const struct flow * flow,
+ size_t raw)
+{
+ size_t hdr;
+
+ hdr = flow->headsz + flow->tailsz;
+ if (flow->info.qs.ber == 0 && flow->crypt == NULL)
+ hdr += CRCLEN;
+
+ return raw > hdr ? raw - hdr : 0;
+}
+
static int flow_init(struct flow_info * info,
- struct crypt_sk * sk)
+ struct crypt_sk * sk,
+ time_t rtt_hint)
{
struct timespec now;
struct flow * flow;
@@ -550,7 +711,6 @@ static int flow_init(struct flow_info * info,
flow->tailsz = 0;
if (IS_ENCRYPTED(sk)) {
- /* Set to lower value in tests, should we make configurable? */
sk->rot_bit = KEY_ROTATION_BIT;
flow->crypt = crypt_create_ctx(sk);
if (flow->crypt == NULL)
@@ -561,22 +721,16 @@ static int flow_init(struct flow_info * info,
assert(flow->frcti == NULL);
- if (IS_ORDERED(flow->info)) {
- flow->frcti = frcti_create(fd, DELT_A, DELT_R, info->mpl);
+ if (IS_ORDERED(&flow->info)) {
+ uint32_t frct_mtu = flow_user_mtu(flow, info->mtu);
+
+ flow->frcti = frcti_create(fd, DELT_A, DELT_R,
+ info->mpl, rtt_hint,
+ info->qs, frct_mtu);
if (flow->frcti == NULL)
goto fail_frcti;
-
- if (ssm_flow_set_add(proc.fqset, 0, info->id))
- goto fail_flow_set_add;
-
- ++proc.n_frcti;
- if (proc.n_frcti == 1 &&
- pthread_create(&proc.tx, NULL, flow_tx, NULL) < 0)
- goto fail_tx_thread;
}
- list_add_tail(&flow->next, &proc.flow_list);
-
proc.id_to_fd[info->id].fd = fd;
flow_set_state(&proc.id_to_fd[info->id], FLOW_ALLOCATED);
@@ -585,10 +739,6 @@ static int flow_init(struct flow_info * info,
return fd;
- fail_tx_thread:
- ssm_flow_set_del(proc.fqset, 0, info->id);
- fail_flow_set_add:
- frcti_destroy(flow->frcti);
fail_frcti:
crypt_destroy_ctx(flow->crypt);
fail_crypt:
@@ -655,13 +805,13 @@ static void init(int argc,
gcry_control(GCRYCTL_INITIALIZATION_FINISHED, 0);
}
#endif
- proc.fds = bmp_create(PROG_MAX_FLOWS - PROG_RES_FDS, PROG_RES_FDS);
+ proc.fds = bmp_create(PROC_MAX_FLOWS - PROC_RES_FDS, PROC_RES_FDS);
if (proc.fds == NULL) {
fprintf(stderr, "FATAL: Could not create fd bitmap.\n");
goto fail_fds;
}
- proc.fqueues = bmp_create(PROG_MAX_FQUEUES, 0);
+ proc.fqueues = bmp_create(PROC_MAX_FQUEUES, 0);
if (proc.fqueues == NULL) {
fprintf(stderr, "FATAL: Could not create fqueue bitmap.\n");
goto fail_fqueues;
@@ -677,13 +827,13 @@ static void init(int argc,
goto fail_rdrb;
}
- proc.flows = malloc(sizeof(*proc.flows) * PROG_MAX_FLOWS);
+ proc.flows = malloc(sizeof(*proc.flows) * PROC_MAX_FLOWS);
if (proc.flows == NULL) {
fprintf(stderr, "FATAL: Could not malloc flows.\n");
goto fail_flows;
}
- for (i = 0; i < PROG_MAX_FLOWS; ++i)
+ for (i = 0; i < PROC_MAX_FLOWS; ++i)
flow_clear(i);
proc.id_to_fd = malloc(sizeof(*proc.id_to_fd) * SYS_MAX_FLOWS);
@@ -716,20 +866,14 @@ static void init(int argc,
goto fail_fqset;
}
- proc.frct_set = fset_create();
- if (proc.frct_set == NULL || proc.frct_set->idx != 0) {
- fprintf(stderr, "FATAL: Could not create FRCT set.\n");
- goto fail_frct_set;
- }
-
- if (timerwheel_init() < 0) {
+ if (tw_init() < 0) {
fprintf(stderr, "FATAL: Could not initialize timerwheel.\n");
goto fail_timerwheel;
}
if (crypt_secure_malloc_init(PROC_SECMEM_MAX) < 0) {
fprintf(stderr, "FATAL: Could not init secure malloc.\n");
- goto fail_timerwheel;
+ goto fail_secmem;
}
#if defined PROC_FLOW_STATS
@@ -741,24 +885,15 @@ static void init(int argc,
}
}
#endif
- if (pthread_create(&proc.rx, NULL, flow_rx, NULL) < 0) {
- fprintf(stderr, "FATAL: Could not start monitor thread.\n");
- goto fail_monitor;
- }
-
- list_head_init(&proc.flow_list);
-
return;
- fail_monitor:
#if defined PROC_FLOW_STATS
- rib_fini();
fail_rib_init:
+ crypt_secure_malloc_fini();
#endif
- timerwheel_fini();
+ fail_secmem:
+ tw_fini();
fail_timerwheel:
- fset_destroy(proc.frct_set);
- fail_frct_set:
ssm_flow_set_close(proc.fqset);
fail_fqset:
pthread_rwlock_destroy(&proc.lock);
@@ -789,19 +924,20 @@ static void fini(void)
if (proc.fds == NULL)
return;
- pthread_cancel(proc.rx);
- pthread_join(proc.rx, NULL);
+ /* Wake all in-flight readers/writers BEFORE wrlock acquire. */
+ for (i = 0; i < PROC_MAX_FLOWS; ++i)
+ if (proc.flows[i].info.id != -1)
+ flow_quiesce(i);
pthread_rwlock_wrlock(&proc.lock);
- for (i = 0; i < PROG_MAX_FLOWS; ++i) {
+ for (i = 0; i < PROC_MAX_FLOWS; ++i) {
struct flow * flow = &proc.flows[i];
if (flow->info.id != -1) {
ssize_t idx;
- ssm_rbuff_set_acl(flow->rx_rb, ACL_FLOWDOWN);
while ((idx = ssm_rbuff_read(flow->rx_rb)) >= 0)
ssm_pool_remove(proc.pool, idx);
- __flow_fini(i);
+ do_flow_fini(i);
}
}
@@ -813,9 +949,9 @@ static void fini(void)
#ifdef PROC_FLOW_STATS
rib_fini();
#endif
- timerwheel_fini();
+ crypt_secure_malloc_fini();
- fset_destroy(proc.frct_set);
+ tw_fini();
ssm_flow_set_close(proc.fqset);
@@ -860,6 +996,10 @@ int flow_accept(qosspec_t * qs,
if (qs != NULL)
qs->ber = 1;
#endif
+ /* STREAM cannot tolerate loss: drops create silent gaps. */
+ if (qs != NULL && qs->service == SVC_STREAM && qs->loss != 0)
+ return -EINVAL;
+
memset(&flow, 0, sizeof(flow));
flow.n_pid = getpid();
@@ -878,7 +1018,8 @@ int flow_accept(qosspec_t * qs,
if (err < 0)
return err;
- fd = flow_init(&flow, &crypt);
+ /* No RTT in accept; rtt_hint=0 bootstraps from first ACK. */
+ fd = flow_init(&flow, &crypt, 0);
crypt_secure_clear(key, SYMMKEYSZ);
@@ -899,11 +1040,16 @@ int flow_alloc(const char * dst,
uint8_t key[SYMMKEYSZ];
int fd;
int err;
+ struct timespec t0;
+ struct timespec t1;
#ifdef QOS_DISABLE_CRC
if (qs != NULL)
qs->ber = 1;
#endif
+ /* STREAM cannot tolerate loss: drops create silent gaps. */
+ if (qs != NULL && qs->service == SVC_STREAM && qs->loss != 0)
+ return -EINVAL;
memset(&flow, 0, sizeof(flow));
@@ -913,11 +1059,13 @@ int flow_alloc(const char * dst,
if (flow_alloc__irm_req_ser(&msg, &flow, dst, timeo))
return -ENOMEM;
+ clock_gettime(PTHREAD_COND_CLOCK, &t0);
+
err = send_recv_msg(&msg);
- if (err < 0) {
- printf("send_recv_msg error %d\n", err);
+ if (err < 0)
return err;
- }
+
+ clock_gettime(PTHREAD_COND_CLOCK, &t1);
crypt.key = key;
@@ -925,7 +1073,7 @@ int flow_alloc(const char * dst,
if (err < 0)
return err;
- fd = flow_init(&flow, &crypt);
+ fd = flow_init(&flow, &crypt, ts_diff_ns(&t1, &t0));
crypt_secure_clear(key, SYMMKEYSZ);
@@ -964,7 +1112,7 @@ int flow_join(const char * dst,
if (err < 0)
return err;
- fd = flow_init(&flow, &crypt);
+ fd = flow_init(&flow, &crypt, 0);
crypt_secure_clear(key, SYMMKEYSZ);
@@ -983,10 +1131,10 @@ int flow_dealloc(int fd)
struct flow * flow;
int err;
- if (fd < 0 || fd >= SYS_MAX_FLOWS )
+ if (fd < 0 || fd >= PROC_MAX_FLOWS )
return -EINVAL;
- memset(&info, 0, sizeof(flow));
+ memset(&info, 0, sizeof(info));
flow = &proc.flows[fd];
@@ -1008,9 +1156,8 @@ int flow_dealloc(int fd)
pthread_rwlock_rdlock(&proc.lock);
- timeo.tv_sec = frcti_dealloc(flow->frcti);
- while (timeo.tv_sec < 0) { /* keep the flow active for rtx */
- ssize_t ret;
+ while (FRCTI_LINGERING(flow->frcti)) {
+ ssize_t ret;
pthread_rwlock_unlock(&proc.lock);
@@ -1018,12 +1165,12 @@ int flow_dealloc(int fd)
pthread_rwlock_rdlock(&proc.lock);
- timeo.tv_sec = frcti_dealloc(flow->frcti);
-
- if (ret == -EFLOWDOWN && timeo.tv_sec < 0)
- timeo.tv_sec = -timeo.tv_sec;
+ if (ret == -EFLOWDOWN)
+ break;
}
+ timeo.tv_sec = FRCTI_DEALLOC(flow->frcti);
+
pthread_cleanup_push(__cleanup_rwlock_unlock, &proc.lock);
ssm_rbuff_fini(flow->tx_rb);
@@ -1033,15 +1180,18 @@ int flow_dealloc(int fd)
info.id = flow->info.id;
info.n_pid = getpid();
- if (flow_dealloc__irm_req_ser(&msg, &info, &timeo) < 0)
- return -ENOMEM;
+ if (flow_dealloc__irm_req_ser(&msg, &info, &timeo) < 0) {
+ err = -ENOMEM;
+ goto out;
+ }
err = send_recv_msg(&msg);
if (err < 0)
- return err;
+ goto out;
err = irm__irm_result_des(&msg);
+ out:
flow_fini(fd);
return err;
@@ -1055,12 +1205,12 @@ int ipcp_flow_dealloc(int fd)
struct flow * flow;
int err;
- if (fd < 0 || fd >= SYS_MAX_FLOWS )
+ if (fd < 0 || fd >= PROC_MAX_FLOWS )
return -EINVAL;
flow = &proc.flows[fd];
- memset(&info, 0, sizeof(flow));
+ memset(&info, 0, sizeof(info));
pthread_rwlock_rdlock(&proc.lock);
@@ -1074,15 +1224,18 @@ int ipcp_flow_dealloc(int fd)
pthread_rwlock_unlock(&proc.lock);
- if (ipcp_flow_dealloc__irm_req_ser(&msg, &info) < 0)
- return -ENOMEM;
+ if (ipcp_flow_dealloc__irm_req_ser(&msg, &info) < 0) {
+ err = -ENOMEM;
+ goto out;
+ }
err = send_recv_msg(&msg);
if (err < 0)
- return err;
+ goto out;
err = irm__irm_result_des(&msg);
+ out:
flow_fini(fd);
return err;
@@ -1102,8 +1255,18 @@ int fccntl(int fd,
uint32_t tx_acl;
size_t * qlen;
struct flow * flow;
-
- if (fd < 0 || fd >= SYS_MAX_FLOWS)
+ uint16_t old_acc;
+ uint16_t new_acc;
+ size_t max;
+ size_t * maxp;
+ size_t rsz;
+ size_t * rszp;
+ time_t rto;
+ time_t * rtop;
+ int rc;
+ bool emit_eos = false;
+
+ if (fd < 0 || fd >= PROC_MAX_FLOWS)
return -EBADF;
flow = &proc.flows[fd];
@@ -1167,14 +1330,27 @@ int fccntl(int fd,
qlen = va_arg(l, size_t *);
*qlen = ssm_rbuff_queued(flow->tx_rb);
break;
+ case FLOWGMTU:
+ maxp = va_arg(l, size_t *);
+ if (maxp == NULL)
+ goto einval;
+ *maxp = flow_user_mtu(flow, flow->info.mtu);
+ break;
case FLOWSFLAGS:
+ old_acc = flow->oflags & FLOWFACCMODE;
flow->oflags = va_arg(l, uint32_t);
+ new_acc = flow->oflags & FLOWFACCMODE;
+
+ /* Defer EOS emit until after proc.lock is dropped: */
+ /* frcti_fin_snd may block on shm-pool/tx-rb. */
+ if (new_acc == FLOWFRDONLY
+ && old_acc != FLOWFRDONLY
+ && flow->frcti != NULL)
+ emit_eos = true;
+
rx_acl = ssm_rbuff_get_acl(flow->rx_rb);
- tx_acl = ssm_rbuff_get_acl(flow->rx_rb);
- /*
- * Making our own flow write only means making the
- * the other side of the flow read only.
- */
+ tx_acl = ssm_rbuff_get_acl(flow->tx_rb);
+ /* Our flow write-only -> peer's read-only. */
if (flow->oflags & FLOWFWRONLY)
rx_acl |= ACL_RDONLY;
if (flow->oflags & FLOWFRDWR)
@@ -1218,6 +1394,59 @@ int fccntl(int fd,
goto eperm;
*cflags = frcti_getflags(flow->frcti);
break;
+ case FRCTSMAXSDU:
+ max = va_arg(l, size_t);
+ if (flow->frcti == NULL)
+ goto eperm;
+ if (frcti_set_max_rcv_sdu(flow->frcti, max) < 0)
+ goto einval;
+ break;
+ case FRCTGMAXSDU:
+ maxp = va_arg(l, size_t *);
+ if (maxp == NULL)
+ goto einval;
+ if (flow->frcti == NULL)
+ goto eperm;
+ *maxp = frcti_get_max_rcv_sdu(flow->frcti);
+ break;
+ case FRCTSRRINGSZ:
+ rsz = va_arg(l, size_t);
+ if (flow->frcti == NULL)
+ goto eperm;
+ rc = frcti_set_rcv_ring_sz(flow->frcti, rsz);
+ if (rc < 0) {
+ pthread_rwlock_unlock(&proc.lock);
+ va_end(l);
+ return rc;
+ }
+ break;
+ case FRCTGRRINGSZ:
+ rszp = va_arg(l, size_t *);
+ if (rszp == NULL)
+ goto einval;
+ if (flow->frcti == NULL)
+ goto eperm;
+ *rszp = frcti_get_rcv_ring_sz(flow->frcti);
+ break;
+ case FRCTSRTOMIN:
+ if (flow->frcti == NULL)
+ goto eperm;
+ rto = va_arg(l, time_t);
+ rc = frcti_set_rto_min(flow->frcti, rto);
+ if (rc < 0) {
+ pthread_rwlock_unlock(&proc.lock);
+ va_end(l);
+ return rc;
+ }
+ break;
+ case FRCTGRTOMIN:
+ if (flow->frcti == NULL)
+ goto eperm;
+ rtop = va_arg(l, time_t *);
+ if (rtop == NULL)
+ goto einval;
+ *rtop = frcti_get_rto_min(flow->frcti);
+ break;
default:
pthread_rwlock_unlock(&proc.lock);
va_end(l);
@@ -1227,6 +1456,9 @@ int fccntl(int fd,
pthread_rwlock_unlock(&proc.lock);
+ if (emit_eos)
+ frcti_fin_snd(flow->frcti);
+
va_end(l);
return 0;
@@ -1241,86 +1473,195 @@ int fccntl(int fd,
return -EPERM;
}
-static int chk_crc(struct ssm_pk_buff * spb)
-{
- uint32_t crc;
- uint8_t * head = ssm_pk_buff_head(spb);
- uint8_t * tail = ssm_pk_buff_tail_release(spb, CRCLEN);
-
- mem_hash(HASH_CRC32, &crc, head, tail - head);
-
- return !(crc == *((uint32_t *) tail));
-}
-
-static int add_crc(struct ssm_pk_buff * spb)
-{
- uint8_t * head;
- uint8_t * tail;
-
- tail = ssm_pk_buff_tail_alloc(spb, CRCLEN);
- if (tail == NULL)
- return -ENOMEM;
-
- head = ssm_pk_buff_head(spb);
- mem_hash(HASH_CRC32, tail, head, tail - head);
-
- return 0;
-}
-
static int flow_tx_spb(struct flow * flow,
struct ssm_pk_buff * spb,
+ uint16_t flags,
bool block,
struct timespec * abstime)
{
struct timespec now;
ssize_t idx;
+ size_t pci_total;
int ret;
clock_gettime(PTHREAD_COND_CLOCK, &now);
-
- pthread_rwlock_wrlock(&proc.lock);
-
flow->snd_act = now;
- pthread_rwlock_unlock(&proc.lock);
-
- idx = ssm_pk_buff_get_idx(spb);
-
- pthread_rwlock_rdlock(&proc.lock);
+ idx = ssm_pk_buff_get_off(spb);
if (ssm_pk_buff_len(spb) > 0) {
- if (frcti_snd(flow->frcti, spb) < 0)
+ if (FRCTI_SND(flow->frcti, spb, flags) < 0)
goto enomem;
- if (spb_encrypt(flow, spb) < 0)
- goto enomem;
+ if (flow->info.qs.ber == 0) {
+ pci_total = flow->frcti != NULL
+ ? frcti_data_hdr_len(flow->frcti) : 0;
+ if (crc_add(spb, pci_total) != 0)
+ goto enomem;
+ }
- if (flow->info.qs.ber == 0 && add_crc(spb) != 0)
+ if (spb_encrypt(flow, spb) < 0)
goto enomem;
}
- pthread_cleanup_push(__cleanup_rwlock_unlock, &proc.lock);
-
if (!block)
ret = ssm_rbuff_write(flow->tx_rb, idx);
else
ret = ssm_rbuff_write_b(flow->tx_rb, idx, abstime);
- if (ret < 0)
+ if (ret < 0) {
ssm_pool_remove(proc.pool, idx);
- else
- ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT);
-
- pthread_cleanup_pop(true);
+ return ret;
+ }
+ ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT);
return 0;
-enomem:
- pthread_rwlock_unlock(&proc.lock);
+ enomem:
ssm_pool_remove(proc.pool, idx);
return -ENOMEM;
}
+/* Per-fragment role for fragment i out of n; n == 1 yields SOLE. */
+static __inline__ uint16_t flow_frag_role(size_t i, size_t n)
+{
+ if (n == 1)
+ return FRCT_FR_SOLE;
+ if (i == 0)
+ return FRCT_FR_FIRST;
+ if (i + 1 == n)
+ return FRCT_FR_LAST;
+
+ return FRCT_FR_MID;
+}
+
+/*
+ * Stream-mode write: split buf into chunks of
+ * (frag_mtu - PCI - PCI_STREAM) bytes; each chunk goes through the
+ * normal tx path. frcti_snd injects the [start,end) extension and
+ * advances snd_byte_next under its wrlock. No FFGM/LFGM role bits.
+ */
+static ssize_t flow_write_stream(struct flow * flow,
+ const void * buf,
+ size_t count,
+ int oflags,
+ struct timespec * dl)
+{
+ const uint8_t * src = buf;
+ size_t payload;
+ size_t off = 0;
+ bool block = !(oflags & FLOWFWNOBLOCK);
+
+ if (!FRCTI_IS_FRTX(flow->frcti))
+ return -EMSGSIZE;
+
+ payload = FRCTI_PAYLOAD_CAP(flow->frcti);
+
+ while (off < count) {
+ struct ssm_pk_buff * spb;
+ uint8_t * ptr;
+ ssize_t idx;
+ size_t clen;
+ int ret;
+
+ ret = flow_wait_window(flow, 1, block, dl);
+ if (ret < 0)
+ return off > 0 ? (ssize_t) off : (ssize_t) ret;
+
+ clen = MIN(count - off, payload);
+
+ if (block)
+ idx = ssm_pool_alloc_b(proc.pool, clen, &ptr,
+ &spb, dl);
+ else
+ idx = ssm_pool_alloc(proc.pool, clen, &ptr, &spb);
+ if (idx < 0)
+ return off > 0 ? (ssize_t) off : idx;
+
+ memcpy(ptr, src + off, clen);
+
+ ret = flow_tx_spb(flow, spb, 0, block, dl);
+ if (ret < 0)
+ return off > 0 ? (ssize_t) off : (ssize_t) ret;
+
+ off += clen;
+ }
+
+ return (ssize_t) count;
+}
+
+/* Per-fragment flow_tx_spb loop. Raw flows refuse; FRCT splits the SDU. */
+static ssize_t flow_write_frag(struct flow * flow,
+ const void * buf,
+ size_t count,
+ int oflags,
+ struct timespec * dl)
+{
+ const uint8_t * src = buf;
+ size_t frag_payload;
+ size_t n;
+ size_t off = 0;
+ size_t i;
+ int ret;
+ bool block = !(oflags & FLOWFWNOBLOCK);
+
+ /* Raw flows carry no PCI; cannot fragment. */
+ if (flow->frcti == NULL)
+ return -EMSGSIZE;
+
+ frag_payload = FRCTI_PAYLOAD_CAP(flow->frcti);
+
+ /* Guard the ceil-divide against size_t overflow. */
+ if (count > SIZE_MAX - frag_payload + 1)
+ return -EMSGSIZE;
+ n = (count + frag_payload - 1) / frag_payload;
+
+ /* SDU larger than the FC window can ever offer would deadlock. */
+ if (n > RQ_SIZE)
+ return -EMSGSIZE;
+
+ /* SDU-atomic FC: wait for n seqnos to avoid overshoot mid-SDU. */
+ ret = flow_wait_window(flow, n, block, dl);
+ if (ret < 0)
+ return (ssize_t) ret;
+
+ STAT_BUMP(flow->frcti, sdu_snd_frag);
+
+ for (i = 0; i < n; ++i) {
+ struct ssm_pk_buff * spb;
+ uint8_t * ptr;
+ ssize_t idx;
+ size_t clen;
+
+ clen = (i + 1 == n) ? (count - off) : frag_payload;
+
+ if (block)
+ idx = ssm_pool_alloc_b(proc.pool, clen, &ptr,
+ &spb, dl);
+ else
+ idx = ssm_pool_alloc(proc.pool, clen, &ptr, &spb);
+ if (idx < 0) {
+ if (off > 0)
+ STAT_BUMP(flow->frcti, sdu_snd_alloc);
+ return off > 0 ? (ssize_t) off : idx;
+ }
+
+ memcpy(ptr, src + off, clen);
+
+ ret = flow_tx_spb(flow, spb, flow_frag_role(i, n),
+ block, dl);
+ if (ret < 0) {
+ if (off > 0)
+ STAT_BUMP(flow->frcti, sdu_snd_tx);
+ return off > 0 ? (ssize_t) off : (ssize_t) ret;
+ }
+
+ off += clen;
+ }
+
+ return (ssize_t) count;
+}
+
ssize_t flow_write(int fd,
const void * buf,
size_t count)
@@ -1330,76 +1671,75 @@ ssize_t flow_write(int fd,
int ret;
int flags;
struct timespec abs;
- struct timespec * abstime = NULL;
+ struct timespec now;
+ struct timespec * dl = NULL;
struct ssm_pk_buff * spb;
uint8_t * ptr;
if (buf == NULL && count != 0)
return -EINVAL;
- if (fd < 0 || fd >= PROG_MAX_FLOWS)
+ if (fd < 0 || fd >= PROC_MAX_FLOWS)
return -EBADF;
flow = &proc.flows[fd];
- clock_gettime(PTHREAD_COND_CLOCK, &abs);
-
- pthread_rwlock_wrlock(&proc.lock);
+ pthread_rwlock_rdlock(&proc.lock);
if (flow->info.id < 0) {
pthread_rwlock_unlock(&proc.lock);
return -ENOTALLOC;
}
+ flags = flow->oflags;
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+
if (flow->snd_timesout) {
- ts_add(&abs, &flow->snd_timeo, &abs);
- abstime = &abs;
+ ts_add(&now, &flow->snd_timeo, &abs);
+ dl = &abs;
}
- flags = flow->oflags;
-
pthread_rwlock_unlock(&proc.lock);
if ((flags & FLOWFACCMODE) == FLOWFRDONLY)
return -EPERM;
- if (flags & FLOWFWNOBLOCK) {
- if (!frcti_is_window_open(flow->frcti))
- return -EAGAIN;
- idx = ssm_pool_alloc(proc.pool, count, &ptr, &spb);
- } else {
- ret = frcti_window_wait(flow->frcti, abstime);
+ tw_move_safe();
+
+ if (flow->frcti != NULL) {
+ /* Pump rx_rb so a pure-writer processes ACKs. */
+ ret = flow_wait_window(flow, 1, !(flags & FLOWFWNOBLOCK), dl);
if (ret < 0)
return ret;
- idx = ssm_pool_alloc_b(proc.pool, count, &ptr, &spb, abstime);
+
+ if (count > 0 && FRCTI_IS_STREAM(flow->frcti))
+ return flow_write_stream(flow, buf, count, flags, dl);
+
+ if (FRCTI_NEEDS_FRAG(flow->frcti, count))
+ return flow_write_frag(flow, buf, count, flags, dl);
+ } else if (flow->info.mtu > 0
+ && count > flow_user_mtu(flow, flow->info.mtu)) {
+ /* Raw flows carry no PCI; refuse anything > one n-1 frame. */
+ return -EMSGSIZE;
}
+ if (flags & FLOWFWNOBLOCK)
+ idx = ssm_pool_alloc(proc.pool, count, &ptr, &spb);
+ else
+ idx = ssm_pool_alloc_b(proc.pool, count, &ptr, &spb, dl);
if (idx < 0)
return idx;
if (count > 0)
memcpy(ptr, buf, count);
- ret = flow_tx_spb(flow, spb, !(flags & FLOWFWNOBLOCK), abstime);
+ ret = flow_tx_spb(flow, spb, FRCT_FR_SOLE,
+ !(flags & FLOWFWNOBLOCK), dl);
return ret < 0 ? (ssize_t) ret : (ssize_t) count;
}
-static bool invalid_pkt(struct flow * flow,
- struct ssm_pk_buff * spb)
-{
- if (spb == NULL || ssm_pk_buff_len(spb) == 0)
- return true;
-
- if (flow->info.qs.ber == 0 && chk_crc(spb) != 0)
- return true;
-
- if (spb_decrypt(flow, spb) < 0)
- return true;
-
- return false;
-}
-
static ssize_t flow_rx_spb(struct flow * flow,
struct ssm_pk_buff ** spb,
bool block,
@@ -1408,19 +1748,14 @@ static ssize_t flow_rx_spb(struct flow * flow,
ssize_t idx;
struct timespec now;
- idx = block ? ssm_rbuff_read_b(flow->rx_rb, abstime) :
- ssm_rbuff_read(flow->rx_rb);
+ idx = block ? ssm_rbuff_read_b(flow->rx_rb, abstime)
+ : ssm_rbuff_read(flow->rx_rb);
if (idx < 0)
return idx;
clock_gettime(PTHREAD_COND_CLOCK, &now);
-
- pthread_rwlock_wrlock(&proc.lock);
-
flow->rcv_act = now;
- pthread_rwlock_unlock(&proc.lock);
-
*spb = ssm_pool_get(proc.pool, idx);
if (invalid_pkt(flow, *spb)) {
@@ -1431,28 +1766,124 @@ static ssize_t flow_rx_spb(struct flow * flow,
return idx;
}
+static ssize_t raw_flow_read_pkt(struct flow * flow,
+ bool block,
+ struct timespec * dl)
+{
+ struct ssm_pk_buff * spb;
+ struct timespec wait_abs;
+ ssize_t idx;
+
+ while (true) {
+ if (!block) {
+ idx = ssm_rbuff_read(flow->rx_rb);
+ if (idx < 0)
+ return -EAGAIN;
+ } else {
+ compute_wait_deadline(dl, &wait_abs);
+ idx = ssm_rbuff_read_b(flow->rx_rb, &wait_abs);
+ if (idx == -ETIMEDOUT) {
+ if (deadline_passed(dl))
+ return -ETIMEDOUT;
+ continue;
+ }
+ if (idx < 0)
+ return idx;
+ }
+
+ spb = ssm_pool_get(proc.pool, idx);
+ if (!invalid_pkt(flow, spb))
+ return idx;
+
+ ssm_pool_remove(proc.pool, idx);
+ if (!block)
+ return -EAGAIN;
+ }
+}
+
+static ssize_t deliver_pkt(struct flow * flow,
+ struct ssm_pk_buff * spb,
+ ssize_t idx,
+ void * buf,
+ size_t count,
+ bool partrd)
+{
+ uint8_t * packet = ssm_pk_buff_head(spb);
+ ssize_t n = ssm_pk_buff_len(spb);
+
+ assert(n >= 0);
+
+ if (n <= (ssize_t) count) {
+ memcpy(buf, packet, n);
+ ipcp_spb_release(spb);
+ if (partrd && n == (ssize_t) count)
+ flow->part_idx = DONE_PART;
+ else
+ flow->part_idx = NO_PART;
+
+ return n;
+ }
+
+ if (partrd) {
+ memcpy(buf, packet, count);
+ ssm_pk_buff_pop(spb, n);
+ flow->part_idx = idx;
+ return count;
+ }
+
+ ipcp_spb_release(spb);
+ return -EMSGSIZE;
+}
+
+/* Drive frcti_consume until it delivers or errors. */
+static ssize_t flow_read_frcti(struct flow * flow,
+ void * buf,
+ size_t count,
+ bool block,
+ struct timespec * dl)
+{
+ struct timespec now;
+ ssize_t bytes;
+ int rc;
+
+ while (true) {
+ flow_drain_rx_nb(flow);
+ bytes = FRCTI_CONSUME(flow->frcti, buf, count);
+ if (bytes >= 0)
+ break;
+ if (bytes != -EAGAIN)
+ return bytes;
+ if (!block)
+ return -EAGAIN;
+ rc = flow_rx_one(flow, dl);
+ if (rc < 0)
+ return rc;
+ }
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ flow->rcv_act = now;
+
+ return bytes;
+}
+
ssize_t flow_read(int fd,
void * buf,
size_t count)
{
- ssize_t idx;
- ssize_t n;
- uint8_t * packet;
+ struct flow * flow;
struct ssm_pk_buff * spb;
struct timespec abs;
struct timespec now;
- struct timespec * abstime = NULL;
- struct flow * flow;
+ struct timespec * dl = NULL;
+ ssize_t idx;
bool block;
bool partrd;
- if (fd < 0 || fd >= PROG_MAX_FLOWS)
+ if (fd < 0 || fd >= PROC_MAX_FLOWS)
return -EBADF;
flow = &proc.flows[fd];
- clock_gettime(PTHREAD_COND_CLOCK, &now);
-
pthread_rwlock_rdlock(&proc.lock);
if (flow->info.id < 0) {
@@ -1461,8 +1892,8 @@ ssize_t flow_read(int fd,
}
if (flow->part_idx == DONE_PART) {
- pthread_rwlock_unlock(&proc.lock);
flow->part_idx = NO_PART;
+ pthread_rwlock_unlock(&proc.lock);
return 0;
}
@@ -1470,75 +1901,33 @@ ssize_t flow_read(int fd,
partrd = !(flow->oflags & FLOWFRNOPART);
if (flow->rcv_timesout) {
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
ts_add(&now, &flow->rcv_timeo, &abs);
- abstime = &abs;
+ dl = &abs;
}
- idx = flow->part_idx;
- if (idx < 0) {
- while ((idx = frcti_queued_pdu(flow->frcti)) < 0) {
- pthread_rwlock_unlock(&proc.lock);
-
- idx = flow_rx_spb(flow, &spb, block, abstime);
- if (idx < 0) {
- if (block && idx != -EAGAIN)
- return idx;
- if (!block)
- return idx;
+ pthread_rwlock_unlock(&proc.lock);
- pthread_rwlock_rdlock(&proc.lock);
- continue;
- }
+ tw_move_safe();
- pthread_rwlock_rdlock(&proc.lock);
+ idx = flow->part_idx;
+ if (idx < 0 && flow->frcti != NULL)
+ return flow_read_frcti(flow, buf, count, block, dl);
- frcti_rcv(flow->frcti, spb);
- }
+ if (idx < 0) {
+ idx = raw_flow_read_pkt(flow, block, dl);
+ if (idx < 0)
+ return idx;
}
spb = ssm_pool_get(proc.pool, idx);
- pthread_rwlock_unlock(&proc.lock);
-
- packet = ssm_pk_buff_head(spb);
-
- n = ssm_pk_buff_len(spb);
-
- assert(n >= 0);
-
- if (n <= (ssize_t) count) {
- memcpy(buf, packet, n);
- ipcp_spb_release(spb);
-
- pthread_rwlock_wrlock(&proc.lock);
-
- flow->part_idx = (partrd && n == (ssize_t) count) ?
- DONE_PART : NO_PART;
-
- flow->rcv_act = now;
-
- pthread_rwlock_unlock(&proc.lock);
- return n;
- } else {
- if (partrd) {
- memcpy(buf, packet, count);
- ssm_pk_buff_head_release(spb, n);
- pthread_rwlock_wrlock(&proc.lock);
- flow->part_idx = idx;
-
- flow->rcv_act = now;
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ flow->rcv_act = now;
- pthread_rwlock_unlock(&proc.lock);
- return count;
- } else {
- ipcp_spb_release(spb);
- return -EMSGSIZE;
- }
- }
+ return deliver_pkt(flow, spb, idx, buf, count, partrd);
}
-/* fqueue functions. */
-
struct flow_set * fset_create(void)
{
struct flow_set * set;
@@ -1614,7 +2003,7 @@ int fset_add(struct flow_set * set,
struct flow * flow;
int ret;
- if (set == NULL || fd < 0 || fd >= SYS_MAX_FLOWS)
+ if (set == NULL || fd < 0 || fd >= PROC_MAX_FLOWS)
return -EINVAL;
flow = &proc.flows[fd];
@@ -1650,7 +2039,7 @@ void fset_del(struct flow_set * set,
{
struct flow * flow;
- if (set == NULL || fd < 0 || fd >= SYS_MAX_FLOWS)
+ if (set == NULL || fd < 0 || fd >= PROC_MAX_FLOWS)
return;
flow = &proc.flows[fd];
@@ -1661,7 +2050,7 @@ void fset_del(struct flow_set * set,
ssm_flow_set_del(proc.fqset, set->idx, flow->info.id);
if (flow->frcti != NULL)
- ssm_flow_set_add(proc.fqset, 0, proc.flows[fd].info.id);
+ ssm_flow_set_add(proc.fqset, 0, flow->info.id);
pthread_rwlock_unlock(&proc.lock);
}
@@ -1672,7 +2061,7 @@ bool fset_has(const struct flow_set * set,
struct flow * flow;
bool ret;
- if (set == NULL || fd < 0 || fd >= SYS_MAX_FLOWS)
+ if (set == NULL || fd < 0 || fd >= PROC_MAX_FLOWS)
return false;
flow = &proc.flows[fd];
@@ -1691,61 +2080,59 @@ bool fset_has(const struct flow_set * set,
return ret;
}
-/* Filter fqueue events for non-data packets */
static int fqueue_filter(struct fqueue * fq)
{
struct ssm_pk_buff * spb;
int fd;
ssize_t idx;
struct frcti * frcti;
+ int ret = 0;
- while (fq->next < fq->fqsize) {
- if (fq->fqueue[fq->next].event != FLOW_PKT)
- return 1;
+ /* proc.lock rdlock gates frcti_destroy via flow_fini wrlock. */
+ pthread_rwlock_rdlock(&proc.lock);
- pthread_rwlock_rdlock(&proc.lock);
+ while (fq->next < fq->fqsize) {
+ if (fq->fqueue[fq->next].event != FLOW_PKT) {
+ ret = 1;
+ goto out;
+ }
fd = proc.id_to_fd[fq->fqueue[fq->next].flow_id].fd;
if (fd < 0) {
++fq->next;
- pthread_rwlock_unlock(&proc.lock);
continue;
}
frcti = proc.flows[fd].frcti;
if (frcti == NULL) {
- pthread_rwlock_unlock(&proc.lock);
- return 1;
+ ret = 1;
+ goto out;
}
- if (__frcti_pdu_ready(frcti) >= 0) {
- pthread_rwlock_unlock(&proc.lock);
- return 1;
+ if (FRCTI_PDU_READY(frcti)) {
+ ret = 1;
+ goto out;
}
- pthread_rwlock_unlock(&proc.lock);
-
idx = flow_rx_spb(&proc.flows[fd], &spb, false, NULL);
if (idx < 0)
- return 0;
-
- pthread_rwlock_rdlock(&proc.lock);
+ goto out;
spb = ssm_pool_get(proc.pool, idx);
- __frcti_rcv(frcti, spb);
+ FRCTI_RCV(frcti, spb);
- if (__frcti_pdu_ready(frcti) >= 0) {
- pthread_rwlock_unlock(&proc.lock);
- return 1;
+ if (FRCTI_PDU_READY(frcti)) {
+ ret = 1;
+ goto out;
}
- pthread_rwlock_unlock(&proc.lock);
-
++fq->next;
}
- return 0;
+ out:
+ pthread_rwlock_unlock(&proc.lock);
+ return ret;
}
int fqueue_next(struct fqueue * fq)
@@ -1792,7 +2179,8 @@ ssize_t fevent(struct flow_set * set,
{
ssize_t ret = 0;
struct timespec abs;
- struct timespec * t = NULL;
+ struct timespec * dl = NULL;
+ struct timespec wait_abs;
if (set == NULL || fq == NULL)
return -EINVAL;
@@ -1800,17 +2188,26 @@ ssize_t fevent(struct flow_set * set,
if (fq->fqsize > 0 && fq->next != fq->fqsize)
return 1;
- clock_gettime(PTHREAD_COND_CLOCK, &abs);
-
if (timeo != NULL) {
- ts_add(&abs, timeo, &abs);
- t = &abs;
+ struct timespec now;
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ ts_add(&now, timeo, &abs);
+ dl = &abs;
}
while (ret == 0) {
- ret = ssm_flow_set_wait(proc.fqset, set->idx, fq->fqueue, t);
- if (ret == -ETIMEDOUT)
- return -ETIMEDOUT;
+ tw_move_safe();
+
+ compute_wait_deadline(dl, &wait_abs);
+
+ ret = ssm_flow_set_wait(proc.fqset, set->idx,
+ fq->fqueue, &wait_abs);
+ if (ret == -ETIMEDOUT) {
+ if (deadline_passed(dl))
+ return -ETIMEDOUT;
+ ret = 0;
+ continue;
+ }
fq->fqsize = ret;
fq->next = 0;
@@ -1823,8 +2220,6 @@ ssize_t fevent(struct flow_set * set,
return 1;
}
-/* ipcp-dev functions. */
-
int np1_flow_alloc(pid_t n_pid,
int flow_id)
{
@@ -1837,9 +2232,10 @@ int np1_flow_alloc(pid_t n_pid,
flow.n_pid = getpid();
flow.qs = qos_np1;
flow.mpl = 0;
- flow.n_1_pid = n_pid; /* This "flow" is upside-down! */
+ /* np1 flow: n_1_pid is the upper. */
+ flow.n_1_pid = n_pid;
- return flow_init(&flow, &crypt);
+ return flow_init(&flow, &crypt, 0);
}
int np1_flow_dealloc(int flow_id,
@@ -1847,12 +2243,7 @@ int np1_flow_dealloc(int flow_id,
{
int fd;
- /*
- * TODO: Don't pass timeo to the IPCP but wait in IRMd.
- * This will need async ops, waiting until we bootstrap
- * the IRMd over ouroboros.
- */
-
+ /* TODO: wait in IRMd, not here; needs async ops. */
sleep(timeo);
pthread_rwlock_rdlock(&proc.lock);
@@ -1900,6 +2291,7 @@ int ipcp_create_r(const struct ipcp_info * info)
int ipcp_flow_req_arr(const buffer_t * dst,
qosspec_t qs,
time_t mpl,
+ uint32_t mtu,
const buffer_t * data)
{
struct flow_info flow;
@@ -1916,6 +2308,7 @@ int ipcp_flow_req_arr(const buffer_t * dst,
flow.n_1_pid = getpid();
flow.qs = qs;
flow.mpl = mpl;
+ flow.mtu = mtu;
if (ipcp_flow_req_arr__irm_req_ser(&msg, dst, &flow, data) < 0)
return -ENOMEM;
@@ -1930,22 +2323,25 @@ int ipcp_flow_req_arr(const buffer_t * dst,
if (err < 0)
return err;
- assert(crypt.nid == NID_undef); /* np1 flows are not encrypted */
+ /* np1 flows are not encrypted. */
+ assert(crypt.nid == NID_undef);
- /* inverted for np1_flow */
+ /* Inverted for np1_flow. */
flow.n_1_pid = flow.n_pid;
flow.n_pid = getpid();
flow.mpl = 0;
+ flow.mtu = 0;
flow.qs = qos_np1;
crypt.nid = NID_undef;
- return flow_init(&flow, &crypt);
+ return flow_init(&flow, &crypt, 0);
}
int ipcp_flow_alloc_reply(int fd,
int response,
time_t mpl,
+ uint32_t mtu,
const buffer_t * data)
{
struct flow_info flow;
@@ -1953,7 +2349,7 @@ int ipcp_flow_alloc_reply(int fd,
buffer_t msg = {SOCK_BUF_SIZE, buf};
int err;
- assert(fd >= 0 && fd < SYS_MAX_FLOWS);
+ assert(fd >= 0 && fd < PROC_MAX_FLOWS);
pthread_rwlock_rdlock(&proc.lock);
@@ -1962,6 +2358,7 @@ int ipcp_flow_alloc_reply(int fd,
pthread_rwlock_unlock(&proc.lock);
flow.mpl = mpl;
+ flow.mtu = mtu;
if (ipcp_flow_alloc_reply__irm_msg_ser(&msg, &flow, response, data) < 0)
return -ENOMEM;
@@ -1979,7 +2376,7 @@ int ipcp_flow_read(int fd,
struct flow * flow;
ssize_t idx = -1;
- assert(fd >= 0 && fd < SYS_MAX_FLOWS);
+ assert(fd >= 0 && fd < PROC_MAX_FLOWS);
assert(spb);
flow = &proc.flows[fd];
@@ -1988,7 +2385,14 @@ int ipcp_flow_read(int fd,
assert(flow->info.id >= 0);
- while (frcti_queued_pdu(flow->frcti) < 0) {
+ /* Raw flow: deliver the popped pkt directly (no FRCT rq). */
+ if (flow->frcti == NULL) {
+ pthread_rwlock_unlock(&proc.lock);
+ idx = flow_rx_spb(flow, spb, false, NULL);
+ return idx < 0 ? (int) idx : 0;
+ }
+
+ while (!FRCTI_PDU_READY(flow->frcti)) {
pthread_rwlock_unlock(&proc.lock);
idx = flow_rx_spb(flow, spb, false, NULL);
@@ -1997,7 +2401,7 @@ int ipcp_flow_read(int fd,
pthread_rwlock_rdlock(&proc.lock);
- frcti_rcv(flow->frcti, *spb);
+ FRCTI_RCV(flow->frcti, *spb);
}
pthread_rwlock_unlock(&proc.lock);
@@ -2011,12 +2415,12 @@ int ipcp_flow_write(int fd,
struct flow * flow;
int ret;
- assert(fd >= 0 && fd < SYS_MAX_FLOWS);
+ assert(fd >= 0 && fd < PROC_MAX_FLOWS);
assert(spb);
flow = &proc.flows[fd];
- pthread_rwlock_wrlock(&proc.lock);
+ pthread_rwlock_rdlock(&proc.lock);
if (flow->info.id < 0) {
pthread_rwlock_unlock(&proc.lock);
@@ -2030,30 +2434,28 @@ int ipcp_flow_write(int fd,
pthread_rwlock_unlock(&proc.lock);
- ret = flow_tx_spb(flow, spb, true, NULL);
+ ret = flow_tx_spb(flow, spb, FRCT_FR_SOLE, true, NULL);
return ret;
}
-static int pool_copy_spb(struct ssm_pool * src_pool,
- ssize_t src_idx,
- struct ssm_pool * dst_pool,
- struct ssm_pk_buff ** dst_spb)
+/* Copy src into dst_pool without consuming src. Caller owns both halves. */
+static int pool_dup_spb(struct ssm_pool * src_pool,
+ size_t src_off,
+ struct ssm_pool * dst_pool,
+ struct ssm_pk_buff ** dst_spb)
{
struct ssm_pk_buff * src;
uint8_t * ptr;
size_t len;
- src = ssm_pool_get(src_pool, src_idx);
+ src = ssm_pool_get(src_pool, src_off);
len = ssm_pk_buff_len(src);
- if (ssm_pool_alloc(dst_pool, len, &ptr, dst_spb) < 0) {
- ssm_pool_remove(src_pool, src_idx);
+ if (ssm_pool_alloc(dst_pool, len, &ptr, dst_spb) < 0)
return -ENOMEM;
- }
memcpy(ptr, ssm_pk_buff_head(src), len);
- ssm_pool_remove(src_pool, src_idx);
return 0;
}
@@ -2063,9 +2465,9 @@ int np1_flow_read(int fd,
struct ssm_pool * pool)
{
struct flow * flow;
- ssize_t idx = -1;
+ ssize_t off;
- assert(fd >= 0 && fd < SYS_MAX_FLOWS);
+ assert(fd >= 0 && fd < PROC_MAX_FLOWS);
assert(spb);
flow = &proc.flows[fd];
@@ -2074,20 +2476,23 @@ int np1_flow_read(int fd,
pthread_rwlock_rdlock(&proc.lock);
- idx = ssm_rbuff_read(flow->rx_rb);
- if (idx < 0) {
+ off = ssm_rbuff_read(flow->rx_rb);
+ if (off < 0) {
pthread_rwlock_unlock(&proc.lock);
- return idx;
+ return off;
}
pthread_rwlock_unlock(&proc.lock);
if (pool == NULL) {
- *spb = ssm_pool_get(proc.pool, idx);
+ *spb = ssm_pool_get(proc.pool, off);
} else {
/* Cross-pool copy: PUP -> GSPP */
- if (pool_copy_spb(pool, idx, proc.pool, spb) < 0)
+ if (pool_dup_spb(pool, off, proc.pool, spb) < 0) {
+ ssm_pool_remove(pool, off);
return -ENOMEM;
+ }
+ ssm_pool_remove(pool, off);
}
return 0;
@@ -2100,9 +2505,10 @@ int np1_flow_write(int fd,
struct flow * flow;
struct ssm_pk_buff * dst;
int ret;
- ssize_t idx;
+ size_t off;
+ size_t dst_off;
- assert(fd >= 0 && fd < SYS_MAX_FLOWS);
+ assert(fd >= 0 && fd < PROC_MAX_FLOWS);
assert(spb);
flow = &proc.flows[fd];
@@ -2121,45 +2527,47 @@ int np1_flow_write(int fd,
pthread_rwlock_unlock(&proc.lock);
- idx = ssm_pk_buff_get_idx(spb);
+ off = ssm_pk_buff_get_off(spb);
if (pool == NULL) {
- ret = ssm_rbuff_write_b(flow->tx_rb, idx, NULL);
+ ret = ssm_rbuff_write_b(flow->tx_rb, off, NULL);
if (ret < 0)
- ssm_pool_remove(proc.pool, idx);
- else
- ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT);
+ return ret;
+ ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT);
} else {
- /* Cross-pool copy: GSPP -> PUP */
- if (pool_copy_spb(proc.pool, idx, pool, &dst) < 0)
+ /* Cross-pool copy: GSPP -> PUP. Src kept on error. */
+ if (pool_dup_spb(proc.pool, off, pool, &dst) < 0)
return -ENOMEM;
- idx = ssm_pk_buff_get_idx(dst);
- ret = ssm_rbuff_write_b(flow->tx_rb, idx, NULL);
- if (ret < 0)
- ssm_pool_remove(pool, idx);
- else
- ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT);
+ dst_off = ssm_pk_buff_get_off(dst);
+ ret = ssm_rbuff_write_b(flow->tx_rb, dst_off, NULL);
+ if (ret < 0) {
+ ssm_pool_remove(pool, dst_off);
+ return ret;
+ }
+ ssm_flow_set_notify(flow->set, flow->info.id, FLOW_PKT);
+ ssm_pool_remove(proc.pool, off);
}
- return ret;
+ return 0;
}
int ipcp_spb_reserve(struct ssm_pk_buff ** spb,
size_t len)
{
- return ssm_pool_alloc_b(proc.pool, len, NULL, spb, NULL) < 0 ? -1 : 0;
+ return ssm_pool_alloc_b(proc.pool, len, NULL, spb, NULL) < 0
+ ? -1 : 0;
}
void ipcp_spb_release(struct ssm_pk_buff * spb)
{
- ssm_pool_remove(proc.pool, ssm_pk_buff_get_idx(spb));
+ ssm_pool_remove(proc.pool, ssm_pk_buff_get_off(spb));
}
int ipcp_flow_fini(int fd)
{
struct ssm_rbuff * rx_rb;
- assert(fd >= 0 && fd < SYS_MAX_FLOWS);
+ assert(fd >= 0 && fd < PROC_MAX_FLOWS);
pthread_rwlock_rdlock(&proc.lock);
@@ -2188,7 +2596,7 @@ int ipcp_flow_fini(int fd)
int ipcp_flow_get_qoscube(int fd,
qoscube_t * cube)
{
- assert(fd >= 0 && fd < SYS_MAX_FLOWS);
+ assert(fd >= 0 && fd < PROC_MAX_FLOWS);
assert(cube);
pthread_rwlock_rdlock(&proc.lock);
@@ -2227,7 +2635,7 @@ int local_flow_transfer(int src_fd,
struct ssm_pk_buff * dst_spb;
struct ssm_pool * sp;
struct ssm_pool * dp;
- ssize_t idx;
+ ssize_t off;
int ret;
assert(src_fd >= 0);
@@ -2241,15 +2649,15 @@ int local_flow_transfer(int src_fd,
pthread_rwlock_rdlock(&proc.lock);
- idx = ssm_rbuff_read(src_flow->rx_rb);
- if (idx < 0) {
+ off = ssm_rbuff_read(src_flow->rx_rb);
+ if (off < 0) {
pthread_rwlock_unlock(&proc.lock);
- return idx;
+ return off;
}
if (dst_flow->info.id < 0) {
pthread_rwlock_unlock(&proc.lock);
- ssm_pool_remove(sp, idx);
+ ssm_pool_remove(sp, off);
return -ENOTALLOC;
}
@@ -2257,21 +2665,24 @@ int local_flow_transfer(int src_fd,
if (sp == dp) {
/* Same pool: zero-copy */
- ret = ssm_rbuff_write_b(dst_flow->tx_rb, idx, NULL);
+ ret = ssm_rbuff_write_b(dst_flow->tx_rb, off, NULL);
if (ret < 0)
- ssm_pool_remove(sp, idx);
+ ssm_pool_remove(sp, off);
else
ssm_flow_set_notify(dst_flow->set,
dst_flow->info.id, FLOW_PKT);
} else {
/* Different pools: single copy */
- if (pool_copy_spb(sp, idx, dp, &dst_spb) < 0)
+ if (pool_dup_spb(sp, off, dp, &dst_spb) < 0) {
+ ssm_pool_remove(sp, off);
return -ENOMEM;
+ }
- idx = ssm_pk_buff_get_idx(dst_spb);
- ret = ssm_rbuff_write_b(dst_flow->tx_rb, idx, NULL);
+ ssm_pool_remove(sp, off);
+ off = ssm_pk_buff_get_off(dst_spb);
+ ret = ssm_rbuff_write_b(dst_flow->tx_rb, off, NULL);
if (ret < 0)
- ssm_pool_remove(dp, idx);
+ ssm_pool_remove(dp, off);
else
ssm_flow_set_notify(dst_flow->set,
dst_flow->info.id, FLOW_PKT);
diff --git a/src/lib/frct.c b/src/lib/frct.c
index fad2cf69..2e8955e3 100644
--- a/src/lib/frct.c
+++ b/src/lib/frct.c
@@ -1,7 +1,7 @@
/*
* Ouroboros - Copyright (C) 2016 - 2026
*
- * Flow and Retransmission Control
+ * Flow and Retransmission Control Task (FRCT)
*
* Dimitri Staessens <dimitri@ouroboros.rocks>
* Sander Vrijders <sander@ouroboros.rocks>
@@ -20,97 +20,416 @@
* Foundation, Inc., http://www.fsf.org/about/contact/.
*/
-#include <ouroboros/endian.h>
+/* Included by dev.c; uses dev.c statics (proc, spb_encrypt, ...). */
#define DELT_RDV (100 * MILLION) /* ns */
-#define MAX_RDV (1 * BILLION) /* ns */
+#define MAX_RDV (1 * BILLION) /* ns */
+
+#define MAX_RTO_MUL 8 /* caps the RTO backoff shift */
+#define MAX_TLP_PER_EP 2 /* RFC 8985 §7.3: up to 2 TLPs */
+#define INITIAL_RTO (1 * BILLION) /* RFC 6298 §2.1: 1 s default */
+#define RTT_BOOT_NS (10 * MILLION) /* rtt_hint floor + initial mdev */
+#define SRTT_FLOOR_NS 1000L /* 1 us; smoothed RTT floor */
+#define MDEV_FLOOR_NS 100L /* 100 ns; mdev sanity floor */
+#define RTT_CLAMP_MUL 16 /* probe sample cap = N * srtt */
+#define MIN_RTT_WIN_NS (300ULL * BILLION) /* 5 min, Linux tcp default */
+#define NACK_COOLDOWN_NS (100 * MILLION) /* pre-DRF NACK cooldown */
+#define FRCT_TX_TIMEO_NS (250 * 1000) /* tx ring write deadline */
+#define ACK_DELAY_NS (2ULL * TICTIME) /* delayed-ACK fire delay */
#define FRCT "frct"
#define FRCT_PCILEN (sizeof(struct frct_pci))
#define FRCT_NAME_STRLEN 32
-struct frct_cr {
- uint32_t lwe; /* Left window edge */
- uint32_t rwe; /* Right window edge */
+/* Wire-protocol cap on SACK blocks per packet; binds both peers. */
+#define SACK_MAX_BLOCKS 2048
+#define SACK_BLOCK_SIZE (2 * sizeof(uint32_t))
+/* 2B count + 2B pad to 4-byte align the block list. */
+#define SACK_HDR_SIZE (sizeof(uint32_t))
+#define SACK_MIN_GAP_NS (250u * 1000u) /* 250 us SACK gap */
+#define MIN_REORDER_NS (250u * 1000u) /* 250 us RACK floor */
+#define SACK_RXM_MAX 32 /* Cap on retransmits staged from single SACK.*/
+#define DUP_THRESH 3 /* RFC 8985 §6.2 step 2.2 SACK count gate. */
+
+/* RFC 8985 §7.2 RACK reorder-window scaling cap. */
+#define REO_WND_MULT_MAX 20
+/* RFC 8985 §7.2 step 5: round trips of no DSACK before halving. */
+#define REO_DECAY_PKTS 16
+/* DSACK seqno sanity: reject reports older/farther than one rcv window. */
+#define MAX_DSACK_LAG RQ_SIZE
+
+/* Signed ns elapsed; negative under concurrent update (no underflow). */
+static __inline__ int64_t ts_age_ns(uint64_t now_ns,
+ uint64_t then_ns)
+{
+ return (int64_t)(now_ns - then_ns);
+}
- uint8_t cflags;
- uint32_t seqno; /* SEQ to send, or last SEQ Ack'd */
+/* True iff strictly more than thr_ns elapsed since then_ns. */
+static __inline__ bool ts_aged_ns(uint64_t now_ns,
+ uint64_t then_ns,
+ uint64_t thr_ns)
+{
+ return ts_age_ns(now_ns, then_ns) > (int64_t) thr_ns;
+}
- struct timespec act; /* Last seen activity */
- time_t inact; /* Inactivity (s) */
-};
+/* FRCT r-timer: do not retransmit packet older than t_r (from first send). */
+#define RXM_AGED_OUT(t0, now_ns, t_r) \
+ ts_aged_ns((now_ns), (t0), (uint64_t)(t_r))
-struct frcti {
- int fd;
+/* FRCT a-timer: do not (re)transmit ACK after t_a from last data receive. */
+#define ACK_AGED_OUT(act, now_ns, t_a) \
+ ts_aged_ns((now_ns), (act), (uint64_t)(t_a))
- time_t mpl;
- time_t a;
- time_t r;
- time_t rdv;
-
- time_t srtt; /* Smoothed rtt */
- time_t mdev; /* Deviation */
- time_t rto; /* Retransmission timeout */
- uint32_t rttseq;
- struct timespec t_probe; /* Probe time */
- bool probe; /* Probe active */
-#ifdef PROC_FLOW_STATS
- size_t n_rtx; /* Number of rxm packets */
- size_t n_prb; /* Number of rtt probes */
- size_t n_rtt; /* Number of estimates */
- size_t n_dup; /* Duplicates received */
- size_t n_dak; /* Delayed ACKs received */
- size_t n_rdv; /* Number of rdv packets */
- size_t n_out; /* Packets out of window */
- size_t n_rqo; /* Packets out of rqueue */
-#endif
- struct frct_cr snd_cr;
- struct frct_cr rcv_cr;
+struct sack_args {
+ uint16_t n;
+ bool dsack; /* RFC 2883: block[0] is a DSACK report */
+ uint32_t ack;
+ uint32_t rwe;
+ uint32_t blocks[][2]; /* flexible — sized at alloc time */
+};
+/* NewReno-careful (RFC 6582) exit pad; gates RTT samples post-signal. */
+#define RTT_QUARANTINE 32
+#define RTTP_NONCE_LEN 16
- ssize_t rq[RQ_SIZE];
- pthread_rwlock_t lock;
+/* RTT-probe wire payload (after the FRCT PCI). */
+struct frct_rttp {
+ uint32_t probe_id; /* sender counter; 0 on reply */
+ uint32_t echo_id; /* peer's probe_id; 0 outbound */
+ uint8_t nonce[RTTP_NONCE_LEN]; /* random; echoed verbatim */
+} __attribute__((packed));
- bool open; /* Window open/closed */
- struct timespec t_wnd; /* Window closed time */
- struct timespec t_rdvs; /* Last rendez-vous sent */
- pthread_cond_t cond;
- pthread_mutex_t mtx;
-};
+#define RTTP_PAYLOAD sizeof(struct frct_rttp)
+#define RTTP_POS(id) ((id) & (RTTP_RING - 1))
+/*
+ * Flag values are assigned MSB-first on the wire (RFC convention):
+ * bit 0 = 0x8000 occupies wire-position 0 of the 16-bit flags
+ * field, bit 12 = 0x0008 is the last assigned bit, and the three
+ * LSBs (0x0007) are reserved.
+ */
enum frct_flags {
- FRCT_DATA = 0x01, /* PDU carries data */
- FRCT_DRF = 0x02, /* Data run flag */
- FRCT_ACK = 0x04, /* ACK field valid */
- FRCT_FC = 0x08, /* FC window valid */
- FRCT_RDVS = 0x10, /* Rendez-vous */
- FRCT_FFGM = 0x20, /* First Fragment */
- FRCT_MFGM = 0x40, /* More fragments */
+ FRCT_DATA = 0x8000, /* PDU carries data */
+ FRCT_DRF = 0x4000, /* Data run flag */
+ FRCT_ACK = 0x2000, /* ACK field valid */
+ FRCT_NACK = 0x1000, /* Neg-ACK: pci->seqno is arrival_seqno - 1 */
+ FRCT_FC = 0x0800, /* FC window valid */
+ FRCT_RDVS = 0x0400, /* Rendez-vous */
+ FRCT_FFGM = 0x0200, /* First fragment (begin) */
+ FRCT_LFGM = 0x0100, /* Last fragment (end) */
+ FRCT_RXM = 0x0080, /* Retransmission */
+ FRCT_SACK = 0x0040, /* SACK block list follows */
+ FRCT_RTTP = 0x0020, /* RTT probe / echo */
+ FRCT_KA = 0x0010, /* Keepalive */
+ FRCT_FIN = 0x0008, /* End of stream */
};
-struct frct_pci {
- uint8_t flags;
+/*
+ * DATA-packet fragment role (FFGM = begin, LFGM = end), SCTP-style:
+ * 1 1 = sole / un-fragmented SDU (begin AND end)
+ * 1 0 = first fragment of a multi-fragment SDU
+ * 0 0 = middle fragment
+ * 0 1 = last fragment
+ */
+#define FRCT_FR_MASK (FRCT_FFGM | FRCT_LFGM)
+#define FRCT_FR_SOLE (FRCT_FFGM | FRCT_LFGM)
+#define FRCT_FR_FIRST (FRCT_FFGM)
+#define FRCT_FR_MID (0)
+#define FRCT_FR_LAST (FRCT_LFGM)
+
+/* Default cap on a single reassembled SDU. App can raise via FRCTSMAXSDU */
+#define FRCT_MAX_SDU (1U << 20)
+
+/* Stream-mode PCI extension: [start, end) byte range on every DATA pkt. */
+struct frct_pci_stream {
+ uint32_t start;
+ uint32_t end;
+} __attribute__((packed));
+
+#define FRCT_PCI_STREAM_LEN (sizeof(struct frct_pci_stream))
- uint8_t pad; /* 24 bit window! */
- uint16_t window;
+/* Bytes following PCI: SACK list / RTTP nonce / control payload. */
+#define FRCT_BODY(pci) ((uint8_t *) (pci) + FRCT_PCILEN)
+/* Typed access to the stream PCI extension on stream DATA packets. */
+#define FRCT_SPCI(pci) \
+ ((struct frct_pci_stream *) ((uint8_t *) (pci) + FRCT_PCILEN))
+/* Push the FRCT header onto spb's head. */
+#define FRCT_HDR_PUSH(spb, frcti) \
+ ((struct frct_pci *) ssm_pk_buff_push((spb), \
+ frcti_data_hdr_len(frcti)))
+
+/* Pop a fixed-size header off spb's head; cast to type *. */
+#define FRCT_HDR_POP(spb, type) \
+ ((struct type *) ssm_pk_buff_pop((spb), sizeof(struct type)))
+
+/* Default / max per-flow stream rx ring (pow2); min N * per_pkt. */
+#define FRCT_STREAM_RING_MIN_PKTS 4
+#define FRCT_STREAM_RING_SZ (1U << 20) /* 1 MiB default */
+#define FRCT_STREAM_RING_SZ_MAX (1U << 27) /* 128 MiB */
+
+struct frct_pci {
+ uint16_t flags;
+ uint16_t hcs;
+
+ uint32_t window;
uint32_t seqno;
uint32_t ackno;
} __attribute__((packed));
+/* Stat counters; fold to no-ops without PROC_FLOW_STATS. */
#ifdef PROC_FLOW_STATS
+struct frcti_stat {
+ size_t rxm_rto; /* RTO-timer driven retransmits */
+ size_t rxm_rcv; /* RXM packets received (all) */
+ size_t rxm_dup_rcv; /* RXM dups (peer already had it) */
+ size_t rxm_sack; /* SACK-mechanism retransmits */
+ size_t rxm_rack; /* RACK-driven retransmits */
+ size_t rxm_dupthresh; /* DupThresh-driven retransmits */
+ size_t rxm_nack; /* NACK-pulled retransmits */
+ size_t rxm_due_count; /* rxm_due entries (pre-bail) */
+ size_t rxm_due_acked; /* bail: seqno < snd_lwe */
+ size_t rxm_due_unowned; /* bail: slot.rxm replaced */
+ size_t rxm_due_aged; /* bail: r->t0 + t_r < now */
+ size_t rxm_due_defer; /* bail: non-HoL, deferred to HoL */
+ size_t rxm_arm_fail; /* rxm_arm: malloc failed */
+ size_t rxm_cancel; /* entries cancelled at teardown */
+ size_t rxm_tx_dead; /* RXM tx into terminal flow */
+ size_t tx_drop; /* frct_tx fail (any cause) */
+ size_t tx_drop_ack; /* bare ACK dropped */
+ size_t tx_drop_sack; /* SACK dropped */
+ size_t tx_drop_ka; /* keepalive dropped */
+ size_t tx_drop_rttp; /* RTT probe/echo dropped */
+ size_t tx_drop_nack; /* pre-DRF NACK dropped */
+ size_t tx_drop_rdv; /* rendez-vous dropped */
+ size_t tx_drop_other; /* anything not matched above */
+ size_t ack_snd; /* ACK packets sent (bare + SACK) */
+ size_t ack_fire; /* delayed-ACK timer fires */
+ size_t ack_supp_seqno; /* fire suppressed: seqno */
+ size_t ack_supp_inact; /* fire suppressed: inact */
+ size_t ack_supp_rate; /* fire suppressed: rate */
+ size_t ack_rcv; /* ACK packets received */
+ size_t ack_rtt; /* ACKs that fed RTT estimator */
+ size_t ack_dup_rcv; /* ACK packet wire dups dropped */
+ size_t dup_rcv; /* duplicates received */
+ size_t out_rcv; /* pkts out of window */
+ size_t rqo_rcv; /* pkts out of rqueue */
+ size_t ooo_rcv; /* OOO arrivals */
+ size_t sack_snd; /* SACK packets sent */
+ size_t sack_rcv; /* SACK packets received */
+ size_t dsack_snd; /* SACK pkts carrying a DSACK */
+ size_t dsack_rcv; /* DSACK blocks parsed */
+ size_t dsack_drop; /* DSACK blocks past MAX_DSACK_LAG */
+ size_t nack_snd; /* pre-DRF NACKs sent */
+ size_t nack_rcv; /* pre-DRF NACKs received */
+ size_t tlp_snd; /* tail loss probes sent */
+ size_t inact_drop; /* inactivity drop (NACK on cd) */
+ size_t drf_rebase; /* DRF-triggered window rebase */
+ size_t rq_released; /* slots cleared by release_rq */
+ size_t rttp_snd; /* RTT probes sent */
+ size_t rttp_rcv; /* RTT probe replies rcvd */
+ size_t rtt_smpl; /* RTT estimator samples */
+ size_t rdv_snd; /* rendez-vous packets sent */
+ size_t rdv_rcv; /* rendez-vous packets rcvd */
+ size_t ka_snd; /* keepalives sent */
+ size_t ka_rcv; /* keepalives received */
+ size_t sdu_snd_frag; /* writes that fragmented */
+ size_t sdu_snd_alloc; /* alloc fail truncated SDU send */
+ size_t sdu_snd_tx; /* tx fail truncated SDU send */
+ size_t frag_snd; /* fragments sent: FIRST/MID/LAST */
+ size_t frag_rcv; /* fragments stashed in rq[] */
+ size_t sdu_reasm; /* SDUs delivered reassembled */
+ size_t sdu_sole; /* SOLE SDUs delivered (n==1) */
+ size_t frag_drop; /* dropped at malformed run */
+ size_t strm_snd_byte; /* bytes sent on stream */
+ size_t strm_rcv_byte; /* bytes copied to ring */
+ size_t strm_dlv_byte; /* bytes delivered to reader */
+ size_t strm_drop; /* stream rcvs dropped */
+ size_t strm_fin_drop; /* stream FIN packets rejected */
+ /* Profiling instrumentation. */
+ size_t rcv_proc_ns; /* time inside FRCTI_RCV (ns) */
+ size_t tw_move_ns; /* time inside tw_move (ns) */
+ size_t drain_calls; /* flow_drain_rx_nb invocations */
+};
+
+#define STAT_BUMP(frcti, field) FETCH_ADD_RELAXED(&(frcti)->stat.field, 1)
+#define STAT_ADD(frcti, field, v) FETCH_ADD_RELAXED(&(frcti)->stat.field, (v))
+#define STAT_LOAD(frcti, field) LOAD_RELAXED(&(frcti)->stat.field)
+#else
+#define STAT_BUMP(frcti, field) ((void) (frcti))
+#define STAT_ADD(frcti, field, v) ((void) (frcti))
+#define STAT_LOAD(frcti, field) ((void) (frcti), (size_t) 0)
+#endif
+
+#define frcti_to_flow(f) (&proc.flows[(f)->fd])
+#define RTTP_RING 8
+#define RTTP_COLD_NS (100 * MILLION) /* cold-probe cadence */
+#define RQ_SLOT(seqno) ((seqno) & (RQ_SIZE - 1))
+
+struct rxm_entry;
+
+enum snd_slot_flags {
+ SND_RTX = 0x01, /* Any retransmit; Karn skips next RTT sample. */
+ SND_FAST_RXM = 0x02, /* Fast-retx one-shot gate per loss event. */
+ SND_TLP = 0x04, /* Tail loss probe; ACK resets rto_mul. */
+};
+
+struct snd_slot {
+ struct rxm_entry * rxm; /* RXM entry, NULL if none. */
+ uint64_t time; /* ts_to_ns of last send (any kind). */
+ uint8_t flags; /* SND_* bits above. */
+};
+
+/* Per-seqno reorder slot (FRTX) and stream-mode byte/FIN metadata. */
+struct rcv_slot {
+ ssize_t idx; /* spb idx; -1 = empty */
+ uint32_t start; /* stream byte start */
+ uint32_t end; /* stream byte end */
+ uint8_t fin; /* stream FIN bit */
+};
+
+struct frct_cr {
+ uint32_t lwe; /* Left window edge */
+ uint32_t rwe; /* Right window edge */
+
+ uint8_t cflags;
+ uint32_t seqno; /* SEQ to send, or last SEQ Ack'd */
+ uint32_t ackno; /* snd: ACK-pkt seqno; rcv: dedup */
+
+ uint64_t act; /* ts_to_ns of last activity */
+ uint64_t inact; /* Inactivity threshold (ns) */
+};
+
+struct frcti {
+ /* IMM: set once in frcti_create; read-only thereafter. */
+ int fd;
+ uint64_t t_mpl; /* MPL (ns) */
+ uint64_t t_a; /* a-timer (ns) */
+ uint64_t t_r; /* r-timer (ns) */
+ uint64_t t_rdv; /* RDV cooldown (ns) */
+ time_t ber; /* cached qs.ber */
+ bool lossy; /* qs.loss != 0 */
+ time_t qs_timeout; /* cached qs.timeout (ms) */
+ size_t frag_mtu; /* max FRCT pkt: PCI + payload */
+ uint16_t sack_n_max; /* SACK blocks that fit MTU */
+ bool stream;
+
+ /* All fields below are protected by lock (rwlock/LOAD_ACQUIRE). */
+ struct {
+ struct frct_cr snd_cr;
+ struct frct_cr rcv_cr;
+
+ /* RTT/RACK estimator */
+ time_t srtt; /* smoothed RTT */
+ time_t mdev; /* mean deviation */
+ time_t min_rtt; /* RACK base, ns */
+ uint64_t t_min_rtt; /* min_rtt last set */
+ time_t rto; /* retransmit TO */
+ time_t rto_min; /* RTO floor (ns) */
+ uint8_t rto_mul; /* RTO backoff bits */
+ uint32_t rtt_lwe; /* RTT-sample fence */
+ uint64_t t_rcv_rtt; /* last RTT feed */
+ uint64_t t_snd_probe; /* last probe sent */
+ uint64_t t_latest_ack; /* RACK.fack snd-ts */
+ uint32_t probe_id_next;
+ struct {
+ uint32_t id;
+ uint64_t ts; /* ts_to_ns send */
+ uint8_t nonce[RTTP_NONCE_LEN]; /* echoed back */
+ } probes[RTTP_RING];
+
+ /* rcv reassembly */
+ size_t max_rcv_sdu; /* max reasm bytes */
+ uint8_t * rcv_ring; /* lazy alloc */
+ size_t rcv_ring_sz; /* power of 2 */
+ uint32_t ring_seq_cap; /* ring/per_pkt */
+
+ uint32_t snd_byte_next;
+ bool snd_fin_sent;
+ uint32_t snd_fin_seqno;
+ uint32_t rcv_byte_next;
+ uint32_t rcv_byte_high; /* contiguous high */
+ uint32_t rcv_byte_fin; /* set when FIN */
+ bool rcv_fin_seen;
+
+ struct rcv_slot rcv_slots[RQ_SIZE];
+ struct snd_slot snd_slots[RQ_SIZE]; /* .rxm is ATOM */
+
+ /* rcv SACK dedup */
+ uint64_t t_snd_sack;
+ uint32_t sack_lwe; /* rcv lwe at SACK */
+ uint16_t sack_n; /* SACK block count */
+
+ /* RFC 2883 D-SACK: pending report (single-slot, latest). */
+ uint32_t dsack_seqno;
+ bool dsack_valid;
+
+ /* RFC 8985 §7.2 RACK reorder-window scaling. */
+ uint8_t reo_wnd_mult; /* REO_WND_MULT_MAX */
+ uint32_t dsack_lwe_snap; /* lwe @ last DSACK */
+ uint64_t t_last_reo_widen; /* once-per-RTT */
+
+ uint32_t dup_thresh; /* RFC 8985 */
+ uint32_t tlp_high_seq; /* §7.3: 0 = none */
+ uint8_t tlp_count; /* §7.3 per-episode */
+ uint64_t t_nack;
+ bool open; /* FC window state */
+ bool in_recovery;
+ uint32_t recovery_high; /* seqno @ entry */
+ uint32_t rack_fired_lwe; /* lwe @ last RACK */
+ struct timespec t_wnd; /* window-closed ts */
+ struct timespec t_last_rdv; /* last RDV sent */
+ struct list_head rxm_list; /* live rxm entries */
+
+ pthread_rwlock_t lock;
+ };
+
+ /* Read/written via __atomic without holding lock. */
+ uint64_t t_ka_rcv; /* ts_to_ns of last KA rx */
+ uint8_t ack_pending; /* delayed-ACK dedup */
+ uint8_t tlp_pending; /* TLP arm dedup (lazy) */
+
+ /* Timer entries; ownership belongs to the tw module. */
+ struct tw_entry ack_tw; /* delayed-ACK timer */
+ struct tw_entry ka_tw; /* keepalive timer */
+ struct tw_entry tlp_tw; /* tail-loss probe timer */
+
+#ifdef PROC_FLOW_STATS
+ /* STAT: lock-free relaxed atomic counters. */
+ struct frcti_stat stat;
+#endif
+};
+
+#ifdef PROC_FLOW_STATS
+
+__attribute__((cold))
static int frct_rib_read(const char * path,
char * buf,
size_t len)
{
+ struct frcti * frcti;
struct timespec now;
+ uint64_t now_ns;
char * entry;
- struct flow * flow;
- struct frcti * frcti;
int fd;
-
- (void) len;
+ int written;
+ /* Snapshot under the locks; format outside (pure userspace). */
+ struct {
+ uint64_t t_mpl;
+ uint64_t t_a;
+ uint64_t t_r;
+ time_t srtt;
+ time_t mdev;
+ time_t rto;
+ time_t min_rtt;
+ struct frct_cr snd_cr;
+ struct frct_cr rcv_cr;
+ size_t rx_q_now;
+ size_t tx_q_now;
+ struct frcti_stat stat;
+ } s;
entry = strstr(path, RIB_SEPARATOR);
assert(entry);
@@ -118,23 +437,50 @@ static int frct_rib_read(const char * path,
fd = atoi(path);
- flow = &proc.flows[fd];
-
clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ if (fd < 0 || fd >= PROC_MAX_FLOWS)
+ return 0;
pthread_rwlock_rdlock(&proc.lock);
- frcti = flow->frcti;
+ frcti = proc.flows[fd].frcti;
+ if (frcti == NULL) {
+ pthread_rwlock_unlock(&proc.lock);
+ return 0;
+ }
+
+ s.t_mpl = frcti->t_mpl;
+ s.t_a = frcti->t_a;
+ s.t_r = frcti->t_r;
+
+ s.rx_q_now = proc.flows[fd].rx_rb != NULL
+ ? ssm_rbuff_queued(proc.flows[fd].rx_rb) : 0;
+ s.tx_q_now = proc.flows[fd].tx_rb != NULL
+ ? ssm_rbuff_queued(proc.flows[fd].tx_rb) : 0;
pthread_rwlock_rdlock(&frcti->lock);
- sprintf(buf,
- "Maximum packet lifetime (ns): %20ld\n"
- "Max time to Ack (ns): %20ld\n"
- "Max time to Retransmit (ns): %20ld\n"
+ s.srtt = frcti->srtt;
+ s.mdev = frcti->mdev;
+ s.rto = frcti->rto;
+ s.min_rtt = frcti->min_rtt;
+ s.snd_cr = frcti->snd_cr;
+ s.rcv_cr = frcti->rcv_cr;
+ s.stat = frcti->stat;
+
+ pthread_rwlock_unlock(&frcti->lock);
+ pthread_rwlock_unlock(&proc.lock);
+
+ written = snprintf(buf, len,
+ "Maximum packet lifetime (ns): %20" PRIu64 "\n"
+ "Max time to Ack (ns): %20" PRIu64 "\n"
+ "Max time to Retransmit (ns): %20" PRIu64 "\n"
"Smoothed rtt (ns): %20ld\n"
"RTT standard deviation (ns): %20ld\n"
"Retransmit timeout RTO (ns): %20ld\n"
+ "Minimum RTT (RACK base, ns): %20ld\n"
"Sender left window edge: %20u\n"
"Sender right window edge: %20u\n"
"Sender inactive (ns): %20lld\n"
@@ -143,44 +489,132 @@ static int frct_rib_read(const char * path,
"Receiver right window edge: %20u\n"
"Receiver inactive (ns): %20lld\n"
"Receiver last ack: %20u\n"
- "Number of pkt retransmissions: %20zu\n"
- "Number of rtt probes: %20zu\n"
- "Number of rtt estimates: %20zu\n"
- "Number of duplicates received: %20zu\n"
- "Number of delayed acks received: %20zu\n"
- "Number of rendez-vous sent: %20zu\n"
- "Number of packets out of window: %20zu\n"
- "Number of packets out of rqueue: %20zu\n",
- frcti->mpl,
- frcti->a,
- frcti->r,
- frcti->srtt,
- frcti->mdev,
- frcti->rto,
- frcti->snd_cr.lwe,
- frcti->snd_cr.rwe,
- ts_diff_ns(&now, &frcti->snd_cr.act),
- frcti->snd_cr.seqno,
- frcti->rcv_cr.lwe,
- frcti->rcv_cr.rwe,
- ts_diff_ns(&now, &frcti->rcv_cr.act),
- frcti->rcv_cr.seqno,
- frcti->n_rtx,
- frcti->n_prb,
- frcti->n_rtt,
- frcti->n_dup,
- frcti->n_dak,
- frcti->n_rdv,
- frcti->n_out,
- frcti->n_rqo);
-
- pthread_rwlock_unlock(&flow->frcti->lock);
+ "RXM (RTO-driven) sent: %20zu\n"
+ "RXM packets received: %20zu\n"
+ " duplicates received: %20zu\n"
+ "RXM (SACK mechanism) sent: %20zu\n"
+ "RXM (RACK-driven) sent: %20zu\n"
+ "RXM (DupThresh-driven) sent: %20zu\n"
+ "RXM (NACK-driven) sent: %20zu\n"
+ "ACK packets sent: %20zu\n"
+ "Delayed-ACK timer fires: %20zu\n"
+ " suppressed (seqno): %20zu\n"
+ " suppressed (inact): %20zu\n"
+ " suppressed (rate): %20zu\n"
+ "ACK packets received: %20zu\n"
+ " fed RTT estimator: %20zu\n"
+ " wire dups dropped: %20zu\n"
+ "Duplicates received: %20zu\n"
+ "Out-of-window pkts received: %20zu\n"
+ "Out-of-rqueue pkts received: %20zu\n"
+ "OOO arrivals: %20zu\n"
+ "SACKs sent: %20zu\n"
+ "SACKs received: %20zu\n"
+ "D-SACKs sent: %20zu\n"
+ "D-SACKs received: %20zu\n"
+ "D-SACK out-of-range dropped: %20zu\n"
+ "Pre-DRF NACKs sent: %20zu\n"
+ "Pre-DRF NACKs received: %20zu\n"
+ "Tail loss probes sent: %20zu\n"
+ "Inactivity drops (silent): %20zu\n"
+ "DRF window rebases: %20zu\n"
+ "rq slots cleared by release_rq: %20zu\n"
+ "RTT probes sent: %20zu\n"
+ "RTT probe replies received: %20zu\n"
+ "RTT estimator samples: %20zu\n"
+ "Rendez-vous packets sent: %20zu\n"
+ "Rendez-vous packets received: %20zu\n"
+ "Keepalives sent: %20zu\n"
+ "Keepalives received: %20zu\n"
+ "SDU writes fragmented: %20zu\n"
+ " alloc fail mid-SDU: %20zu\n"
+ " tx fail mid-SDU: %20zu\n"
+ "Fragments sent: %20zu\n"
+ "Fragments received: %20zu\n"
+ "SDUs delivered reassembled: %20zu\n"
+ "SDUs delivered (SOLE): %20zu\n"
+ "Fragments dropped (malformed): %20zu\n"
+ "Stream bytes sent: %20zu\n"
+ "Stream bytes received: %20zu\n"
+ "Stream bytes delivered: %20zu\n"
+ "Stream packets dropped: %20zu\n"
+ "Stream FINs dropped: %20zu\n"
+ "FRCTI_RCV time (ns): %20zu\n"
+ "tw_move time (ns): %20zu\n"
+ "drain_rx_nb calls: %20zu\n"
+ "RX rbuff queued: %20zu\n"
+ "TX rbuff queued: %20zu\n"
+ "RXM-due entries: %20zu\n"
+ " bail (acked): %20zu\n"
+ " bail (unowned): %20zu\n"
+ " bail (aged): %20zu\n"
+ " bail (defer): %20zu\n"
+ "RXM-arm malloc failures: %20zu\n"
+ "RXM cancels (teardown): %20zu\n"
+ "RXM tx into dead flow: %20zu\n"
+ "Tx ring drops (any cause): %20zu\n"
+ " ack: %20zu\n"
+ " sack: %20zu\n"
+ " ka: %20zu\n"
+ " rttp: %20zu\n"
+ " nack: %20zu\n"
+ " rdv: %20zu\n"
+ " other: %20zu\n",
+ /* Check getattr size below when adding stats. */
+ s.t_mpl, s.t_a, s.t_r,
+ s.srtt, s.mdev, s.rto, s.min_rtt,
+ s.snd_cr.lwe, s.snd_cr.rwe,
+ (long long)(now_ns - s.snd_cr.act),
+ s.snd_cr.seqno,
+ s.rcv_cr.lwe, s.rcv_cr.rwe,
+ (long long)(now_ns - s.rcv_cr.act),
+ s.rcv_cr.seqno,
+ s.stat.rxm_rto, s.stat.rxm_rcv, s.stat.rxm_dup_rcv,
+ s.stat.rxm_sack, s.stat.rxm_rack, s.stat.rxm_dupthresh,
+ s.stat.rxm_nack,
+ s.stat.ack_snd, s.stat.ack_fire,
+ s.stat.ack_supp_seqno, s.stat.ack_supp_inact,
+ s.stat.ack_supp_rate,
+ s.stat.ack_rcv, s.stat.ack_rtt, s.stat.ack_dup_rcv,
+ s.stat.dup_rcv, s.stat.out_rcv, s.stat.rqo_rcv,
+ s.stat.ooo_rcv,
+ s.stat.sack_snd, s.stat.sack_rcv,
+ s.stat.dsack_snd, s.stat.dsack_rcv, s.stat.dsack_drop,
+ s.stat.nack_snd, s.stat.nack_rcv, s.stat.tlp_snd,
+ s.stat.inact_drop, s.stat.drf_rebase, s.stat.rq_released,
+ s.stat.rttp_snd, s.stat.rttp_rcv, s.stat.rtt_smpl,
+ s.stat.rdv_snd, s.stat.rdv_rcv,
+ s.stat.ka_snd, s.stat.ka_rcv,
+ s.stat.sdu_snd_frag, s.stat.sdu_snd_alloc, s.stat.sdu_snd_tx,
+ s.stat.frag_snd, s.stat.frag_rcv,
+ s.stat.sdu_reasm, s.stat.sdu_sole, s.stat.frag_drop,
+ s.stat.strm_snd_byte, s.stat.strm_rcv_byte,
+ s.stat.strm_dlv_byte,
+ s.stat.strm_drop, s.stat.strm_fin_drop,
+ s.stat.rcv_proc_ns, s.stat.tw_move_ns,
+ s.stat.drain_calls,
+ s.rx_q_now, s.tx_q_now,
+ s.stat.rxm_due_count,
+ s.stat.rxm_due_acked, s.stat.rxm_due_unowned,
+ s.stat.rxm_due_aged, s.stat.rxm_due_defer,
+ s.stat.rxm_arm_fail,
+ s.stat.rxm_cancel,
+ s.stat.rxm_tx_dead, s.stat.tx_drop,
+ s.stat.tx_drop_ack, s.stat.tx_drop_sack,
+ s.stat.tx_drop_ka, s.stat.tx_drop_rttp,
+ s.stat.tx_drop_nack, s.stat.tx_drop_rdv,
+ s.stat.tx_drop_other);
+
+ if (written < 0)
+ return 0;
- pthread_rwlock_unlock(&proc.lock);
+ if ((size_t) written >= len)
+ return (int) (len - 1);
- return strlen(buf);
+ return written;
}
+__attribute__((cold))
static int frct_rib_readdir(char *** buf)
{
*buf = malloc(sizeof(**buf));
@@ -199,13 +633,14 @@ static int frct_rib_readdir(char *** buf)
return -ENOMEM;
}
+__attribute__((cold))
static int frct_rib_getattr(const char * path,
struct rib_attr * attr)
{
(void) path;
- (void) attr;
- attr->size = 1189;
+ /* Must be >= the sprintf output in frct_rib_read. */
+ attr->size = 8192;
attr->mtime = 0;
return 0;
@@ -220,128 +655,1172 @@ static struct rib_ops r_ops = {
#endif /* PROC_FLOW_STATS */
-static bool before(uint32_t seq1,
- uint32_t seq2)
+static __inline__ bool before(uint32_t s1, uint32_t s2)
{
- return (int32_t)(seq1 - seq2) < 0;
+ return (int32_t)(s1 - s2) < 0;
}
-static bool after(uint32_t seq1,
- uint32_t seq2)
+static __inline__ bool after(uint32_t s1, uint32_t s2)
{
- return (int32_t)(seq2 - seq1) < 0;
+ return (int32_t)(s2 - s1) < 0;
}
-static void __send_frct_pkt(int fd,
- uint8_t flags,
- uint32_t ackno,
- uint32_t rwe)
+static __inline__ bool within(uint32_t seq, uint32_t lo, uint32_t hi)
{
- struct ssm_pk_buff * spb;
- struct frct_pci * pci;
- ssize_t idx;
- struct flow * f;
+ return after(seq, lo) && !after(seq, hi);
+}
- /* Raw calls needed to bypass frcti. */
-#ifdef RXM_BLOCKING
- idx = ssm_pool_alloc_b(proc.pool, sizeof(*pci), NULL, &spb, NULL);
-#else
- idx = ssm_pool_alloc(proc.pool, sizeof(*pci), NULL, &spb);
-#endif
- if (idx < 0)
+static __inline__ bool in_window(uint32_t seq, const struct frct_cr * cr)
+{
+ return !before(seq, cr->lwe) && before(seq, cr->rwe);
+}
+
+/* DRF arrival that stays within the current receive epoch. */
+static __inline__ bool same_epoch_drf(uint32_t seq,
+ uint16_t flags,
+ const struct frct_cr * cr)
+{
+ if (cr->lwe == cr->rwe)
+ return false;
+
+ return (flags & FRCT_RXM) || in_window(seq, cr);
+}
+
+/*
+ * RACK reorder window R (RFC 8985 §6.2):
+ * R = MIN(reo_wnd_mult * RACK.min_RTT / 4, SRTT)
+ * reo_wnd_mult scales on D-SACK evidence of under-tolerance (§7.2).
+ * Fall back to srtt when no min_rtt sample exists yet; MIN_REORDER_NS
+ * floor guards collapse below the timer-tick resolution.
+ */
+static __inline__ uint64_t rack_reorder_window(struct frcti * frcti)
+{
+ uint64_t mult = frcti->reo_wnd_mult > 0 ? frcti->reo_wnd_mult : 1;
+ uint64_t base = frcti->min_rtt > 0 ? (uint64_t) frcti->min_rtt
+ : (uint64_t) frcti->srtt;
+ uint64_t R = mult * (base / 4);
+
+ R = MAX(R, (uint64_t) MIN_REORDER_NS);
+ R = MIN(R, (uint64_t) frcti->srtt);
+
+ return R;
+}
+
+static __inline__ int frct_spb_reserve(size_t len,
+ struct ssm_pk_buff ** spb)
+{
+ ssize_t idx = ssm_pool_alloc_b(proc.pool, len, NULL, spb, NULL);
+
+ return idx < 0 ? (int) idx : 0;
+}
+
+static __inline__ void frct_spb_release(struct ssm_pk_buff * spb)
+{
+ ssm_pool_remove(proc.pool, ssm_pk_buff_get_off(spb));
+}
+
+static __inline__ void frct_spb_release_idx(size_t idx)
+{
+ ssm_pool_remove(proc.pool, idx);
+}
+
+/* Fetch the spb stashed at the rq slot for seqno. */
+static __inline__ struct ssm_pk_buff * rq_frag(const struct frcti * frcti,
+ uint32_t seqno)
+{
+ return ssm_pool_get(proc.pool, frcti->rcv_slots[RQ_SLOT(seqno)].idx);
+}
+
+static __inline__ size_t frcti_data_hdr_len(const struct frcti * frcti)
+{
+ return FRCT_PCILEN + (frcti->stream ? FRCT_PCI_STREAM_LEN : 0);
+}
+
+static __inline__ size_t frcti_ctrl_hdr_len(const struct frcti * frcti)
+{
+ (void) frcti;
+
+ return FRCT_PCILEN;
+}
+
+/*
+ * HCS at offset 2 inside PCI. Covers flags (bytes 0..1) and
+ * window/seqno/ackno (bytes 4..15), plus SPCI for stream DATA.
+ */
+static void frct_hcs_set(struct frct_pci * pci,
+ bool stream)
+{
+ uint16_t hcs = 0;
+ size_t tail;
+
+ tail = sizeof(*pci) - sizeof(pci->flags) - sizeof(pci->hcs);
+ if (stream)
+ tail += FRCT_PCI_STREAM_LEN;
+
+ crc16_ccitt_false(&hcs, pci, sizeof(pci->flags));
+ crc16_ccitt_false(&hcs, &pci->window, tail);
+
+ pci->hcs = hton16(hcs);
+}
+
+static int frct_hcs_check(const struct frct_pci * pci,
+ const struct frcti * frcti)
+{
+ uint16_t hcs = 0;
+ uint16_t flags;
+ size_t tail;
+
+ /* Untrusted flag read; mismatch on HCS will drop on corrupt. */
+ flags = ntoh16(pci->flags);
+
+ tail = sizeof(*pci) - sizeof(pci->flags) - sizeof(pci->hcs);
+ if (frcti->stream && (flags & FRCT_DATA))
+ tail += FRCT_PCI_STREAM_LEN;
+
+ crc16_ccitt_false(&hcs, pci, sizeof(pci->flags));
+ crc16_ccitt_false(&hcs, &pci->window, tail);
+
+ return hcs != ntoh16(pci->hcs);
+}
+
+/* Bump tx_drop plus the per-frame-type counter matching `flags`. */
+static void frct_tx_drop_bump(struct frcti * frcti,
+ uint16_t flags)
+{
+ STAT_BUMP(frcti, tx_drop);
+
+ if (flags & FRCT_SACK) {
+ STAT_BUMP(frcti, tx_drop_sack);
return;
+ }
- pci = (struct frct_pci *) ssm_pk_buff_head(spb);
- memset(pci, 0, sizeof(*pci));
+ if (flags & FRCT_KA) {
+ STAT_BUMP(frcti, tx_drop_ka);
+ return;
+ }
- *((uint32_t *) pci) = hton32(rwe);
+ if (flags & FRCT_RTTP) {
+ STAT_BUMP(frcti, tx_drop_rttp);
+ return;
+ }
- pci->flags = flags;
- pci->ackno = hton32(ackno);
+ if (flags & FRCT_NACK) {
+ STAT_BUMP(frcti, tx_drop_nack);
+ return;
+ }
- f = &proc.flows[fd];
+ if (flags & FRCT_RDVS) {
+ STAT_BUMP(frcti, tx_drop_rdv);
+ return;
+ }
+
+ if (flags & FRCT_ACK) {
+ STAT_BUMP(frcti, tx_drop_ack);
+ return;
+ }
+
+ STAT_BUMP(frcti, tx_drop_other);
+}
+
+static int frct_tx(struct frcti * frcti, struct ssm_pk_buff * spb)
+{
+ struct flow * f = frcti_to_flow(frcti);
+ const struct frct_pci * pci;
+ const struct timespec * dl = NULL;
+ struct timespec now;
+ struct timespec intv = TIMESPEC_INIT_NS(FRCT_TX_TIMEO_NS);
+ struct timespec deadline;
+ uint16_t flags;
+ ssize_t idx;
+ int ret = -ENOMEM;
+
+ pci = (const struct frct_pci *) ssm_pk_buff_head(spb);
+ flags = ntoh16(pci->flags);
+
+ /* CRC32 covers plaintext body; PCI is in HCS. Pre-encrypt. */
+ if (flags & FRCT_SACK) {
+ if (crc_add(spb, frcti_ctrl_hdr_len(frcti)) != 0)
+ goto fail;
+ } else if ((flags & FRCT_DATA) && f->info.qs.ber == 0) {
+ if (crc_add(spb, frcti_data_hdr_len(frcti)) != 0)
+ goto fail;
+ }
if (spb_encrypt(f, spb) < 0)
goto fail;
-#ifdef RXM_BLOCKING
- if (ssm_rbuff_write_b(f->tx_rb, idx, NULL))
-#else
- if (ssm_rbuff_write(f->tx_rb, idx))
-#endif
+ idx = ssm_pk_buff_get_off(spb);
+
+ /* DATA blocks; control times out so a full ring can't stall wheel. */
+ if (!(flags & FRCT_DATA)) {
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ ts_add(&now, &intv, &deadline);
+ dl = &deadline;
+ }
+
+ ret = ssm_rbuff_write_b(f->tx_rb, idx, dl);
+ if (ret < 0)
goto fail;
ssm_flow_set_notify(f->set, f->info.id, FLOW_PKT);
- return;
+ return 0;
fail:
- ipcp_spb_release(spb);
- return;
+ frct_tx_drop_bump(frcti, flags);
+ ssm_pool_remove(proc.pool, ssm_pk_buff_get_off(spb));
+ return ret;
+}
+
+__attribute__((cold))
+static void frct_mark_flow_down(struct frcti * frcti)
+{
+ struct flow * f = frcti_to_flow(frcti);
+
+ if (f->rx_rb != NULL)
+ ssm_rbuff_set_acl(f->rx_rb, ACL_FLOWDOWN);
+
+ if (f->tx_rb != NULL)
+ ssm_rbuff_set_acl(f->tx_rb, ACL_FLOWDOWN);
+}
+
+__attribute__((cold))
+static void frct_mark_peer_dead(struct frcti * frcti)
+{
+ struct flow * f = frcti_to_flow(frcti);
+
+ if (f->rx_rb != NULL)
+ ssm_rbuff_set_acl(f->rx_rb, ACL_FLOWPEER);
+
+ if (proc.fqset != NULL)
+ ssm_flow_set_notify(proc.fqset, f->info.id, FLOW_PEER);
+}
+
+static __inline__ int frct_ctrl_alloc(struct ssm_pk_buff ** spb,
+ struct frct_pci ** pci,
+ size_t payload_len)
+{
+ if (frct_spb_reserve(FRCT_PCILEN + payload_len, spb) < 0)
+ return -1;
+
+ *pci = (struct frct_pci *) ssm_pk_buff_head(*spb);
+ memset(*pci, 0, FRCT_PCILEN);
+
+ return 0;
+}
+
+/*
+ * Advertised rwe. Stream mode clamps to lwe + ring_seq_cap so the
+ * byte-equivalent fits the rx ring. Caller holds at least the rdlock.
+ */
+static __inline__ uint32_t frcti_advert_rwe(struct frcti * frcti)
+{
+ uint32_t rwe;
+ uint32_t cap;
+
+ rwe = frcti->rcv_cr.rwe;
+
+ if (!frcti->stream)
+ return rwe;
+
+ cap = frcti->rcv_cr.lwe + frcti->ring_seq_cap;
+
+ return before(cap, rwe) ? cap : rwe;
+}
+
+static void frcti_pkt_snd(struct frcti * frcti,
+ uint16_t flags,
+ uint32_t ackno,
+ uint32_t rwe)
+{
+ struct ssm_pk_buff * spb;
+ struct frct_pci * pci;
+
+ if (frct_ctrl_alloc(&spb, &pci, 0) < 0)
+ return;
+
+ pci->flags = hton16(flags);
+ pci->window = hton32(rwe);
+ pci->ackno = hton32(ackno);
+ if (flags & FRCT_ACK) {
+ /* reuse ackno for the sequence number of delayed ACK */
+ ackno = FETCH_ADD_RELAXED(&frcti->snd_cr.ackno, 1);
+ pci->seqno = hton32(ackno + 1);
+ }
+
+ frct_hcs_set(pci, false);
+
+ frct_tx(frcti, spb);
+}
+
+/* RTO floor scales with srtt; hard floor rto_min guards sub-ms RTT. */
+static void rtt_init(struct frcti * frcti,
+ time_t rtt_hint)
+{
+ time_t floor;
+
+ if (rtt_hint > 0) {
+ rtt_hint = MAX(rtt_hint, (time_t) RTT_BOOT_NS);
+ frcti->srtt = rtt_hint;
+ frcti->mdev = rtt_hint >> 3;
+ floor = MAX(frcti->rto_min, 2 * frcti->srtt);
+ frcti->rto = MAX(floor, rtt_hint + (frcti->mdev << MDEV_MUL));
+ frcti->min_rtt = rtt_hint;
+ } else {
+ /* Boot from first ACK. */
+ frcti->srtt = 0;
+ frcti->mdev = RTT_BOOT_NS;
+ frcti->rto = MAX((time_t) INITIAL_RTO, frcti->rto_min);
+ frcti->min_rtt = 0;
+ }
+
+ frcti->rto_mul = 0;
+}
+
+/* RFC 8985 §6.2: replace min_RTT on unset, smaller sample, or expiry. */
+static __inline__ bool min_rtt_stale(struct frcti * frcti,
+ time_t mrtt,
+ uint64_t now_ns)
+{
+ if (frcti->min_rtt == 0)
+ return true;
+
+ if (mrtt < frcti->min_rtt)
+ return true;
+
+ return ts_aged_ns(now_ns, frcti->t_min_rtt, MIN_RTT_WIN_NS);
+}
+
+/* Linux-style windowed-min refresh of RACK.min_RTT. */
+static __inline__ void min_rtt_update(struct frcti * frcti,
+ time_t mrtt,
+ uint64_t now_ns)
+{
+ if (!min_rtt_stale(frcti, mrtt, now_ns))
+ return;
+
+ frcti->min_rtt = mrtt;
+ frcti->t_min_rtt = now_ns;
+}
+
+static void rtt_update(struct frcti * frcti,
+ time_t mrtt,
+ uint64_t now_ns)
+{
+ time_t srtt = frcti->srtt;
+ time_t rttvar = frcti->mdev;
+ time_t floor;
+ time_t rto;
+
+ if (srtt == 0) {
+ srtt = mrtt;
+ rttvar = mrtt >> 1;
+ } else {
+ /* RFC 6298 symmetric EWMA. */
+ time_t delta = mrtt - srtt;
+ srtt += (delta >> 3);
+ delta = (ABS(delta) - rttvar) >> 2;
+#ifdef FRCT_LINUX_RTT_ESTIMATOR
+ if (delta < 0)
+ delta >>= 3;
+#endif
+ rttvar += delta;
+ }
+ STAT_BUMP(frcti, rtt_smpl);
+ frcti->srtt = MAX(SRTT_FLOOR_NS, srtt);
+ frcti->mdev = MAX(MDEV_FLOOR_NS, rttvar);
+
+ min_rtt_update(frcti, mrtt, now_ns);
+
+ floor = MAX(frcti->rto_min, 2 * frcti->srtt);
+ rto = MAX(floor, frcti->srtt + (frcti->mdev << MDEV_MUL));
+
+ STORE_RELEASE(&frcti->rto, rto);
+ STORE_RELEASE(&frcti->rto_mul, 0);
+}
+
+/* Fill probes[pos], return new probe_id; 0 on entropy failure. Wrlock. */
+static uint32_t rttp_alloc_probe(struct frcti * frcti,
+ uint64_t now_ns,
+ uint8_t nonce[RTTP_NONCE_LEN])
+{
+ uint32_t probe_id;
+ size_t pos;
+
+ if (random_buffer(nonce, RTTP_NONCE_LEN) < 0)
+ return 0;
+
+ probe_id = frcti->probe_id_next++;
+ if (probe_id == 0)
+ probe_id = frcti->probe_id_next++;
+
+ pos = RTTP_POS(probe_id);
+ frcti->probes[pos].id = probe_id;
+ frcti->probes[pos].ts = now_ns;
+ memcpy(frcti->probes[pos].nonce, nonce, RTTP_NONCE_LEN);
+ frcti->t_snd_probe = now_ns;
+
+ STAT_BUMP(frcti, rttp_snd);
+
+ return probe_id;
+}
+
+/* Caller wrlock; out args valid on true (caller emits post-unlock). */
+static bool rtt_probe_arm(struct frcti * frcti,
+ uint64_t now_ns,
+ uint32_t * probe_id,
+ uint8_t nonce[RTTP_NONCE_LEN])
+{
+ if (frcti->srtt == 0)
+ return false;
+
+ if (!after(frcti->snd_cr.seqno, frcti->snd_cr.lwe))
+ return false;
+
+ if (!ts_aged_ns(now_ns, frcti->t_rcv_rtt,
+ 2u * (uint64_t) frcti->srtt))
+ return false;
+
+ if (!ts_aged_ns(now_ns, frcti->t_snd_probe,
+ (uint64_t) frcti->srtt))
+ return false;
+
+ *probe_id = rttp_alloc_probe(frcti, now_ns, nonce);
+
+ return *probe_id != 0;
+}
+
+static void frcti_rttp_snd(struct frcti * frcti,
+ uint32_t probe_id,
+ uint32_t echo_id,
+ const uint8_t * nonce)
+{
+ struct ssm_pk_buff * spb;
+ struct frct_pci * pci;
+ struct frct_rttp * rttp;
+
+ if (frct_ctrl_alloc(&spb, &pci, RTTP_PAYLOAD) < 0)
+ return;
+
+ pci->flags = hton16(FRCT_RTTP);
+
+ frct_hcs_set(pci, false);
+
+ rttp = (struct frct_rttp *) FRCT_BODY(pci);
+ rttp->probe_id = hton32(probe_id);
+ rttp->echo_id = hton32(echo_id);
+ memcpy(rttp->nonce, nonce, sizeof(rttp->nonce));
+
+ frct_tx(frcti, spb);
+}
+
+struct rxm_entry {
+ struct tw_entry tw;
+ struct list_head next; /* in frcti->rxm_list */
+ struct frcti * frcti;
+ uint32_t seqno;
+ uint64_t t0;
+ size_t len;
+ uint8_t pkt[]; /* flexible — sized at alloc time */
+};
+
+static struct rxm_entry * rxm_entry_create(struct frcti * frcti,
+ uint32_t seqno,
+ const struct ssm_pk_buff * spb)
+{
+ struct rxm_entry * r;
+ struct timespec now;
+ size_t len = ssm_pk_buff_len(spb);
+
+ r = malloc(sizeof(*r) + len);
+ if (r == NULL) {
+ STAT_BUMP(frcti, rxm_arm_fail);
+ return NULL;
+ }
+
+ memcpy(r->pkt, ssm_pk_buff_head(spb), len);
+ r->len = len;
+ r->frcti = frcti;
+ r->seqno = seqno;
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ r->t0 = TS_TO_UINT64(now);
+
+ tw_init_entry(&r->tw);
+
+ return r;
+}
+
+static void rxm_entry_destroy(struct rxm_entry * r)
+{
+ free(r);
+}
+
+static bool rxm_still_owned(struct frcti * frcti,
+ size_t pos,
+ struct rxm_entry * r)
+{
+ return LOAD_ACQUIRE(&frcti->snd_slots[pos].rxm) == r;
+}
+
+/*
+ * All in-flight slots share the HoL backoff; otherwise non-HoL timers
+ * cycle at base RTO and storm the wire while HoL is still backing off.
+ */
+static uint64_t rxm_next_deadline(struct frcti * frcti,
+ uint64_t now_ns)
+{
+ time_t rto = LOAD_RELAXED(&frcti->rto);
+ uint8_t rto_mul = LOAD_RELAXED(&frcti->rto_mul);
+
+ return now_ns + ((uint64_t) rto << rto_mul);
+}
+
+/* Copy pkt, set FRCT_RXM, refresh ackno, re-seal HCS. */
+static struct ssm_pk_buff * rxm_pkt_prepare(const void * pkt,
+ size_t len,
+ uint32_t rcv_lwe,
+ bool stream)
+{
+ struct ssm_pk_buff * spb;
+ struct frct_pci * pci;
+ uint16_t flags;
+
+ if (frct_spb_reserve(len, &spb) < 0)
+ return NULL;
+
+ pci = (struct frct_pci *) ssm_pk_buff_head(spb);
+ memcpy(pci, pkt, len);
+
+ flags = ntoh16(pci->flags) | FRCT_RXM;
+ pci->flags = hton16(flags);
+ pci->ackno = hton32(rcv_lwe);
+
+ frct_hcs_set(pci, stream);
+
+ return spb;
+}
+
+/* Caller must NOT hold frcti->lock. */
+static void rxm_snd(struct frcti * frcti,
+ uint32_t seqno,
+ const void * pkt,
+ size_t len)
+{
+ struct ssm_pk_buff * spb;
+ struct timespec now;
+ struct snd_slot * slot;
+ uint32_t snd_lwe;
+ uint32_t rcv_lwe;
+ size_t pos;
+ int ret;
+
+ snd_lwe = LOAD_RELAXED(&frcti->snd_cr.lwe);
+ rcv_lwe = LOAD_RELAXED(&frcti->rcv_cr.lwe);
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ pos = RQ_SLOT(seqno);
+ slot = &frcti->snd_slots[pos];
+
+ slot->time = TS_TO_UINT64(now);
+ /* RTO supersedes any pending TLP/fast-rxm on this slot. */
+ slot->flags = (slot->flags & ~(SND_FAST_RXM | SND_TLP)) | SND_RTX;
+ /* §7.3: RTO supersedes TLP probes and ends the probe episode. */
+ frcti->tlp_high_seq = 0;
+ frcti->tlp_count = 0;
+
+ frcti->rtt_lwe = seqno + 1;
+
+ /* Only the HoL retransmit bumps the global RTO backoff. */
+ if (seqno == snd_lwe && frcti->rto_mul < MAX_RTO_MUL)
+ STORE_RELEASE(&frcti->rto_mul, frcti->rto_mul + 1);
+
+ /* RFC 8985 §7.2 step 4: RTO on HoL resets RACK reo scaling. */
+ if (seqno == snd_lwe)
+ frcti->reo_wnd_mult = 1;
+
+ pthread_rwlock_unlock(&frcti->lock);
+
+ STAT_BUMP(frcti, rxm_rto);
+
+ spb = rxm_pkt_prepare(pkt, len, rcv_lwe, frcti->stream);
+ if (spb == NULL)
+ return;
+
+ /* ETIMEDOUT/ENOMEM: let r-timer drive teardown. */
+ ret = frct_tx(frcti, spb);
+ if (ret == -EFLOWDOWN || ret == -ENOTALLOC)
+ STAT_BUMP(frcti, rxm_tx_dead);
}
-static void send_frct_pkt(struct frcti * frcti)
+static void rxm_due(void * arg)
+{
+ struct rxm_entry * r = arg;
+ struct frcti * frcti = r->frcti;
+ struct timespec now;
+ uint64_t now_ns;
+ uint32_t snd_lwe;
+ size_t pos = RQ_SLOT(r->seqno);
+
+ STAT_BUMP(frcti, rxm_due_count);
+
+ snd_lwe = LOAD_RELAXED(&frcti->snd_cr.lwe);
+
+ /* Already ACK'd: expected for the steady-state majority. */
+ if (before(r->seqno, snd_lwe)) {
+ STAT_BUMP(frcti, rxm_due_acked);
+ goto cleanup;
+ }
+
+ /* SACK/RACK-cleared the slot (caller NULL'd snd_slots[pos].rxm). */
+ if (!rxm_still_owned(frcti, pos, r)) {
+ STAT_BUMP(frcti, rxm_due_unowned);
+ goto cleanup;
+ }
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ /* R-timer expired: peer unreachable. */
+ if (RXM_AGED_OUT(r->t0, now_ns, frcti->t_r)) {
+ STAT_BUMP(frcti, rxm_due_aged);
+ frct_mark_flow_down(frcti);
+ goto cleanup;
+ }
+
+ /* HoL-only retx; defer at base rto so HoL transitions react. */
+ if (r->seqno != snd_lwe) {
+ STAT_BUMP(frcti, rxm_due_defer);
+ tw_post(&r->tw, now_ns + LOAD_RELAXED(&frcti->rto),
+ rxm_due, r);
+ return;
+ }
+
+ rxm_snd(frcti, r->seqno, r->pkt, r->len);
+
+ /* Re-check ownership: fire path may have replaced our entry. */
+ if (rxm_still_owned(frcti, pos, r)) {
+ uint64_t anchor;
+
+ /* Per-slot anchor breaks co-fire re-bin. */
+ anchor = frcti->snd_slots[pos].time;
+ tw_post(&r->tw, rxm_next_deadline(frcti, anchor), rxm_due, r);
+ return;
+ }
+
+ cleanup:
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ if (rxm_still_owned(frcti, pos, r))
+ STORE_RELEASE(&frcti->snd_slots[pos].rxm, NULL);
+
+ list_del(&r->next);
+
+ pthread_rwlock_unlock(&frcti->lock);
+
+ rxm_entry_destroy(r);
+}
+
+static int rxm_arm(struct frcti * frcti,
+ uint32_t seqno,
+ const struct ssm_pk_buff * spb)
+{
+ struct rxm_entry * r;
+ time_t rto;
+ uint8_t rto_mul;
+ uint64_t deadline;
+
+ r = rxm_entry_create(frcti, seqno, spb);
+ if (r == NULL)
+ return -ENOMEM;
+
+ rto = LOAD_RELAXED(&frcti->rto);
+ rto_mul = LOAD_RELAXED(&frcti->rto_mul);
+ deadline = r->t0 + ((uint64_t) rto << rto_mul);
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ list_add_tail(&r->next, &frcti->rxm_list);
+ STORE_RELEASE(&frcti->snd_slots[RQ_SLOT(seqno)].rxm, r);
+
+ pthread_rwlock_unlock(&frcti->lock);
+
+ tw_post(&r->tw, deadline, rxm_due, r);
+
+ return 0;
+}
+
+static void rxm_cancel_all(struct frcti * frcti)
+{
+ struct list_head * p;
+ struct list_head * t;
+
+ list_for_each_safe(p, t, &frcti->rxm_list) {
+ struct rxm_entry * r = list_entry(p, struct rxm_entry, next);
+ list_del(&r->next);
+ tw_cancel(&r->tw);
+ rxm_entry_destroy(r);
+ STAT_BUMP(frcti, rxm_cancel);
+ }
+}
+
+static __inline__ void sack_block_put(uint8_t * payload,
+ uint16_t i,
+ uint32_t s,
+ uint32_t e)
+{
+ uint32_t * blk = (uint32_t *)
+ (payload + SACK_HDR_SIZE + i * SACK_BLOCK_SIZE);
+
+ blk[0] = hton32(s);
+ blk[1] = hton32(e);
+}
+
+static __inline__ void sack_block_get(const uint8_t * payload,
+ uint16_t i,
+ uint32_t * s,
+ uint32_t * e)
+{
+ const uint32_t * blk = (const uint32_t *)
+ (payload + SACK_HDR_SIZE + i * SACK_BLOCK_SIZE);
+
+ *s = ntoh32(blk[0]);
+ *e = ntoh32(blk[1]);
+}
+
+/*
+ * Build SACK blocks for ranges *above* rcv_cr.lwe. Wire invariant
+ * (see doc/frct.txt §1.3): every block produced here satisfies
+ * blocks[i].start > rcv_cr.lwe = ackno, which makes the "first block
+ * below ackno" convention used to mark a D-SACK (RFC 2883 §4 case 1)
+ * unambiguous. Caller holds frcti->lock.
+ */
+static uint16_t sack_blocks_build(struct frcti * frcti,
+ uint32_t blocks[][2],
+ uint16_t max_n)
+{
+ const struct rcv_slot * slots = frcti->rcv_slots;
+ uint32_t s;
+ uint32_t end;
+ uint16_t n = 0;
+
+ s = frcti->rcv_cr.lwe + 1;
+ end = frcti->rcv_cr.lwe + RQ_SIZE;
+ if (after(end, frcti->rcv_cr.rwe))
+ end = frcti->rcv_cr.rwe;
+
+ while (before(s, end) && n < max_n) {
+ while (before(s, end) && slots[RQ_SLOT(s)].idx == -1)
+ ++s;
+
+ if (!before(s, end))
+ break;
+
+ blocks[n][0] = s;
+ while (before(s, end) && slots[RQ_SLOT(s)].idx != -1)
+ ++s;
+ blocks[n][1] = s;
+ ++n;
+ }
+
+ return n;
+}
+
+/*
+ * Prepend the pending D-SACK report (if any) as block[0]; clear flag.
+ * Returns the number of slots consumed at the head (0 or 1). Caller
+ * holds wrlock.
+ */
+static __inline__ uint16_t dsack_consume(struct frcti * frcti,
+ uint32_t blocks[][2])
+{
+ if (!frcti->dsack_valid || frcti->sack_n_max == 0)
+ return 0;
+
+ blocks[0][0] = frcti->dsack_seqno;
+ blocks[0][1] = frcti->dsack_seqno + 1;
+ frcti->dsack_valid = false;
+ return 1;
+}
+
+/* Caller must NOT hold frcti->lock. */
+static void frcti_sack_snd(struct frcti * frcti,
+ const struct sack_args * sa)
+{
+ struct ssm_pk_buff * spb;
+ struct frct_pci * pci;
+ buffer_t buf;
+ uint16_t i;
+
+ assert(sa->n <= SACK_MAX_BLOCKS);
+
+ buf.len = SACK_HDR_SIZE + sa->n * SACK_BLOCK_SIZE;
+
+ if (frct_ctrl_alloc(&spb, &pci, buf.len) < 0)
+ return;
+
+ pci->flags = hton16(FRCT_ACK | FRCT_FC | FRCT_SACK);
+ pci->window = hton32(sa->rwe);
+ pci->ackno = hton32(sa->ack);
+ pci->seqno = hton32(FETCH_ADD_RELAXED(&frcti->snd_cr.ackno, 1) + 1);
+
+ frct_hcs_set(pci, false);
+
+ buf.data = FRCT_BODY(pci);
+ memset(buf.data, 0, SACK_HDR_SIZE);
+ *(uint16_t *) buf.data = hton16(sa->n);
+ for (i = 0; i < sa->n; ++i)
+ sack_block_put(buf.data, i, sa->blocks[i][0], sa->blocks[i][1]);
+
+ frct_tx(frcti, spb);
+}
+
+static void ack_snd(struct frcti * frcti,
+ bool with_sack)
{
struct timespec now;
+ uint64_t now_ns;
time_t diff;
uint32_t ackno;
uint32_t rwe;
- int fd;
+ struct sack_args * sa = NULL;
+ size_t sa_sz;
+ bool sacking = false;
assert(frcti);
+ STAT_BUMP(frcti, ack_fire);
+
clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ if (with_sack && frcti->sack_n_max > 0) {
+ sa_sz = sizeof(*sa) + frcti->sack_n_max * sizeof(sa->blocks[0]);
+ sa = malloc(sa_sz);
+ /* If alloc fails, fall through and send a bare cum-ACK. */
+ }
pthread_rwlock_wrlock(&frcti->lock);
- if (!after(frcti->rcv_cr.lwe, frcti->rcv_cr.seqno)) {
+ /* D-SACK rides through cum-ACK freshness; signal is the duplicate. */
+ if (!after(frcti->rcv_cr.lwe, frcti->rcv_cr.seqno)
+ && !frcti->dsack_valid) {
pthread_rwlock_unlock(&frcti->lock);
- return;
+ STAT_BUMP(frcti, ack_supp_seqno);
+ goto out;
}
- fd = frcti->fd;
ackno = frcti->rcv_cr.lwe;
- rwe = frcti->rcv_cr.rwe;
+ rwe = frcti_advert_rwe(frcti);
- diff = ts_diff_ns(&now, &frcti->rcv_cr.act);
- if (diff > frcti->a) {
+ if (ACK_AGED_OUT(frcti->rcv_cr.act, now_ns, frcti->t_a)) {
pthread_rwlock_unlock(&frcti->lock);
- return;
+ STAT_BUMP(frcti, ack_supp_inact);
+ goto out;
}
- diff = ts_diff_ns(&now, &frcti->snd_cr.act);
- if (diff < TICTIME) {
+ diff = (time_t) ts_age_ns(now_ns, frcti->snd_cr.act);
+ if (diff < TICTIME && !frcti->dsack_valid) {
pthread_rwlock_unlock(&frcti->lock);
- return;
+ STAT_BUMP(frcti, ack_supp_rate);
+ goto out;
}
+ /* RFC 2018: piggyback SACK on timer ACK; dedup unchanged board. */
+ if (sa == NULL || (frcti->sack_n == 0 && !frcti->dsack_valid))
+ goto no_sack;
+
+ sa->dsack = false;
+ sa->n = dsack_consume(frcti, sa->blocks);
+ if (sa->n == 1)
+ sa->dsack = true;
+
+ sa->n += sack_blocks_build(frcti, sa->blocks + sa->n,
+ frcti->sack_n_max - sa->n);
+ if (sa->n == 0)
+ goto no_sack;
+
+ if (!sa->dsack && ackno == frcti->sack_lwe && sa->n == frcti->sack_n)
+ goto no_sack;
+
+ sa->ack = ackno;
+ sa->rwe = rwe;
+ frcti->sack_lwe = ackno;
+ frcti->sack_n = sa->n;
+ frcti->t_snd_sack = now_ns;
+ sacking = true;
+
+ no_sack:
frcti->rcv_cr.seqno = frcti->rcv_cr.lwe;
pthread_rwlock_unlock(&frcti->lock);
- __send_frct_pkt(fd, FRCT_ACK | FRCT_FC, ackno, rwe);
+ STAT_BUMP(frcti, ack_snd);
+
+ if (sacking) {
+ STAT_BUMP(frcti, sack_snd);
+ if (sa->dsack)
+ STAT_BUMP(frcti, dsack_snd);
+ frcti_sack_snd(frcti, sa);
+ } else {
+ frcti_pkt_snd(frcti, FRCT_ACK | FRCT_FC, ackno, rwe);
+ }
+
+ out:
+ free(sa);
}
-static void __send_rdv(int fd)
+/* Delayed-ACK timer: per-flow, dedup'd via atomic test-and-set. */
+static void ack_due(void * arg)
{
- __send_frct_pkt(fd, FRCT_RDVS, 0, 0);
+ struct frcti * frcti = arg;
+
+ __atomic_clear(&frcti->ack_pending, __ATOMIC_RELAXED);
+
+ ack_snd(frcti, true);
}
-static struct frcti * frcti_create(int fd,
- time_t a,
- time_t r,
- time_t mpl)
+static int ack_arm(struct frcti * frcti)
{
- struct frcti * frcti;
- ssize_t idx;
- struct timespec now;
- pthread_condattr_t cattr;
+ struct timespec now;
+ uint64_t deadline;
+
+ if (__atomic_test_and_set(&frcti->ack_pending, __ATOMIC_RELAXED))
+ return 0;
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ deadline = TS_TO_UINT64(now) + ACK_DELAY_NS;
+
+ tw_post(&frcti->ack_tw, deadline, ack_due, frcti);
+
+ return 0;
+}
+
+/* Forward decl breaks the keepalive cycle: ka_arm <-> ka_due. */
+static void ka_due(void * arg);
+
+static int ka_arm(struct frcti * frcti)
+{
+ struct timespec now;
+ uint64_t now_ns;
+ uint64_t timeo_ns;
+ uint64_t snd_ns;
+ uint64_t rcv_ns;
+ uint64_t deadline;
+
+ timeo_ns = (uint64_t) frcti->qs_timeout * MILLION; /* IMM */
+ snd_ns = LOAD_RELAXED(&frcti->snd_cr.act) + timeo_ns / 4;
+ rcv_ns = LOAD_RELAXED(&frcti->rcv_cr.act) + timeo_ns;
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+ deadline = MIN(snd_ns, rcv_ns);
+ if (deadline <= now_ns)
+ deadline = now_ns + timeo_ns / 4;
+
+ tw_post(&frcti->ka_tw, deadline, ka_due, frcti);
+
+ return 0;
+}
+
+__attribute__((cold))
+static void ka_snd(struct frcti * frcti)
+{
+ struct ssm_pk_buff * spb;
+ struct frct_pci * pci;
+ struct timespec now;
+ uint64_t now_ns;
+ time_t timeo_ns;
+ uint64_t rcv_act;
+ uint64_t ka_rcv;
+ int64_t rcv_idle;
+ int64_t snd_idle;
+ uint32_t ackno;
+
+ assert(frcti);
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ timeo_ns = (time_t)(frcti->qs_timeout) * MILLION; /* IMM */
+ rcv_act = LOAD_RELAXED(&frcti->rcv_cr.act);
+ ka_rcv = LOAD_RELAXED(&frcti->t_ka_rcv);
+ rcv_idle = ts_age_ns(now_ns, rcv_act > ka_rcv ? rcv_act : ka_rcv);
+ snd_idle = ts_age_ns(now_ns, LOAD_RELAXED(&frcti->snd_cr.act));
+
+ if (rcv_idle > timeo_ns) {
+ frct_mark_peer_dead(frcti);
+ return;
+ }
+
+ if (snd_idle <= timeo_ns / 4) {
+ ka_arm(frcti);
+ return;
+ }
+
+ if (frct_ctrl_alloc(&spb, &pci, 0) < 0) {
+ ka_arm(frcti);
+ return;
+ }
+
+ ackno = LOAD_RELAXED(&frcti->rcv_cr.lwe);
+
+ pci->flags = hton16(FRCT_KA | FRCT_ACK);
+ pci->ackno = hton32(ackno);
+
+ frct_hcs_set(pci, false);
+
+ STAT_BUMP(frcti, ka_snd);
+ frct_tx(frcti, spb);
+
+ ka_arm(frcti);
+}
+
+/* Keepalive timer: re-posted by the fire callback itself. */
+static void ka_due(void * arg)
+{
+ ka_snd((struct frcti *) arg);
+}
+
+static void frcti_rdv_snd(struct frcti * frcti)
+{
+ frcti_pkt_snd(frcti, FRCT_RDVS, 0, 0);
+}
+
+#define HAS_RESCNTL(cr) ((cr)->cflags & FRCTFRESCNTL)
+static bool frcti_is_window_open(struct frcti * frcti)
+{
+ struct frct_cr * snd_cr = &frcti->snd_cr;
+ struct timespec now;
+ time_t diff;
+ bool ret = false;
+
+ if (!HAS_RESCNTL(snd_cr))
+ return true;
+
+ if (before(snd_cr->seqno, LOAD_RELAXED(&snd_cr->rwe)))
+ return true;
+
+ /* Window may be closed; wrlock for RDV state mutations. */
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ if (before(snd_cr->seqno, snd_cr->rwe)) {
+ ret = true;
+ goto unlock;
+ }
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+
+ if (frcti->open) {
+ frcti->open = false;
+ frcti->t_wnd = now;
+ frcti->t_last_rdv = now;
+ goto unlock;
+ }
+
+ diff = ts_diff_ns(&now, &frcti->t_wnd);
+ if (diff > MAX_RDV)
+ goto unlock;
+
+ diff = ts_diff_ns(&now, &frcti->t_last_rdv);
+ if (diff > (time_t) frcti->t_rdv) {
+ frcti->t_last_rdv = now;
+ frcti_rdv_snd(frcti);
+ STAT_BUMP(frcti, rdv_snd);
+ }
+ unlock:
+ pthread_rwlock_unlock(&frcti->lock);
+
+ return ret;
+}
+
+/* n contiguous seqnos free? No RDV: the n=1 path drives it. */
+static bool frcti_is_window_open_n(struct frcti * frcti,
+ size_t n)
+{
+ struct frct_cr * snd_cr = &frcti->snd_cr;
+
+ if (!HAS_RESCNTL(snd_cr))
+ return true;
+
+ if (n <= 1)
+ return frcti_is_window_open(frcti);
+
+ return before(snd_cr->seqno + (uint32_t)(n - 1),
+ LOAD_RELAXED(&snd_cr->rwe));
+}
+
+static void release_rq(struct frcti * frcti)
+{
+ size_t i;
+
+ for (i = 0; i < RQ_SIZE; ++i) {
+ if (frcti->rcv_slots[i].idx == -1)
+ continue;
+
+ /* Stream rq entries are sentinels (no spb owned). */
+ if (!frcti->stream)
+ frct_spb_release_idx(frcti->rcv_slots[i].idx);
+
+ frcti->rcv_slots[i].idx = -1;
+ STAT_BUMP(frcti, rq_released);
+ }
+}
+
+static __inline__ bool stream_ring_sz_ok(struct frcti * frcti,
+ size_t n)
+{
+ size_t per_pkt;
+
+ if (n > FRCT_STREAM_RING_SZ_MAX)
+ return false;
+
+ if ((n & (n - 1)) != 0)
+ return false;
+
+ per_pkt = frcti->frag_mtu - frcti_data_hdr_len(frcti);
+
+ return n >= FRCT_STREAM_RING_MIN_PKTS * per_pkt;
+}
+
+/* Default ring sized for full RQ_SIZE seqno window; pow2, capped. */
+static size_t default_stream_ring_sz(size_t per_pkt)
+{
+ size_t need;
+ size_t sz;
+
+ need = (size_t) RQ_SIZE * per_pkt;
+ sz = FRCT_STREAM_RING_SZ;
+
+ while (sz < need && sz < FRCT_STREAM_RING_SZ_MAX)
+ sz <<= 1;
+
+ return sz;
+}
+
+struct frcti * frcti_create(int fd,
+ uint64_t a,
+ uint64_t r,
+ uint64_t mpl,
+ time_t rtt_hint,
+ qosspec_t qs,
+ uint32_t mtu)
+{
+ struct frcti * frcti;
+ ssize_t idx;
+ struct timespec now;
+ uint64_t now_ns;
+ size_t bb;
+ size_t per_pkt;
#ifdef PROC_FLOW_STATS
- char frctstr[FRCT_NAME_STRLEN + 1];
+ char frctstr[FRCT_NAME_STRLEN + 1];
#endif
- mpl *= MILLION;
- a *= BILLION;
- r *= BILLION;
+ mpl *= MILLION; /* ms -> ns */
+ a *= MILLION; /* ms -> ns */
+ r *= MILLION; /* ms -> ns */
frcti = malloc(sizeof(*frcti));
if (frcti == NULL)
@@ -349,56 +1828,76 @@ static struct frcti * frcti_create(int fd,
memset(frcti, 0, sizeof(*frcti));
+ list_head_init(&frcti->rxm_list);
+
if (pthread_rwlock_init(&frcti->lock, NULL))
goto fail_lock;
- if (pthread_mutex_init(&frcti->mtx, NULL))
- goto fail_mutex;
-
- if (pthread_condattr_init(&cattr))
- goto fail_cattr;
-#ifndef __APPLE__
- pthread_condattr_setclock(&cattr, PTHREAD_COND_CLOCK);
-#endif
- if (pthread_cond_init(&frcti->cond, &cattr))
- goto fail_cond;
-
#ifdef PROC_FLOW_STATS
sprintf(frctstr, "%d", fd);
if (rib_reg(frctstr, &r_ops))
goto fail_rib_reg;
#endif
- pthread_condattr_destroy(&cattr);
for (idx = 0; idx < RQ_SIZE; ++idx)
- frcti->rq[idx] = -1;
+ frcti->rcv_slots[idx].idx = -1;
clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ frcti->t_mpl = mpl;
+ frcti->t_a = a;
+ frcti->t_r = r;
+ frcti->t_rdv = DELT_RDV;
+ frcti->fd = fd;
+ frcti->ber = (time_t) qs.ber;
+ frcti->lossy = (qs.loss != 0);
+ frcti->qs_timeout = (time_t) qs.timeout;
+
+ frcti->frag_mtu = (size_t) mtu;
+
+ /* Cap blocks per SACK at what fits in the per-flow frag_mtu. */
+ bb = (frcti->frag_mtu - FRCT_PCILEN - SACK_HDR_SIZE)
+ / SACK_BLOCK_SIZE;
+ if (bb > SACK_MAX_BLOCKS)
+ bb = SACK_MAX_BLOCKS;
+ frcti->sack_n_max = (uint16_t) bb;
+
+ frcti->max_rcv_sdu = FRCT_MAX_SDU;
+
+ frcti->stream = (qs.service == SVC_STREAM);
+ if (frcti->stream) {
+ per_pkt = frcti->frag_mtu - frcti_data_hdr_len(frcti);
+ frcti->rcv_ring_sz = default_stream_ring_sz(per_pkt);
+ frcti->ring_seq_cap =
+ (uint32_t) (frcti->rcv_ring_sz / per_pkt);
+ }
- frcti->mpl = mpl;
- frcti->a = a;
- frcti->r = r;
- frcti->rdv = DELT_RDV;
- frcti->fd = fd;
-
-
- frcti->rttseq = 0;
- frcti->probe = false;
-
- frcti->srtt = 0; /* Updated on first ACK */
- frcti->mdev = 10 * MILLION; /* Updated on first ACK */
- frcti->rto = BILLION; /* Initial rxm will be after 1 s */
-#ifdef PROC_FLOW_STATS
- frcti->n_rtx = 0;
- frcti->n_prb = 0;
- frcti->n_rtt = 0;
- frcti->n_dup = 0;
- frcti->n_dak = 0;
- frcti->n_rdv = 0;
- frcti->n_out = 0;
- frcti->n_rqo = 0;
-#endif
- if (proc.flows[fd].info.qs.loss == 0) {
+ frcti->rto_min = (time_t) MAX(RTO_MIN, 1ULL << RXMQ_RES);
+ rtt_init(frcti, rtt_hint);
+ frcti->t_min_rtt = now_ns;
+ frcti->probe_id_next = 1;
+ frcti->t_rcv_rtt = now_ns;
+ frcti->t_snd_probe = now_ns;
+ frcti->t_snd_sack = 0;
+ frcti->sack_lwe = 0;
+ frcti->sack_n = 0;
+ frcti->dsack_seqno = 0;
+ frcti->dsack_valid = false;
+ frcti->reo_wnd_mult = 1;
+ frcti->dsack_lwe_snap = 0;
+ frcti->t_last_reo_widen = 0;
+ /* So the first pre-DRF NACK fires without waiting cooldown. */
+ frcti->t_nack = now_ns - BILLION;
+ frcti->in_recovery = false;
+ frcti->recovery_high = 0;
+ frcti->rack_fired_lwe = 0;
+
+ tw_init_entry(&frcti->ack_tw);
+ tw_init_entry(&frcti->ka_tw);
+ tw_init_entry(&frcti->tlp_tw);
+
+ if (!frcti->lossy) {
frcti->snd_cr.cflags |= FRCTFRTX | FRCTFLINGER;
frcti->rcv_cr.cflags |= FRCTFRTX;
}
@@ -406,24 +1905,31 @@ static struct frcti * frcti_create(int fd,
frcti->snd_cr.cflags |= FRCTFRESCNTL;
frcti->snd_cr.rwe = START_WINDOW;
+ if (frcti->lossy)
+ frcti->snd_cr.rwe = RQ_SIZE;
- frcti->snd_cr.inact = (3 * mpl + a + r) / BILLION + 1; /* s */
- frcti->snd_cr.act.tv_sec = now.tv_sec - (frcti->snd_cr.inact + 1);
+ frcti->snd_cr.inact = 3 * mpl + a + r + BILLION; /* ns */
+ frcti->snd_cr.act = now_ns - frcti->snd_cr.inact - BILLION;
- frcti->rcv_cr.inact = (2 * mpl + a + r) / BILLION + 1; /* s */
- frcti->rcv_cr.act.tv_sec = now.tv_sec - (frcti->rcv_cr.inact + 1);
+ frcti->rcv_cr.inact = 2 * mpl + a + r + BILLION; /* ns */
+ frcti->rcv_cr.act = now_ns - frcti->rcv_cr.inact - BILLION;
+
+ frcti->t_ka_rcv = now_ns;
+
+ /* qs_timeout == 0: no KA, silent peer crash goes undetected. */
+ if (frcti->qs_timeout > 0) {
+ if (ka_arm(frcti) < 0)
+ goto fail_ka_arm;
+ }
return frcti;
+ fail_ka_arm:
#ifdef PROC_FLOW_STATS
+ sprintf(frctstr, "%d", fd);
+ rib_unreg(frctstr);
fail_rib_reg:
- pthread_cond_destroy(&frcti->cond);
#endif
- fail_cond:
- pthread_condattr_destroy(&cattr);
- fail_cattr:
- pthread_mutex_destroy(&frcti->mtx);
- fail_mutex:
pthread_rwlock_destroy(&frcti->lock);
fail_lock:
free(frcti);
@@ -431,21 +1937,55 @@ static struct frcti * frcti_create(int fd,
return NULL;
}
-static void frcti_destroy(struct frcti * frcti)
+void frcti_destroy(struct frcti * frcti)
{
#ifdef PROC_FLOW_STATS
char frctstr[FRCT_NAME_STRLEN + 1];
+#endif
+ /* Drop every wheel entry referencing frcti before freeing it. */
+ rxm_cancel_all(frcti);
+ tw_cancel(&frcti->ack_tw);
+ tw_cancel(&frcti->ka_tw);
+ tw_cancel(&frcti->tlp_tw);
+
+#if defined(PROC_FLOW_STATS) && defined(FRCT_DEBUG_STDOUT)
+ printf("[FRCT teardown] pid=%d fd=%d "
+ "sdu_snd=%zu sdu_reasm=%zu sdu_sole=%zu "
+ "frag_snd=%zu frag_rcv=%zu frag_drop=%zu "
+ "rxm_rto=%zu rxm_sack=%zu rxm_dup=%zu "
+ "rxm_due=%zu acked=%zu unowned=%zu aged=%zu defer=%zu "
+ "cancel=%zu arm_fail=%zu inflight=%u "
+ "nack_snd=%zu nack_rcv=%zu inact_drop=%zu "
+ "drf_rebase=%zu rq_released=%zu\n",
+ (int) getpid(), frcti->fd,
+ frcti->stat.sdu_snd_frag, frcti->stat.sdu_reasm,
+ frcti->stat.sdu_sole,
+ frcti->stat.frag_snd, frcti->stat.frag_rcv,
+ frcti->stat.frag_drop,
+ frcti->stat.rxm_rto, frcti->stat.rxm_sack,
+ frcti->stat.rxm_dupthresh,
+ frcti->stat.rxm_due_count, frcti->stat.rxm_due_acked,
+ frcti->stat.rxm_due_unowned, frcti->stat.rxm_due_aged,
+ frcti->stat.rxm_due_defer,
+ frcti->stat.rxm_cancel, frcti->stat.rxm_arm_fail,
+ frcti->snd_cr.seqno - frcti->snd_cr.lwe,
+ frcti->stat.nack_snd, frcti->stat.nack_rcv,
+ frcti->stat.inact_drop,
+ frcti->stat.drf_rebase, frcti->stat.rq_released);
+#endif
+
+ release_rq(frcti);
+ free(frcti->rcv_ring);
+#ifdef PROC_FLOW_STATS
sprintf(frctstr, "%d", frcti->fd);
rib_unreg(frctstr);
#endif
- pthread_cond_destroy(&frcti->cond);
- pthread_mutex_destroy(&frcti->mtx);
pthread_rwlock_destroy(&frcti->lock);
free(frcti);
}
-static uint16_t frcti_getflags(struct frcti * frcti)
+uint16_t frcti_getflags(struct frcti * frcti)
{
uint16_t ret;
@@ -453,89 +1993,91 @@ static uint16_t frcti_getflags(struct frcti * frcti)
pthread_rwlock_rdlock(&frcti->lock);
- ret = frcti->snd_cr.cflags;
+ ret = frcti->snd_cr.cflags & FRCTFMASK;
pthread_rwlock_unlock(&frcti->lock);
return ret;
}
-static void frcti_setflags(struct frcti * frcti,
- uint16_t flags)
+void frcti_setflags(struct frcti * frcti,
+ uint16_t flags)
{
- flags |= FRCTFRTX; /* Should not be set by command */
-
assert(frcti);
- pthread_rwlock_wrlock(&frcti->lock);
+ flags &= FRCTFSETMASK;
- frcti->snd_cr.cflags &= FRCTFRTX; /* Zero other flags */
+ pthread_rwlock_wrlock(&frcti->lock);
- frcti->snd_cr.cflags &= flags;
+ frcti->snd_cr.cflags = (frcti->snd_cr.cflags & ~FRCTFSETMASK) | flags;
pthread_rwlock_unlock(&frcti->lock);
}
-#define frcti_queued_pdu(frcti) \
- (frcti == NULL ? idx : __frcti_queued_pdu(frcti))
+size_t frcti_get_max_rcv_sdu(struct frcti * frcti)
+{
+ size_t ret;
-#define frcti_snd(frcti, spb) \
- (frcti == NULL ? 0 : __frcti_snd(frcti, spb))
+ assert(frcti);
-#define frcti_rcv(frcti, spb) \
- (frcti == NULL ? 0 : __frcti_rcv(frcti, spb))
+ pthread_rwlock_rdlock(&frcti->lock);
+ ret = frcti->max_rcv_sdu;
+ pthread_rwlock_unlock(&frcti->lock);
-#define frcti_dealloc(frcti) \
- (frcti == NULL ? 0 : __frcti_dealloc(frcti))
+ return ret;
+}
-#define frcti_is_window_open(frcti) \
- (frcti == NULL ? true : __frcti_is_window_open(frcti))
+int frcti_set_max_rcv_sdu(struct frcti * frcti,
+ size_t max)
+{
+ assert(frcti);
-#define frcti_window_wait(frcti, abstime) \
- (frcti == NULL ? 0 : __frcti_window_wait(frcti, abstime))
+ if (max == 0)
+ return -EINVAL;
+ pthread_rwlock_wrlock(&frcti->lock);
+ frcti->max_rcv_sdu = max;
+ pthread_rwlock_unlock(&frcti->lock);
-static bool __frcti_is_window_open(struct frcti * frcti)
+ return 0;
+}
+
+size_t frcti_get_rcv_ring_sz(struct frcti * frcti)
{
- struct frct_cr * snd_cr = &frcti->snd_cr;
- bool ret = true;
+ size_t ret;
+
+ assert(frcti);
pthread_rwlock_rdlock(&frcti->lock);
+ ret = frcti->rcv_ring_sz;
+ pthread_rwlock_unlock(&frcti->lock);
+
+ return ret;
+}
- if (snd_cr->cflags & FRCTFRESCNTL)
- ret = before(snd_cr->seqno, snd_cr->rwe);
+/* Set before any stream byte has been delivered; -EBUSY otherwise. */
+int frcti_set_rcv_ring_sz(struct frcti * frcti,
+ size_t n)
+{
+ int ret = 0;
+ size_t per_pkt;
- if (!ret) {
- struct timespec now;
+ assert(frcti);
- clock_gettime(PTHREAD_COND_CLOCK, &now);
+ if (!frcti->stream)
+ return -ENOTSUP;
+ if (!stream_ring_sz_ok(frcti, n))
+ return -EINVAL;
- pthread_mutex_lock(&frcti->mtx);
- if (frcti->open) {
- frcti->open = false;
- frcti->t_wnd = now;
- frcti->t_rdvs = now;
- } else {
- time_t diff;
- diff = ts_diff_ns(&now, &frcti->t_wnd);
- if (diff > MAX_RDV) {
- pthread_mutex_unlock(&frcti->mtx);
- pthread_rwlock_unlock(&frcti->lock);
- return false;
- }
-
- diff = ts_diff_ns(&now, &frcti->t_rdvs);
- if (diff > frcti->rdv) {
- frcti->t_rdvs = now;
- __send_rdv(frcti->fd);
-#ifdef PROC_FLOW_STATS
- frcti->n_rdv++;
-#endif
+ per_pkt = frcti->frag_mtu - frcti_data_hdr_len(frcti);
- }
- }
+ pthread_rwlock_wrlock(&frcti->lock);
- pthread_mutex_unlock(&frcti->mtx);
+ if (frcti->rcv_ring != NULL) {
+ ret = -EBUSY;
+ } else {
+ frcti->rcv_ring_sz = n;
+ frcti->ring_seq_cap = (uint32_t) (n / per_pkt);
}
pthread_rwlock_unlock(&frcti->lock);
@@ -543,392 +2085,2101 @@ static bool __frcti_is_window_open(struct frcti * frcti)
return ret;
}
-static int __frcti_window_wait(struct frcti * frcti,
- struct timespec * abstime)
+time_t frcti_get_rto_min(struct frcti * frcti)
{
- struct frct_cr * snd_cr = &frcti->snd_cr;
- int ret = 0;
+ time_t v;
+
+ assert(frcti);
pthread_rwlock_rdlock(&frcti->lock);
+ v = frcti->rto_min;
+ pthread_rwlock_unlock(&frcti->lock);
- if (!(snd_cr->cflags & FRCTFRESCNTL)) {
- pthread_rwlock_unlock(&frcti->lock);
+ return v;
+}
+
+/* Floor at the timer-wheel resolution; finer granularity is unrepresentable. */
+int frcti_set_rto_min(struct frcti * frcti,
+ time_t rto_min)
+{
+ time_t floor = (time_t) (1ULL << RXMQ_RES);
+ time_t rto_floor;
+ time_t rto;
+
+ assert(frcti);
+
+ if (rto_min < floor)
+ return -EINVAL;
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ frcti->rto_min = rto_min;
+ if (frcti->srtt > 0) {
+ rto_floor = MAX(rto_min, 2 * frcti->srtt);
+ rto = MAX(rto_floor,
+ frcti->srtt + (frcti->mdev << MDEV_MUL));
+ STORE_RELEASE(&frcti->rto, rto);
+ } else if (frcti->rto < rto_min) {
+ STORE_RELEASE(&frcti->rto, rto_min);
+ }
+
+ pthread_rwlock_unlock(&frcti->lock);
+
+ return 0;
+}
+
+/* Re-arm a fresh rxm so a lost fast-retx still recovers via RTO. */
+static void sack_rxm_snd(struct frcti * frcti,
+ void * pkt,
+ size_t len)
+{
+ struct ssm_pk_buff * spb;
+ const struct frct_pci * pci;
+ uint32_t rcv_lwe;
+ uint32_t seqno;
+ int ret;
+
+ rcv_lwe = LOAD_RELAXED(&frcti->rcv_cr.lwe);
+
+ spb = rxm_pkt_prepare(pkt, len, rcv_lwe, frcti->stream);
+ if (spb == NULL)
+ return;
+
+ pci = (const struct frct_pci *) ssm_pk_buff_head(spb);
+ seqno = ntoh32(pci->seqno);
+
+ /* Register fresh rxm before send; old entry self-cleans. */
+ if (rxm_arm(frcti, seqno, spb) < 0) {
+ frct_spb_release(spb);
+ return;
+ }
+
+ STAT_BUMP(frcti, rxm_sack);
+ ret = frct_tx(frcti, spb);
+ if (ret == -EFLOWDOWN || ret == -ENOTALLOC)
+ STAT_BUMP(frcti, rxm_tx_dead);
+}
+
+/* Additive HoL emit; original snd_slots[hp].rxm stays armed (NewReno). */
+static int fast_rxm_send(struct frcti * frcti,
+ void * pkt,
+ size_t len)
+{
+ struct ssm_pk_buff * spb;
+ uint32_t rcv_lwe;
+
+ rcv_lwe = LOAD_RELAXED(&frcti->rcv_cr.lwe);
+
+ spb = rxm_pkt_prepare(pkt, len, rcv_lwe, frcti->stream);
+ if (spb == NULL)
return 0;
+
+ return frct_tx(frcti, spb);
+}
+
+/* PCI bytes survive head_release at receive; just rewind the pointer. */
+static __inline__ uint16_t frag_role_peek(struct ssm_pk_buff * spb)
+{
+ const struct frct_pci * pci;
+
+ assert(ssm_pk_buff_head(spb) != NULL);
+
+ pci = (const struct frct_pci *) (ssm_pk_buff_head(spb) - FRCT_PCILEN);
+
+ return ntoh16(pci->flags) & FRCT_FR_MASK;
+}
+
+enum frag_state {
+ FRAG_NOT_READY, /* head missing / FIRST..LAST run incomplete */
+ FRAG_DELIVER, /* *count fragments form a deliverable SDU */
+ FRAG_DROP, /* *count fragments at lwe are malformed */
+};
+
+/*
+ * On a gap in the run: FRTX waits (NOT_READY); best-effort scans forward
+ * for the next FIRST/SOLE and returns DROP for the broken prefix. *count
+ * gets the offset from the trailing edge. NOT_READY if no later run is
+ * in window. Caller rdlock.
+ */
+static enum frag_state frag_inspect_gap(struct frcti * frcti,
+ size_t start,
+ size_t * count)
+{
+ const struct rcv_slot * slots = frcti->rcv_slots;
+ struct ssm_pk_buff * spb;
+ uint32_t k;
+ uint16_t role;
+ size_t m;
+
+ if (frcti->rcv_cr.cflags & FRCTFRTX)
+ return FRAG_NOT_READY;
+
+ k = frcti->rcv_cr.rwe - RQ_SIZE;
+
+ for (m = start; m < RQ_SIZE; ++m) {
+ if (slots[RQ_SLOT(k + m)].idx == -1)
+ continue;
+
+ spb = rq_frag(frcti, k + m);
+ role = frag_role_peek(spb);
+
+ if (role == FRCT_FR_SOLE || role == FRCT_FR_FIRST) {
+ if (m == 0)
+ return FRAG_NOT_READY;
+
+ *count = m;
+ return FRAG_DROP;
+ }
}
- while (snd_cr->seqno == snd_cr->rwe && ret != -ETIMEDOUT) {
- struct timespec now;
- pthread_rwlock_unlock(&frcti->lock);
- pthread_mutex_lock(&frcti->mtx);
+ return FRAG_NOT_READY;
+}
+
+/*
+ * Inspect rq[lwe..]; set *count and return DELIVER/DROP/NOT_READY. DROP
+ * covers broken prefixes (mid/last at HoL, FIRST..[non-LAST]..new-FIRST).
+ * Non-FRTX flows skip past gaps to the next FIRST/SOLE. Caller rdlock.
+ */
+static enum frag_state frag_run_inspect(struct frcti * frcti,
+ size_t * count)
+{
+ const struct rcv_slot * slots = frcti->rcv_slots;
+ struct ssm_pk_buff * spb;
+ uint32_t k = frcti->rcv_cr.rwe - RQ_SIZE;
+ uint16_t role;
+ size_t n = 0;
- if (frcti->open) {
- clock_gettime(PTHREAD_COND_CLOCK, &now);
+ if (slots[RQ_SLOT(k)].idx == -1)
+ return frag_inspect_gap(frcti, 0, count);
- frcti->t_wnd = now;
- frcti->t_rdvs = now;
- frcti->open = false;
+ spb = rq_frag(frcti, k);
+ role = frag_role_peek(spb);
+
+ if (role == FRCT_FR_SOLE) {
+ *count = 1;
+ return FRAG_DELIVER;
+ }
+
+ if (role != FRCT_FR_FIRST) {
+ *count = 1;
+ return FRAG_DROP;
+ }
+
+ while (true) {
+ if (n == RQ_SIZE || slots[RQ_SLOT(k + n)].idx == -1)
+ return frag_inspect_gap(frcti, n, count);
+
+ spb = rq_frag(frcti, k + n);
+ role = frag_role_peek(spb);
+ ++n;
+
+ if (role == FRCT_FR_LAST) {
+ *count = n;
+ return FRAG_DELIVER;
}
- pthread_cleanup_push(__cleanup_mutex_unlock, &frcti->mtx);
+ if (n > 1 && role != FRCT_FR_MID) {
+ /* SOLE or new FIRST mid-run: drop the prefix. */
+ *count = n - 1;
+ return FRAG_DROP;
+ }
+ }
+}
- ret = -__timedwait(&frcti->cond, &frcti->mtx, abstime);
+/* Caller wrlock. Delivery edge is implicit: rwe - RQ_SIZE. */
+static void frag_drop(struct frcti * frcti,
+ size_t count)
+{
+ uint32_t k = frcti->rcv_cr.rwe - RQ_SIZE;
+ uint32_t edge;
+ size_t i;
- pthread_cleanup_pop(false);
+ for (i = 0; i < count; ++i) {
+ size_t pos = RQ_SLOT(k + i);
- if (ret == -ETIMEDOUT) {
- time_t diff;
+ if (frcti->rcv_slots[pos].idx == -1)
+ continue;
- clock_gettime(PTHREAD_COND_CLOCK, &now);
+ frct_spb_release_idx(frcti->rcv_slots[pos].idx);
+ frcti->rcv_slots[pos].idx = -1;
+ }
- diff = ts_diff_ns(&now, &frcti->t_wnd);
- if (diff > MAX_RDV) {
- pthread_mutex_unlock(&frcti->mtx);
- return -ECONNRESET; /* write fails! */
- }
+ frcti->rcv_cr.rwe += count;
- diff = ts_diff_ns(&now, &frcti->t_rdvs);
- if (diff > frcti->rdv) {
- frcti->t_rdvs = now;
- __send_rdv(frcti->fd);
- }
+ /* Drop may span a gap; pull lwe up to preserve rwe - RQ_SIZE <= lwe. */
+ edge = frcti->rcv_cr.rwe - RQ_SIZE;
+ if (before(frcti->rcv_cr.lwe, edge))
+ STORE_RELEASE(&frcti->rcv_cr.lwe, edge);
+}
+
+/* Copy `count` fragments at rq[lwe..] into buf; release + advance lwe. */
+static size_t frag_gather(struct frcti * frcti,
+ size_t count,
+ uint8_t * buf)
+{
+ struct ssm_pk_buff * frag;
+ size_t off = 0;
+ size_t i;
+ uint32_t k = frcti->rcv_cr.rwe - RQ_SIZE;
+
+ for (i = 0; i < count; ++i) {
+ size_t pos = RQ_SLOT(k + i);
+ size_t flen;
+
+ frag = rq_frag(frcti, k + i);
+ flen = ssm_pk_buff_len(frag);
+ memcpy(buf + off, ssm_pk_buff_head(frag), flen);
+ off += flen;
+ frct_spb_release_idx(frcti->rcv_slots[pos].idx);
+ frcti->rcv_slots[pos].idx = -1;
+ }
+
+ frcti->rcv_cr.rwe += count;
+
+ return off;
+}
+
+/* Caller holds lock. */
+static size_t frag_total_len(struct frcti * frcti,
+ size_t count,
+ bool * overflow)
+{
+ struct ssm_pk_buff * frag;
+ size_t total = 0;
+ size_t i;
+ uint32_t k = frcti->rcv_cr.rwe - RQ_SIZE;
+
+ *overflow = false;
+
+ for (i = 0; i < count; ++i) {
+ size_t flen;
+
+ frag = rq_frag(frcti, k + i);
+ flen = ssm_pk_buff_len(frag);
+ if (total + flen < total) {
+ *overflow = true;
+ return 0;
}
+ total += flen;
+ }
+
+ return total;
+}
+
+/*
+ * Process a delivered slot at lwe: latch FIN if acceptable,
+ * advance byte_high (clamped to byte_fin once latched).
+ */
+static __inline__ void stream_deliver_slot(struct frcti * frcti,
+ size_t lp)
+{
+ uint32_t end;
+
+ end = frcti->rcv_slots[lp].end;
+
+ if (frcti->rcv_slots[lp].fin) {
+ if (end == frcti->rcv_byte_high && !frcti->rcv_fin_seen) {
+ frcti->rcv_fin_seen = true;
+ frcti->rcv_byte_fin = end;
+ } else {
+ STAT_BUMP(frcti, strm_fin_drop);
+ }
+ }
+
+ if (frcti->rcv_fin_seen && after(end, frcti->rcv_byte_fin))
+ end = frcti->rcv_byte_fin;
+
+ frcti->rcv_byte_high = end;
+}
+
+/* Two-segment memcpy from buf into the rx ring at byte offset start. */
+static void stream_ring_write(struct frcti * frcti,
+ uint32_t start,
+ buffer_t buf)
+{
+ size_t mask = frcti->rcv_ring_sz - 1;
+ size_t off = start & mask;
+
+ if (off + buf.len <= frcti->rcv_ring_sz) {
+ memcpy(frcti->rcv_ring + off, buf.data, buf.len);
+ } else {
+ size_t first = frcti->rcv_ring_sz - off;
+ memcpy(frcti->rcv_ring + off, buf.data, first);
+ memcpy(frcti->rcv_ring, buf.data + first, buf.len - first);
+ }
+}
- pthread_mutex_unlock(&frcti->mtx);
- pthread_rwlock_rdlock(&frcti->lock);
+/* Two-segment memcpy from the rx ring at byte offset start into buf. */
+static void stream_ring_read(struct frcti * frcti,
+ uint32_t start,
+ buffer_t buf)
+{
+ size_t mask = frcti->rcv_ring_sz - 1;
+ size_t off = start & mask;
+
+ if (off + buf.len <= frcti->rcv_ring_sz) {
+ memcpy(buf.data, frcti->rcv_ring + off, buf.len);
+ } else {
+ size_t first = frcti->rcv_ring_sz - off;
+ memcpy(buf.data, frcti->rcv_ring + off, first);
+ memcpy(buf.data + first, frcti->rcv_ring, buf.len - first);
}
+}
+
+/* Deliver-or-drop one stashed slot at lwe; advance lwe/rwe. Caller wrlock. */
+static void stream_advance_lwe(struct frcti * frcti)
+{
+ size_t lp;
+
+ lp = RQ_SLOT(frcti->rcv_cr.lwe);
+
+ if (frcti->rcv_slots[lp].start != frcti->rcv_byte_high)
+ STAT_BUMP(frcti, strm_drop);
+ else
+ stream_deliver_slot(frcti, lp);
+
+ frcti->rcv_slots[lp].fin = 0;
+ frcti->rcv_slots[lp].idx = -1;
+ STORE_RELEASE(&frcti->rcv_cr.lwe, frcti->rcv_cr.lwe + 1);
+ frcti->rcv_cr.rwe++;
+}
+
+/*
+ * Validate a stream DATA packet before stashing. Returns 0 if the
+ * packet may be written into rcv_ring + rq[], -1 otherwise.
+ */
+static __inline__ int stream_stash_check(struct frcti * frcti,
+ uint32_t start,
+ uint32_t end,
+ size_t plen,
+ uint16_t flags)
+{
+ if (end - start != (uint32_t) plen)
+ return -1;
+ /* FIN MUST be 0-byte. */
+ if ((flags & FRCT_FIN) && plen != 0)
+ return -1;
+
+ /* Post-EOS: no further FIN once latched. */
+ if (frcti->rcv_fin_seen && (flags & FRCT_FIN))
+ return -1;
+
+ /* Post-EOS: reject data at or past byte_fin. */
+ if (frcti->rcv_fin_seen && !before(start, frcti->rcv_byte_fin))
+ return -1;
+
+ /* Stale: peer is behind the delivered edge. */
+ if (before(end, frcti->rcv_byte_next))
+ return -1;
+
+ /* Exact-edge: only an empty-stream FIN is meaningful. */
+ if (end == frcti->rcv_byte_next && !(flags & FRCT_FIN))
+ return -1;
+
+ if (end - frcti->rcv_byte_next > frcti->rcv_ring_sz)
+ return -1;
+
+ return 0;
+}
+
+/*
+ * Stream-mode DATA receive: validate, stash payload in rcv_ring, mark
+ * rq[pos], advance lwe through any newly-contiguous run. Returns 0
+ * (spb released) or -1 (caller releases). Caller wrlock.
+ */
+static int frcti_stream_data_rcv(struct frcti * frcti,
+ struct ssm_pk_buff * spb,
+ size_t pos,
+ uint16_t flags)
+{
+ struct frct_pci_stream * spci;
+ uint32_t start;
+ uint32_t end;
+ buffer_t buf;
+ size_t skip;
+
+ if (ssm_pk_buff_len(spb) < FRCT_PCI_STREAM_LEN)
+ return -1;
+
+ if (frcti->rcv_ring == NULL) {
+ frcti->rcv_ring = calloc(1, frcti->rcv_ring_sz);
+ if (frcti->rcv_ring == NULL)
+ return -ENOMEM;
+ }
+
+ spci = FRCT_HDR_POP(spb, frct_pci_stream);
+ start = ntoh32(spci->start);
+ end = ntoh32(spci->end);
+
+ buf.data = ssm_pk_buff_head(spb);
+ buf.len = ssm_pk_buff_len(spb);
+
+ if (stream_stash_check(frcti, start, end, buf.len, flags) < 0)
+ return -1;
+
+ /* Trim front-overlap with already-delivered region. */
+ if (before(start, frcti->rcv_byte_next)) {
+ skip = frcti->rcv_byte_next - start;
+ buf.data += skip;
+ buf.len -= skip;
+ start = frcti->rcv_byte_next;
+ }
+
+ stream_ring_write(frcti, start, buf);
+ STAT_ADD(frcti, strm_rcv_byte, buf.len);
+
+ frcti->rcv_slots[pos].idx = 1;
+ frcti->rcv_slots[pos].start = start;
+ frcti->rcv_slots[pos].end = end;
+ frcti->rcv_slots[pos].fin = (flags & FRCT_FIN) ? 1 : 0;
+
+ while (frcti->rcv_slots[RQ_SLOT(frcti->rcv_cr.lwe)].idx != -1)
+ stream_advance_lwe(frcti);
+
+ frct_spb_release(spb);
+
+ return 0;
+}
+
+/*
+ * DATA receive: stash idx at rq[pos], advance lwe through any
+ * contiguous run. Caller wrlock.
+ */
+static void frcti_data_stash(struct frcti * frcti,
+ ssize_t idx,
+ size_t pos,
+ uint16_t flags)
+{
+ frcti->rcv_slots[pos].idx = idx;
+
+ if ((flags & FRCT_FR_MASK) != FRCT_FR_SOLE)
+ STAT_BUMP(frcti, frag_rcv);
+
+ /* lwe = cum-ACK edge; advance per fragment through contiguous run. */
+ while (before(frcti->rcv_cr.lwe, frcti->rcv_cr.rwe)
+ && frcti->rcv_slots[RQ_SLOT(frcti->rcv_cr.lwe)].idx != -1)
+ STORE_RELEASE(&frcti->rcv_cr.lwe, frcti->rcv_cr.lwe + 1);
+}
+
+/* Stream consume: copy up to `count` contiguous bytes from ring into buf. */
+static ssize_t frcti_consume_stream(struct frcti * frcti,
+ uint8_t * buf,
+ size_t count)
+{
+ size_t avail;
+ size_t copy;
+ ssize_t ret;
+ buffer_t dst;
+
+ assert(frcti);
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ avail = (size_t) (frcti->rcv_byte_high - frcti->rcv_byte_next);
+ if (avail == 0) {
+ /* EOS drained: signal EOF to the reader. */
+ if (frcti->rcv_fin_seen
+ && frcti->rcv_byte_next == frcti->rcv_byte_fin)
+ ret = 0;
+ else
+ ret = -EAGAIN;
+ goto unlock;
+ }
+
+ copy = MIN(avail, count);
+
+ dst.data = buf;
+ dst.len = copy;
+ stream_ring_read(frcti, frcti->rcv_byte_next, dst);
+
+ frcti->rcv_byte_next += (uint32_t) copy;
+ STAT_ADD(frcti, strm_dlv_byte, copy);
+
+ ret = (ssize_t) copy;
+
+ unlock:
pthread_rwlock_unlock(&frcti->lock);
return ret;
}
-static ssize_t __frcti_queued_pdu(struct frcti * frcti)
+/*
+ * FRTX consume: copy next ready PDU (full SDU or nothing). Returns bytes,
+ * -EAGAIN (no PDU), or -EMSGSIZE (oversize: run dropped to unblock flow).
+ */
+static ssize_t frcti_consume(struct frcti * frcti,
+ uint8_t * buf,
+ size_t count)
{
- ssize_t idx;
- size_t pos;
+ size_t n;
+ size_t total;
+ bool overflow;
+ enum frag_state st;
+ ssize_t ret;
assert(frcti);
- /* See if we already have the next PDU. */
pthread_rwlock_wrlock(&frcti->lock);
- pos = frcti->rcv_cr.lwe & (RQ_SIZE - 1);
-
- idx = frcti->rq[pos];
- if (idx != -1) {
- ++frcti->rcv_cr.lwe;
- ++frcti->rcv_cr.rwe;
- frcti->rq[pos] = -1;
+ while (true) {
+ st = frag_run_inspect(frcti, &n);
+ if (st == FRAG_NOT_READY) {
+ ret = -EAGAIN;
+ goto unlock;
+ }
+ if (st == FRAG_DROP) {
+ STAT_ADD(frcti, frag_drop, n);
+ frag_drop(frcti, n);
+ continue;
+ }
+ /* FRAG_DELIVER */
+ total = frag_total_len(frcti, n, &overflow);
+ if (overflow || total > frcti->max_rcv_sdu || total > count) {
+ STAT_ADD(frcti, frag_drop, n);
+ frag_drop(frcti, n);
+ ret = -EMSGSIZE;
+ goto unlock;
+ }
+ ret = (ssize_t) frag_gather(frcti, n, buf);
+ if (n > 1)
+ STAT_BUMP(frcti, sdu_reasm);
+ else
+ STAT_BUMP(frcti, sdu_sole);
+ goto unlock;
}
+ unlock:
pthread_rwlock_unlock(&frcti->lock);
- return idx;
+ return ret;
}
-static ssize_t __frcti_pdu_ready(struct frcti * frcti)
+static bool frcti_pdu_ready(struct frcti * frcti)
{
- ssize_t idx;
- size_t pos;
+ size_t pos;
+ size_t count;
+ bool ready;
assert(frcti);
- /* See if we already have the next PDU. */
pthread_rwlock_rdlock(&frcti->lock);
- pos = frcti->rcv_cr.lwe & (RQ_SIZE - 1);
- idx = frcti->rq[pos];
+ if (frcti->stream) {
+ ready = frcti->rcv_byte_high != frcti->rcv_byte_next;
+ pthread_rwlock_unlock(&frcti->lock);
+ return ready;
+ }
+
+ if (frag_run_inspect(frcti, &count) != FRAG_DELIVER) {
+ /* Drop case: frcti_consume will handle it; not ready. */
+ pthread_rwlock_unlock(&frcti->lock);
+ return false;
+ }
+
+ pos = RQ_SLOT(frcti->rcv_cr.rwe - RQ_SIZE);
+ ready = frcti->rcv_slots[pos].idx != -1;
+
+ pthread_rwlock_unlock(&frcti->lock);
+
+ return ready;
+}
+
+/* No srtt yet: probe at the cold-probe cadence to seed it. */
+#define PROBE_DUE_COLD(frcti, now_ns) \
+ ((now_ns) - (frcti)->t_snd_probe > (uint64_t) RTTP_COLD_NS)
+
+/* Have srtt: probe when peer quiet for > 2*srtt and last probe > srtt. */
+#define PROBE_DUE_WARM(frcti, now_ns) \
+ ((now_ns) - (frcti)->t_rcv_rtt > 2u * (uint64_t)(frcti)->srtt \
+ && (now_ns) - (frcti)->t_snd_probe > (uint64_t)(frcti)->srtt)
+
+/* Seeds srtt for receive-only sides so they don't fall back to 1 s RTO. */
+__attribute__((cold))
+static void frcti_rcv_probe(struct frcti * frcti,
+ uint64_t now_ns)
+{
+ uint32_t probe_id;
+ uint8_t nonce[RTTP_NONCE_LEN] = { 0 };
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ if (frcti->srtt == 0 && !PROBE_DUE_COLD(frcti, now_ns)) {
+ pthread_rwlock_unlock(&frcti->lock);
+ return;
+ }
+
+ if (frcti->srtt != 0 && !PROBE_DUE_WARM(frcti, now_ns)) {
+ pthread_rwlock_unlock(&frcti->lock);
+ return;
+ }
+
+ probe_id = rttp_alloc_probe(frcti, now_ns, nonce);
pthread_rwlock_unlock(&frcti->lock);
- return idx;
+ if (probe_id != 0)
+ frcti_rttp_snd(frcti, probe_id, 0, nonce);
}
-#include <timerwheel.c>
+/* Echo at slot `pos` matches our probe: id, slot, nonce all intact. */
+static __inline__ bool probe_echo_matches(struct frcti * frcti,
+ size_t pos,
+ uint32_t echo_id,
+ const uint8_t nonce[RTTP_NONCE_LEN])
+{
+ if (frcti->probes[pos].id != echo_id)
+ return false;
+
+ if (frcti->probes[pos].ts == 0)
+ return false;
+
+ return memcmp(frcti->probes[pos].nonce, nonce, RTTP_NONCE_LEN) == 0;
+}
/*
- * Send a final ACK for everything that has not been ACK'd.
- * If the flow should be kept active for retransmission,
- * the returned time will be negative.
+ * RTT probe (echo_id == 0): bounce the nonce back to peer.
+ * RTT echo (echo_id != 0): verify nonce + feed sample.
*/
-static time_t __frcti_dealloc(struct frcti * frcti)
+static void frcti_rttp_rcv(struct frcti * frcti,
+ buffer_t pkt,
+ uint64_t now_ns)
{
- struct timespec now;
- time_t wait;
- int ackno;
- int fd = -1;
+ const struct frct_rttp * rttp;
+ uint32_t probe_id;
+ uint32_t echo_id;
+ uint8_t nonce[RTTP_NONCE_LEN];
+ size_t ring_pos;
+ int64_t elapsed;
+ uint64_t sample;
+
+ if (pkt.len < RTTP_PAYLOAD)
+ return;
+
+ rttp = (const struct frct_rttp *) pkt.data;
+ probe_id = ntoh32(rttp->probe_id);
+ echo_id = ntoh32(rttp->echo_id);
+
+ /* Forged/malformed: bouncing this would loop on echo_id == 0. */
+ if (probe_id == 0 && echo_id == 0)
+ return;
+
+ memcpy(nonce, rttp->nonce, sizeof(nonce));
+
+ if (echo_id == 0) {
+ /* Probe: echo back with same nonce so peer can verify. */
+ STAT_BUMP(frcti, rttp_rcv);
+ frcti_rttp_snd(frcti, 0, probe_id, nonce);
+ return;
+ }
+
+ ring_pos = RTTP_POS(echo_id);
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ if (!probe_echo_matches(frcti, ring_pos, echo_id, nonce)) {
+ pthread_rwlock_unlock(&frcti->lock);
+ return;
+ }
+
+ elapsed = ts_age_ns(now_ns, frcti->probes[ring_pos].ts);
+ frcti->probes[ring_pos].ts = 0;
+ frcti->t_rcv_rtt = now_ns;
+
+ if (elapsed <= 0) {
+ pthread_rwlock_unlock(&frcti->lock);
+ return;
+ }
+ sample = (uint64_t) elapsed;
+
+ /* Clamp probe sample to RTT_CLAMP_MUL * srtt to avoid poisoning. */
+ if (frcti->srtt > 0)
+ sample = MIN(sample, (uint64_t) frcti->srtt * RTT_CLAMP_MUL);
+
+ rtt_update(frcti, sample, now_ns);
+
+ pthread_rwlock_unlock(&frcti->lock);
+}
+
+/* Honours piggybacked ACK on the KA. */
+static void frcti_ka_rcv(struct frcti * frcti,
+ const struct frct_pci * pci,
+ uint64_t now_ns,
+ uint16_t flags)
+{
+ uint32_t ka_ackno;
+
+ STORE_RELEASE(&frcti->t_ka_rcv, now_ns);
+ STAT_BUMP(frcti, ka_rcv);
+
+ if (!(flags & FRCT_ACK))
+ return;
+
+ ka_ackno = ntoh32(pci->ackno);
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ if (within(ka_ackno, frcti->snd_cr.lwe, frcti->snd_cr.seqno))
+ STORE_RELEASE(&frcti->snd_cr.lwe, ka_ackno);
+
+ pthread_rwlock_unlock(&frcti->lock);
+}
+
+/*
+ * Additive HoL re-emit (carries DRF); runs before rcv_cr->act
+ * refresh so it doesn't pre-empt peer's first DRF.
+ */
+__attribute__((cold))
+static void frcti_nack_rcv(struct frcti * frcti)
+{
+ struct timespec now;
+ uint64_t now_ns;
+ size_t hp;
+ struct rxm_entry * rxm;
+ void * pkt_copy = NULL;
+ size_t pkt_len = 0;
clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ STAT_BUMP(frcti, nack_rcv);
+
+ if (frcti->snd_cr.seqno == frcti->snd_cr.lwe) {
+ pthread_rwlock_unlock(&frcti->lock);
+ return;
+ }
+
+ hp = RQ_SLOT(frcti->snd_cr.lwe);
+ rxm = LOAD_ACQUIRE(&frcti->snd_slots[hp].rxm);
+ if (rxm == NULL || RXM_AGED_OUT(rxm->t0, now_ns, frcti->t_r)) {
+ pthread_rwlock_unlock(&frcti->lock);
+ return;
+ }
+
+ pkt_copy = malloc(rxm->len);
+ if (pkt_copy != NULL) {
+ memcpy(pkt_copy, rxm->pkt, rxm->len);
+ pkt_len = rxm->len;
+ /* Karn: suppress RTT sample. NACK supersedes pending TLP. */
+ frcti->snd_slots[hp].flags =
+ (frcti->snd_slots[hp].flags & ~SND_TLP)
+ | SND_RTX | SND_FAST_RXM;
+ frcti->rtt_lwe = frcti->snd_cr.lwe + 1;
+ STAT_BUMP(frcti, rxm_nack);
+ }
+
+ pthread_rwlock_unlock(&frcti->lock);
+
+ if (pkt_copy != NULL) {
+ int ret = fast_rxm_send(frcti, pkt_copy, pkt_len);
+ if (ret == -EFLOWDOWN || ret == -ENOTALLOC)
+ STAT_BUMP(frcti, rxm_tx_dead);
+ free(pkt_copy);
+ }
+}
+
+__attribute__((cold))
+static void frcti_rdv_rcv(struct frcti * frcti)
+{
+ uint32_t rwe;
pthread_rwlock_rdlock(&frcti->lock);
- ackno = frcti->rcv_cr.lwe;
- if (frcti->rcv_cr.lwe != frcti->rcv_cr.seqno)
- fd = frcti->fd;
+ rwe = frcti_advert_rwe(frcti);
- wait = MAX(frcti->rcv_cr.inact - now.tv_sec + frcti->rcv_cr.act.tv_sec,
- frcti->snd_cr.inact - now.tv_sec + frcti->snd_cr.act.tv_sec);
- wait = MAX(wait, 0);
+ pthread_rwlock_unlock(&frcti->lock);
+
+ STAT_BUMP(frcti, rdv_rcv);
- if (frcti->snd_cr.cflags & FRCTFLINGER
- && before(frcti->snd_cr.lwe, frcti->snd_cr.seqno))
- wait = -wait;
+ frcti_pkt_snd(frcti, FRCT_FC, 0, rwe);
+}
+
+/* §7.2: PTO = 2*SRTT + max delayed-ACK delay; fallback when unseeded. */
+static __inline__ uint64_t tlp_pto(const struct frcti * frcti)
+{
+ if (frcti->srtt > 0)
+ return 2ULL * (uint64_t) frcti->srtt + ACK_DELAY_NS;
+ return NACK_COOLDOWN_NS;
+}
+
+/*
+ * RFC 8985 §7: lazy probe. Re-evaluate on fire — if sender was active
+ * within PTO, re-post; else probe HoL once and hand off to RTO.
+ */
+__attribute__((cold))
+static void tlp_due(void * arg)
+{
+ struct frcti * frcti = arg;
+ struct timespec now;
+ uint64_t now_ns;
+ uint64_t pto;
+ uint64_t rto_at;
+ size_t hp;
+ struct rxm_entry * rxm;
+ void * pkt_copy = NULL;
+ size_t pkt_len = 0;
+ bool re_post = false;
+ uint64_t deadline = 0;
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ if (frcti->snd_cr.seqno == frcti->snd_cr.lwe)
+ goto unlock;
+
+ if (!before(frcti->snd_cr.seqno, frcti->snd_cr.rwe))
+ goto unlock; /* FC-blocked: RDV handles it. */
+
+ /* RFC 8985 §7.3: one outstanding probe, MAX_TLP_PER_EP per ep. */
+ if (frcti->tlp_high_seq != 0)
+ goto unlock;
+
+ if (frcti->tlp_count >= MAX_TLP_PER_EP)
+ goto unlock;
+
+ pto = tlp_pto(frcti);
+
+ /* §7.2: anchor PTO on most recent send; defer if still active. */
+ if (now_ns < frcti->snd_cr.act + pto) {
+ deadline = frcti->snd_cr.act + pto;
+ re_post = true;
+ goto unlock;
+ }
+
+ hp = RQ_SLOT(frcti->snd_cr.lwe);
+ rxm = LOAD_ACQUIRE(&frcti->snd_slots[hp].rxm);
+ if (rxm == NULL || RXM_AGED_OUT(rxm->t0, now_ns, frcti->t_r))
+ goto unlock;
+
+ /* Cap: if HoL RTO is due, let rxm_due fire instead. */
+ rto_at = rxm->t0 + ((uint64_t) frcti->rto
+ << LOAD_RELAXED(&frcti->rto_mul));
+ if (rto_at <= now_ns)
+ goto unlock;
+
+ pkt_copy = malloc(rxm->len);
+ if (pkt_copy != NULL) {
+ memcpy(pkt_copy, rxm->pkt, rxm->len);
+ pkt_len = rxm->len;
+ frcti->snd_slots[hp].time = now_ns;
+ frcti->snd_slots[hp].flags |= SND_TLP | SND_FAST_RXM;
+ frcti->rtt_lwe = frcti->snd_cr.lwe + 1;
+ /* §7.3 outstanding-probe marker; ack_rcv/rxm_snd clear. */
+ frcti->tlp_high_seq = frcti->snd_cr.seqno;
+ frcti->tlp_count++;
+ STAT_BUMP(frcti, tlp_snd);
+ }
+
+ unlock:
pthread_rwlock_unlock(&frcti->lock);
- if (fd != -1)
- __send_frct_pkt(fd, FRCT_ACK, ackno, 0);
+ if (pkt_copy != NULL) {
+ fast_rxm_send(frcti, pkt_copy, pkt_len);
+ free(pkt_copy);
+ }
+
+ if (re_post)
+ tw_post(&frcti->tlp_tw, deadline, tlp_due, frcti);
+ else
+ __atomic_clear(&frcti->tlp_pending, __ATOMIC_RELAXED);
+}
+
+/* §7.2 lazy: post once per quiet period. tlp_due re-evaluates on fire. */
+static int tlp_arm(struct frcti * frcti)
+{
+ struct timespec now;
+ uint64_t now_ns;
+ uint64_t pto;
+ uint64_t deadline;
+
+ /* §7.3: one outstanding probe, MAX_TLP_PER_EP per recovery ep. */
+ if (LOAD_RELAXED(&frcti->tlp_high_seq) != 0)
+ return 0;
+ if (LOAD_RELAXED(&frcti->tlp_count) >= MAX_TLP_PER_EP)
+ return 0;
+ if (__atomic_test_and_set(&frcti->tlp_pending, __ATOMIC_RELAXED))
+ return 0;
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ pto = tlp_pto(frcti);
+
+ deadline = LOAD_RELAXED(&frcti->snd_cr.act) + pto;
+ if (deadline <= now_ns)
+ deadline = now_ns + pto;
+
+ tw_post(&frcti->tlp_tw, deadline, tlp_due, frcti);
+
+ return 0;
+}
+
+/*
+ * FC window advert from any flag-bearing packet. Caps at lwe + RQ_SIZE,
+ * rejects backward shrink (forged/stale FC), marks window open.
+ * Caller wrlock.
+ */
+static __inline__ void frcti_fc_rcv(struct frcti * frcti,
+ const struct frct_pci * pci)
+{
+ struct frct_cr * snd_cr;
+ uint32_t rwe;
+ uint32_t rwe_max;
+
+ snd_cr = &frcti->snd_cr;
+ rwe = ntoh32(pci->window);
+ rwe_max = snd_cr->lwe + RQ_SIZE;
+
+ if (after(rwe, rwe_max))
+ rwe = rwe_max;
+
+ /* Reject backward shrink (forged/stale FC). */
+ if (before(rwe, snd_cr->rwe))
+ rwe = snd_cr->rwe;
+
+ STORE_RELAXED(&snd_cr->rwe, rwe);
+ frcti->open = true;
+}
+
+/* Packet copies captured under frcti->lock; emitted after release. */
+struct pending {
+ buffer_t fast_rxm;
+ buffer_t sack_rxm[SACK_RXM_MAX];
+ size_t sack_rxm_cnt;
+};
+
+/* RFC 6582 §3.2: seal recovery_high on entry; do not extend on new gaps. */
+static void recovery_enter(struct frcti * frcti)
+{
+ if (frcti->in_recovery)
+ return;
+
+ frcti->in_recovery = true;
+ frcti->recovery_high = frcti->snd_cr.seqno + RTT_QUARANTINE;
+}
+
+/* True when cum-ACK clears recovery_high or all in-flight ACKed. */
+static bool recovery_exit_reached(struct frcti * frcti,
+ uint32_t ackno)
+{
+ if (!frcti->in_recovery)
+ return false;
+
+ if (!before(ackno, frcti->recovery_high))
+ return true;
+
+ return ackno == frcti->snd_cr.seqno;
+}
+
+/* RTT sample gate: Karn + SACK-consume + don't-seed. */
+static bool rtt_sample_eligible(struct frcti * frcti,
+ size_t p,
+ uint16_t flags,
+ uint32_t lwe)
+{
+ if (flags & FRCT_RXM)
+ return false;
+ if (frcti->snd_slots[p].flags & (SND_RTX | SND_TLP))
+ return false;
+ if (LOAD_ACQUIRE(&frcti->snd_slots[p].rxm) == NULL)
+ return false;
+ if (before(lwe, frcti->rtt_lwe))
+ return false;
+ /* Don't seed srtt from a cum-ACK; let probes seed. */
+ if (frcti->srtt == 0)
+ return false;
+ return true;
+}
+
+#define RXM_SLOT_EMPTY(rxm) ((rxm) == NULL)
+#define FAST_RXM_STAGED(pending) ((pending)->fast_rxm.data != NULL)
+#define RXM_FAST_DONE(flags) (((flags) & SND_FAST_RXM) != 0)
+
+/* RACK fast retransmit on cum-ACK: HoL aged past R, not yet retransmitted. */
+static void fast_rxm_consider(struct frcti * frcti,
+ uint64_t now_ns,
+ struct pending * pending)
+{
+ struct rxm_entry * rxm;
+ struct snd_slot * slot;
+ size_t hp;
+ uint64_t R;
+ bool rack_ok;
+
+ hp = RQ_SLOT(frcti->snd_cr.lwe);
+ slot = &frcti->snd_slots[hp];
+ rxm = LOAD_ACQUIRE(&slot->rxm);
+ R = rack_reorder_window(frcti);
+
+ if (RXM_SLOT_EMPTY(rxm))
+ return;
+
+ /* RFC 8985 §6.2: time-based RACK OR DupThresh count. */
+ rack_ok = (int64_t)(frcti->t_latest_ack - slot->time) > (int64_t) R;
+ if (!rack_ok && frcti->dup_thresh < DUP_THRESH)
+ return;
+
+ /* HoL aged past t_r; let rxm_due tear the flow down. */
+ if (RXM_AGED_OUT(rxm->t0, now_ns, frcti->t_r))
+ return;
+
+ /* Already on it. */
+ if (FAST_RXM_STAGED(pending) || RXM_FAST_DONE(slot->flags))
+ return;
+
+ recovery_enter(frcti);
+
+ pending->fast_rxm.data = malloc(rxm->len);
+ if (pending->fast_rxm.data == NULL)
+ return;
+
+ pending->fast_rxm.len = rxm->len;
+ memcpy(pending->fast_rxm.data, rxm->pkt, rxm->len);
+ slot->flags |= SND_RTX | SND_FAST_RXM;
+ frcti->rtt_lwe = frcti->snd_cr.lwe + 1;
+ if (rack_ok)
+ STAT_BUMP(frcti, rxm_rack);
+ else
+ STAT_BUMP(frcti, rxm_dupthresh);
+}
+
+/* Caller holds wrlock; RACK fast retransmit queued in pending. */
+__attribute__((hot))
+static void frcti_ack_rcv(struct frcti * frcti,
+ const struct frct_pci * pci,
+ uint16_t flags,
+ uint64_t now_ns,
+ struct pending * pending)
+{
+ uint32_t ackno;
+ uint32_t lwe;
+ size_t p;
+ size_t fresh;
+
+ if (!(flags & FRCT_DATA))
+ STAT_BUMP(frcti, ack_rcv);
+
+ ackno = ntoh32(pci->ackno);
+ if (ackno == frcti->snd_cr.lwe) {
+ /* RFC 8985 §6.2: only on scoreboard change. */
+ if (frcti->snd_cr.lwe != frcti->rack_fired_lwe) {
+ fast_rxm_consider(frcti, now_ns, pending);
+ frcti->rack_fired_lwe = frcti->snd_cr.lwe;
+ }
+ return;
+ }
+
+ if (!within(ackno, frcti->snd_cr.lwe, frcti->snd_cr.seqno))
+ return;
+
+ lwe = frcti->snd_cr.lwe;
+ p = RQ_SLOT(lwe);
+
+ STORE_RELEASE(&frcti->snd_cr.lwe, ackno);
+
+ /* §7.3: cum-ACK past the probed seqno resolves the TLP. */
+ if (frcti->tlp_high_seq != 0
+ && !before(ackno, frcti->tlp_high_seq))
+ frcti->tlp_high_seq = 0;
+
+ /* §7.3: end the probe episode once inflight drains. */
+ if (ackno == frcti->snd_cr.seqno)
+ frcti->tlp_count = 0;
+
+ /* RFC 8985 §7.2: halve mult per REO_DECAY_PKTS fresh-ACK'd seqnos. */
+ fresh = ackno - frcti->dsack_lwe_snap;
+ if (frcti->reo_wnd_mult > 1 && fresh >= REO_DECAY_PKTS) {
+ uint8_t half = frcti->reo_wnd_mult >> 1;
+ frcti->reo_wnd_mult = half < 1 ? 1 : half;
+ frcti->dsack_lwe_snap = ackno;
+ }
+
+ /* RFC 8985: latest cum-ACKed send-time (slot of ackno-1). */
+ frcti->t_latest_ack = frcti->snd_slots[RQ_SLOT(ackno - 1)].time;
+
+ /* RFC 8985: SACK-above-lwe count is per-recovery-episode. */
+ frcti->dup_thresh = 0;
+
+ /* Karn-skip on retx; TLP ACK clears rto_mul (no CC backoff). */
+ if ((frcti->snd_slots[p].flags & SND_RTX) == 0
+ || (frcti->snd_slots[p].flags & SND_TLP) != 0)
+ STORE_RELEASE(&frcti->rto_mul, 0);
- return wait;
+ if (recovery_exit_reached(frcti, ackno))
+ frcti->in_recovery = false;
+
+ if (rtt_sample_eligible(frcti, p, flags, lwe)) {
+ int64_t mrtt = ts_age_ns(now_ns, frcti->snd_slots[p].time);
+ if (mrtt > 0) {
+ if (!(flags & FRCT_DATA))
+ STAT_BUMP(frcti, ack_rtt);
+ rtt_update(frcti, (time_t) mrtt, now_ns);
+ frcti->t_rcv_rtt = now_ns;
+ }
+ }
}
-static int __frcti_snd(struct frcti * frcti,
- struct ssm_pk_buff * spb)
+/* Skip k == lwe under clamp: NULLing HoL from a stale SACK wedges it. */
+static uint32_t sack_mark_blocks(struct frcti * frcti,
+ const uint8_t * payload,
+ uint16_t n,
+ uint32_t * newly_marked)
{
- struct frct_pci * pci;
- struct timespec now;
- struct frct_cr * snd_cr;
- struct frct_cr * rcv_cr;
- uint32_t seqno;
- bool rtx;
+ uint32_t hi_sacked = frcti->snd_cr.lwe;
+ uint32_t marked = 0;
+ uint16_t i;
+
+ for (i = 0; i < n; ++i) {
+ uint32_t s;
+ uint32_t e;
+ uint32_t k;
+ bool clamped;
+
+ sack_block_get(payload, i, &s, &e);
+
+ if (!before(s, e))
+ continue;
+
+ clamped = before(s, frcti->snd_cr.lwe);
+ if (clamped)
+ s = frcti->snd_cr.lwe;
+ if (after(e, frcti->snd_cr.seqno))
+ e = frcti->snd_cr.seqno;
+
+ for (k = s; before(k, e); ++k) {
+ size_t kp = RQ_SLOT(k);
+ uint64_t t_k;
+ if (clamped && k == frcti->snd_cr.lwe)
+ continue;
+ if (LOAD_ACQUIRE(&frcti->snd_slots[kp].rxm) == NULL)
+ continue;
+ STORE_RELEASE(&frcti->snd_slots[kp].rxm, NULL);
+ frcti->snd_slots[kp].flags = 0;
+ marked++;
+ /* RACK.fack: latest SACK-confirmed send-time. */
+ t_k = frcti->snd_slots[kp].time;
+ if (t_k > frcti->t_latest_ack)
+ frcti->t_latest_ack = t_k;
+ }
+
+ if (after(e, hi_sacked))
+ hi_sacked = e;
+ }
+
+ *newly_marked = marked;
+ return hi_sacked;
+}
+
+/* Queue once per loss event (SND_FAST_RXM gates). Emit after unlock. */
+static void sack_queue_rxm(struct frcti * frcti,
+ uint32_t hi_sacked,
+ uint64_t now_ns,
+ struct pending * pending)
+{
+ uint64_t R = rack_reorder_window(frcti);
+ uint32_t k;
+ bool rack_ok;
+
+ for (k = frcti->snd_cr.lwe; before(k, hi_sacked); ++k) {
+ struct rxm_entry * rxm;
+ size_t kp = RQ_SLOT(k);
+ size_t cnt = pending->sack_rxm_cnt;
+ size_t rack_age;
+
+ rxm = LOAD_ACQUIRE(&frcti->snd_slots[kp].rxm);
+
+ if (cnt >= SACK_RXM_MAX)
+ break;
+
+ if (rxm == NULL)
+ continue;
+
+ if (frcti->snd_slots[kp].flags & SND_FAST_RXM)
+ continue;
+
+ if (RXM_AGED_OUT(rxm->t0, now_ns, frcti->t_r))
+ continue;
+
+ rack_age = frcti->t_latest_ack - frcti->snd_slots[kp].time;
+ /* RFC 8985 §6.2: time-based RACK OR DupThresh count. */
+ rack_ok = (int64_t) rack_age > (int64_t) R;
+ if (!rack_ok && frcti->dup_thresh < DUP_THRESH)
+ continue;
+
+ if (rack_ok)
+ STAT_BUMP(frcti, rxm_rack);
+ else
+ STAT_BUMP(frcti, rxm_dupthresh);
+
+ pending->sack_rxm[cnt].data = malloc(rxm->len);
+ if (pending->sack_rxm[cnt].data == NULL)
+ break;
+
+ pending->sack_rxm[cnt].len = rxm->len;
+ memcpy(pending->sack_rxm[cnt].data, rxm->pkt, rxm->len);
+ pending->sack_rxm_cnt++;
+ /* NULL slot so the original timer self-cleans. */
+ STORE_RELEASE(&frcti->snd_slots[kp].rxm, NULL);
+ frcti->snd_slots[kp].time = now_ns;
+ frcti->snd_slots[kp].flags |= SND_RTX | SND_FAST_RXM;
+ frcti->rtt_lwe = k + 1;
+ }
+}
+
+/*
+ * RFC 2883 D-SACK detector. Returns true iff block[0] is a D-SACK
+ * report:
+ * case 1: blocks[0].start < pkt_ackno (strictly below cum-ACK).
+ * case 2: blocks[0] is a strict sub-range of some blocks[i>0].
+ * MAX_DSACK_LAG bounds case-1 distance to one rcv window (sanity).
+ */
+static bool sack_is_dsack(struct frcti * frcti,
+ const uint8_t * payload,
+ uint16_t n,
+ uint32_t pkt_ackno)
+{
+ uint32_t s0;
+ uint32_t e0;
+ uint16_t i;
+
+ if (n == 0)
+ return false;
+
+ sack_block_get(payload, 0, &s0, &e0);
+ if (!before(s0, e0))
+ return false;
+
+ if (before(s0, pkt_ackno)) {
+ if ((pkt_ackno - s0) <= (uint32_t) MAX_DSACK_LAG)
+ return true;
+ STAT_BUMP(frcti, dsack_drop);
+ return false;
+ }
+
+ for (i = 1; i < n; ++i) {
+ uint32_t si;
+ uint32_t ei;
+
+ sack_block_get(payload, i, &si, &ei);
+ if (!before(si, ei))
+ continue;
+ if (!before(s0, si) && !after(e0, ei)
+ && (s0 != si || e0 != ei))
+ return true;
+ }
+
+ return false;
+}
+
+/* RFC 8985 §7.2: grow reo_wnd_mult on DSACK; at most once per RTT. */
+static __inline__ void reo_wnd_on_dsack(struct frcti * frcti,
+ uint64_t now_ns)
+{
+ time_t srtt = frcti->srtt;
+
+ /* Snap is unconditional: feeds the per-D-SACK decay clock. */
+ frcti->dsack_lwe_snap = frcti->snd_cr.lwe;
+
+ if (srtt > 0
+ && now_ns - frcti->t_last_reo_widen <= (uint64_t) srtt)
+ return;
+
+ if (frcti->reo_wnd_mult < REO_WND_MULT_MAX)
+ frcti->reo_wnd_mult++;
+
+ frcti->t_last_reo_widen = now_ns;
+}
+
+/* Caller holds wrlock; retransmits queued for post-unlock emission. */
+static void frcti_sack_rcv(struct frcti * frcti,
+ buffer_t pkt,
+ uint32_t pkt_ackno,
+ uint64_t now_ns,
+ struct pending * pending)
+{
+ uint32_t hi_sacked;
+ uint32_t marked;
+ uint16_t n;
+ bool dsack;
+ uint16_t n_real;
+
+ if (pkt.len < SACK_HDR_SIZE)
+ return;
+
+ n = ntoh16(*(const uint16_t *) pkt.data);
+ if (n > SACK_MAX_BLOCKS)
+ return;
+
+ if (pkt.len < SACK_HDR_SIZE + (size_t) n * SACK_BLOCK_SIZE)
+ return;
+
+ STAT_BUMP(frcti, sack_rcv);
+
+ dsack = sack_is_dsack(frcti, pkt.data, n, pkt_ackno);
+ n_real = n - (dsack ? 1 : 0);
+
+ if (dsack) {
+ STAT_BUMP(frcti, dsack_rcv);
+ reo_wnd_on_dsack(frcti, now_ns);
+ }
+
+ /* DSACK-only carries no new gap; don't enter recovery. */
+ if (n_real > 0)
+ recovery_enter(frcti);
+
+ marked = 0;
+ hi_sacked = sack_mark_blocks(frcti, pkt.data, n, &marked);
+ frcti->dup_thresh += marked;
+
+ if (after(hi_sacked, frcti->snd_cr.lwe))
+ sack_queue_rxm(frcti, hi_sacked, now_ns, pending);
+}
+
+/* Emit and free queued packet copies. */
+static void pending_flush(struct frcti * frcti,
+ struct pending * pending)
+{
+ size_t i;
+
+ for (i = 0; i < pending->sack_rxm_cnt; ++i) {
+ sack_rxm_snd(frcti, pending->sack_rxm[i].data,
+ pending->sack_rxm[i].len);
+ free(pending->sack_rxm[i].data);
+ }
+
+ if (pending->fast_rxm.data != NULL) {
+ int ret = fast_rxm_send(frcti, pending->fast_rxm.data,
+ pending->fast_rxm.len);
+ if (ret == -EFLOWDOWN || ret == -ENOTALLOC)
+ STAT_BUMP(frcti, rxm_tx_dead);
+ free(pending->fast_rxm.data);
+ }
+}
+
+/* Pre-DRF NACK: ask peer to retransmit HoL; seqno is informational. */
+static void frcti_nack_snd(struct frcti * frcti,
+ uint32_t seqno_unseen)
+{
+ struct ssm_pk_buff * spb;
+ struct frct_pci * pci;
+
+ if (frct_ctrl_alloc(&spb, &pci, 0) < 0)
+ return;
+
+ pci->flags = hton16(FRCT_NACK);
+ pci->seqno = hton32(seqno_unseen);
+
+ frct_hcs_set(pci, false);
+
+ frct_tx(frcti, spb);
+}
+
+enum frct_act {
+ FRCT_ACTIVE,
+ FRCT_INACT_NEED_NACK,
+ FRCT_INACT_DROP,
+};
+
+/* On rcv inactivity: rebase on DRF, or arm pre-DRF NACK. Caller wrlock. */
+static enum frct_act rcv_inact_check(struct frcti * frcti,
+ uint16_t flags,
+ uint32_t seqno,
+ uint64_t now_ns)
+{
+ struct frct_cr * rcv_cr = &frcti->rcv_cr;
+ uint64_t cd;
+
+ if (!ts_aged_ns(now_ns, rcv_cr->act, rcv_cr->inact))
+ return FRCT_ACTIVE;
+
+ if (flags & FRCT_DRF) {
+ if (same_epoch_drf(seqno, flags, rcv_cr))
+ return FRCT_ACTIVE;
+
+ /* Bootstrap or fresh epoch: rebase. */
+ STAT_BUMP(frcti, drf_rebase);
+ release_rq(frcti);
+ STORE_RELEASE(&rcv_cr->lwe, seqno);
+ rcv_cr->rwe = seqno + RQ_SIZE;
+ rcv_cr->seqno = seqno;
+ return FRCT_ACTIVE;
+ }
+
+ if (!(flags & FRCT_DATA))
+ return FRCT_ACTIVE;
+
+ /* Pre-DRF: nudge sender with NACK (rate-limited). */
+ cd = frcti->srtt > 0 ? (uint64_t) frcti->srtt : NACK_COOLDOWN_NS;
+ if (!ts_aged_ns(now_ns, frcti->t_nack, cd))
+ return FRCT_INACT_DROP;
+
+ frcti->t_nack = now_ns;
+ STAT_BUMP(frcti, nack_snd);
+
+ return FRCT_INACT_NEED_NACK;
+}
+
+/* Both modes: bounded accept into rq[seqno]. Caller wrlock. */
+__attribute__((hot))
+static bool rq_accept(struct frcti * frcti,
+ uint32_t seqno,
+ size_t pos,
+ uint16_t flags)
+{
+ struct frct_cr * rcv_cr = &frcti->rcv_cr;
+
+ if (!before(seqno, rcv_cr->rwe)) {
+ STAT_BUMP(frcti, out_rcv);
+ return false;
+ }
+
+ if (!before(seqno, rcv_cr->lwe + RQ_SIZE)) {
+ STAT_BUMP(frcti, rqo_rcv);
+ return false;
+ }
+
+ if (frcti->rcv_slots[pos].idx != -1) {
+ if (flags & FRCT_RXM)
+ STAT_BUMP(frcti, rxm_dup_rcv);
+ else
+ STAT_BUMP(frcti, dup_rcv);
+ /* RFC 2883 §4 case 2: in-window dup; sub-range marker. */
+ frcti->dsack_seqno = seqno;
+ frcti->dsack_valid = true;
+ return false;
+ }
+
+ return true;
+}
+
+/* OOO arrival; throttle by min_gap + scoreboard dedup. */
+static bool sack_check(struct frcti * frcti,
+ uint32_t seqno,
+ uint64_t now_ns,
+ struct sack_args * out)
+{
+ struct frct_cr * rcv_cr = &frcti->rcv_cr;
+ uint64_t min_gap;
+ uint16_t n;
+
+ if (!after(seqno, rcv_cr->lwe))
+ return false;
+
+ STAT_BUMP(frcti, ooo_rcv);
+
+ /* SACK carries cum-ACK; bound by t_a like any other ACK. */
+ if (ACK_AGED_OUT(rcv_cr->act, now_ns, frcti->t_a))
+ return false;
+
+ /* srtt/8 gate starved recovery under burst loss; floor to save CPU. */
+ min_gap = (uint64_t) SACK_MIN_GAP_NS;
+
+ if (!ts_aged_ns(now_ns, frcti->t_snd_sack, min_gap))
+ return false;
+
+ out->dsack = false;
+ n = dsack_consume(frcti, out->blocks);
+ if (n == 1)
+ out->dsack = true;
+ n += sack_blocks_build(frcti, out->blocks + n,
+ frcti->sack_n_max - n);
+
+ if (!out->dsack
+ && rcv_cr->lwe == frcti->sack_lwe && n == frcti->sack_n)
+ return false;
+
+ out->n = n;
+ out->ack = rcv_cr->lwe;
+ out->rwe = frcti_advert_rwe(frcti);
+ frcti->t_snd_sack = now_ns;
+ frcti->sack_lwe = rcv_cr->lwe;
+ frcti->sack_n = n;
+
+ return true;
+}
+
+/* Wire-dup of fresh DATA at an already-ACKed seqno. */
+static __inline__ bool is_dup_data(uint16_t flags,
+ uint32_t seqno,
+ uint32_t lwe)
+{
+ if (!(flags & FRCT_DATA))
+ return false;
+
+ if (flags & FRCT_RXM)
+ return false;
+
+ return before(seqno, lwe);
+}
+
+/*
+ * Wire-dup ACK packet: same seqno as the previous emission. Updates
+ * the dedup ackno on a fresh ACK; caller drops on true.
+ */
+static __inline__ bool is_dup_ack(struct frcti * frcti,
+ uint16_t flags,
+ uint32_t seqno)
+{
+ if (flags & FRCT_DATA)
+ return false;
+
+ if (!(flags & FRCT_ACK))
+ return false;
+
+ if (seqno == frcti->rcv_cr.ackno)
+ return true;
+
+ frcti->rcv_cr.ackno = seqno;
+
+ return false;
+}
+
+/* Caller wrlock. */
+__attribute__((cold))
+static void seqno_rotate(struct frcti * frcti,
+ uint64_t now_ns)
+{
+ struct frct_cr * snd_cr = &frcti->snd_cr;
+
+ if (!ts_aged_ns(now_ns, snd_cr->act, snd_cr->inact))
+ return;
+ /* Idle-on-wire ≠ idle e2e: don't orphan in-flight rxm. */
+ if (snd_cr->seqno != snd_cr->lwe)
+ return;
+
+ /* Avoid colliding with peer's current rcv window. */
+ do {
+ random_buffer(&snd_cr->seqno, sizeof(snd_cr->seqno));
+ } while (in_window(snd_cr->seqno, snd_cr));
+ STORE_RELEASE(&snd_cr->lwe, snd_cr->seqno);
+ STORE_RELAXED(&snd_cr->rwe, snd_cr->lwe + START_WINDOW);
+ frcti->rtt_lwe = snd_cr->seqno;
+ frcti->in_recovery = false;
+ frcti->recovery_high = snd_cr->seqno;
+}
+
+__attribute__((hot))
+static int frcti_snd(struct frcti * frcti,
+ struct ssm_pk_buff * spb,
+ uint16_t flags)
+{
+ struct frct_pci * pci;
+ struct frct_pci_stream * spci = NULL;
+ struct timespec now;
+ struct frct_cr * snd_cr;
+ struct frct_cr * rcv_cr;
+ uint32_t seqno;
+ uint16_t pci_flags = 0;
+ bool rtx;
+ uint64_t now_ns;
+ int64_t rcv_idle;
+ uint32_t probe_id = 0;
+ uint8_t probe_nonce[RTTP_NONCE_LEN] = { 0 };
+ bool probe;
+ size_t payload_len = 0;
assert(frcti);
- assert(ssm_pk_buff_len(spb) != 0);
+ /* Stream mode permits 0-byte sends for the EOS marker. */
+ assert(ssm_pk_buff_len(spb) != 0 || frcti->stream);
snd_cr = &frcti->snd_cr;
rcv_cr = &frcti->rcv_cr;
- timerwheel_move();
+ tw_move_safe();
+
+ if (frcti->stream)
+ payload_len = ssm_pk_buff_len(spb);
- pci = (struct frct_pci *) ssm_pk_buff_head_alloc(spb, FRCT_PCILEN);
+ pci = FRCT_HDR_PUSH(spb, frcti);
if (pci == NULL)
return -ENOMEM;
- memset(pci, 0, sizeof(*pci));
+ memset(pci, 0, FRCT_PCILEN);
+
+ if (frcti->stream)
+ spci = FRCT_SPCI(pci);
clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
pthread_rwlock_wrlock(&frcti->lock);
rtx = snd_cr->cflags & FRCTFRTX;
- pci->flags |= FRCT_DATA;
+ pci_flags |= FRCT_DATA;
+ if (!frcti->stream)
+ pci_flags |= (flags & FRCT_FR_MASK);
- /* Set DRF if there are no unacknowledged packets. */
- if (snd_cr->seqno == snd_cr->lwe)
- pci->flags |= FRCT_DRF;
+ if (!frcti->stream && (flags & FRCT_FR_MASK) != FRCT_FR_SOLE)
+ STAT_BUMP(frcti, frag_snd);
- /* Choose a new sequence number if sender inactivity expired. */
- if (now.tv_sec - snd_cr->act.tv_sec > snd_cr->inact) {
- /* There are no unacknowledged packets. */
- assert(snd_cr->seqno == snd_cr->lwe);
- random_buffer(&snd_cr->seqno, sizeof(snd_cr->seqno));
- snd_cr->lwe = snd_cr->seqno;
- snd_cr->rwe = snd_cr->lwe + START_WINDOW;
+ if (frcti->stream) {
+ if (flags & FRCT_FIN)
+ pci_flags |= FRCT_FIN;
+
+ spci->start = hton32(frcti->snd_byte_next);
+ frcti->snd_byte_next += (uint32_t) payload_len;
+ spci->end = hton32(frcti->snd_byte_next);
+ STAT_ADD(frcti, strm_snd_byte, payload_len);
}
+ if (snd_cr->seqno == snd_cr->lwe)
+ pci_flags |= FRCT_DRF;
+
+ seqno_rotate(frcti, now_ns);
+
seqno = snd_cr->seqno;
pci->seqno = hton32(seqno);
- if (now.tv_sec - rcv_cr->act.tv_sec < rcv_cr->inact) {
- pci->flags |= FRCT_FC;
- *((uint32_t *) pci) |= hton32(rcv_cr->rwe & 0x00FFFFFF);
+ rcv_idle = ts_age_ns(now_ns, rcv_cr->act);
+
+ if (rcv_idle < (int64_t) rcv_cr->inact) {
+ pci_flags |= FRCT_FC;
+ pci->window = hton32(frcti_advert_rwe(frcti));
}
if (!rtx) {
- snd_cr->lwe++;
+ STORE_RELEASE(&snd_cr->lwe, snd_cr->lwe + 1);
+ STORE_RELEASE(&snd_cr->rwe, snd_cr->lwe + RQ_SIZE);
} else {
- if (!frcti->probe) {
- frcti->rttseq = snd_cr->seqno;
- frcti->t_probe = now;
- frcti->probe = true;
-#ifdef PROC_FLOW_STATS
- frcti->n_prb++;
-#endif
- }
- if ((now.tv_sec - rcv_cr->act.tv_sec) * BILLION <= frcti->a) {
- pci->flags |= FRCT_ACK;
+ size_t p = RQ_SLOT(seqno);
+ frcti->snd_slots[p].time = now_ns;
+ /* Fresh send clears RTX bits. */
+ frcti->snd_slots[p].flags = 0;
+ if (rcv_idle <= (int64_t) frcti->t_a) {
+ pci_flags |= FRCT_ACK;
pci->ackno = hton32(rcv_cr->lwe);
rcv_cr->seqno = rcv_cr->lwe;
}
}
+ pci->flags = hton16(pci_flags);
+
+ frct_hcs_set(pci, frcti->stream);
+
snd_cr->seqno++;
- snd_cr->act = now;
+ STORE_RELEASE(&snd_cr->act, now_ns);
+
+ probe = rtt_probe_arm(frcti, now_ns, &probe_id, probe_nonce);
pthread_rwlock_unlock(&frcti->lock);
- if (rtx)
- timerwheel_rxm(frcti, seqno, spb);
+ if (probe)
+ frcti_rttp_snd(frcti, probe_id, 0, probe_nonce);
+
+ if (rtx) {
+ rxm_arm(frcti, seqno, spb);
+ tlp_arm(frcti);
+ }
return 0;
}
-static void rtt_estimator(struct frcti * frcti,
- time_t mrtt)
+/*
+ * Stream: 0-byte FRCT_FIN DATA so peer's flow_read returns 0 at this
+ * byte. Msg: control packet with FRCT_FIN flag, snd_cr.seqno carried
+ * in pci->ackno (sender packs via frcti_pkt_snd's ackno parameter).
+ */
+static void frcti_fin_snd(struct frcti * frcti)
{
- time_t srtt = frcti->srtt;
- time_t rttvar = frcti->mdev;
+ struct ssm_pk_buff * spb;
+ bool already;
+ uint32_t fin_seqno;
- if (srtt == 0) { /* first measurement */
- srtt = mrtt;
- rttvar = mrtt >> 1;
- } else {
- time_t delta = mrtt - srtt;
- srtt += (delta >> 3);
- delta = (ABS(delta) - rttvar) >> 2;
-#ifdef FRCT_LINUX_RTT_ESTIMATOR
- if (delta < 0)
- delta >>= 3;
-#endif
- rttvar += delta;
+ if (!(frcti->snd_cr.cflags & FRCTFLINGER))
+ return;
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ already = frcti->snd_fin_sent;
+ frcti->snd_fin_sent = true;
+ fin_seqno = frcti->snd_cr.seqno;
+
+ if (!already && !frcti->stream)
+ frcti->snd_fin_seqno = fin_seqno;
+
+ pthread_rwlock_unlock(&frcti->lock);
+
+ if (already)
+ return;
+
+ if (!frcti->stream) {
+ frcti_pkt_snd(frcti, FRCT_FIN, fin_seqno, 0);
+ return;
}
-#ifdef PROC_FLOW_STATS
- frcti->n_rtt++;
-#endif
- frcti->srtt = MAX(1000L, srtt);
- frcti->mdev = MAX(100L, rttvar);
- frcti->rto = MAX(RTO_MIN, frcti->srtt + (frcti->mdev << MDEV_MUL));
-}
-
-/* Always queues the next application packet on the RQ. */
-static void __frcti_rcv(struct frcti * frcti,
- struct ssm_pk_buff * spb)
-{
- ssize_t idx;
- size_t pos;
- struct frct_pci * pci;
- struct timespec now;
- struct frct_cr * rcv_cr;
- struct frct_cr * snd_cr;
- uint32_t seqno;
- uint32_t ackno;
- uint32_t rwe;
- int fd = -1;
- assert(frcti);
+ if (frct_spb_reserve(frcti_data_hdr_len(frcti), &spb) < 0)
+ return;
+ /* Reset spb to 0-len so frcti_snd's head_alloc populates PCI. */
+ ssm_pk_buff_truncate(spb, 0);
+
+ if (frcti_snd(frcti, spb, FRCT_FIN) < 0) {
+ frct_spb_release(spb);
+ return;
+ }
+
+ if (frct_tx(frcti, spb) < 0)
+ return;
+
+ pthread_rwlock_wrlock(&frcti->lock);
+
+ frcti->snd_fin_seqno = frcti->snd_cr.seqno - 1;
+
+ pthread_rwlock_unlock(&frcti->lock);
+}
+
+static bool final_ack_due(struct frcti * frcti,
+ struct frct_cr * rcv_cr,
+ uint64_t now_ns)
+{
+ if (rcv_cr->lwe == rcv_cr->seqno)
+ return false;
+
+ if (ACK_AGED_OUT(rcv_cr->act, now_ns, frcti->t_a))
+ return false;
+
+ return true;
+}
+
+/* Snd-side has FLINGER cflag and unACK'd data below the FIN/seqno. */
+static __inline__ bool snd_drain_pending(struct frct_cr * snd_cr,
+ uint32_t edge)
+{
+ if (!(snd_cr->cflags & FRCTFLINGER))
+ return false;
+
+ return before(snd_cr->lwe, edge);
+}
+
+/* Peer is still active and we haven't seen their FIN yet. */
+static __inline__ bool rcv_drain_pending(struct frcti * frcti,
+ struct frct_cr * rcv_cr,
+ uint64_t now_ns)
+{
+ if (frcti->rcv_fin_seen)
+ return false;
+
+ return !ts_aged_ns(now_ns, rcv_cr->act, rcv_cr->inact);
+}
+
+/* Drain-loop predicate: snd-side unACK'd data OR peer still active. */
+static bool frcti_lingering(struct frcti * frcti)
+{
+ struct timespec now;
+ struct frct_cr * snd_cr;
+ struct frct_cr * rcv_cr;
+ uint32_t edge;
+ uint64_t now_ns;
+ bool snd_linger;
+ bool rcv_linger;
+
+ /* Idempotent; emits FIN once per side, both stream and msg. */
+ frcti_fin_snd(frcti);
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ pthread_rwlock_rdlock(&frcti->lock);
+
+ snd_cr = &frcti->snd_cr;
rcv_cr = &frcti->rcv_cr;
+
+ if (frcti->snd_fin_sent)
+ edge = frcti->snd_fin_seqno;
+ else
+ edge = snd_cr->seqno;
+
+ snd_linger = snd_drain_pending(snd_cr, edge);
+ rcv_linger = rcv_drain_pending(frcti, rcv_cr, now_ns);
+
+ pthread_rwlock_unlock(&frcti->lock);
+
+ return snd_linger || rcv_linger;
+}
+
+static time_t frcti_dealloc(struct frcti * frcti)
+{
+ struct timespec now;
+ struct frct_cr * snd_cr;
+ struct frct_cr * rcv_cr;
+ int ackno;
+ bool due;
+ int64_t now_ns;
+ int64_t rcv;
+ int64_t snd;
+
snd_cr = &frcti->snd_cr;
+ rcv_cr = &frcti->rcv_cr;
+
+ /* Idempotent; usually already sent by frcti_lingering. */
+ frcti_fin_snd(frcti);
clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
- pci = (struct frct_pci *) ssm_pk_buff_head_release(spb, FRCT_PCILEN);
+ pthread_rwlock_rdlock(&frcti->lock);
- idx = ssm_pk_buff_get_idx(spb);
- seqno = ntoh32(pci->seqno);
- pos = seqno & (RQ_SIZE - 1);
+ ackno = rcv_cr->lwe;
+ rcv = (int64_t)(rcv_cr->act + rcv_cr->inact) - now_ns;
+ snd = (int64_t)(snd_cr->act + snd_cr->inact) - now_ns;
+ due = final_ack_due(frcti, rcv_cr, now_ns);
- pthread_rwlock_wrlock(&frcti->lock);
+ pthread_rwlock_unlock(&frcti->lock);
- if (now.tv_sec - rcv_cr->act.tv_sec > rcv_cr->inact) {
- if (pci->flags & FRCT_DRF) { /* New run. */
- rcv_cr->lwe = seqno;
- rcv_cr->rwe = seqno + RQ_SIZE;
- rcv_cr->seqno = seqno;
- } else if (pci->flags & FRCT_DATA) {
- goto drop_packet;
- }
- }
+ if (due)
+ frcti_pkt_snd(frcti, FRCT_ACK, ackno, 0);
- rcv_cr->act = now;
+ return (time_t) MAX((MAX(rcv, snd) / BILLION), 0);
+}
- /* For now, just send an immediate window update. */
- if (pci->flags & FRCT_RDVS) {
- fd = frcti->fd;
- rwe = rcv_cr->rwe;
- pthread_rwlock_unlock(&frcti->lock);
+__attribute__((hot))
+static void frcti_rcv(struct frcti * frcti,
+ struct ssm_pk_buff * spb)
+{
+ ssize_t idx;
+ size_t pos;
+ struct frct_pci * pci;
+ struct timespec now;
+ uint64_t now_ns;
+ struct frct_cr * rcv_cr;
+ uint32_t seqno;
+ uint16_t flags;
+ buffer_t pkt;
+ struct pending pending = { 0 };
+ bool in_order;
+ struct sack_args * sa = NULL;
+ bool send_sack = false;
- __send_frct_pkt(fd, FRCT_FC, 0, rwe);
+ assert(frcti);
+
+ rcv_cr = &frcti->rcv_cr;
- ssm_pool_remove(proc.pool, idx);
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ if (ssm_pk_buff_len(spb) < FRCT_PCILEN) {
+ frct_spb_release(spb);
return;
}
- if (pci->flags & FRCT_ACK) {
- ackno = ntoh32(pci->ackno);
- if (after(ackno, frcti->snd_cr.lwe))
- frcti->snd_cr.lwe = ackno;
+ pci = FRCT_HDR_POP(spb, frct_pci);
- if (frcti->probe && after(ackno, frcti->rttseq)) {
-#ifdef PROC_FLOW_STATS
- if (!(pci->flags & FRCT_DATA))
- frcti->n_dak++;
-#endif
- rtt_estimator(frcti, ts_diff_ns(&now, &frcti->t_probe));
- frcti->probe = false;
- }
+ idx = ssm_pk_buff_get_off(spb);
+ seqno = ntoh32(pci->seqno);
+ pos = RQ_SLOT(seqno);
+
+ flags = ntoh16(pci->flags);
+
+ pkt.data = ssm_pk_buff_head(spb);
+ pkt.len = ssm_pk_buff_len(spb);
+
+ if (flags & FRCT_RXM)
+ STAT_BUMP(frcti, rxm_rcv);
+
+ /* Stateless / lock-free dispatches. spb released via ctrl_done. */
+ if (flags & FRCT_KA) {
+ frcti_ka_rcv(frcti, pci, now_ns, flags);
+ goto ctrl_done;
}
- if (pci->flags & FRCT_FC) {
- uint32_t rwe;
+ if (flags & FRCT_RTTP) {
+ frcti_rttp_rcv(frcti, pkt, now_ns);
+ goto ctrl_done;
+ }
- rwe = ntoh32(*((uint32_t *)pci) & hton32(0x00FFFFFF));
- rwe |= snd_cr->rwe & 0xFF000000;
+ if (flags & FRCT_NACK) {
+ frcti_nack_rcv(frcti);
+ goto ctrl_done;
+ }
+
+ if (flags & FRCT_RDVS) {
+ frcti_rdv_rcv(frcti);
+ goto ctrl_done;
+ }
- /* Rollover for 24 bit */
- if (before(rwe, snd_cr->rwe) && snd_cr->rwe - rwe > 0x007FFFFF)
- rwe += 0x01000000;
+ /* Msg-mode FIN: control packet, FIN seqno carried in pci->ackno. */
+ if ((flags & FRCT_FIN) && !(flags & FRCT_DATA)) {
+ pthread_rwlock_wrlock(&frcti->lock);
+ if (!frcti->rcv_fin_seen) {
+ frcti->rcv_fin_seen = true;
+ frcti->rcv_byte_fin = ntoh32(pci->ackno);
+ }
+ pthread_rwlock_unlock(&frcti->lock);
+ goto ctrl_done;
+ }
- snd_cr->rwe = rwe;
+ pthread_rwlock_wrlock(&frcti->lock);
- pthread_mutex_lock(&frcti->mtx);
- if (!frcti->open) {
- frcti->open = true;
- pthread_cond_broadcast(&frcti->cond);
+ /* rcv_inact_check is a no-op for non-DATA non-DRF packets. */
+ if (flags & (FRCT_DATA | FRCT_DRF)) {
+ switch (rcv_inact_check(frcti, flags, seqno, now_ns)) {
+ case FRCT_INACT_NEED_NACK:
+ pthread_rwlock_unlock(&frcti->lock);
+ frcti_nack_snd(frcti, seqno - 1);
+ frct_spb_release(spb);
+ return;
+ case FRCT_INACT_DROP:
+ STAT_BUMP(frcti, inact_drop);
+ goto drop_packet;
+ case FRCT_ACTIVE:
+ /* FALLTHRU */
+ default:
+ break;
}
- pthread_mutex_unlock(&frcti->mtx);
}
- if (!(pci->flags & FRCT_DATA))
+ /* DATA-only act refresh: non-DATA would lock out DRF rebase. */
+ if (flags & FRCT_DATA)
+ STORE_RELEASE(&rcv_cr->act, now_ns);
+
+ /* Wire-dup ACK packet: same seqno as the previous emission. */
+ if (is_dup_ack(frcti, flags, seqno)) {
+ STAT_BUMP(frcti, ack_dup_rcv);
+ goto drop_packet;
+ }
+
+ /* Wire-dup of DATA: piggybacked ACK info already processed. */
+ if (is_dup_data(flags, seqno, rcv_cr->lwe)) {
+ rcv_cr->seqno = seqno;
+ STAT_BUMP(frcti, dup_rcv);
+ /* RFC 2883 §4 case 1: dup below cum-ACK. */
+ frcti->dsack_seqno = seqno;
+ frcti->dsack_valid = true;
+ goto drop_packet;
+ }
+
+ if (flags & FRCT_ACK)
+ frcti_ack_rcv(frcti, pci, flags, now_ns, &pending);
+
+ if (flags & FRCT_SACK)
+ frcti_sack_rcv(frcti, pkt, ntoh32(pci->ackno),
+ now_ns, &pending);
+
+ if (flags & FRCT_FC)
+ frcti_fc_rcv(frcti, pci);
+
+ if (!(flags & FRCT_DATA))
goto drop_packet;
if (before(seqno, rcv_cr->lwe)) {
- rcv_cr->seqno = seqno; /* Ensures we send a new ACK. */
-#ifdef PROC_FLOW_STATS
- frcti->n_dup++;
-#endif
+ /* Bump rcv_cr.seqno to force ack_snd to fire on the dup. */
+ rcv_cr->seqno = seqno;
+ if (flags & FRCT_RXM)
+ STAT_BUMP(frcti, rxm_dup_rcv);
+ else
+ STAT_BUMP(frcti, dup_rcv);
+ /* RFC 2883 §4 case 1: dup below cum-ACK. */
+ frcti->dsack_seqno = seqno;
+ frcti->dsack_valid = true;
goto drop_packet;
}
- if (rcv_cr->cflags & FRCTFRTX) {
+ if (!rq_accept(frcti, seqno, pos, flags))
+ goto drop_packet;
- if (!before(seqno, rcv_cr->rwe)) { /* Out of window. */
-#ifdef PROC_FLOW_STATS
- frcti->n_out++;
-#endif
+ if (frcti->stream) {
+ if (frcti_stream_data_rcv(frcti, spb, pos, flags) < 0) {
+ STAT_BUMP(frcti, strm_drop);
goto drop_packet;
}
-
- if (!before(seqno, rcv_cr->lwe + RQ_SIZE)) {
-#ifdef PROC_FLOW_STATS
- frcti->n_rqo++;
-#endif
- goto drop_packet; /* Out of rq. */
- }
- if (frcti->rq[pos] != -1) {
-#ifdef PROC_FLOW_STATS
- frcti->n_dup++;
-#endif
- goto drop_packet; /* Duplicate in rq. */
- }
- fd = frcti->fd;
+ /* spb consumed by stash; do not release in drop path. */
+ spb = NULL;
} else {
- rcv_cr->lwe = seqno;
+ frcti_data_stash(frcti, idx, pos, flags);
+ }
+
+ /* Lazy alloc: only OOO arrivals can trigger a SACK send. */
+ if (after(seqno, rcv_cr->lwe) && frcti->sack_n_max > 0) {
+ size_t sa_sz = sizeof(*sa)
+ + frcti->sack_n_max * sizeof(sa->blocks[0]);
+ sa = malloc(sa_sz);
+ /* If alloc fails, sack_check sees NULL and we skip SACK. */
}
- frcti->rq[pos] = idx;
+ send_sack = sa != NULL && sack_check(frcti, seqno, now_ns, sa);
+ in_order = !after(seqno, rcv_cr->lwe);
pthread_rwlock_unlock(&frcti->lock);
- if (fd != -1)
- timerwheel_delayed_ack(fd, frcti);
+ if (send_sack) {
+ STAT_BUMP(frcti, sack_snd);
+ if (sa->dsack)
+ STAT_BUMP(frcti, dsack_snd);
+ frcti_sack_snd(frcti, sa);
+ } else if (in_order) {
+ ack_arm(frcti);
+ }
+
+ if ((flags & FRCT_ACK) && frcti->snd_cr.seqno != frcti->snd_cr.lwe)
+ tlp_arm(frcti);
+
+ pending_flush(frcti, &pending);
+
+ frcti_rcv_probe(frcti, now_ns);
+ free(sa);
+ return;
+
+ ctrl_done:
+ frct_spb_release(spb);
return;
drop_packet:
pthread_rwlock_unlock(&frcti->lock);
- ssm_pool_remove(proc.pool, idx);
- send_frct_pkt(frcti);
- return;
+ frct_spb_release(spb);
+ /* with_sack=true: ack_snd no-ops if neither dsack nor SACK is due. */
+ ack_snd(frcti, true);
+
+ pending_flush(frcti, &pending);
+ free(sa);
}
+
+/* NULL-shim macros for the no-FRCT case. */
+
+#define FRCTI_SND(frcti, spb, flags) \
+ ((frcti) == NULL ? 0 : frcti_snd((frcti), (spb), (flags)))
+
+#define FRCTI_RCV(frcti, spb) \
+ do { \
+ if ((frcti) != NULL) \
+ frcti_rcv((frcti), (spb)); \
+ } while (0)
+
+#define FRCTI_PDU_READY(frcti) \
+ ((frcti) != NULL && frcti_pdu_ready(frcti))
+
+#define FRCTI_CONSUME(frcti, buf, count) \
+ ((frcti) == NULL ? (ssize_t) -EAGAIN \
+ : (frcti)->stream \
+ ? frcti_consume_stream((frcti), (buf), (count)) \
+ : frcti_consume((frcti), (buf), (count)))
+
+#define FRCTI_IS_FRTX(frcti) \
+ ((frcti) != NULL && ((frcti)->rcv_cr.cflags & FRCTFRTX))
+
+#define FRCTI_IS_STREAM(frcti) ((frcti) != NULL && (frcti)->stream)
+
+#define FRCTI_PAYLOAD_CAP(frcti) \
+ ((frcti)->frag_mtu - frcti_data_hdr_len(frcti))
+
+#define FRCTI_NEEDS_FRAG(frcti, count) \
+ ((frcti) != NULL && (count) > FRCTI_PAYLOAD_CAP(frcti))
+
+#define FRCTI_IS_WINDOW_OPEN(frcti) \
+ ((frcti) == NULL ? true : frcti_is_window_open(frcti))
+
+#define FRCTI_IS_WINDOW_OPEN_N(frcti, n) \
+ ((frcti) == NULL ? true : frcti_is_window_open_n((frcti), (n)))
+
+#define FRCTI_LINGERING(frcti) \
+ ((frcti) == NULL ? false : frcti_lingering(frcti))
+
+#define FRCTI_DEALLOC(frcti) \
+ ((frcti) == NULL ? (time_t) 0 : frcti_dealloc(frcti))
+
diff --git a/src/lib/hash.c b/src/lib/hash.c
index 7adee968..7ffa5bc1 100644
--- a/src/lib/hash.c
+++ b/src/lib/hash.c
@@ -39,6 +39,9 @@
#include <ouroboros/md5.h>
#include <ouroboros/sha3.h>
#endif
+#include <ouroboros/crc8.h>
+#include <ouroboros/crc16.h>
+#include <ouroboros/crc64.h>
#include <string.h>
#include <assert.h>
#include <stdbool.h>
@@ -69,6 +72,12 @@ int hash_len_tbl [] = {
uint16_t hash_len(enum hash_algo algo)
{
+ if (algo == HASH_CRC8)
+ return CRC8_HASH_LEN;
+ if (algo == HASH_CRC16)
+ return CRC16_HASH_LEN;
+ if (algo == HASH_CRC64)
+ return CRC64_HASH_LEN;
#ifdef HAVE_LIBGCRYPT
return (uint16_t) gcry_md_get_algo_dlen(gcry_algo_tbl[algo]);
#else
@@ -81,6 +90,27 @@ void mem_hash(enum hash_algo algo,
const uint8_t * buf,
size_t len)
{
+ if (algo == HASH_CRC8) {
+ uint8_t crc = 0;
+
+ crc8_autosar(&crc, buf, len);
+ *(uint8_t *) dst = crc;
+ return;
+ }
+ if (algo == HASH_CRC16) {
+ uint16_t crc = 0;
+
+ crc16_ccitt_false(&crc, buf, len);
+ *(uint16_t *) dst = htobe16(crc);
+ return;
+ }
+ if (algo == HASH_CRC64) {
+ uint64_t crc = 0;
+
+ crc64_nvme(&crc, buf, len);
+ *(uint64_t *) dst = htobe64(crc);
+ return;
+ }
#ifdef HAVE_LIBGCRYPT
gcry_md_hash_buffer(gcry_algo_tbl[algo], dst, buf, len);
#else
diff --git a/src/lib/pb/ipcp.proto b/src/lib/pb/ipcp.proto
index 9dc402f5..406b8d9c 100644
--- a/src/lib/pb/ipcp.proto
+++ b/src/lib/pb/ipcp.proto
@@ -54,7 +54,7 @@ message ipcp_msg {
optional int32 response = 10;
optional string comp = 11;
optional uint32 timeo_sec = 12;
- optional sint32 mpl = 13;
+ optional sint32 mpl = 13; /* MPL in ms. */
optional int32 result = 14;
optional uint32 uid = 15; /* 0 = GSPP, >0 = PUP uid */
}
diff --git a/src/lib/pb/irm.proto b/src/lib/pb/irm.proto
index 9ed0a29b..5de860a5 100644
--- a/src/lib/pb/irm.proto
+++ b/src/lib/pb/irm.proto
@@ -88,12 +88,12 @@ message irm_msg {
repeated ipcp_list_msg ipcps = 17;
repeated name_info_msg names = 18;
optional timespec_msg timeo = 19;
- optional sint32 mpl = 20;
+ optional sint32 mpl = 20; /* MPL in ms. */
optional string comp = 21;
optional bytes pk = 22; /* piggyback */
optional uint32 timeo_sec = 23;
optional uint32 timeo_nsec = 24;
optional sint32 result = 25;
- optional bytes sym_key = 26; /* symmetric encryption key */
- optional sint32 cipher_nid = 27; /* cipher NID */
+ optional bytes sym_key = 26; /* symmetric encryption key */
+ optional sint32 cipher_nid = 27; /* cipher NID */
}
diff --git a/src/lib/pb/model.proto b/src/lib/pb/model.proto
index f1382f3d..4c1564a5 100644
--- a/src/lib/pb/model.proto
+++ b/src/lib/pb/model.proto
@@ -28,7 +28,7 @@ message qosspec_msg {
required uint32 availability = 3; /* Class of 9s. */
required uint32 loss = 4; /* Packet loss. */
required uint32 ber = 5; /* Bit error rate, ppb. */
- required uint32 in_order = 6; /* In-order delivery. */
+ required uint32 service = 6; /* enum qos_service. */
required uint32 max_gap = 7; /* In ms. */
required uint32 timeout = 8; /* Timeout in ms. */
}
@@ -37,10 +37,11 @@ message flow_info_msg {
required uint32 id = 1;
required uint32 n_pid = 2;
required uint32 n_1_pid = 3;
- required uint32 mpl = 4;
+ required uint32 mpl = 4; /* MPL in ms. */
required uint32 state = 5;
required qosspec_msg qos = 6;
required uint32 uid = 7;
+ required uint32 mtu = 8; /* Layer MTU (bytes). */
}
message name_info_msg {
diff --git a/src/lib/protobuf.c b/src/lib/protobuf.c
index 28b3aab2..a824d357 100644
--- a/src/lib/protobuf.c
+++ b/src/lib/protobuf.c
@@ -81,6 +81,7 @@ flow_info_msg_t * flow_info_s_to_msg(const struct flow_info * s)
msg->mpl = s->mpl;
msg->state = s->state;
msg->uid = s->uid;
+ msg->mtu = s->mtu;
msg->qos = qos_spec_s_to_msg(&s->qs);
if (msg->qos == NULL)
goto fail_msg;
@@ -107,6 +108,7 @@ struct flow_info flow_info_msg_to_s(const flow_info_msg_t * msg)
s.mpl = msg->mpl;
s.state = msg->state;
s.uid = msg->uid;
+ s.mtu = msg->mtu;
s.qs = qos_spec_msg_to_s(msg->qos);
return s;
@@ -757,7 +759,7 @@ qosspec_msg_t * qos_spec_s_to_msg(const struct qos_spec * s)
msg->availability = s->availability;
msg->loss = s->loss;
msg->ber = s->ber;
- msg->in_order = s->in_order;
+ msg->service = s->service;
msg->max_gap = s->max_gap;
msg->timeout = s->timeout;
@@ -775,7 +777,7 @@ struct qos_spec qos_spec_msg_to_s(const qosspec_msg_t * msg)
s.availability = msg->availability;
s.loss = msg->loss;
s.ber = msg->ber;
- s.in_order = msg->in_order;
+ s.service = msg->service;
s.max_gap = msg->max_gap;
s.timeout = msg->timeout;
diff --git a/src/lib/qoscube.c b/src/lib/qoscube.c
index 1eaa0d7c..5d7ae17d 100644
--- a/src/lib/qoscube.c
+++ b/src/lib/qoscube.c
@@ -29,15 +29,11 @@
qoscube_t qos_spec_to_cube(qosspec_t qs)
{
- if (qs.delay <= qos_voice.delay &&
- qs.bandwidth <= qos_voice.bandwidth &&
- qs.availability >= qos_voice.availability &&
- qs.max_gap <= qos_voice.max_gap)
+ if (qs.delay <= 50 && qs.bandwidth <= 100000
+ && qs.availability >= 5 && qs.max_gap <= 50)
return QOS_CUBE_VOICE;
- else if (qs.delay <= qos_video.delay &&
- qs.bandwidth <= qos_video.bandwidth &&
- qs.availability >= qos_video.availability &&
- qs.max_gap <= qos_video.max_gap)
+ else if (qs.delay <= 100 && qs.availability >= 3
+ && qs.max_gap <= 100)
return QOS_CUBE_VIDEO;
else
return QOS_CUBE_BE;
diff --git a/src/lib/random.c b/src/lib/random.c
index 96315132..2c9a6c0d 100644
--- a/src/lib/random.c
+++ b/src/lib/random.c
@@ -47,8 +47,9 @@ int random_buffer(void * buf,
gcry_randomize(buf, len, GCRY_STRONG_RANDOM);
return 0;
#elif defined(HAVE_OPENSSL_RNG)
- if (len > 0 && len < INT_MAX)
- return RAND_bytes((unsigned char *) buf, (int) len);
- return -1;
+ if (len == 0 || len >= INT_MAX)
+ return -1;
+
+ return RAND_bytes((unsigned char *) buf, (int) len) == 1 ? 0 : -1;
#endif
}
diff --git a/src/lib/rib.c b/src/lib/rib.c
index a8d535c9..6e421397 100644
--- a/src/lib/rib.c
+++ b/src/lib/rib.c
@@ -112,14 +112,14 @@ static int rib_read(const char * path,
(void) info;
(void) offset;
- pthread_rwlock_wrlock(&rib.lock);
+ pthread_rwlock_rdlock(&rib.lock);
list_for_each(p, &rib.reg_comps) {
struct reg_comp * r = list_entry(p, struct reg_comp, next);
if (strcmp(comp, r->path) == 0) {
- int ret = r->ops->read(path + 1, buf, size);
+ struct rib_ops * ops = r->ops;
pthread_rwlock_unlock(&rib.lock);
- return ret;
+ return ops->read(path + 1, buf, size);
}
}
@@ -160,19 +160,25 @@ static int rib_readdir(const char * path,
ssize_t len;
ssize_t i;
struct reg_comp * c;
+ struct rib_ops * ops;
c = list_entry(p, struct reg_comp, next);
if (strcmp(path + 1, c->path) != 0)
continue;
- assert(c->ops->readdir != NULL);
+ ops = c->ops;
+
+ assert(ops->readdir != NULL);
+
+ pthread_rwlock_unlock(&rib.lock);
- len = c->ops->readdir(&dir_entries);
+ len = ops->readdir(&dir_entries);
if (len < 0)
- break;
+ return 0;
for (i = 0; i < len; ++i)
filler(buf, dir_entries[i], NULL, 0);
freepp(char, dir_entries, len);
+ return 0;
}
}
diff --git a/src/lib/ssm/flow_set.c b/src/lib/ssm/flow_set.c
index 73d0db55..cb38e6fd 100644
--- a/src/lib/ssm/flow_set.c
+++ b/src/lib/ssm/flow_set.c
@@ -58,9 +58,9 @@
#define QUEUESIZE ((SSM_RBUFF_SIZE) * sizeof(struct flowevent))
#define SSM_FSET_FILE_SIZE (SYS_MAX_FLOWS * sizeof(ssize_t) \
- + PROG_MAX_FQUEUES * sizeof(size_t) \
- + PROG_MAX_FQUEUES * sizeof(pthread_cond_t) \
- + PROG_MAX_FQUEUES * QUEUESIZE \
+ + PROC_MAX_FQUEUES * sizeof(size_t) \
+ + PROC_MAX_FQUEUES * sizeof(pthread_cond_t) \
+ + PROC_MAX_FQUEUES * QUEUESIZE \
+ sizeof(pthread_mutex_t))
#define fqueue_ptr(fs, idx) (fs->fqueues + (SSM_RBUFF_SIZE) * idx)
@@ -104,10 +104,10 @@ static struct ssm_flow_set * flow_set_create(pid_t pid,
set->mtable = shm_base;
set->heads = (size_t *) (set->mtable + SYS_MAX_FLOWS);
- set->conds = (pthread_cond_t *)(set->heads + PROG_MAX_FQUEUES);
- set->fqueues = (struct flowevent *) (set->conds + PROG_MAX_FQUEUES);
+ set->conds = (pthread_cond_t *)(set->heads + PROC_MAX_FQUEUES);
+ set->fqueues = (struct flowevent *) (set->conds + PROC_MAX_FQUEUES);
set->lock = (pthread_mutex_t *)
- (set->fqueues + PROG_MAX_FQUEUES * (SSM_RBUFF_SIZE));
+ (set->fqueues + PROC_MAX_FQUEUES * (SSM_RBUFF_SIZE));
return set;
@@ -164,7 +164,7 @@ struct ssm_flow_set * ssm_flow_set_create(pid_t pid)
if (pthread_condattr_setclock(&cattr, PTHREAD_COND_CLOCK))
goto fail_condattr_set;
#endif
- for (i = 0; i < PROG_MAX_FQUEUES; ++i) {
+ for (i = 0; i < PROC_MAX_FQUEUES; ++i) {
set->heads[i] = 0;
if (pthread_cond_init(&set->conds[i], &cattr))
goto fail_init;
@@ -222,7 +222,7 @@ void ssm_flow_set_zero(struct ssm_flow_set * set,
ssize_t i = 0;
assert(set);
- assert(idx < PROG_MAX_FQUEUES);
+ assert(idx < PROC_MAX_FQUEUES);
pthread_mutex_lock(set->lock);
@@ -242,7 +242,7 @@ int ssm_flow_set_add(struct ssm_flow_set * set,
{
assert(set);
assert(!(flow_id < 0) && flow_id < SYS_MAX_FLOWS);
- assert(idx < PROG_MAX_FQUEUES);
+ assert(idx < PROC_MAX_FQUEUES);
pthread_mutex_lock(set->lock);
@@ -264,7 +264,7 @@ void ssm_flow_set_del(struct ssm_flow_set * set,
{
assert(set);
assert(!(flow_id < 0) && flow_id < SYS_MAX_FLOWS);
- assert(idx < PROG_MAX_FQUEUES);
+ assert(idx < PROC_MAX_FQUEUES);
pthread_mutex_lock(set->lock);
@@ -282,7 +282,7 @@ int ssm_flow_set_has(struct ssm_flow_set * set,
assert(set);
assert(!(flow_id < 0) && flow_id < SYS_MAX_FLOWS);
- assert(idx < PROG_MAX_FQUEUES);
+ assert(idx < PROC_MAX_FQUEUES);
pthread_mutex_lock(set->lock);
@@ -332,7 +332,7 @@ ssize_t ssm_flow_set_wait(const struct ssm_flow_set * set,
ssize_t ret = 0;
assert(set);
- assert(idx < PROG_MAX_FQUEUES);
+ assert(idx < PROC_MAX_FQUEUES);
assert(fqueue);
#ifndef HAVE_ROBUST_MUTEX
diff --git a/src/lib/ssm/pool.c b/src/lib/ssm/pool.c
index f17a6e65..5607a360 100644
--- a/src/lib/ssm/pool.c
+++ b/src/lib/ssm/pool.c
@@ -24,6 +24,7 @@
#include "config.h"
+#include <ouroboros/atomics.h>
#include <ouroboros/errno.h>
#include <ouroboros/pthread.h>
#include <ouroboros/ssm_pool.h>
@@ -75,26 +76,6 @@ static const struct ssm_size_class_cfg ssm_pup_cfg[SSM_POOL_MAX_CLASSES] = {
#define GET_SHARD_FOR_PID(pid) ((int)((pid) % SSM_POOL_SHARDS))
-#define LOAD_RELAXED(ptr) \
- (__atomic_load_n(ptr, __ATOMIC_RELAXED))
-
-#define LOAD_ACQUIRE(ptr) \
- (__atomic_load_n(ptr, __ATOMIC_ACQUIRE))
-
-#define STORE_RELEASE(ptr, val) \
- (__atomic_store_n(ptr, val, __ATOMIC_RELEASE))
-
-#define LOAD(ptr) \
- (__atomic_load_n(ptr, __ATOMIC_SEQ_CST))
-
-#define STORE(ptr, val) \
- (__atomic_store_n(ptr, val, __ATOMIC_SEQ_CST))
-
-#define FETCH_ADD(ptr, val) \
- (__atomic_fetch_add(ptr, val, __ATOMIC_SEQ_CST))
-
-#define FETCH_SUB(ptr, val) \
- (__atomic_fetch_sub(ptr, val, __ATOMIC_SEQ_CST))
#define SSM_FILE_SIZE (SSM_POOL_TOTAL_SIZE + sizeof(struct _ssm_pool_hdr))
#define SSM_GSPP_FILE_SIZE (SSM_GSPP_TOTAL_SIZE + sizeof(struct _ssm_pool_hdr))
@@ -165,29 +146,6 @@ static __inline__ void list_add_head(struct _ssm_list_head * head,
STORE(&head->count, LOAD(&head->count) + 1);
}
-static __inline__ int select_size_class(struct ssm_pool * pool,
- size_t len)
-{
- size_t sz;
- int i;
-
- assert(pool != NULL);
-
- /* Total space needed: header + headspace + data + tailspace */
- sz = sizeof(struct ssm_pk_buff) + SSM_PK_BUFF_HEADSPACE + len
- + SSM_PK_BUFF_TAILSPACE;
-
- for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) {
- struct _ssm_size_class * sc;
-
- sc = &pool->hdr->size_classes[i];
- if (sc->object_size > 0 && sz <= sc->object_size)
- return i;
- }
-
- return -1;
-}
-
static __inline__ int find_size_class_for_offset(struct ssm_pool * pool,
size_t offset)
{
@@ -702,7 +660,7 @@ ssize_t ssm_pool_alloc(struct ssm_pool * pool,
assert(pool != NULL);
assert(spb != NULL);
- idx = select_size_class(pool, count);
+ idx = select_size_class(pool->hdr, count);
if (idx >= 0)
return alloc_from_sc(pool, idx, count, ptr, spb);
@@ -720,7 +678,7 @@ ssize_t ssm_pool_alloc_b(struct ssm_pool * pool,
assert(pool != NULL);
assert(spb != NULL);
- idx = select_size_class(pool, count);
+ idx = select_size_class(pool->hdr, count);
if (idx >= 0)
return alloc_from_sc_b(pool, idx, count, ptr, spb, abstime);
@@ -746,7 +704,7 @@ ssize_t ssm_pool_read(uint8_t ** dst,
}
struct ssm_pk_buff * ssm_pool_get(struct ssm_pool * pool,
- size_t off)
+ size_t off)
{
struct ssm_pk_buff * blk;
@@ -825,36 +783,36 @@ int ssm_pool_remove(struct ssm_pool * pool,
return 0;
}
-size_t ssm_pk_buff_get_idx(struct ssm_pk_buff * spb)
+size_t ssm_pk_buff_get_off(const struct ssm_pk_buff * spb)
{
assert(spb != NULL);
return spb->off;
}
-uint8_t * ssm_pk_buff_head(struct ssm_pk_buff * spb)
+uint8_t * ssm_pk_buff_head(const struct ssm_pk_buff * spb)
{
assert(spb != NULL);
- return spb->data + spb->pk_head;
+ return (uint8_t *) spb->data + spb->pk_head;
}
-uint8_t * ssm_pk_buff_tail(struct ssm_pk_buff * spb)
+uint8_t * ssm_pk_buff_tail(const struct ssm_pk_buff * spb)
{
assert(spb != NULL);
- return spb->data + spb->pk_tail;
+ return (uint8_t *) spb->data + spb->pk_tail;
}
-size_t ssm_pk_buff_len(struct ssm_pk_buff * spb)
+size_t ssm_pk_buff_len(const struct ssm_pk_buff * spb)
{
assert(spb != NULL);
return spb->pk_tail - spb->pk_head;
}
-uint8_t * ssm_pk_buff_head_alloc(struct ssm_pk_buff * spb,
- size_t size)
+uint8_t * ssm_pk_buff_push(struct ssm_pk_buff * spb,
+ size_t size)
{
assert(spb != NULL);
@@ -866,8 +824,8 @@ uint8_t * ssm_pk_buff_head_alloc(struct ssm_pk_buff * spb,
return spb->data + spb->pk_head;
}
-uint8_t * ssm_pk_buff_tail_alloc(struct ssm_pk_buff * spb,
- size_t size)
+uint8_t * ssm_pk_buff_push_tail(struct ssm_pk_buff * spb,
+ size_t size)
{
uint8_t * buf;
@@ -883,8 +841,8 @@ uint8_t * ssm_pk_buff_tail_alloc(struct ssm_pk_buff * spb,
return buf;
}
-uint8_t * ssm_pk_buff_head_release(struct ssm_pk_buff * spb,
- size_t size)
+uint8_t * ssm_pk_buff_pop(struct ssm_pk_buff * spb,
+ size_t size)
{
uint8_t * buf;
@@ -898,8 +856,8 @@ uint8_t * ssm_pk_buff_head_release(struct ssm_pk_buff * spb,
return buf;
}
-uint8_t * ssm_pk_buff_tail_release(struct ssm_pk_buff * spb,
- size_t size)
+uint8_t * ssm_pk_buff_pop_tail(struct ssm_pk_buff * spb,
+ size_t size)
{
assert(spb != NULL);
assert(!(size > spb->pk_tail - spb->pk_head));
diff --git a/src/lib/ssm/rbuff.c b/src/lib/ssm/rbuff.c
index e4558c31..c149c306 100644
--- a/src/lib/ssm/rbuff.c
+++ b/src/lib/ssm/rbuff.c
@@ -80,6 +80,7 @@ struct ssm_rbuff {
pthread_cond_t * del; /* signal when data removed */
pid_t pid; /* pid of the owner */
int flow_id; /* flow_id of the flow */
+ size_t n_users; /* in-flight users */
};
#define MM_FLAGS (PROT_READ | PROT_WRITE)
@@ -119,6 +120,7 @@ static struct ssm_rbuff * rbuff_create(pid_t pid,
rb->del = rb->add + 1;
rb->pid = pid;
rb->flow_id = flow_id;
+ rb->n_users = 0;
return rb;
@@ -228,11 +230,20 @@ void ssm_rbuff_close(struct ssm_rbuff * rb)
{
assert(rb);
+ /*
+ * Caller must set ACL_FLOWDOWN first; if a user becomes
+ * cancellable, push a cleanup that decrements n_users.
+ */
+ while (__atomic_load_n(&rb->n_users, __ATOMIC_SEQ_CST) > 0) {
+ struct timespec tic = { 0, 100000 };
+ nanosleep(&tic, NULL);
+ }
+
rbuff_destroy(rb);
}
int ssm_rbuff_write(struct ssm_rbuff * rb,
- size_t idx)
+ size_t off)
{
size_t acl;
bool was_empty;
@@ -240,6 +251,8 @@ int ssm_rbuff_write(struct ssm_rbuff * rb,
assert(rb != NULL);
+ __atomic_fetch_add(&rb->n_users, 1, __ATOMIC_SEQ_CST);
+
acl = __atomic_load_n(rb->acl, __ATOMIC_SEQ_CST);
if (acl != ACL_RDWR) {
if (acl & ACL_FLOWDOWN) {
@@ -261,7 +274,7 @@ int ssm_rbuff_write(struct ssm_rbuff * rb,
was_empty = IS_EMPTY(rb);
- HEAD(rb) = (ssize_t) idx;
+ HEAD(rb) = (ssize_t) off;
ADVANCE_HEAD(rb);
if (was_empty)
@@ -269,16 +282,18 @@ int ssm_rbuff_write(struct ssm_rbuff * rb,
pthread_mutex_unlock(rb->mtx);
+ __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST);
return 0;
fail_mutex:
pthread_mutex_unlock(rb->mtx);
fail_acl:
+ __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST);
return ret;
}
int ssm_rbuff_write_b(struct ssm_rbuff * rb,
- size_t idx,
+ size_t off,
const struct timespec * abstime)
{
size_t acl;
@@ -287,6 +302,8 @@ int ssm_rbuff_write_b(struct ssm_rbuff * rb,
assert(rb != NULL);
+ __atomic_fetch_add(&rb->n_users, 1, __ATOMIC_SEQ_CST);
+
acl = __atomic_load_n(rb->acl, __ATOMIC_SEQ_CST);
if (acl != ACL_RDWR) {
if (acl & ACL_FLOWDOWN) {
@@ -316,7 +333,7 @@ int ssm_rbuff_write_b(struct ssm_rbuff * rb,
if (ret != -ETIMEDOUT && ret != -EFLOWDOWN) {
was_empty = IS_EMPTY(rb);
- HEAD(rb) = (ssize_t) idx;
+ HEAD(rb) = (ssize_t) off;
ADVANCE_HEAD(rb);
if (was_empty)
pthread_cond_broadcast(rb->add);
@@ -325,6 +342,7 @@ int ssm_rbuff_write_b(struct ssm_rbuff * rb,
pthread_mutex_unlock(rb->mtx);
fail_acl:
+ __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST);
return ret;
}
@@ -351,11 +369,21 @@ ssize_t ssm_rbuff_read(struct ssm_rbuff * rb)
assert(rb != NULL);
- if (IS_EMPTY(rb))
- return check_rb_acl(rb);
+ __atomic_fetch_add(&rb->n_users, 1, __ATOMIC_SEQ_CST);
+
+ if (IS_EMPTY(rb)) {
+ ret = check_rb_acl(rb);
+ goto out;
+ }
robust_mutex_lock(rb->mtx);
+ if (IS_EMPTY(rb)) {
+ pthread_mutex_unlock(rb->mtx);
+ ret = check_rb_acl(rb);
+ goto out;
+ }
+
ret = TAIL(rb);
ADVANCE_TAIL(rb);
@@ -363,6 +391,8 @@ ssize_t ssm_rbuff_read(struct ssm_rbuff * rb)
pthread_mutex_unlock(rb->mtx);
+ out:
+ __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST);
return ret;
}
@@ -374,9 +404,13 @@ ssize_t ssm_rbuff_read_b(struct ssm_rbuff * rb,
assert(rb != NULL);
+ __atomic_fetch_add(&rb->n_users, 1, __ATOMIC_SEQ_CST);
+
acl = __atomic_load_n(rb->acl, __ATOMIC_SEQ_CST);
- if (IS_EMPTY(rb) && (acl & ACL_FLOWDOWN))
- return -EFLOWDOWN;
+ if (IS_EMPTY(rb) && (acl & ACL_FLOWDOWN)) {
+ idx = -EFLOWDOWN;
+ goto out;
+ }
robust_mutex_lock(rb->mtx);
@@ -402,6 +436,8 @@ ssize_t ssm_rbuff_read_b(struct ssm_rbuff * rb,
assert(idx != -EAGAIN);
+ out:
+ __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST);
return idx;
}
@@ -410,7 +446,11 @@ void ssm_rbuff_set_acl(struct ssm_rbuff * rb,
{
assert(rb != NULL);
+ robust_mutex_lock(rb->mtx);
__atomic_store_n(rb->acl, (size_t) flags, __ATOMIC_SEQ_CST);
+ pthread_cond_broadcast(rb->add);
+ pthread_cond_broadcast(rb->del);
+ pthread_mutex_unlock(rb->mtx);
}
uint32_t ssm_rbuff_get_acl(struct ssm_rbuff * rb)
@@ -424,6 +464,8 @@ void ssm_rbuff_fini(struct ssm_rbuff * rb)
{
assert(rb != NULL);
+ __atomic_fetch_add(&rb->n_users, 1, __ATOMIC_SEQ_CST);
+
robust_mutex_lock(rb->mtx);
pthread_cleanup_push(__cleanup_mutex_unlock, rb->mtx);
@@ -432,6 +474,8 @@ void ssm_rbuff_fini(struct ssm_rbuff * rb)
robust_wait(rb->del, rb->mtx, NULL);
pthread_cleanup_pop(true);
+
+ __atomic_fetch_sub(&rb->n_users, 1, __ATOMIC_SEQ_CST);
}
size_t ssm_rbuff_queued(struct ssm_rbuff * rb)
diff --git a/src/lib/ssm/ssm.h.in b/src/lib/ssm/ssm.h.in
index b9246c8b..b86327a1 100644
--- a/src/lib/ssm/ssm.h.in
+++ b/src/lib/ssm/ssm.h.in
@@ -38,7 +38,6 @@
#define SSM_RBUFF_PREFIX "@SSM_RBUFF_PREFIX@"
#define SSM_FLOW_SET_PREFIX "@SSM_FLOW_SET_PREFIX@"
#define SSM_POOL_NAME "@SSM_POOL_NAME@"
-#define SSM_POOL_BLOCKS @SSM_POOL_BLOCKS@
#define SSM_RBUFF_SIZE @SSM_RBUFF_SIZE@
/* Packet buffer space reservation */
@@ -164,6 +163,24 @@ struct _ssm_pool_hdr {
struct _ssm_size_class size_classes[SSM_POOL_MAX_CLASSES];
};
+#define SSM_PK_BUFF_TOTALSPACE (SSM_PK_BUFF_HEADSPACE + SSM_PK_BUFF_TAILSPACE)
+static __inline__ int select_size_class(struct _ssm_pool_hdr * hdr,
+ size_t len)
+{
+ size_t sz;
+ int i;
+
+ sz = sizeof(struct ssm_pk_buff) + SSM_PK_BUFF_TOTALSPACE + len;
+
+ for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) {
+ struct _ssm_size_class * sc = &hdr->size_classes[i];
+ if (sc->object_size > 0 && sz <= sc->object_size)
+ return i;
+ }
+
+ return -1;
+}
+
#ifdef __cplusplus
}
#endif
diff --git a/src/lib/ssm/tests/pool_sharding_test.c b/src/lib/ssm/tests/pool_sharding_test.c
index c53105e3..ec464a92 100644
--- a/src/lib/ssm/tests/pool_sharding_test.c
+++ b/src/lib/ssm/tests/pool_sharding_test.c
@@ -80,19 +80,13 @@ static int test_lazy_distribution(void)
goto fail_pool;
}
- /* Find the first size class with blocks */
- sc_idx = -1;
- for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) {
- if (hdr->size_classes[i].object_count > 0) {
- sc_idx = i;
- break;
- }
- }
-
+ /* Inspect the class that TEST_SIZE allocations will use */
+ sc_idx = select_size_class(hdr, TEST_SIZE);
if (sc_idx < 0) {
- printf("No size classes configured.\n");
+ printf("No size class fits TEST_SIZE=%d.\n", TEST_SIZE);
for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) {
- printf(" Class %d: count=%zu\n", i,
+ printf(" Class %d: object_size=%zu count=%zu\n", i,
+ hdr->size_classes[i].object_size,
hdr->size_classes[i].object_count);
}
goto fail_pool;
@@ -137,7 +131,6 @@ static int test_shard_migration(void)
ssize_t off;
int shard_idx;
int sc_idx;
- int i;
TEST_START();
@@ -149,18 +142,11 @@ static int test_shard_migration(void)
hdr = get_pool_hdr(pool);
- /* Find the first size class with blocks */
- sc_idx = -1;
- for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) {
- if (hdr->size_classes[i].object_count > 0) {
- sc_idx = i;
- break;
- }
- }
-
+ /* Inspect the class that TEST_SIZE allocations will use */
+ sc_idx = select_size_class(hdr, TEST_SIZE);
if (sc_idx < 0) {
- printf("No size classes configured.\n");
- goto fail;
+ printf("No size class fits TEST_SIZE=%d.\n", TEST_SIZE);
+ goto fail_pool;
}
sc = &hdr->size_classes[sc_idx];
@@ -209,7 +195,6 @@ static int test_fallback_stealing(void)
size_t total_free;
size_t i;
int sc_idx;
- int c;
TEST_START();
@@ -221,18 +206,11 @@ static int test_fallback_stealing(void)
hdr = get_pool_hdr(pool);
- /* Find the first size class with blocks */
- sc_idx = -1;
- for (c = 0; c < SSM_POOL_MAX_CLASSES; c++) {
- if (hdr->size_classes[c].object_count > 0) {
- sc_idx = c;
- break;
- }
- }
-
+ /* Inspect the class that TEST_SIZE allocations will use */
+ sc_idx = select_size_class(hdr, TEST_SIZE);
if (sc_idx < 0) {
- printf("No size classes configured.\n");
- goto fail;
+ printf("No size class fits TEST_SIZE=%d.\n", TEST_SIZE);
+ goto fail_pool;
}
sc = &hdr->size_classes[sc_idx];
@@ -261,7 +239,7 @@ static int test_fallback_stealing(void)
/* Free them all - they go to local_shard */
for (i = 0; i < total_blocks / 2; i++) {
- size_t off = ssm_pk_buff_get_idx(spbs[i]);
+ size_t off = ssm_pk_buff_get_off(spbs[i]);
if (ssm_pool_remove(pool, off) != 0) {
printf("Remove %zu failed.\n", i);
free(spbs);
@@ -299,7 +277,7 @@ static int test_fallback_stealing(void)
/* Now all allocated blocks are in use again */
/* Cleanup - free all allocated blocks */
for (i = 0; i < total_blocks / 2; i++) {
- size_t off = ssm_pk_buff_get_idx(spbs[i]);
+ size_t off = ssm_pk_buff_get_off(spbs[i]);
ssm_pool_remove(pool, off);
}
@@ -396,20 +374,15 @@ static int test_multiprocess_sharding(void)
/* Verify blocks distributed across shards */
hdr = get_pool_hdr(pool);
- /* Find the first size class with blocks */
- sc = NULL;
- for (i = 0; i < SSM_POOL_MAX_CLASSES; i++) {
- if (hdr->size_classes[i].object_count > 0) {
- sc = &hdr->size_classes[i];
- break;
- }
- }
-
- if (sc == NULL) {
- printf("No size classes configured.\n");
+ /* Inspect the class that TEST_SIZE allocations used */
+ i = select_size_class(hdr, TEST_SIZE);
+ if (i < 0) {
+ printf("No size class fits TEST_SIZE=%d.\n", TEST_SIZE);
goto fail_pool;
}
+ sc = &hdr->size_classes[i];
+
/* After children allocate and free, blocks should be in shards
* (though exact distribution depends on PID values)
*/
diff --git a/src/lib/ssm/tests/pool_test.c b/src/lib/ssm/tests/pool_test.c
index 3fc19cd5..0f9db24d 100644
--- a/src/lib/ssm/tests/pool_test.c
+++ b/src/lib/ssm/tests/pool_test.c
@@ -741,14 +741,14 @@ static int test_ssm_pk_buff_operations(void)
memcpy(head, data, dlen);
- tail = ssm_pk_buff_tail_alloc(spb, 32);
+ tail = ssm_pk_buff_push_tail(spb, 32);
if (tail == NULL) {
- printf("Tail_alloc failed.\n");
+ printf("push_tail failed.\n");
goto fail_ops;
}
if (ssm_pk_buff_len(spb) != POOL_256 + 32) {
- printf("Length after tail_alloc: %zu.\n",
+ printf("Length after push_tail: %zu.\n",
ssm_pk_buff_len(spb));
goto fail_ops;
}
@@ -758,14 +758,14 @@ static int test_ssm_pk_buff_operations(void)
goto fail_ops;
}
- tail = ssm_pk_buff_tail_release(spb, 32);
+ tail = ssm_pk_buff_pop_tail(spb, 32);
if (tail == NULL) {
- printf("Tail_release failed.\n");
+ printf("pop_tail failed.\n");
goto fail_ops;
}
if (ssm_pk_buff_len(spb) != POOL_256) {
- printf("Length after tail_release: %zu.\n",
+ printf("Length after pop_tail: %zu.\n",
ssm_pk_buff_len(spb));
goto fail_ops;
}
diff --git a/src/lib/tests/CMakeLists.txt b/src/lib/tests/CMakeLists.txt
index 5a2f2c52..32836589 100644
--- a/src/lib/tests/CMakeLists.txt
+++ b/src/lib/tests/CMakeLists.txt
@@ -10,7 +10,6 @@ create_test_sourcelist(${PARENT_DIR}_tests test_suite.c
auth_test_slh_dsa.c
bitmap_test.c
btree_test.c
- crc32_test.c
crypt_test.c
hash_test.c
kex_test.c
@@ -20,6 +19,7 @@ create_test_sourcelist(${PARENT_DIR}_tests test_suite.c
sockets_test.c
time_test.c
tpm_test.c
+ tw_test.c
)
add_executable(${PARENT_DIR}_test ${${PARENT_DIR}_tests})
diff --git a/src/lib/tests/hash_test.c b/src/lib/tests/hash_test.c
index e43847e1..451d3c25 100644
--- a/src/lib/tests/hash_test.c
+++ b/src/lib/tests/hash_test.c
@@ -39,6 +39,74 @@ struct vec_entry {
char * out;
};
+static int test_crc8(void)
+{
+ int ret = 0;
+
+ struct vec_entry vec [] = {
+ { "", "00" },
+ { "123456789", "df" },
+ { NULL, NULL }
+ };
+
+ struct vec_entry * cur = vec;
+
+ TEST_START();
+
+ while (cur->in != NULL) {
+ uint8_t crc;
+ char res[3];
+
+ str_hash(HASH_CRC8, &crc, cur->in);
+
+ sprintf(res, "%02x", crc);
+ if (strcmp(res, cur->out) != 0) {
+ printf("Hash failed %s != %s.\n", res, cur->out);
+ ret |= -1;
+ }
+
+ ++cur;
+ }
+
+ TEST_END(ret);
+
+ return ret;
+}
+
+static int test_crc16(void)
+{
+ int ret = 0;
+
+ struct vec_entry vec [] = {
+ { "", "ffff" },
+ { "123456789", "29b1" },
+ { NULL, NULL }
+ };
+
+ struct vec_entry * cur = vec;
+
+ TEST_START();
+
+ while (cur->in != NULL) {
+ uint8_t crc[2];
+ char res[5];
+
+ str_hash(HASH_CRC16, crc, cur->in);
+
+ sprintf(res, "%02x%02x", crc[0], crc[1]);
+ if (strcmp(res, cur->out) != 0) {
+ printf("Hash failed %s != %s.\n", res, cur->out);
+ ret |= -1;
+ }
+
+ ++cur;
+ }
+
+ TEST_END(ret);
+
+ return ret;
+}
+
static int test_crc32(void)
{
int ret = 0;
@@ -74,6 +142,42 @@ static int test_crc32(void)
return ret;
}
+static int test_crc64(void)
+{
+ int ret = 0;
+
+ struct vec_entry vec [] = {
+ { "", "0000000000000000" },
+ { "123456789", "ae8b14860a799888" },
+ { "0123456789abcdef",
+ "091485ca7018730e" },
+ { NULL, NULL }
+ };
+
+ struct vec_entry * cur = vec;
+
+ TEST_START();
+
+ while (cur->in != NULL) {
+ uint8_t crc[8];
+ char res[17];
+
+ str_hash(HASH_CRC64, crc, cur->in);
+
+ sprintf(res, HASH_FMT64, HASH_VAL64(crc));
+ if (strcmp(res, cur->out) != 0) {
+ printf("Hash failed %s != %s.\n", res, cur->out);
+ ret |= -1;
+ }
+
+ ++cur;
+ }
+
+ TEST_END(ret);
+
+ return ret;
+}
+
static int test_md5(void)
{
int ret = 0;
@@ -192,8 +296,14 @@ int hash_test(int argc,
(void) argc;
(void) argv;
+ ret |= test_crc8();
+
+ ret |= test_crc16();
+
ret |= test_crc32();
+ ret |= test_crc64();
+
ret |= test_md5();
ret |= test_sha3();
diff --git a/src/lib/tests/tpm_test.c b/src/lib/tests/tpm_test.c
index df1d8850..7cc049cd 100644
--- a/src/lib/tests/tpm_test.c
+++ b/src/lib/tests/tpm_test.c
@@ -21,7 +21,7 @@
*/
-#include "tpm.c"
+#include <ouroboros/tpm.h>
#include <test/test.h>
diff --git a/src/lib/tests/tw_test.c b/src/lib/tests/tw_test.c
new file mode 100644
index 00000000..32c302c4
--- /dev/null
+++ b/src/lib/tests/tw_test.c
@@ -0,0 +1,663 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * Generic timing-wheel tests
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+#if defined(__linux__) || defined(__CYGWIN__)
+#define _DEFAULT_SOURCE
+#else
+#define _POSIX_C_SOURCE 200809L
+#endif
+
+#include "config.h"
+
+#include <test/test.h>
+
+#include <ouroboros/time.h>
+#include <ouroboros/tw.h>
+
+#include <stdint.h>
+#include <stdio.h>
+#include <time.h>
+
+struct payload {
+ struct tw_entry tw;
+ int fired;
+};
+
+struct cancel_payload {
+ struct tw_entry tw;
+ int fired;
+ struct tw_entry * sibling;
+};
+
+struct repost_payload {
+ struct tw_entry tw;
+ int fired;
+ struct payload * sibling;
+ uint64_t repost_at;
+};
+
+static void cb_count(void * arg)
+{
+ struct payload * p = arg;
+ p->fired++;
+}
+
+static void cb_cancel_sibling(void * arg)
+{
+ struct cancel_payload * p = arg;
+ p->fired++;
+ tw_cancel(p->sibling);
+}
+
+static void cb_repost_sibling(void * arg)
+{
+ struct repost_payload * p = arg;
+ p->fired++;
+ tw_post(&p->sibling->tw, p->repost_at, cb_count, p->sibling);
+}
+
+static uint64_t now_ns(void)
+{
+ struct timespec ts;
+ clock_gettime(PTHREAD_COND_CLOCK, &ts);
+ return TS_TO_UINT64(ts);
+}
+
+static void sleep_ns(uint64_t ns)
+{
+ struct timespec ts;
+ UINT64_TO_TS(ns, &ts);
+ nanosleep(&ts, NULL);
+}
+
+static int test_tw_init_fini(void)
+{
+ TEST_START();
+
+ if (tw_init() < 0) {
+ printf("tw_init failed.\n");
+ goto fail;
+ }
+
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_tw_post_fires_after_deadline(void)
+{
+ struct payload p;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&p.tw);
+ p.fired = 0;
+
+ tw_post(&p.tw, now_ns() + 5 * MILLION, cb_count, &p);
+
+ sleep_ns(20 * MILLION);
+ tw_move();
+
+ if (p.fired != 1) {
+ printf("expected 1 fire, got %d\n", p.fired);
+ goto fail_post;
+ }
+
+ tw_cancel(&p.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&p.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_tw_no_fire_before_deadline(void)
+{
+ struct payload p;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&p.tw);
+ p.fired = 0;
+
+ tw_post(&p.tw, now_ns() + 100 * MILLION, cb_count, &p);
+
+ sleep_ns(2 * MILLION);
+ tw_move();
+
+ if (p.fired != 0) {
+ printf("expected 0 fires, got %d\n", p.fired);
+ goto fail_post;
+ }
+
+ tw_cancel(&p.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&p.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_tw_cancel_prevents_fire(void)
+{
+ struct payload p;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&p.tw);
+ p.fired = 0;
+
+ tw_post(&p.tw, now_ns() + 5 * MILLION, cb_count, &p);
+ tw_cancel(&p.tw);
+
+ sleep_ns(20 * MILLION);
+ tw_move();
+
+ if (p.fired != 0) {
+ printf("cancelled entry fired %d times\n", p.fired);
+ goto fail_init;
+ }
+
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_init:
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_tw_cancel_unposted_is_noop(void)
+{
+ struct tw_entry e;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&e);
+ tw_cancel(&e);
+ tw_cancel(&e);
+
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_tw_fire_only_once(void)
+{
+ struct payload p;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&p.tw);
+ p.fired = 0;
+
+ tw_post(&p.tw, now_ns() + 3 * MILLION, cb_count, &p);
+
+ sleep_ns(20 * MILLION);
+ tw_move();
+ tw_move();
+ tw_move();
+
+ if (p.fired != 1) {
+ printf("expected 1 fire, got %d after 3 moves\n", p.fired);
+ goto fail_post;
+ }
+
+ tw_cancel(&p.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&p.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* Multi-level: post a level-1 (>= 256ms) deadline; should still fire. */
+static int test_tw_post_level1_fires(void)
+{
+ struct payload p;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&p.tw);
+ p.fired = 0;
+
+ tw_post(&p.tw, now_ns() + 300 * MILLION, cb_count, &p);
+
+ if (p.tw.lvl != 1) {
+ printf("expected level 1 placement, got %zu\n", p.tw.lvl);
+ goto fail_post;
+ }
+
+ sleep_ns(320 * MILLION);
+ tw_move();
+
+ if (p.fired != 1) {
+ printf("level-1 entry didn't fire (got %d)\n", p.fired);
+ goto fail_post;
+ }
+
+ tw_cancel(&p.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&p.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+static int test_tw_many_entries_all_fire(void)
+{
+ struct payload pl[16];
+ size_t i;
+ size_t total = 0;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ for (i = 0; i < 16; ++i) {
+ tw_init_entry(&pl[i].tw);
+ pl[i].fired = 0;
+ tw_post(&pl[i].tw, now_ns() + (1 + i) * MILLION,
+ cb_count, &pl[i]);
+ }
+
+ sleep_ns(40 * MILLION);
+ tw_move();
+
+ for (i = 0; i < 16; ++i)
+ total += pl[i].fired;
+
+ if (total != 16) {
+ printf("expected 16 fires, got %zu\n", total);
+ goto fail_post;
+ }
+
+ for (i = 0; i < 16; ++i)
+ tw_cancel(&pl[i].tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ for (i = 0; i < 16; ++i)
+ tw_cancel(&pl[i].tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* tw_next_expiry signals empty wheel via tv_nsec == -1. */
+static int test_tw_next_expiry_empty(void)
+{
+ struct timespec out;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_next_expiry(&out);
+ if (out.tv_nsec != -1) {
+ printf("expected tv_nsec=-1, got %ld\n", (long) out.tv_nsec);
+ goto fail_init;
+ }
+
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_init:
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* tw_next_expiry returns a deadline within the right ballpark. */
+static int test_tw_next_expiry_returns_deadline(void)
+{
+ struct payload p;
+ struct timespec out;
+ uint64_t target;
+ uint64_t out_ns;
+ int64_t skew;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&p.tw);
+ p.fired = 0;
+
+ target = now_ns() + 50 * MILLION;
+ tw_post(&p.tw, target, cb_count, &p);
+
+ tw_next_expiry(&out);
+ out_ns = TS_TO_UINT64(out);
+
+ /* Level-0 quantization gives ±1 slot of skew. */
+ skew = (int64_t)(out_ns) - (int64_t)(target);
+ if (skew < -2 * MILLION || skew > 4 * MILLION) {
+ printf("deadline not in -2..+4 ms, skew=%ld ns\n", (long) skew);
+ goto fail_post;
+ }
+
+ tw_cancel(&p.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&p.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* Repost: fire, then post again. */
+static int test_tw_repost_after_fire(void)
+{
+ struct payload p;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&p.tw);
+ p.fired = 0;
+
+ tw_post(&p.tw, now_ns() + 3 * MILLION, cb_count, &p);
+ sleep_ns(20 * MILLION);
+ tw_move();
+ if (p.fired != 1) {
+ printf("first fire missed\n");
+ goto fail_post;
+ }
+
+ tw_post(&p.tw, now_ns() + 3 * MILLION, cb_count, &p);
+ sleep_ns(20 * MILLION);
+ tw_move();
+ if (p.fired != 2) {
+ printf("second fire missed (fired=%d)\n", p.fired);
+ goto fail_post;
+ }
+
+ tw_cancel(&p.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&p.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* Double-post replaces the schedule; only the second fires. */
+static int test_tw_double_post_replaces(void)
+{
+ struct payload p;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&p.tw);
+ p.fired = 0;
+
+ tw_post(&p.tw, now_ns() + 30 * MILLION, cb_count, &p);
+ tw_post(&p.tw, now_ns() + 3 * MILLION, cb_count, &p);
+
+ sleep_ns(20 * MILLION);
+ tw_move();
+
+ if (p.fired != 1) {
+ printf("expected 1 fire after replace, got %d\n", p.fired);
+ goto fail_post;
+ }
+
+ sleep_ns(40 * MILLION);
+ tw_move();
+
+ if (p.fired != 1) {
+ printf("first schedule fired after replace (got %d)\n",
+ p.fired);
+ goto fail_post;
+ }
+
+ tw_cancel(&p.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&p.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* Fire callback may safely cancel a sibling in the same slot. */
+static int test_tw_fire_cancels_sibling(void)
+{
+ struct cancel_payload a;
+ struct payload b;
+ uint64_t deadline;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&a.tw);
+ tw_init_entry(&b.tw);
+ a.fired = 0;
+ a.sibling = &b.tw;
+ b.fired = 0;
+
+ deadline = now_ns() + 3 * MILLION;
+ tw_post(&a.tw, deadline, cb_cancel_sibling, &a);
+ tw_post(&b.tw, deadline, cb_count, &b);
+
+ sleep_ns(20 * MILLION);
+ tw_move();
+
+ if (a.fired != 1) {
+ printf("a expected 1 fire, got %d\n", a.fired);
+ goto fail_post;
+ }
+ if (b.fired != 0) {
+ printf("b should not have fired (got %d)\n", b.fired);
+ goto fail_post;
+ }
+
+ tw_cancel(&a.tw);
+ tw_cancel(&b.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&a.tw);
+ tw_cancel(&b.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+/* Fire callback may safely repost a sibling to a future slot. */
+static int test_tw_fire_posts_sibling(void)
+{
+ struct repost_payload a;
+ struct payload b;
+ uint64_t deadline;
+
+ TEST_START();
+
+ if (tw_init() < 0)
+ goto fail;
+
+ tw_init_entry(&a.tw);
+ tw_init_entry(&b.tw);
+ a.fired = 0;
+ a.sibling = &b;
+ a.repost_at = now_ns() + 30 * MILLION;
+ b.fired = 0;
+
+ deadline = now_ns() + 3 * MILLION;
+ tw_post(&a.tw, deadline, cb_repost_sibling, &a);
+ tw_post(&b.tw, deadline, cb_count, &b);
+
+ sleep_ns(20 * MILLION);
+ tw_move();
+
+ if (a.fired != 1) {
+ printf("a expected 1 fire, got %d\n", a.fired);
+ goto fail_post;
+ }
+ if (b.fired != 0) {
+ printf("b fired before reposted deadline (got %d)\n",
+ b.fired);
+ goto fail_post;
+ }
+
+ sleep_ns(25 * MILLION);
+ tw_move();
+
+ if (b.fired != 1) {
+ printf("b expected 1 fire after repost, got %d\n",
+ b.fired);
+ goto fail_post;
+ }
+
+ tw_cancel(&a.tw);
+ tw_cancel(&b.tw);
+ tw_fini();
+
+ TEST_SUCCESS();
+
+ return TEST_RC_SUCCESS;
+ fail_post:
+ tw_cancel(&a.tw);
+ tw_cancel(&b.tw);
+ tw_fini();
+ fail:
+ TEST_FAIL();
+ return TEST_RC_FAIL;
+}
+
+int tw_test(int argc,
+ char ** argv)
+{
+ int ret = 0;
+
+ (void) argc;
+ (void) argv;
+
+ ret |= test_tw_init_fini();
+ ret |= test_tw_post_fires_after_deadline();
+ ret |= test_tw_no_fire_before_deadline();
+ ret |= test_tw_cancel_prevents_fire();
+ ret |= test_tw_cancel_unposted_is_noop();
+ ret |= test_tw_fire_only_once();
+ ret |= test_tw_post_level1_fires();
+ ret |= test_tw_many_entries_all_fire();
+ ret |= test_tw_next_expiry_empty();
+ ret |= test_tw_next_expiry_returns_deadline();
+ ret |= test_tw_repost_after_fire();
+ ret |= test_tw_double_post_replaces();
+ ret |= test_tw_fire_cancels_sibling();
+ ret |= test_tw_fire_posts_sibling();
+
+ return ret;
+}
diff --git a/src/lib/timerwheel.c b/src/lib/timerwheel.c
deleted file mode 100644
index 2c796c96..00000000
--- a/src/lib/timerwheel.c
+++ /dev/null
@@ -1,414 +0,0 @@
-/*
- * Ouroboros - Copyright (C) 2016 - 2026
- *
- * Timerwheel
- *
- * Dimitri Staessens <dimitri@ouroboros.rocks>
- * Sander Vrijders <sander@ouroboros.rocks>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * version 2.1 as published by the Free Software Foundation.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., http://www.fsf.org/about/contact/.
- */
-
-#include <ouroboros/list.h>
-
-/* Overflow limits range to about 6 hours. */
-#define ts_to_ns(ts) (ts.tv_sec * BILLION + ts.tv_nsec)
-#define ts_to_rxm_slot(ts) (ts_to_ns(ts) >> RXMQ_RES)
-#define ts_to_ack_slot(ts) (ts_to_ns(ts) >> ACKQ_RES)
-
-struct rxm {
- struct list_head next;
- uint32_t seqno;
-#ifndef RXM_BUFFER_ON_HEAP
- struct ssm_pk_buff * spb;
-#endif
- struct frct_pci * pkt;
- size_t len;
- time_t t0; /* Time when original was sent (us). */
- struct frcti * frcti;
- int fd;
- int flow_id; /* Prevent rtx when fd reused. */
-};
-
-struct ack {
- struct list_head next;
- struct frcti * frcti;
- int fd;
- int flow_id;
-};
-
-struct {
- /*
- * At a 1 ms min resolution, every level bumps the
- * resolution by a factor of 16.
- */
- struct list_head rxms[RXMQ_LVLS][RXMQ_SLOTS];
-
- struct list_head acks[ACKQ_SLOTS];
- bool map[ACKQ_SLOTS][PROG_MAX_FLOWS];
-
- size_t prv_rxm[RXMQ_LVLS]; /* Last processed rxm slots. */
- size_t prv_ack; /* Last processed ack slot. */
- pthread_mutex_t lock;
-} rw;
-
-static void timerwheel_fini(void)
-{
- size_t i;
- size_t j;
- struct list_head * p;
- struct list_head * h;
-
- pthread_mutex_lock(&rw.lock);
-
- for (i = 0; i < RXMQ_LVLS; ++i) {
- for (j = 0; j < RXMQ_SLOTS; j++) {
- list_for_each_safe(p, h, &rw.rxms[i][j]) {
- struct rxm * rxm;
- rxm = list_entry(p, struct rxm, next);
- list_del(&rxm->next);
-#ifdef RXM_BUFFER_ON_HEAP
- free(rxm->pkt);
-#else
- ssm_pk_buff_ack(rxm->spb);
- ipcp_spb_release(rxm->spb);
-#endif
- free(rxm);
- }
- }
- }
-
- for (i = 0; i < ACKQ_SLOTS; ++i) {
- list_for_each_safe(p, h, &rw.acks[i]) {
- struct ack * a = list_entry(p, struct ack, next);
- list_del(&a->next);
- free(a);
- }
- }
-
- pthread_mutex_unlock(&rw.lock);
-
- pthread_mutex_destroy(&rw.lock);
-}
-
-static int timerwheel_init(void)
-{
- struct timespec now;
- size_t i;
- size_t j;
-
- if (pthread_mutex_init(&rw.lock, NULL))
- return -1;
-
- clock_gettime(PTHREAD_COND_CLOCK, &now);
-
- for (i = 0; i < RXMQ_LVLS; ++i) {
- rw.prv_rxm[i] = (ts_to_rxm_slot(now) - 1);
- rw.prv_rxm[i] >>= (RXMQ_BUMP * i);
- rw.prv_rxm[i] &= (RXMQ_SLOTS - 1);
- for (j = 0; j < RXMQ_SLOTS; ++j)
- list_head_init(&rw.rxms[i][j]);
- }
-
- rw.prv_ack = (ts_to_ack_slot(now) - 1) & (ACKQ_SLOTS - 1);
- for (i = 0; i < ACKQ_SLOTS; ++i)
- list_head_init(&rw.acks[i]);
-
- return 0;
-}
-
-static void timerwheel_move(void)
-{
- struct timespec now;
- struct list_head * p;
- struct list_head * h;
- size_t rxm_slot;
- size_t ack_slot;
- size_t i;
- size_t j;
-
- pthread_mutex_lock(&rw.lock);
-
- pthread_cleanup_push(__cleanup_mutex_unlock, &rw.lock);
-
- clock_gettime(PTHREAD_COND_CLOCK, &now);
-
- rxm_slot = ts_to_rxm_slot(now);
-
- for (i = 0; i < RXMQ_LVLS; ++i) {
- size_t j_max_slot = rxm_slot & (RXMQ_SLOTS - 1);
- j = rw.prv_rxm[i];
- if (j_max_slot < j)
- j_max_slot += RXMQ_SLOTS;
- while (j++ < j_max_slot) {
- list_for_each_safe(p, h,
- &rw.rxms[i][j & (RXMQ_SLOTS - 1)]) {
- struct rxm * r;
- struct frct_cr * snd_cr;
- struct frct_cr * rcv_cr;
- size_t slot;
- size_t rslot;
- ssize_t idx;
- struct ssm_pk_buff * spb;
- struct frct_pci * pci;
- struct flow * f;
- uint32_t snd_lwe;
- uint32_t rcv_lwe;
- size_t lvl = 0;
-
- r = list_entry(p, struct rxm, next);
-
- list_del(&r->next);
-
- snd_cr = &r->frcti->snd_cr;
- rcv_cr = &r->frcti->rcv_cr;
- f = &proc.flows[r->fd];
-#ifndef RXM_BUFFER_ON_HEAP
- ssm_pk_buff_ack(r->spb);
-#endif
- if (f->frcti == NULL
- || f->info.id != r->flow_id)
- goto cleanup;
-
- pthread_rwlock_rdlock(&r->frcti->lock);
-
- snd_lwe = snd_cr->lwe;
- rcv_lwe = rcv_cr->lwe;
-
- pthread_rwlock_unlock(&r->frcti->lock);
-
- /* Has been ack'd, remove. */
- if (before(r->seqno, snd_lwe))
- goto cleanup;
-
- /* Check for r-timer expiry. */
- if (ts_to_ns(now) - r->t0 > r->frcti->r)
- goto flow_down;
-
- pthread_rwlock_wrlock(&r->frcti->lock);
-
- if (r->seqno == r->frcti->rttseq) {
- r->frcti->rto +=
- r->frcti->rto >> RTO_DIV;
- r->frcti->probe = false;
- }
-#ifdef PROC_FLOW_STATS
- r->frcti->n_rtx++;
-#endif
- rslot = r->frcti->rto >> RXMQ_RES;
-
- pthread_rwlock_unlock(&r->frcti->lock);
-
- /* Schedule at least in the next time slot. */
- slot = ts_to_ns(now) >> RXMQ_RES;
-
- while (rslot >= RXMQ_SLOTS) {
- ++lvl;
- rslot >>= RXMQ_BUMP;
- slot >>= RXMQ_BUMP;
- }
-
- if (lvl >= RXMQ_LVLS) /* Can't reschedule */
- goto flow_down;
-
- rslot = (rslot + slot + 1) & (RXMQ_SLOTS - 1);
-#ifdef RXM_BLOCKING
- if (ipcp_spb_reserve(&spb, r->len) < 0)
-#else
- if (ssm_pool_alloc(proc.pool, r->len, NULL,
- &spb) < 0)
-#endif
- goto reschedule; /* rdrbuff full */
-
- pci = (struct frct_pci *) ssm_pk_buff_head(spb);
- memcpy(pci, r->pkt, r->len);
-#ifndef RXM_BUFFER_ON_HEAP
- ipcp_spb_release(r->spb);
- r->spb = spb;
- r->pkt = pci;
- ssm_pk_buff_wait_ack(spb);
-#endif
- idx = ssm_pk_buff_get_idx(spb);
-
- /* Retransmit the copy. */
- pci->ackno = hton32(rcv_lwe);
-#ifdef RXM_BLOCKING
- if (ssm_rbuff_write_b(f->tx_rb, idx, NULL) < 0)
-#else
- if (ssm_rbuff_write(f->tx_rb, idx) < 0)
-#endif
- goto flow_down;
- ssm_flow_set_notify(f->set, f->info.id,
- FLOW_PKT);
- reschedule:
- list_add(&r->next, &rw.rxms[lvl][rslot]);
- continue;
-
- flow_down:
- ssm_rbuff_set_acl(f->tx_rb, ACL_FLOWDOWN);
- ssm_rbuff_set_acl(f->rx_rb, ACL_FLOWDOWN);
- cleanup:
-#ifdef RXM_BUFFER_ON_HEAP
- free(r->pkt);
-#else
- ipcp_spb_release(r->spb);
-#endif
- free(r);
- }
- }
- rw.prv_rxm[i] = rxm_slot & (RXMQ_SLOTS - 1);
- /* Move up a level in the wheel. */
- rxm_slot >>= RXMQ_BUMP;
- }
-
- ack_slot = ts_to_ack_slot(now) & (ACKQ_SLOTS - 1) ;
-
- j = rw.prv_ack;
-
- if (ack_slot < j)
- ack_slot += ACKQ_SLOTS;
-
- while (j++ < ack_slot) {
- list_for_each_safe(p, h, &rw.acks[j & (ACKQ_SLOTS - 1)]) {
- struct ack * a;
- struct flow * f;
-
- a = list_entry(p, struct ack, next);
-
- list_del(&a->next);
-
- f = &proc.flows[a->fd];
-
- rw.map[j & (ACKQ_SLOTS - 1)][a->fd] = false;
-
- if (f->info.id == a->flow_id && f->frcti != NULL)
- send_frct_pkt(a->frcti);
-
- free(a);
- }
- }
-
- rw.prv_ack = ack_slot & (ACKQ_SLOTS - 1);
-
- pthread_cleanup_pop(true);
-}
-
-static int timerwheel_rxm(struct frcti * frcti,
- uint32_t seqno,
- struct ssm_pk_buff * spb)
-{
- struct timespec now;
- struct rxm * r;
- size_t slot;
- size_t lvl = 0;
- time_t rto_slot;
-
- r = malloc(sizeof(*r));
- if (r == NULL)
- return -ENOMEM;
-
- clock_gettime(PTHREAD_COND_CLOCK, &now);
-
- r->t0 = ts_to_ns(now);
- r->seqno = seqno;
- r->frcti = frcti;
- r->len = ssm_pk_buff_len(spb);
-#ifdef RXM_BUFFER_ON_HEAP
- r->pkt = malloc(r->len);
- if (r->pkt == NULL) {
- free(r);
- return -ENOMEM;
- }
- memcpy(r->pkt, ssm_pk_buff_head(spb), r->len);
-#else
- r->spb = spb;
- r->pkt = (struct frct_pci *) ssm_pk_buff_head(spb);
-#endif
- pthread_rwlock_rdlock(&r->frcti->lock);
-
- rto_slot = frcti->rto >> RXMQ_RES;
- slot = r->t0 >> RXMQ_RES;
-
- r->fd = frcti->fd;
- r->flow_id = proc.flows[r->fd].info.id;
-
- pthread_rwlock_unlock(&r->frcti->lock);
-
- while (rto_slot >= RXMQ_SLOTS) {
- ++lvl;
- rto_slot >>= RXMQ_BUMP;
- slot >>= RXMQ_BUMP;
- }
-
- if (lvl >= RXMQ_LVLS) { /* Out of timerwheel range. */
-#ifdef RXM_BUFFER_ON_HEAP
- free(r->pkt);
-#endif
- free(r);
- return -EPERM;
- }
-
- slot = (slot + rto_slot + 1) & (RXMQ_SLOTS - 1);
-
- pthread_mutex_lock(&rw.lock);
-
- list_add_tail(&r->next, &rw.rxms[lvl][slot]);
-#ifndef RXM_BUFFER_ON_HEAP
- ssm_pk_buff_wait_ack(spb);
-#endif
- pthread_mutex_unlock(&rw.lock);
-
- return 0;
-}
-
-static int timerwheel_delayed_ack(int fd,
- struct frcti * frcti)
-{
- struct timespec now;
- struct ack * a;
- size_t slot;
-
- a = malloc(sizeof(*a));
- if (a == NULL)
- return -ENOMEM;
-
- clock_gettime(PTHREAD_COND_CLOCK, &now);
-
- pthread_rwlock_rdlock(&frcti->lock);
-
- slot = (((ts_to_ns(now) + (TICTIME << 1)) >> ACKQ_RES) + 1)
- & (ACKQ_SLOTS - 1);
-
- pthread_rwlock_unlock(&frcti->lock);
-
- a->fd = fd;
- a->frcti = frcti;
- a->flow_id = proc.flows[fd].info.id;
-
- pthread_mutex_lock(&rw.lock);
-
- if (rw.map[slot][fd]) {
- pthread_mutex_unlock(&rw.lock);
- free(a);
- return 0;
- }
-
- rw.map[slot][fd] = true;
-
- list_add_tail(&a->next, &rw.acks[slot]);
-
- pthread_mutex_unlock(&rw.lock);
-
- return 0;
-}
diff --git a/src/lib/tw.c b/src/lib/tw.c
new file mode 100644
index 00000000..ccde7dd1
--- /dev/null
+++ b/src/lib/tw.c
@@ -0,0 +1,307 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * Generic deadline-ordered callback queue (timing wheel)
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ * Sander Vrijders <sander@ouroboros.rocks>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * version 2.1 as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., http://www.fsf.org/about/contact/.
+ */
+
+#if defined(__linux__) || defined(__CYGWIN__)
+#define _DEFAULT_SOURCE
+#else
+#define _POSIX_C_SOURCE 200809L
+#endif
+
+#include "config.h"
+
+#include <ouroboros/list.h>
+#include <ouroboros/pthread.h>
+#include <ouroboros/time.h>
+#include <ouroboros/tw.h>
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+/* 3 levels × 256 slots, 1 ms / 16 ms / 256 ms per-slot resolution. */
+#define TW_LVLS 3
+#define TW_SLOTS 256
+#define TW_BUMP 4
+#define TW_RES 20 /* 2^20 ns ≈ 1 ms per slot at level 0. */
+
+#define TW_SLOT(x) ((x) & (TW_SLOTS - 1))
+
+static struct {
+ struct list_head levels[TW_LVLS][TW_SLOTS];
+ size_t prv[TW_LVLS];
+ pthread_mutex_t mtx;
+ pthread_mutex_t move_mtx;
+ bool initialised;
+} tw;
+
+static size_t tw_lvl_res(size_t lvl)
+{
+ return TW_RES + TW_BUMP * lvl;
+}
+
+/* Smallest level whose slot range covers the deadline. */
+static size_t tw_pick_lvl(uint64_t now_ns,
+ uint64_t deadline_ns)
+{
+ uint64_t delta;
+ size_t lvl;
+
+ delta = deadline_ns > now_ns ? deadline_ns - now_ns : 0;
+ lvl = 0;
+
+ while (lvl < TW_LVLS - 1 && (delta >> tw_lvl_res(lvl)) >= TW_SLOTS)
+ ++lvl;
+
+ return lvl;
+}
+
+static size_t tw_slot(uint64_t ns,
+ size_t lvl)
+{
+ return TW_SLOT(ns >> tw_lvl_res(lvl));
+}
+
+int tw_init(void)
+{
+ struct timespec now;
+ size_t i;
+ size_t j;
+
+ assert(!tw.initialised);
+
+ if (pthread_mutex_init(&tw.mtx, NULL))
+ goto fail_mtx;
+
+ if (pthread_mutex_init(&tw.move_mtx, NULL))
+ goto fail_move_mtx;
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+
+ for (i = 0; i < TW_LVLS; ++i) {
+ tw.prv[i] = TW_SLOT(tw_slot(TS_TO_UINT64(now), i) - 1);
+ for (j = 0; j < TW_SLOTS; ++j)
+ list_head_init(&tw.levels[i][j]);
+ }
+
+ tw.initialised = true;
+
+ return 0;
+
+ fail_move_mtx:
+ pthread_mutex_destroy(&tw.mtx);
+ fail_mtx:
+ return -1;
+}
+
+void tw_fini(void)
+{
+ size_t i;
+ size_t j;
+
+ assert(tw.initialised);
+
+ for (i = 0; i < TW_LVLS; ++i) {
+ for (j = 0; j < TW_SLOTS; ++j)
+ assert(list_is_empty(&tw.levels[i][j]));
+ }
+
+ pthread_mutex_destroy(&tw.move_mtx);
+ pthread_mutex_destroy(&tw.mtx);
+
+ tw.initialised = false;
+}
+
+void tw_init_entry(struct tw_entry * e)
+{
+ list_head_init(&e->next);
+
+ e->deadline_ns = 0;
+ e->fire = NULL;
+ e->arg = NULL;
+ e->lvl = 0;
+}
+
+void tw_post(struct tw_entry * e,
+ uint64_t deadline_ns,
+ tw_fire_fn_t fire,
+ void * arg)
+{
+ struct timespec now;
+ size_t lvl;
+ size_t slot;
+
+ assert(tw.initialised);
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+
+ lvl = tw_pick_lvl(TS_TO_UINT64(now), deadline_ns);
+ /* +1 so deadline <= slot_start; lands later in slot. */
+ slot = TW_SLOT(tw_slot(deadline_ns, lvl) + 1);
+
+ e->deadline_ns = deadline_ns;
+ e->fire = fire;
+ e->arg = arg;
+ e->lvl = lvl;
+
+ pthread_mutex_lock(&tw.mtx);
+
+ if (!list_is_empty(&e->next))
+ list_del(&e->next);
+
+ list_add_tail(&e->next, &tw.levels[lvl][slot]);
+
+ pthread_mutex_unlock(&tw.mtx);
+}
+
+void tw_cancel(struct tw_entry * e)
+{
+ if (e == NULL)
+ return;
+
+ assert(tw.initialised);
+
+ pthread_mutex_lock(&tw.mtx);
+
+ if (!list_is_empty(&e->next)) {
+ list_del(&e->next);
+ list_head_init(&e->next);
+ }
+
+ pthread_mutex_unlock(&tw.mtx);
+}
+
+void tw_move(void)
+{
+ struct timespec now;
+ struct list_head deferred;
+ struct list_head * p;
+ uint64_t now_ns;
+ size_t i;
+ size_t j;
+ size_t cur;
+
+ assert(tw.initialised);
+
+ if (pthread_mutex_trylock(&tw.move_mtx) != 0)
+ return;
+
+ pthread_cleanup_push(__cleanup_mutex_unlock, &tw.move_mtx);
+
+ pthread_mutex_lock(&tw.mtx);
+
+ pthread_cleanup_push(__cleanup_mutex_unlock, &tw.mtx);
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ for (i = 0; i < TW_LVLS; ++i) {
+ cur = tw_slot(now_ns, i);
+
+ j = tw.prv[i];
+ if (cur < j)
+ cur += TW_SLOTS;
+
+ while (j++ < cur) {
+ size_t s = TW_SLOT(j);
+
+ /* Pop-front so fire may mutate any entry. */
+ list_head_init(&deferred);
+
+ while (!list_is_empty(&tw.levels[i][s])) {
+ struct tw_entry * e;
+ p = tw.levels[i][s].nxt;
+ e = list_entry(p, struct tw_entry, next);
+ list_del(&e->next);
+
+ if (e->deadline_ns > now_ns) {
+ list_add_tail(&e->next, &deferred);
+ continue;
+ }
+
+ pthread_mutex_unlock(&tw.mtx);
+ e->fire(e->arg);
+ pthread_mutex_lock(&tw.mtx);
+ }
+
+ while (!list_is_empty(&deferred)) {
+ p = deferred.nxt;
+ list_del(p);
+ list_add_tail(p, &tw.levels[i][s]);
+ }
+ }
+
+ tw.prv[i] = TW_SLOT(cur);
+ }
+
+ pthread_cleanup_pop(true); /* tw.mtx */
+ pthread_cleanup_pop(true); /* tw.move_mtx */
+}
+
+/* Earliest pending deadline at level lvl, INT64_MAX if level is empty. */
+static int64_t tw_lvl_earliest(size_t lvl,
+ uint64_t now_ns)
+{
+ size_t cur = tw_slot(now_ns, lvl);
+ size_t j;
+
+ for (j = 1; j <= TW_SLOTS; ++j) {
+ size_t s = TW_SLOT(cur + j);
+
+ if (list_is_empty(&tw.levels[lvl][s]))
+ continue;
+
+ return (int64_t)(now_ns + ((uint64_t) j << tw_lvl_res(lvl)));
+ }
+
+ return INT64_MAX;
+}
+
+void tw_next_expiry(struct timespec * out)
+{
+ struct timespec now;
+ uint64_t now_ns;
+ int64_t earliest = INT64_MAX;
+ size_t i;
+
+ assert(tw.initialised);
+
+ clock_gettime(PTHREAD_COND_CLOCK, &now);
+ now_ns = TS_TO_UINT64(now);
+
+ pthread_mutex_lock(&tw.mtx);
+
+ for (i = 0; i < TW_LVLS; ++i) {
+ int64_t dl = tw_lvl_earliest(i, now_ns);
+ if (dl < earliest)
+ earliest = dl;
+ }
+
+ pthread_mutex_unlock(&tw.mtx);
+
+ if (earliest == INT64_MAX) {
+ /* Empty wheel: tv_nsec=-1 is an invalid normalised value. */
+ out->tv_sec = 0;
+ out->tv_nsec = -1;
+ } else {
+ UINT64_TO_TS((uint64_t) earliest, out);
+ }
+}
diff --git a/src/tools/CMakeLists.txt b/src/tools/CMakeLists.txt
index 3cec8172..6b418838 100644
--- a/src/tools/CMakeLists.txt
+++ b/src/tools/CMakeLists.txt
@@ -63,6 +63,11 @@ target_include_directories(operf PRIVATE ${TOOLS_INCLUDE_DIRS})
target_link_libraries(operf PRIVATE ouroboros-dev)
install(TARGETS operf RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+add_executable(oftp oftp/oftp.c)
+target_include_directories(oftp PRIVATE ${TOOLS_INCLUDE_DIRS})
+target_link_libraries(oftp PRIVATE ouroboros-dev)
+install(TARGETS oftp RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+
if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
add_executable(ovpn ovpn/ovpn.c)
target_include_directories(ovpn PRIVATE ${TOOLS_INCLUDE_DIRS})
diff --git a/src/tools/irm/irm_ipcp_connect.c b/src/tools/irm/irm_ipcp_connect.c
index f88c36dc..fb21faec 100644
--- a/src/tools/irm/irm_ipcp_connect.c
+++ b/src/tools/irm/irm_ipcp_connect.c
@@ -100,16 +100,18 @@ int do_connect_ipcp(int argc,
}
if (qos != NULL) {
- if (strcmp(qos, "best") == 0)
- qs = qos_best_effort;
- else if (strcmp(qos, "raw") == 0)
+ if (strcmp(qos, "raw") == 0)
qs = qos_raw;
- else if (strcmp(qos, "video") == 0)
- qs = qos_video;
- else if (strcmp(qos, "voice") == 0)
- qs = qos_voice;
- else if (strcmp(qos, "data") == 0)
- qs = qos_data;
+ else if (strcmp(qos, "safe") == 0)
+ qs = qos_raw_safe;
+ else if (strcmp(qos, "rt") == 0)
+ qs = qos_rt;
+ else if (strcmp(qos, "rt-safe") == 0)
+ qs = qos_rt_safe;
+ else if (strcmp(qos, "msg") == 0)
+ qs = qos_msg;
+ else if (strcmp(qos, "stream") == 0)
+ qs = qos_stream;
else
printf("Unknown QoS cube, defaulting to raw.\n");
}
@@ -126,7 +128,7 @@ int do_connect_ipcp(int argc,
if (wildcard_match(comp, MGMT) == 0) {
component = MGMT_COMP;
- /* FIXME: move to qos_data when stable */
+ /* FIXME: move to qos_msg when stable */
if (irm_connect_ipcp(pid, dst, component, qos_raw))
return -1;
}
diff --git a/src/tools/ocbr/ocbr_client.c b/src/tools/ocbr/ocbr_client.c
index 9dd9904c..36c07d43 100644
--- a/src/tools/ocbr/ocbr_client.c
+++ b/src/tools/ocbr/ocbr_client.c
@@ -37,8 +37,11 @@
*/
#include <ouroboros/dev.h>
+#include <ouroboros/qos.h>
#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
volatile bool stop;
@@ -86,6 +89,11 @@ int client_main(char * server,
struct timespec end;
struct timespec intv = {(gap / BILLION), gap % BILLION};
int ms;
+ const char * qenv;
+ qosspec_t qs;
+ qosspec_t * qsp;
+
+ qsp = NULL;
stop = false;
@@ -98,16 +106,38 @@ int client_main(char * server,
sigaction(SIGHUP, &sig_act, NULL) ||
sigaction(SIGPIPE, &sig_act, NULL)) {
printf("Failed to install sighandler.\n");
- return -1;
+ return 2;
}
printf("Client started, duration %d, rate %lu b/s, size %d B.\n",
duration, rate, size);
- fd = flow_alloc(server, NULL, NULL);
+ qenv = getenv("OCBR_QOS");
+ if (qenv != NULL) {
+ if (strcmp(qenv, "raw") == 0)
+ qs = qos_raw;
+ else if (strcmp(qenv, "safe") == 0)
+ qs = qos_raw_safe;
+ else if (strcmp(qenv, "rt") == 0)
+ qs = qos_rt;
+ else if (strcmp(qenv, "rt_safe") == 0)
+ qs = qos_rt_safe;
+ else if (strcmp(qenv, "msg") == 0)
+ qs = qos_msg;
+ else if (strcmp(qenv, "stream") == 0)
+ qs = qos_stream;
+ else {
+ fprintf(stderr,
+ "Unknown OCBR_QOS='%s', using raw.\n", qenv);
+ qs = qos_raw;
+ }
+ qsp = &qs;
+ printf("OCBR_QOS=%s\n", qenv);
+ }
+ fd = flow_alloc(server, qsp, NULL);
if (fd < 0) {
printf("Failed to allocate flow.\n");
- return -1;
+ return 2;
}
clock_gettime(CLOCK_REALTIME, &start);
diff --git a/src/tools/oecho/oecho.c b/src/tools/oecho/oecho.c
index 14caab53..ef0a168f 100644
--- a/src/tools/oecho/oecho.c
+++ b/src/tools/oecho/oecho.c
@@ -101,20 +101,20 @@ static int client_main(void)
fd = flow_alloc("oecho", NULL, NULL);
if (fd < 0) {
printf("Failed to allocate flow.\n");
- return -1;
+ return 2;
}
if (flow_write(fd, message, strlen(message) + 1) < 0) {
printf("Failed to write packet.\n");
flow_dealloc(fd);
- return -1;
+ return 1;
}
count = flow_read(fd, buf, BUF_SIZE);
if (count < 0) {
printf("Failed to read packet.\n");
flow_dealloc(fd);
- return -1;
+ return 1;
}
printf("Server replied with %.*s\n", (int) count, buf);
@@ -126,7 +126,7 @@ static int client_main(void)
int main(int argc, char ** argv)
{
- int ret = -1;
+ int ret = 0;
bool server = false;
argc--;
diff --git a/src/tools/oftp/oftp.c b/src/tools/oftp/oftp.c
new file mode 100644
index 00000000..1ae99403
--- /dev/null
+++ b/src/tools/oftp/oftp.c
@@ -0,0 +1,441 @@
+/*
+ * Ouroboros - Copyright (C) 2016 - 2026
+ *
+ * A minimal file-transfer tool over an FRCT stream flow
+ *
+ * Dimitri Staessens <dimitri@ouroboros.rocks>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define _POSIX_C_SOURCE 200809L
+
+#include <ouroboros/crc64.h>
+#include <ouroboros/dev.h>
+#include <ouroboros/errno.h>
+#include <ouroboros/fccntl.h>
+#include <ouroboros/qos.h>
+
+#include <fcntl.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+
+#define BUF_SIZE 16384
+
+static volatile sig_atomic_t stop = 0;
+
+static void apply_rto_min_env(int fd)
+{
+ const char * env;
+ long v;
+
+ env = getenv("OFTP_FRCT_RTO_MIN");
+ if (env == NULL)
+ return;
+ v = strtol(env, NULL, 10);
+ if (v <= 0)
+ return;
+ if (fccntl(fd, FRCTSRTOMIN, (time_t) v) < 0)
+ fprintf(stderr,
+ "oftp: failed to set RTO_MIN=%ld ns\n", v);
+}
+
+static void apply_stream_ring_sz_env(int fd)
+{
+ const char * env;
+ long v;
+
+ env = getenv("OFTP_FRCT_STREAM_RING_SZ");
+ if (env == NULL)
+ return;
+ v = strtol(env, NULL, 10);
+ if (v <= 0)
+ return;
+ if (fccntl(fd, FRCTSRRINGSZ, (size_t) v) < 0)
+ fprintf(stderr,
+ "oftp: failed to set STREAM_RING_SZ=%ld\n", v);
+}
+
+static void on_signal(int signo)
+{
+ (void) signo;
+ stop = 1;
+}
+
+static void usage(void)
+{
+ printf("Usage: oftp [OPTION]...\n"
+ "Stream-mode file transfer over an Ouroboros flow.\n\n"
+ " -l, --listen Run as the receiver (server)\n"
+ " -n, --name NAME Destination service name (client)\n"
+ " -i, --in FILE Read input from FILE (default stdin)\n"
+ " -o, --out FILE Write output to FILE (default stdout)\n"
+ " -N, --bytes SIZE Stop after SIZE bytes "
+ "(K/M/G suffix; client only)\n"
+ " --help Display this help text and exit\n");
+}
+
+static int parse_size(const char * s, size_t * out)
+{
+ char * end;
+ unsigned long v;
+ size_t mul;
+
+ v = strtoul(s, &end, 0);
+ if (end == s)
+ return -1;
+
+ mul = 1;
+ if (*end == 'k' || *end == 'K')
+ mul = 1024UL;
+ else if (*end == 'm' || *end == 'M')
+ mul = 1024UL * 1024UL;
+ else if (*end == 'g' || *end == 'G')
+ mul = 1024UL * 1024UL * 1024UL;
+ else if (*end != '\0')
+ return -1;
+
+ *out = (size_t) v * mul;
+ return 0;
+}
+
+static void report_xfer(const char * tag,
+ size_t total,
+ uint64_t crc,
+ const struct timespec * t0,
+ const struct timespec * t1)
+{
+ double elapsed_s;
+ double mib_per_s;
+
+ elapsed_s = (t1->tv_sec - t0->tv_sec)
+ + (t1->tv_nsec - t0->tv_nsec) / 1e9;
+ if (elapsed_s <= 0.0)
+ elapsed_s = 1e-9;
+
+ mib_per_s = ((double) total / (1024.0 * 1024.0)) / elapsed_s;
+
+ fprintf(stderr,
+ "oftp: %s %zu bytes in %.3f s (%.2f MiB/s) "
+ "crc64=%016" PRIx64 "\n",
+ tag, total, elapsed_s, mib_per_s, crc);
+}
+
+static int xfer_to_flow(int fd, FILE * in, size_t max_bytes)
+{
+ char buf[BUF_SIZE];
+ size_t n;
+ size_t total;
+ size_t want;
+ size_t off;
+ ssize_t w;
+ uint64_t crc;
+ struct timespec t0;
+ struct timespec t1;
+
+ total = 0;
+ crc = 0;
+
+ clock_gettime(CLOCK_MONOTONIC, &t0);
+
+ while (!stop) {
+ want = sizeof(buf);
+ if (max_bytes > 0 && max_bytes - total < want)
+ want = max_bytes - total;
+ if (want == 0)
+ break;
+
+ n = fread(buf, 1, want, in);
+ if (n == 0)
+ break;
+
+ crc64_nvme(&crc, buf, n);
+
+ off = 0;
+ while (off < n) {
+ w = flow_write(fd, buf + off, n - off);
+ if (w < 0) {
+ fprintf(stderr,
+ "flow_write failed: %zd\n", w);
+ return 1;
+ }
+ off += (size_t) w;
+ total += (size_t) w;
+ }
+ }
+
+ clock_gettime(CLOCK_MONOTONIC, &t1);
+
+ if (ferror(in)) {
+ fprintf(stderr, "Input read error.\n");
+ return 1;
+ }
+
+ report_xfer("sent", total, crc, &t0, &t1);
+ return 0;
+}
+
+static int xfer_from_flow(int fd, FILE * out)
+{
+ char buf[BUF_SIZE];
+ size_t total;
+ ssize_t n;
+ uint64_t crc;
+ struct timespec timeout;
+ struct timespec t0;
+ struct timespec t1;
+ bool started;
+
+ total = 0;
+ crc = 0;
+ started = false;
+ timeout.tv_sec = 1;
+ timeout.tv_nsec = 0;
+
+ /* Short timeout so SIGTERM/SIGINT 'stop' is observed promptly. */
+ fccntl(fd, FLOWSRCVTIMEO, &timeout);
+
+ while (!stop) {
+ n = flow_read(fd, buf, sizeof(buf));
+ if (n == 0) {
+ /* Clean EOF: peer sent EOS and we drained it. */
+ clock_gettime(CLOCK_MONOTONIC, &t1);
+ fflush(out);
+ if (!started)
+ t0 = t1;
+ report_xfer("received", total, crc, &t0, &t1);
+ return 0;
+ }
+ if (n == -ETIMEDOUT)
+ continue;
+ if (n < 0) {
+ /* Peer aborted before EOS: partial transfer. */
+ if (n == -EFLOWDOWN || n == -EFLOWPEER) {
+ fprintf(stderr,
+ "oftp: peer aborted at %zu B\n",
+ total);
+ return 2;
+ }
+ fprintf(stderr,
+ "flow_read failed: %zd\n", n);
+ return 1;
+ }
+ if (!started) {
+ clock_gettime(CLOCK_MONOTONIC, &t0);
+ started = true;
+ }
+ crc64_nvme(&crc, buf, (size_t) n);
+ if (fwrite(buf, 1, (size_t) n, out) != (size_t) n) {
+ fprintf(stderr, "Output write error.\n");
+ return 1;
+ }
+ total += (size_t) n;
+ }
+
+ /* Receiver was signalled (SIGINT/SIGTERM) before EOF. */
+ fflush(out);
+ fprintf(stderr, "oftp: interrupted at %zu B\n", total);
+ return 2;
+}
+
+static int server_main(const char * outpath)
+{
+ FILE * out = stdout;
+ int fd;
+ int ofd;
+ int rc;
+ qosspec_t qs;
+
+ if (outpath != NULL) {
+ ofd = open(outpath,
+ O_WRONLY | O_CREAT | O_EXCL | O_NOFOLLOW,
+ 0600);
+ if (ofd < 0) {
+ perror("open");
+ return 1;
+ }
+ out = fdopen(ofd, "wb");
+ if (out == NULL) {
+ perror("fdopen");
+ close(ofd);
+ unlink(outpath);
+ return 1;
+ }
+ }
+
+ fprintf(stderr, "oftp: listening...\n");
+
+ fd = flow_accept(&qs, NULL);
+ if (fd < 0) {
+ fprintf(stderr, "flow_accept failed: %d\n", fd);
+ if (out != stdout)
+ fclose(out);
+ return 1;
+ }
+
+ if (qs.service != SVC_STREAM) {
+ fprintf(stderr,
+ "oftp: rejecting non-stream flow (service=%u)\n",
+ qs.service);
+ flow_dealloc(fd);
+ if (out != stdout) {
+ fclose(out);
+ unlink(outpath);
+ }
+ return 1;
+ }
+
+ apply_rto_min_env(fd);
+ apply_stream_ring_sz_env(fd);
+
+ rc = xfer_from_flow(fd, out);
+
+ flow_dealloc(fd);
+
+ if (out != stdout) {
+ fclose(out);
+ /* Drop the half-written file on abort/interrupt. */
+ if (rc != 0)
+ unlink(outpath);
+ }
+
+ return rc;
+}
+
+static int client_main(const char * name,
+ const char * inpath,
+ size_t max_bytes)
+{
+ FILE * in;
+ int fd;
+ int rc;
+ qosspec_t qs;
+
+ in = stdin;
+ qs = qos_stream;
+
+ if (inpath != NULL) {
+ in = fopen(inpath, "rb");
+ if (in == NULL) {
+ perror("fopen");
+ return 1;
+ }
+ }
+
+ fd = flow_alloc(name, &qs, NULL);
+ if (fd < 0) {
+ fprintf(stderr, "flow_alloc failed: %d\n", fd);
+ if (in != stdin)
+ fclose(in);
+ return 2;
+ }
+
+ apply_rto_min_env(fd);
+ apply_stream_ring_sz_env(fd);
+
+ rc = xfer_to_flow(fd, in, max_bytes);
+
+ flow_dealloc(fd);
+
+ if (in != stdin)
+ fclose(in);
+
+ return rc;
+}
+
+int main(int argc, char ** argv)
+{
+ bool server;
+ const char * name;
+ const char * inpath;
+ const char * outpath;
+ size_t max_bytes;
+ struct sigaction sa;
+
+ server = false;
+ name = NULL;
+ inpath = NULL;
+ outpath = NULL;
+ max_bytes = 0;
+
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler = on_signal;
+ sigaction(SIGINT, &sa, NULL);
+ sigaction(SIGTERM, &sa, NULL);
+ signal(SIGPIPE, SIG_IGN);
+
+ argc--; argv++;
+ while (argc > 0) {
+ if (strcmp(*argv, "-l") == 0 ||
+ strcmp(*argv, "--listen") == 0) {
+ server = true;
+ } else if ((strcmp(*argv, "-n") == 0 ||
+ strcmp(*argv, "--name") == 0) && argc > 1) {
+ name = *(++argv); argc--;
+ } else if ((strcmp(*argv, "-i") == 0 ||
+ strcmp(*argv, "--in") == 0) && argc > 1) {
+ inpath = *(++argv); argc--;
+ } else if ((strcmp(*argv, "-o") == 0 ||
+ strcmp(*argv, "--out") == 0) && argc > 1) {
+ outpath = *(++argv); argc--;
+ } else if ((strcmp(*argv, "-N") == 0 ||
+ strcmp(*argv, "--bytes") == 0) && argc > 1) {
+ if (parse_size(*(++argv), &max_bytes) < 0) {
+ fprintf(stderr,
+ "oftp: bad size '%s'\n", *argv);
+ return 1;
+ }
+ argc--;
+ } else if (strcmp(*argv, "--help") == 0) {
+ usage();
+ return 0;
+ } else {
+ usage();
+ return 1;
+ }
+ argc--; argv++;
+ }
+
+ if (server)
+ return server_main(outpath);
+
+ if (name == NULL) {
+ usage();
+ return 1;
+ }
+
+ return client_main(name, inpath, max_bytes);
+}
diff --git a/src/tools/operf/operf.c b/src/tools/operf/operf.c
index 1872b351..0198e871 100644
--- a/src/tools/operf/operf.c
+++ b/src/tools/operf/operf.c
@@ -248,5 +248,5 @@ int main(int argc, char ** argv)
if (ret < 0)
exit(EXIT_FAILURE);
- exit(EXIT_SUCCESS);
+ exit(ret);
}
diff --git a/src/tools/operf/operf_client.c b/src/tools/operf/operf_client.c
index 7e8f1a9b..e478aeff 100644
--- a/src/tools/operf/operf_client.c
+++ b/src/tools/operf/operf_client.c
@@ -185,7 +185,7 @@ int client_main(void)
sigaction(SIGHUP, &sig_act, NULL) ||
sigaction(SIGPIPE, &sig_act, NULL)) {
printf("Failed to install sighandler.\n");
- return -1;
+ return 2;
}
client.sent = 0;
@@ -196,7 +196,7 @@ int client_main(void)
fd = flow_alloc(client.server_name, NULL, NULL);
if (fd < 0) {
printf("Failed to allocate flow.\n");
- return -1;
+ return 2;
}
if (client.conf.test_type == TEST_TYPE_BI)
@@ -207,7 +207,7 @@ int client_main(void)
if (flow_write(fd, &client.conf, sizeof(client.conf)) < 0) {
printf("Failed to send configuration.\n");
flow_dealloc(fd);
- return -1;
+ return 1;
}
sleep(1);
diff --git a/src/tools/oping/oping.c b/src/tools/oping/oping.c
index 763c0d62..10e1e23c 100644
--- a/src/tools/oping/oping.c
+++ b/src/tools/oping/oping.c
@@ -60,7 +60,7 @@
#include <errno.h>
#include <float.h>
-#define OPING_BUF_SIZE 1500
+#define OPING_BUF_SIZE 16384
#define ECHO_REQUEST 0
#define ECHO_REPLY 1
#define OPING_MAX_FLOWS 256
@@ -81,8 +81,9 @@
" -F, --flood-busy Flood with busy-polling (lower latency)\n" \
" -i, --interval Interval (default 1000ms)\n" \
" -n, --server-name Name of the oping server\n" \
-" -q, --qos QoS (raw, best, video, voice, data)\n" \
+" -q, --qos QoS (raw, safe, rt, rt-safe, msg)\n" \
" -s, --size Payload size (B, default 64)\n" \
+" -W, --timeout Per-packet recv timeout, ms (default 2000)\n" \
" -Q, --quiet Only print final statistics\n" \
" -D, --timeofday Print time of day before each line\n" \
"\n" \
@@ -93,9 +94,11 @@ struct {
int interval;
uint32_t count;
int size;
+ int timeout; /* per-packet recv timeout, ms */
bool timestamp;
bool flood;
bool flood_busy;
+ long duration;
qosspec_t qs;
/* stats */
@@ -175,18 +178,20 @@ int main(int argc,
argc--;
argv++;
- client.s_apn = NULL;
- client.interval = 1000;
- client.size = 64;
- client.count = INT_MAX;
- client.timestamp = false;
- client.flood = false;
+ client.s_apn = NULL;
+ client.interval = 1000;
+ client.size = 64;
+ client.count = INT_MAX;
+ client.timeout = 2000;
+ client.timestamp = false;
+ client.flood = false;
client.flood_busy = false;
- client.qs = qos_raw;
- client.quiet = false;
- server.quiet = false;
- server.poll = false;
- server.busy = false;
+ client.duration = 0;
+ client.qs = qos_raw;
+ client.quiet = false;
+ server.quiet = false;
+ server.poll = false;
+ server.busy = false;
while (argc > 0) {
if ((strcmp(*argv, "-i") == 0 ||
@@ -216,6 +221,12 @@ int main(int argc,
argc > 1) {
client.size = strtol(*(++argv), &rem, 10);
--argc;
+ } else if ((strcmp(*argv, "-W") == 0 ||
+ strcmp(*argv, "--timeout") == 0) &&
+ argc > 1) {
+ client.timeout = strtol(*(++argv), &rem, 10);
+ client.timeout *= time_mul(rem);
+ --argc;
} else if ((strcmp(*argv, "-q") == 0 ||
strcmp(*argv, "--qos") == 0) &&
argc > 1) {
@@ -249,23 +260,25 @@ int main(int argc,
}
if (duration > 0) {
- if (client.interval == 0)
+ if (client.flood || client.flood_busy)
+ client.duration = duration;
+ else if (client.interval == 0)
client.count = duration * 10;
else
client.count = duration / client.interval;
}
if (qos != NULL) {
- if (strcmp(qos, "best") == 0)
- client.qs = qos_best_effort;
- else if (strcmp(qos, "raw") == 0)
+ if (strcmp(qos, "raw") == 0)
client.qs = qos_raw;
- else if (strcmp(qos, "video") == 0)
- client.qs = qos_video;
- else if (strcmp(qos, "voice") == 0)
- client.qs = qos_voice;
- else if (strcmp(qos, "data") == 0)
- client.qs = qos_data;
+ else if (strcmp(qos, "safe") == 0)
+ client.qs = qos_raw_safe;
+ else if (strcmp(qos, "rt") == 0)
+ client.qs = qos_rt;
+ else if (strcmp(qos, "rt-safe") == 0)
+ client.qs = qos_rt_safe;
+ else if (strcmp(qos, "msg") == 0)
+ client.qs = qos_msg;
else
printf("Unknown QoS cube, defaulting to raw.\n");
}
@@ -298,7 +311,7 @@ int main(int argc,
if (ret < 0)
exit(EXIT_FAILURE);
- exit(EXIT_SUCCESS);
+ exit(ret);
fail:
usage();
diff --git a/src/tools/oping/oping_client.c b/src/tools/oping/oping_client.c
index 23807f65..4b01315d 100644
--- a/src/tools/oping/oping_client.c
+++ b/src/tools/oping/oping_client.c
@@ -47,6 +47,7 @@ void shutdown_client(int signo, siginfo_t * info, void * c)
case SIGINT:
case SIGTERM:
case SIGHUP:
+ case SIGALRM:
stop = true;
default:
return;
@@ -89,7 +90,7 @@ static void print_rtt(int len, int seq,
void * reader(void * o)
{
- struct timespec timeout = {client.interval / 1000 + 2, 0};
+ struct timespec timeout;
struct timespec now = {0, 0};
struct timespec sent;
@@ -100,6 +101,9 @@ void * reader(void * o)
double ms = 0;
uint32_t exp_id = 0;
+ timeout.tv_sec = client.timeout / 1000;
+ timeout.tv_nsec = (client.timeout % 1000) * MILLION;
+
fccntl(fd, FLOWSRCVTIMEO, &timeout);
while (!stop && client.rcvd != client.count) {
@@ -284,18 +288,15 @@ static int flood_busy_ping(int fd)
msg->tv_sec = sent.tv_sec;
msg->tv_nsec = sent.tv_nsec;
- if (flow_write(fd, buf,
- client.size) < 0) {
- printf("Failed to send "
- "packet.\n");
+ if (flow_write(fd, buf, client.size) < 0) {
+ printf("Failed to send packet.\n");
break;
}
++client.sent;
do {
- n = flow_read(fd, buf,
- OPING_BUF_SIZE);
+ n = flow_read(fd, buf, OPING_BUF_SIZE);
} while (n == -EAGAIN && !stop);
if (n < 0)
@@ -315,9 +316,7 @@ static int flood_busy_ping(int fd)
update_rtt_stats(ms);
if (!client.quiet)
- print_rtt(client.size,
- ntohl(msg->id), ms,
- NULL);
+ print_rtt(client.size, ntohl(msg->id), ms, NULL);
}
return 0;
@@ -371,9 +370,7 @@ static int flood_ping(int fd)
update_rtt_stats(ms);
if (!client.quiet)
- print_rtt(client.size,
- ntohl(msg->id), ms,
- NULL);
+ print_rtt(client.size, ntohl(msg->id), ms, NULL);
}
return 0;
@@ -404,25 +401,34 @@ static int client_main(void)
if (sigaction(SIGINT, &sig_act, NULL) ||
sigaction(SIGTERM, &sig_act, NULL) ||
sigaction(SIGHUP, &sig_act, NULL) ||
- sigaction(SIGPIPE, &sig_act, NULL)) {
+ sigaction(SIGPIPE, &sig_act, NULL) ||
+ sigaction(SIGALRM, &sig_act, NULL)) {
printf("Failed to install sighandler.\n");
- return -1;
+ return 2;
}
if (client_init()) {
printf("Failed to initialize client.\n");
- return -1;
+ return 2;
}
fd = flow_alloc(client.s_apn, &client.qs, NULL);
if (fd < 0) {
printf("Failed to allocate flow: %d.\n", fd);
client_fini();
- return -1;
+ return 2;
}
fccntl(fd, FLOWSFLAGS, FLOWFRDWR | FLOWFRNOPART);
+ if (client.duration > 0) {
+ struct itimerval it;
+ memset(&it, 0, sizeof(it));
+ it.it_value.tv_sec = client.duration / 1000;
+ it.it_value.tv_usec = (client.duration % 1000) * 1000;
+ setitimer(ITIMER_REAL, &it, NULL);
+ }
+
clock_gettime(CLOCK_REALTIME, &tic);
if (client.flood_busy)
@@ -439,5 +445,5 @@ static int client_main(void)
flow_dealloc(fd);
client_fini();
- return 0;
+ return client.rcvd == client.sent ? 0 : 1;
}
diff --git a/src/tools/oping/oping_server.c b/src/tools/oping/oping_server.c
index 33af28c4..e98ca040 100644
--- a/src/tools/oping/oping_server.c
+++ b/src/tools/oping/oping_server.c
@@ -237,6 +237,14 @@ int server_main(void)
return -1;
}
+ if (pthread_mutex_init(&server.lock, NULL)) {
+ fqueue_destroy(server.fq);
+ fset_destroy(server.flows);
+ return -1;
+ }
+
+ memset(server.times, 0, sizeof(server.times));
+
pthread_create(&server.cleaner_pt, NULL, cleaner_thread, NULL);
if (server.busy) {
@@ -255,11 +263,13 @@ int server_main(void)
pthread_cancel(server.cleaner_pt);
- fset_destroy(server.flows);
- fqueue_destroy(server.fq);
-
+ /* Join cancellable threads before tearing down their fset. */
pthread_join(server.server_pt, NULL);
pthread_join(server.cleaner_pt, NULL);
+ pthread_mutex_destroy(&server.lock);
+ fset_destroy(server.flows);
+ fqueue_destroy(server.fq);
+
return 0;
}